{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5714285714285714, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "advantage_max": 1.8650365471839905, "advantage_mean": 5.4016712935922584e-08, "advantage_min": -0.9214624911546707, "advantage_std": 0.999835692346096, "completion_length": 2571.2083587646484, "epoch": 0.001142857142857143, "grad_norm": 0.20358379185199738, "kl": 0.0, "lambda_div_used": 0.5, "learning_rate": 2e-08, "loss": -0.0, "reward": -0.03908593417145312, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.03908593417145312, "reward_after_std": 0.8219119422137737, "reward_before_mean": 0.4897647276520729, "reward_before_std": 0.8290339298546314, "reward_change_max": 0.0007017925381660461, "reward_change_mean": -0.5288506411015987, "reward_change_min": -1.0365500748157501, "reward_change_std": 0.4204680975526571, "reward_std": 0.8219119869172573, "rewards/cosine_scaled_reward": -0.015534311532974243, "rewards/format_reward": 0.5208333488553762, "step": 1 }, { "advantage_max": 1.8198039829730988, "advantage_mean": 5.4637592672435176e-08, "advantage_min": -0.9130084365606308, "advantage_std": 0.9997541680932045, "completion_length": 2804.395881652832, "epoch": 0.002285714285714286, "grad_norm": 0.18365833163261414, "kl": 0.0, "lambda_div_used": 0.5, "learning_rate": 4e-08, "loss": -0.0, "reward": -0.21404163353145123, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.21404163353145123, "reward_after_std": 0.4922399129718542, "reward_before_mean": 0.27539755403995514, "reward_before_std": 0.42092561535537243, "reward_change_max": 0.001632794737815857, "reward_change_mean": -0.48943919129669666, "reward_change_min": -0.7970554456114769, "reward_change_std": 0.3251637788489461, "reward_std": 0.4922399166971445, "rewards/cosine_scaled_reward": -0.04980122856795788, "rewards/format_reward": 0.37500000558793545, "step": 2 }, { "advantage_max": 1.8429554402828217, "advantage_mean": 1.862645104822036e-08, "advantage_min": -0.8803069293498993, "advantage_std": 0.9997477829456329, "completion_length": 3368.1041870117188, "epoch": 0.0034285714285714284, "grad_norm": 0.16678079962730408, "kl": 4.012882709503174e-05, "lambda_div_used": 0.5, "learning_rate": 6e-08, "loss": 0.0, "reward": -0.48418260738253593, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.48418260738253593, "reward_after_std": 0.4452684707939625, "reward_before_mean": -0.2069278322160244, "reward_before_std": 0.4541137106716633, "reward_change_max": 0.0003810301423072815, "reward_change_mean": -0.2772547733038664, "reward_change_min": -0.554124977439642, "reward_change_std": 0.22407594323158264, "reward_std": 0.44526849314570427, "rewards/cosine_scaled_reward": -0.17638059332966805, "rewards/format_reward": 0.14583333395421505, "step": 3 }, { "advantage_max": 1.9580544233322144, "advantage_mean": 9.934108535780695e-09, "advantage_min": -0.7727478072047234, "advantage_std": 0.9998557344079018, "completion_length": 2326.5833892822266, "epoch": 0.004571428571428572, "grad_norm": 0.2263989895582199, "kl": 4.519522190093994e-05, "lambda_div_used": 0.5, "learning_rate": 8e-08, "loss": 0.0, "reward": -0.028838554164394736, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.028838554164394736, "reward_after_std": 0.938622172921896, "reward_before_mean": 0.45526737440377474, "reward_before_std": 0.8286880534142256, "reward_change_max": 0.0, "reward_change_mean": -0.4841059297323227, "reward_change_min": -0.8776258826255798, "reward_change_std": 0.32474724017083645, "reward_std": 0.9386221915483475, "rewards/cosine_scaled_reward": -0.074449656996876, "rewards/format_reward": 0.6041666697710752, "step": 4 }, { "advantage_max": 1.9359291642904282, "advantage_mean": 3.6011140291947186e-08, "advantage_min": -0.7737014293670654, "advantage_std": 0.999842680990696, "completion_length": 3186.0625610351562, "epoch": 0.005714285714285714, "grad_norm": 0.18710263073444366, "kl": 4.155933856964111e-05, "lambda_div_used": 0.5, "learning_rate": 1e-07, "loss": 0.0, "reward": -0.3549866806715727, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3549866806715727, "reward_after_std": 0.774379301816225, "reward_before_mean": -0.08489098830614239, "reward_before_std": 0.7537979669868946, "reward_change_max": 0.001534678041934967, "reward_change_mean": -0.270095681771636, "reward_change_min": -0.5602017678320408, "reward_change_std": 0.23432399705052376, "reward_std": 0.774379301816225, "rewards/cosine_scaled_reward": -0.2091121654957533, "rewards/format_reward": 0.3333333432674408, "step": 5 }, { "advantage_max": 1.863465204834938, "advantage_mean": 3.539025805743279e-08, "advantage_min": -0.84102962911129, "advantage_std": 0.9996843636035919, "completion_length": 3105.354217529297, "epoch": 0.006857142857142857, "grad_norm": 0.20564018189907074, "kl": 5.272030830383301e-05, "lambda_div_used": 0.5, "learning_rate": 1.2e-07, "loss": 0.0, "reward": -0.4135122634470463, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.4135122634470463, "reward_after_std": 0.5014001121744514, "reward_before_mean": -0.09564527939073741, "reward_before_std": 0.513142878189683, "reward_change_max": 0.0008734241127967834, "reward_change_mean": -0.3178669649641961, "reward_change_min": -0.6814537905156612, "reward_change_std": 0.26048805261962116, "reward_std": 0.501400119625032, "rewards/cosine_scaled_reward": -0.1728226412087679, "rewards/format_reward": 0.25000000186264515, "step": 6 }, { "advantage_max": 1.8490984290838242, "advantage_mean": 3.1044085080367267e-09, "advantage_min": -0.9447271376848221, "advantage_std": 0.9998549297451973, "completion_length": 3099.562530517578, "epoch": 0.008, "grad_norm": 0.15761414170265198, "kl": 2.193450927734375e-05, "lambda_div_used": 0.5, "learning_rate": 1.4e-07, "loss": 0.0, "reward": -0.014719070866703987, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.014719070866703987, "reward_after_std": 0.8509783893823624, "reward_before_mean": 0.5388575592078269, "reward_before_std": 0.9079956337809563, "reward_change_max": 0.0003746822476387024, "reward_change_mean": -0.5535766407847404, "reward_change_min": -1.1588828563690186, "reward_change_std": 0.46872530598193407, "reward_std": 0.850978396832943, "rewards/cosine_scaled_reward": -0.011821209453046322, "rewards/format_reward": 0.5625000223517418, "step": 7 }, { "advantage_max": 1.855475664138794, "advantage_mean": -2.2972623803241277e-08, "advantage_min": -0.9538509175181389, "advantage_std": 0.9998169988393784, "completion_length": 2739.895851135254, "epoch": 0.009142857142857144, "grad_norm": 0.16170156002044678, "kl": 2.1321699023246765e-05, "lambda_div_used": 0.5, "learning_rate": 1.6e-07, "loss": 0.0, "reward": 0.041294334921985865, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.041294334921985865, "reward_after_std": 0.7754815965890884, "reward_before_mean": 0.6523661861720029, "reward_before_std": 0.7343525104224682, "reward_change_max": 0.001979641616344452, "reward_change_mean": -0.6110718548297882, "reward_change_min": -0.9722144268453121, "reward_change_std": 0.4230197472497821, "reward_std": 0.7754816301167011, "rewards/cosine_scaled_reward": 0.08659974206238985, "rewards/format_reward": 0.47916667722165585, "step": 8 }, { "advantage_max": 1.8438913971185684, "advantage_mean": 7.450580818968433e-09, "advantage_min": -0.866303063929081, "advantage_std": 0.9998209476470947, "completion_length": 3056.8125610351562, "epoch": 0.010285714285714285, "grad_norm": 0.21665655076503754, "kl": 4.3138861656188965e-05, "lambda_div_used": 0.5, "learning_rate": 1.8e-07, "loss": 0.0, "reward": -0.22506628464907408, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.22506628464907408, "reward_after_std": 0.80972820520401, "reward_before_mean": 0.15861139819025993, "reward_before_std": 0.8846573233604431, "reward_change_max": 0.0, "reward_change_mean": -0.3836776837706566, "reward_change_min": -0.9843891076743603, "reward_change_std": 0.40112666599452496, "reward_std": 0.8097282461822033, "rewards/cosine_scaled_reward": -0.10819429811090231, "rewards/format_reward": 0.37500001303851604, "step": 9 }, { "advantage_max": 1.8683208376169205, "advantage_mean": 5.339582997976322e-08, "advantage_min": -0.854301206767559, "advantage_std": 0.999813862144947, "completion_length": 2680.1041870117188, "epoch": 0.011428571428571429, "grad_norm": 0.20287226140499115, "kl": 2.043694257736206e-05, "lambda_div_used": 0.5, "learning_rate": 2e-07, "loss": 0.0, "reward": -0.23507352324668318, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.23507352324668318, "reward_after_std": 0.7843089625239372, "reward_before_mean": 0.1418246179819107, "reward_before_std": 0.8295328728854656, "reward_change_max": 0.0011120587587356567, "reward_change_mean": -0.3768981248140335, "reward_change_min": -0.8887458518147469, "reward_change_std": 0.3657720573246479, "reward_std": 0.7843089960515499, "rewards/cosine_scaled_reward": -0.12700436916202307, "rewards/format_reward": 0.3958333432674408, "step": 10 }, { "advantage_max": 1.929537832736969, "advantage_mean": 9.934107758624577e-09, "advantage_min": -0.7646144963800907, "advantage_std": 0.9998139664530754, "completion_length": 3314.9375, "epoch": 0.012571428571428572, "grad_norm": 0.18723739683628082, "kl": 3.269314765930176e-05, "lambda_div_used": 0.5, "learning_rate": 2.1999999999999998e-07, "loss": 0.0, "reward": -0.3749279286712408, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.3749279286712408, "reward_after_std": 0.8660313133150339, "reward_before_mean": -0.14815278816968203, "reward_before_std": 0.8909500073641539, "reward_change_max": 0.006466515362262726, "reward_change_mean": -0.22677514608949423, "reward_change_min": -0.5190893076360226, "reward_change_std": 0.23491209978237748, "reward_std": 0.8660313449800014, "rewards/cosine_scaled_reward": -0.17824306967668235, "rewards/format_reward": 0.20833333767950535, "step": 11 }, { "advantage_max": 1.8934268653392792, "advantage_mean": -5.587935947293232e-09, "advantage_min": -0.7801559269428253, "advantage_std": 0.9998533800244331, "completion_length": 2306.3333740234375, "epoch": 0.013714285714285714, "grad_norm": 0.22430002689361572, "kl": 3.651529550552368e-05, "lambda_div_used": 0.5, "learning_rate": 2.4e-07, "loss": 0.0, "reward": -0.05954301607562229, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.05954301607562229, "reward_after_std": 0.8166232109069824, "reward_before_mean": 0.4518513362854719, "reward_before_std": 0.771789163351059, "reward_change_max": 0.002845950424671173, "reward_change_mean": -0.5113943256437778, "reward_change_min": -1.007496863603592, "reward_change_std": 0.3824256267398596, "reward_std": 0.8166232109069824, "rewards/cosine_scaled_reward": -0.09699101699516177, "rewards/format_reward": 0.6458333414047956, "step": 12 }, { "advantage_max": 1.8332414776086807, "advantage_mean": 2.3593506426333732e-08, "advantage_min": -0.9173097312450409, "advantage_std": 0.9997749254107475, "completion_length": 3007.687530517578, "epoch": 0.014857142857142857, "grad_norm": 0.19439151883125305, "kl": 3.3482909202575684e-05, "lambda_div_used": 0.5, "learning_rate": 2.6e-07, "loss": 0.0, "reward": -0.28582077845931053, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.28582077845931053, "reward_after_std": 0.6129563190042973, "reward_before_mean": 0.1077382080256939, "reward_before_std": 0.6512869298458099, "reward_change_max": 0.0, "reward_change_mean": -0.3935589976608753, "reward_change_min": -0.8552064150571823, "reward_change_std": 0.33744460716843605, "reward_std": 0.6129563301801682, "rewards/cosine_scaled_reward": -0.09196422435343266, "rewards/format_reward": 0.2916666716337204, "step": 13 }, { "advantage_max": 1.8766373842954636, "advantage_mean": -9.313225468599029e-09, "advantage_min": -0.8543065041303635, "advantage_std": 0.9998165741562843, "completion_length": 2791.041717529297, "epoch": 0.016, "grad_norm": 0.1748839169740677, "kl": 2.9399991035461426e-05, "lambda_div_used": 0.5, "learning_rate": 2.8e-07, "loss": 0.0, "reward": -0.18133868090808392, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.18133868090808392, "reward_after_std": 0.7546257749199867, "reward_before_mean": 0.2495754323899746, "reward_before_std": 0.7416385095566511, "reward_change_max": 0.0003454461693763733, "reward_change_mean": -0.43091410771012306, "reward_change_min": -0.8854171372950077, "reward_change_std": 0.3447839133441448, "reward_std": 0.7546257749199867, "rewards/cosine_scaled_reward": -0.10437896568328142, "rewards/format_reward": 0.4583333469927311, "step": 14 }, { "advantage_max": 1.921839565038681, "advantage_mean": 5.3395828314428684e-08, "advantage_min": -0.8221309706568718, "advantage_std": 0.9998021274805069, "completion_length": 2747.1875343322754, "epoch": 0.017142857142857144, "grad_norm": 0.20771348476409912, "kl": 3.0741095542907715e-05, "lambda_div_used": 0.5, "learning_rate": 3e-07, "loss": 0.0, "reward": -0.11654636077582836, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.11654636077582836, "reward_after_std": 0.6426295302808285, "reward_before_mean": 0.39872846752405167, "reward_before_std": 0.5288418848067522, "reward_change_max": 0.0006920620799064636, "reward_change_mean": -0.5152748012915254, "reward_change_min": -0.812650166451931, "reward_change_std": 0.3277115412056446, "reward_std": 0.6426295377314091, "rewards/cosine_scaled_reward": 0.0014475611969828606, "rewards/format_reward": 0.3958333358168602, "step": 15 }, { "advantage_max": 1.838592454791069, "advantage_mean": 3.601114018092488e-08, "advantage_min": -0.9086092934012413, "advantage_std": 0.9996505901217461, "completion_length": 3414.9583740234375, "epoch": 0.018285714285714287, "grad_norm": 0.18958210945129395, "kl": 3.1381845474243164e-05, "lambda_div_used": 0.5, "learning_rate": 3.2e-07, "loss": 0.0, "reward": -0.5179620869457722, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.5179620869457722, "reward_after_std": 0.5234112944453955, "reward_before_mean": -0.28896366874687374, "reward_before_std": 0.58364612236619, "reward_change_max": 0.0011472925543785095, "reward_change_mean": -0.22899843472987413, "reward_change_min": -0.5847333557903767, "reward_change_std": 0.25637041311711073, "reward_std": 0.523411313071847, "rewards/cosine_scaled_reward": -0.20698183961212635, "rewards/format_reward": 0.12500000186264515, "step": 16 }, { "advantage_max": 1.8927734047174454, "advantage_mean": 2.2351741790771484e-08, "advantage_min": -0.8768943846225739, "advantage_std": 0.999830462038517, "completion_length": 2313.9167251586914, "epoch": 0.019428571428571427, "grad_norm": 0.2875772714614868, "kl": 4.1112303733825684e-05, "lambda_div_used": 0.5, "learning_rate": 3.4000000000000003e-07, "loss": 0.0, "reward": -0.0218079574406147, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.0218079574406147, "reward_after_std": 0.7882238104939461, "reward_before_mean": 0.5311932638287544, "reward_before_std": 0.7641869075596333, "reward_change_max": 0.001342780888080597, "reward_change_mean": -0.5530012445524335, "reward_change_min": -1.0787059292197227, "reward_change_std": 0.42368356697261333, "reward_std": 0.7882238253951073, "rewards/cosine_scaled_reward": -0.026070039719343185, "rewards/format_reward": 0.5833333469927311, "step": 17 }, { "advantage_max": 1.8699724674224854, "advantage_mean": 2.5456150465341665e-08, "advantage_min": -0.8736857995390892, "advantage_std": 0.9997463449835777, "completion_length": 3016.1875228881836, "epoch": 0.02057142857142857, "grad_norm": 0.1541830599308014, "kl": 2.8382986783981323e-05, "lambda_div_used": 0.5, "learning_rate": 3.6e-07, "loss": 0.0, "reward": -0.3072060104459524, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": -0.3072060104459524, "reward_after_std": 0.5597724877297878, "reward_before_mean": 0.08646659925580025, "reward_before_std": 0.5770146455615759, "reward_change_max": 0.0017676278948783875, "reward_change_mean": -0.39367261063307524, "reward_change_min": -0.765678558498621, "reward_change_std": 0.3168019922450185, "reward_std": 0.5597724877297878, "rewards/cosine_scaled_reward": -0.11301671247929335, "rewards/format_reward": 0.31250000558793545, "step": 18 }, { "advantage_max": 1.8800882250070572, "advantage_mean": 2.4835267176115394e-09, "advantage_min": -0.8829206228256226, "advantage_std": 0.9998331665992737, "completion_length": 2785.3750381469727, "epoch": 0.021714285714285714, "grad_norm": 0.16840744018554688, "kl": 2.9861927032470703e-05, "lambda_div_used": 0.5, "learning_rate": 3.7999999999999996e-07, "loss": 0.0, "reward": 0.03168467991054058, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.03168467991054058, "reward_after_std": 0.834330890327692, "reward_before_mean": 0.6134747294709086, "reward_before_std": 0.8129289895296097, "reward_change_max": 0.0006548240780830383, "reward_change_mean": -0.5817900514230132, "reward_change_min": -1.0407536514103413, "reward_change_std": 0.4286394249647856, "reward_std": 0.8343308977782726, "rewards/cosine_scaled_reward": 0.06715403066482395, "rewards/format_reward": 0.47916668094694614, "step": 19 }, { "advantage_max": 1.8762269914150238, "advantage_mean": 6.829699084054397e-09, "advantage_min": -0.7452137358486652, "advantage_std": 0.9998760744929314, "completion_length": 2365.3958740234375, "epoch": 0.022857142857142857, "grad_norm": 0.18150842189788818, "kl": 2.7011847123503685e-05, "lambda_div_used": 0.5, "learning_rate": 4e-07, "loss": 0.0, "reward": 0.11636773869395256, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.11636773869395256, "reward_after_std": 0.9510068409144878, "reward_before_mean": 0.731377505348064, "reward_before_std": 0.9209488183259964, "reward_change_max": 0.0, "reward_change_mean": -0.6150098070502281, "reward_change_min": -1.2754083350300789, "reward_change_std": 0.5005144346505404, "reward_std": 0.9510068707168102, "rewards/cosine_scaled_reward": 0.011522093118401244, "rewards/format_reward": 0.7083333358168602, "step": 20 }, { "advantage_max": 1.9191124886274338, "advantage_mean": -3.228584943837376e-08, "advantage_min": -0.741646058857441, "advantage_std": 0.9997679218649864, "completion_length": 2737.666717529297, "epoch": 0.024, "grad_norm": 0.18668660521507263, "kl": 4.217773675918579e-05, "lambda_div_used": 0.5, "learning_rate": 4.1999999999999995e-07, "loss": 0.0, "reward": -0.14399441180285066, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.14399441180285066, "reward_after_std": 0.8878922425210476, "reward_before_mean": 0.270708542317152, "reward_before_std": 0.8766209781169891, "reward_change_max": 0.0014019683003425598, "reward_change_mean": -0.41470295563340187, "reward_change_min": -0.8328092768788338, "reward_change_std": 0.3357093087397516, "reward_std": 0.8878922797739506, "rewards/cosine_scaled_reward": -0.09381241840310395, "rewards/format_reward": 0.4583333395421505, "step": 21 }, { "advantage_max": 1.9325433522462845, "advantage_mean": 6.364037852257809e-09, "advantage_min": -0.8162704780697823, "advantage_std": 0.9998474344611168, "completion_length": 1529.2292137145996, "epoch": 0.025142857142857144, "grad_norm": 0.3153725266456604, "kl": 5.3122639656066895e-05, "lambda_div_used": 0.5, "learning_rate": 4.3999999999999997e-07, "loss": 0.0, "reward": 0.18219375910121016, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.18219375910121016, "reward_after_std": 0.7688724808394909, "reward_before_mean": 0.9053211729042232, "reward_before_std": 0.6001508925110102, "reward_change_max": 6.76959753036499e-05, "reward_change_mean": -0.7231274656951427, "reward_change_min": -1.1781792268157005, "reward_change_std": 0.445758156478405, "reward_std": 0.7688724920153618, "rewards/cosine_scaled_reward": 0.025577264837920666, "rewards/format_reward": 0.8541666753590107, "step": 22 }, { "advantage_max": 1.9284197837114334, "advantage_mean": 1.3038516710750514e-08, "advantage_min": -0.799706406891346, "advantage_std": 0.9998190701007843, "completion_length": 2694.687545776367, "epoch": 0.026285714285714287, "grad_norm": 0.23257306218147278, "kl": 2.7127563953399658e-05, "lambda_div_used": 0.5, "learning_rate": 4.6e-07, "loss": 0.0, "reward": -0.232659702480305, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.232659702480305, "reward_after_std": 0.7457499727606773, "reward_before_mean": 0.14918443333590403, "reward_before_std": 0.7016965411603451, "reward_change_max": 0.0005003586411476135, "reward_change_mean": -0.38184412755072117, "reward_change_min": -0.7390884645283222, "reward_change_std": 0.2957865409553051, "reward_std": 0.7457499802112579, "rewards/cosine_scaled_reward": -0.14415779197588563, "rewards/format_reward": 0.43750000186264515, "step": 23 }, { "advantage_max": 1.9204401075839996, "advantage_mean": 1.6453365864199654e-08, "advantage_min": -0.8407022133469582, "advantage_std": 0.9998438656330109, "completion_length": 3135.1250610351562, "epoch": 0.027428571428571427, "grad_norm": 0.16080714762210846, "kl": 3.184378147125244e-05, "lambda_div_used": 0.5, "learning_rate": 4.8e-07, "loss": 0.0, "reward": -0.15408260421827435, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.15408260421827435, "reward_after_std": 0.9264589920639992, "reward_before_mean": 0.24006994348019361, "reward_before_std": 0.9225658029317856, "reward_change_max": 0.0, "reward_change_mean": -0.3941525584086776, "reward_change_min": -0.8041125237941742, "reward_change_std": 0.3279961505904794, "reward_std": 0.9264590125530958, "rewards/cosine_scaled_reward": -0.0778816994279623, "rewards/format_reward": 0.39583334513008595, "step": 24 }, { "advantage_max": 1.858852818608284, "advantage_mean": 6.0147916647323996e-09, "advantage_min": -0.8925201445817947, "advantage_std": 0.9998224526643753, "completion_length": 2680.000015258789, "epoch": 0.02857142857142857, "grad_norm": 0.18977677822113037, "kl": 3.3307820558547974e-05, "lambda_div_used": 0.5, "learning_rate": 5e-07, "loss": 0.0, "reward": -0.08414146304130554, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.08414146304130554, "reward_after_std": 0.7848474644124508, "reward_before_mean": 0.42510347068309784, "reward_before_std": 0.8524695411324501, "reward_change_max": 0.003357619047164917, "reward_change_mean": -0.5092449325602502, "reward_change_min": -1.0072326622903347, "reward_change_std": 0.4247382814064622, "reward_std": 0.7848474718630314, "rewards/cosine_scaled_reward": -0.01661492884159088, "rewards/format_reward": 0.45833334885537624, "step": 25 }, { "advantage_max": 1.9238365292549133, "advantage_mean": 3.539025994481193e-08, "advantage_min": -0.8145479038357735, "advantage_std": 0.9997957423329353, "completion_length": 2942.8333740234375, "epoch": 0.029714285714285714, "grad_norm": 0.16458716988563538, "kl": 2.6823952794075012e-05, "lambda_div_used": 0.5, "learning_rate": 5.2e-07, "loss": 0.0, "reward": -0.12822037562727928, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.12822037562727928, "reward_after_std": 0.657489363104105, "reward_before_mean": 0.37513636983931065, "reward_before_std": 0.5635189693421125, "reward_change_max": 0.0009020194411277771, "reward_change_mean": -0.5033567762002349, "reward_change_min": -0.8012861534953117, "reward_change_std": 0.3210837971419096, "reward_std": 0.6574893668293953, "rewards/cosine_scaled_reward": -0.0728484783321619, "rewards/format_reward": 0.520833333954215, "step": 26 }, { "advantage_max": 1.8898457139730453, "advantage_mean": 3.282912131030713e-08, "advantage_min": -0.8456361517310143, "advantage_std": 0.9998165890574455, "completion_length": 3154.312545776367, "epoch": 0.030857142857142857, "grad_norm": 0.18873968720436096, "kl": 2.8768088668584824e-05, "lambda_div_used": 0.5, "learning_rate": 5.4e-07, "loss": 0.0, "reward": -0.19466266850940883, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.19466266850940883, "reward_after_std": 0.7585207261145115, "reward_before_mean": 0.22541327867656946, "reward_before_std": 0.755299074575305, "reward_change_max": 0.0006344020366668701, "reward_change_mean": -0.4200759269297123, "reward_change_min": -0.8561866730451584, "reward_change_std": 0.3406699551269412, "reward_std": 0.7585207633674145, "rewards/cosine_scaled_reward": -0.053960046730935574, "rewards/format_reward": 0.33333333767950535, "step": 27 }, { "advantage_max": 1.8525895327329636, "advantage_mean": 2.2972624136308184e-08, "advantage_min": -0.943960890173912, "advantage_std": 0.9997684359550476, "completion_length": 2793.0833587646484, "epoch": 0.032, "grad_norm": 0.20687302947044373, "kl": 3.4846365451812744e-05, "lambda_div_used": 0.5, "learning_rate": 5.6e-07, "loss": 0.0, "reward": -0.07525549922138453, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.07525549922138453, "reward_after_std": 0.7445814274251461, "reward_before_mean": 0.447348415851593, "reward_before_std": 0.7081887386739254, "reward_change_max": 0.0002209991216659546, "reward_change_mean": -0.5226039234548807, "reward_change_min": -0.8660258539021015, "reward_change_std": 0.3674051659181714, "reward_std": 0.7445814423263073, "rewards/cosine_scaled_reward": -0.005492456257343292, "rewards/format_reward": 0.45833334885537624, "step": 28 }, { "advantage_max": 1.9153321534395218, "advantage_mean": 4.5324366704235786e-08, "advantage_min": -0.7868571989238262, "advantage_std": 0.9998080059885979, "completion_length": 3342.854248046875, "epoch": 0.03314285714285714, "grad_norm": 0.22355221211910248, "kl": 3.172457218170166e-05, "lambda_div_used": 0.5, "learning_rate": 5.8e-07, "loss": 0.0, "reward": -0.4529417622834444, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.4529417622834444, "reward_after_std": 0.6748885735869408, "reward_before_mean": -0.2338971405988559, "reward_before_std": 0.6635385602712631, "reward_change_max": 0.002843029797077179, "reward_change_mean": -0.21904463577084243, "reward_change_min": -0.4862074926495552, "reward_change_std": 0.20299789123237133, "reward_std": 0.674888588488102, "rewards/cosine_scaled_reward": -0.22111523617058992, "rewards/format_reward": 0.20833333767950535, "step": 29 }, { "advantage_max": 1.8778761476278305, "advantage_mean": -1.2417634476236117e-08, "advantage_min": -0.8202168717980385, "advantage_std": 0.9998758137226105, "completion_length": 2829.6041870117188, "epoch": 0.03428571428571429, "grad_norm": 0.21529708802700043, "kl": 2.3565255105495453e-05, "lambda_div_used": 0.5, "learning_rate": 6e-07, "loss": 0.0, "reward": 0.029207328334450722, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.029207328334450722, "reward_after_std": 1.0521993562579155, "reward_before_mean": 0.5473756925202906, "reward_before_std": 1.10447221621871, "reward_change_max": 0.0031651705503463745, "reward_change_mean": -0.5181683599948883, "reward_change_min": -1.1787523217499256, "reward_change_std": 0.47795812133699656, "reward_std": 1.0521993860602379, "rewards/cosine_scaled_reward": 0.023687828797847033, "rewards/format_reward": 0.5000000074505806, "step": 30 }, { "advantage_max": 1.8974759131669998, "advantage_mean": 5.463759289447978e-08, "advantage_min": -0.7851004675030708, "advantage_std": 0.9997349232435226, "completion_length": 3072.645835876465, "epoch": 0.03542857142857143, "grad_norm": 0.16128847002983093, "kl": 2.508610486984253e-05, "lambda_div_used": 0.5, "learning_rate": 6.2e-07, "loss": 0.0, "reward": -0.4339814521372318, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4339814521372318, "reward_after_std": 0.6195205058902502, "reward_before_mean": -0.17191330716013908, "reward_before_std": 0.6293110642582178, "reward_change_max": 0.0012357085943222046, "reward_change_mean": -0.26206816267222166, "reward_change_min": -0.6218099929392338, "reward_change_std": 0.24361183121800423, "reward_std": 0.6195205226540565, "rewards/cosine_scaled_reward": -0.200539980083704, "rewards/format_reward": 0.2291666716337204, "step": 31 }, { "advantage_max": 1.8547977358102798, "advantage_mean": 3.725291408684939e-09, "advantage_min": -0.9400533139705658, "advantage_std": 0.9998045861721039, "completion_length": 3167.854217529297, "epoch": 0.036571428571428574, "grad_norm": 0.16295260190963745, "kl": 2.5317072868347168e-05, "lambda_div_used": 0.5, "learning_rate": 6.4e-07, "loss": 0.0, "reward": -0.15984043339267373, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.15984043339267373, "reward_after_std": 0.6901145968586206, "reward_before_mean": 0.31627153418958187, "reward_before_std": 0.7256025932729244, "reward_change_max": 0.0024649351835250854, "reward_change_mean": -0.4761119820177555, "reward_change_min": -0.9442967176437378, "reward_change_std": 0.3847173471003771, "reward_std": 0.6901146098971367, "rewards/cosine_scaled_reward": -0.05019756080582738, "rewards/format_reward": 0.4166666753590107, "step": 32 }, { "advantage_max": 1.8590810298919678, "advantage_mean": 8.071461832237503e-09, "advantage_min": -0.8207400739192963, "advantage_std": 0.9998291581869125, "completion_length": 3435.3959045410156, "epoch": 0.037714285714285714, "grad_norm": 0.1503303498029709, "kl": 3.756582736968994e-05, "lambda_div_used": 0.5, "learning_rate": 6.6e-07, "loss": 0.0, "reward": -0.2504210639744997, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2504210639744997, "reward_after_std": 0.8694077096879482, "reward_before_mean": 0.08852381771430373, "reward_before_std": 0.9110365584492683, "reward_change_max": 0.0003108680248260498, "reward_change_mean": -0.3389448709785938, "reward_change_min": -0.7708389461040497, "reward_change_std": 0.32376272417604923, "reward_std": 0.8694077171385288, "rewards/cosine_scaled_reward": -0.07032142765820026, "rewards/format_reward": 0.2291666716337204, "step": 33 }, { "advantage_max": 1.8898741006851196, "advantage_mean": 1.1175870862079051e-07, "advantage_min": -0.8010011166334152, "advantage_std": 0.9996843114495277, "completion_length": 2295.7083587646484, "epoch": 0.038857142857142854, "grad_norm": 0.2735695242881775, "kl": 3.103911876678467e-05, "lambda_div_used": 0.5, "learning_rate": 6.800000000000001e-07, "loss": 0.0, "reward": -0.04704808286624029, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.04704808286624029, "reward_after_std": 0.8338280580937862, "reward_before_mean": 0.4671720117330551, "reward_before_std": 0.7878899574279785, "reward_change_max": 0.00018334388732910156, "reward_change_mean": -0.5142201161943376, "reward_change_min": -0.9473587274551392, "reward_change_std": 0.3720804797485471, "reward_std": 0.8338280990719795, "rewards/cosine_scaled_reward": -0.03724732855334878, "rewards/format_reward": 0.5416666716337204, "step": 34 }, { "advantage_max": 1.829305186867714, "advantage_mean": 2.6077032810878364e-08, "advantage_min": -0.8712577894330025, "advantage_std": 0.9998411163687706, "completion_length": 2964.6458435058594, "epoch": 0.04, "grad_norm": 0.20366409420967102, "kl": 4.675239324569702e-05, "lambda_div_used": 0.5, "learning_rate": 7e-07, "loss": 0.0, "reward": -0.009542571380734444, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.009542571380734444, "reward_after_std": 0.9506381116807461, "reward_before_mean": 0.514573335647583, "reward_before_std": 1.050828494131565, "reward_change_max": 0.00032019615173339844, "reward_change_mean": -0.5241158995777369, "reward_change_min": -1.2276764325797558, "reward_change_std": 0.5117328846827149, "reward_std": 0.9506381563842297, "rewards/cosine_scaled_reward": 0.05936999386176467, "rewards/format_reward": 0.3958333358168602, "step": 35 }, { "advantage_max": 1.8876190781593323, "advantage_mean": 9.344269968902807e-08, "advantage_min": -0.8404295146465302, "advantage_std": 0.9997332692146301, "completion_length": 3368.375030517578, "epoch": 0.04114285714285714, "grad_norm": 0.158416748046875, "kl": 3.793835639953613e-05, "lambda_div_used": 0.5, "learning_rate": 7.2e-07, "loss": 0.0, "reward": -0.518017141148448, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.518017141148448, "reward_after_std": 0.5544664487242699, "reward_before_mean": -0.3090785853564739, "reward_before_std": 0.5590341556817293, "reward_change_max": 0.0015523284673690796, "reward_change_mean": -0.20893854810856283, "reward_change_min": -0.4174285866320133, "reward_change_std": 0.1820876558776945, "reward_std": 0.5544664710760117, "rewards/cosine_scaled_reward": -0.25870596151798964, "rewards/format_reward": 0.2083333395421505, "step": 36 }, { "advantage_max": 1.832179456949234, "advantage_mean": 9.390836297473726e-08, "advantage_min": -0.9252327382564545, "advantage_std": 0.9997186735272408, "completion_length": 3271.250015258789, "epoch": 0.04228571428571429, "grad_norm": 0.15664541721343994, "kl": 2.317875623703003e-05, "lambda_div_used": 0.5, "learning_rate": 7.4e-07, "loss": 0.0, "reward": -0.5315404608845711, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.5315404608845711, "reward_after_std": 0.40248375572264194, "reward_before_mean": -0.27842903579585254, "reward_before_std": 0.40684795938432217, "reward_change_max": 0.0006852373480796814, "reward_change_mean": -0.2531114090234041, "reward_change_min": -0.4775106720626354, "reward_change_std": 0.20638726092875004, "reward_std": 0.40248377062380314, "rewards/cosine_scaled_reward": -0.2433811966329813, "rewards/format_reward": 0.20833333395421505, "step": 37 }, { "advantage_max": 1.8640215396881104, "advantage_mean": 9.126961608707518e-08, "advantage_min": -0.9243824407458305, "advantage_std": 0.999757893383503, "completion_length": 3210.7916870117188, "epoch": 0.04342857142857143, "grad_norm": 0.16199976205825806, "kl": 2.1124258637428284e-05, "lambda_div_used": 0.5, "learning_rate": 7.599999999999999e-07, "loss": 0.0, "reward": -0.39931502752006054, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.39931502752006054, "reward_after_std": 0.5454818941652775, "reward_before_mean": -0.0851539708673954, "reward_before_std": 0.545851357281208, "reward_change_max": 0.0036991089582443237, "reward_change_mean": -0.3141610324382782, "reward_change_min": -0.6185585148632526, "reward_change_std": 0.2539242282509804, "reward_std": 0.545481912791729, "rewards/cosine_scaled_reward": -0.13632699789013714, "rewards/format_reward": 0.1875, "step": 38 }, { "advantage_max": 1.8490605801343918, "advantage_mean": 4.6566132061443e-09, "advantage_min": -0.9282248802483082, "advantage_std": 0.99977096170187, "completion_length": 2869.5625228881836, "epoch": 0.044571428571428574, "grad_norm": 0.1589706391096115, "kl": 1.77919864654541e-05, "lambda_div_used": 0.5, "learning_rate": 7.799999999999999e-07, "loss": 0.0, "reward": -0.13201994262635708, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.13201994262635708, "reward_after_std": 0.5264497548341751, "reward_before_mean": 0.4156036972999573, "reward_before_std": 0.45706650079227984, "reward_change_max": 0.0018109232187271118, "reward_change_mean": -0.5476236194372177, "reward_change_min": -0.8542361706495285, "reward_change_std": 0.35812338441610336, "reward_std": 0.526449766010046, "rewards/cosine_scaled_reward": -0.03178148064762354, "rewards/format_reward": 0.4791666753590107, "step": 39 }, { "advantage_max": 1.890770971775055, "advantage_mean": 3.911554902202852e-08, "advantage_min": -0.9282963797450066, "advantage_std": 0.9997633919119835, "completion_length": 2680.6250610351562, "epoch": 0.045714285714285714, "grad_norm": 0.1795395016670227, "kl": 3.270059823989868e-05, "lambda_div_used": 0.5, "learning_rate": 8e-07, "loss": 0.0, "reward": -0.15905895363539457, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.15905895363539457, "reward_after_std": 0.5205851662904024, "reward_before_mean": 0.3587095569819212, "reward_before_std": 0.41430299170315266, "reward_change_max": 0.0008555799722671509, "reward_change_mean": -0.5177684919908643, "reward_change_min": -0.8109701350331306, "reward_change_std": 0.32442333083599806, "reward_std": 0.5205851849168539, "rewards/cosine_scaled_reward": -0.07064523361623287, "rewards/format_reward": 0.5000000093132257, "step": 40 }, { "advantage_max": 1.934581384062767, "advantage_mean": 6.5192582443529545e-09, "advantage_min": -0.7122581228613853, "advantage_std": 0.9998366460204124, "completion_length": 3073.5833740234375, "epoch": 0.046857142857142854, "grad_norm": 0.16301335394382477, "kl": 2.850499004125595e-05, "lambda_div_used": 0.5, "learning_rate": 8.199999999999999e-07, "loss": 0.0, "reward": -0.3758960599079728, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3758960599079728, "reward_after_std": 0.7847613207995892, "reward_before_mean": -0.12845315597951412, "reward_before_std": 0.7612991891801357, "reward_change_max": 0.0017989054322242737, "reward_change_mean": -0.24744290485978127, "reward_change_min": -0.5457473620772362, "reward_change_std": 0.22195276990532875, "reward_std": 0.7847613506019115, "rewards/cosine_scaled_reward": -0.22047658078372478, "rewards/format_reward": 0.31250000558793545, "step": 41 }, { "advantage_max": 1.9205241799354553, "advantage_mean": 2.483526917451684e-08, "advantage_min": -0.6978275701403618, "advantage_std": 0.999718151986599, "completion_length": 2810.895866394043, "epoch": 0.048, "grad_norm": 0.22402982413768768, "kl": 4.731118679046631e-05, "lambda_div_used": 0.5, "learning_rate": 8.399999999999999e-07, "loss": 0.0, "reward": -0.4632724979892373, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.4632724979892373, "reward_after_std": 0.550530057400465, "reward_before_mean": -0.20958980754949152, "reward_before_std": 0.535234902985394, "reward_change_max": 0.002125099301338196, "reward_change_mean": -0.25368270359467715, "reward_change_min": -0.5563341490924358, "reward_change_std": 0.21589954826049507, "reward_std": 0.5505300872027874, "rewards/cosine_scaled_reward": -0.27146157110109925, "rewards/format_reward": 0.3333333358168602, "step": 42 }, { "advantage_max": 1.9519257545471191, "advantage_mean": 1.055498932700516e-08, "advantage_min": -0.7456157505512238, "advantage_std": 0.9998450949788094, "completion_length": 3029.416717529297, "epoch": 0.04914285714285714, "grad_norm": 0.20059436559677124, "kl": 2.3126602172851562e-05, "lambda_div_used": 0.5, "learning_rate": 8.599999999999999e-07, "loss": 0.0, "reward": -0.2882131487131119, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.2882131487131119, "reward_after_std": 0.8363650441169739, "reward_before_mean": 0.01781909167766571, "reward_before_std": 0.8098427765071392, "reward_change_max": 0.002855464816093445, "reward_change_mean": -0.30603223550133407, "reward_change_min": -0.7128670252859592, "reward_change_std": 0.2696162755601108, "reward_std": 0.8363650590181351, "rewards/cosine_scaled_reward": -0.14734045788645744, "rewards/format_reward": 0.31250000558793545, "step": 43 }, { "advantage_max": 1.921295627951622, "advantage_mean": 3.3527614351491764e-08, "advantage_min": -0.7192764729261398, "advantage_std": 0.9997594803571701, "completion_length": 2610.000030517578, "epoch": 0.05028571428571429, "grad_norm": 0.2608076333999634, "kl": 0.0001121722161769867, "lambda_div_used": 0.5, "learning_rate": 8.799999999999999e-07, "loss": 0.0, "reward": -0.2315717376768589, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2315717376768589, "reward_after_std": 0.7365182116627693, "reward_before_mean": 0.15951215103268623, "reward_before_std": 0.7295054513961077, "reward_change_max": 0.0021981820464134216, "reward_change_mean": -0.39108387008309364, "reward_change_min": -0.790928453207016, "reward_change_std": 0.3149452833458781, "reward_std": 0.7365182191133499, "rewards/cosine_scaled_reward": -0.1285772593691945, "rewards/format_reward": 0.41666667349636555, "step": 44 }, { "advantage_max": 1.8415375053882599, "advantage_mean": 7.078051655895479e-08, "advantage_min": -0.8002776876091957, "advantage_std": 0.9997911676764488, "completion_length": 3401.375, "epoch": 0.05142857142857143, "grad_norm": 0.15118639171123505, "kl": 4.731118679046631e-05, "lambda_div_used": 0.5, "learning_rate": 9e-07, "loss": 0.0, "reward": -0.3574506975710392, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.3574506975710392, "reward_after_std": 0.7208534479141235, "reward_before_mean": -0.06056522950530052, "reward_before_std": 0.7741351164877415, "reward_change_max": 0.0, "reward_change_mean": -0.29688544757664204, "reward_change_min": -0.7843009307980537, "reward_change_std": 0.30331660620868206, "reward_std": 0.7208534702658653, "rewards/cosine_scaled_reward": -0.12403261481085792, "rewards/format_reward": 0.18750000558793545, "step": 45 }, { "advantage_max": 1.8736757040023804, "advantage_mean": -6.6744791915596124e-09, "advantage_min": -0.9290287122130394, "advantage_std": 0.9997085630893707, "completion_length": 3181.1875, "epoch": 0.052571428571428575, "grad_norm": 0.21306857466697693, "kl": 7.120147347450256e-05, "lambda_div_used": 0.5, "learning_rate": 9.2e-07, "loss": 0.0, "reward": -0.5937137454748154, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.5937137454748154, "reward_after_std": 0.3615282103419304, "reward_before_mean": -0.3823595102876425, "reward_before_std": 0.37264155969023705, "reward_change_max": 0.00016885995864868164, "reward_change_mean": -0.21135425474494696, "reward_change_min": -0.4077131114900112, "reward_change_std": 0.17300888849422336, "reward_std": 0.3615282140672207, "rewards/cosine_scaled_reward": -0.26409642212092876, "rewards/format_reward": 0.1458333395421505, "step": 46 }, { "advantage_max": 1.8681392222642899, "advantage_mean": 2.5611371856637533e-08, "advantage_min": -0.8132197260856628, "advantage_std": 0.9998373538255692, "completion_length": 2791.8333892822266, "epoch": 0.053714285714285714, "grad_norm": 0.2283652275800705, "kl": 7.794797420501709e-05, "lambda_div_used": 0.5, "learning_rate": 9.399999999999999e-07, "loss": 0.0, "reward": -0.02218475693371147, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.02218475693371147, "reward_after_std": 0.8718568906188011, "reward_before_mean": 0.511185884475708, "reward_before_std": 0.9081144295632839, "reward_change_max": 0.00019782781600952148, "reward_change_mean": -0.5333706503733993, "reward_change_min": -1.1346662230789661, "reward_change_std": 0.4650296289473772, "reward_std": 0.8718568943440914, "rewards/cosine_scaled_reward": 0.005592945963144302, "rewards/format_reward": 0.5, "step": 47 }, { "advantage_max": 1.8958362936973572, "advantage_mean": 1.4280280513645494e-08, "advantage_min": -0.7991106733679771, "advantage_std": 0.9998115226626396, "completion_length": 2759.770866394043, "epoch": 0.054857142857142854, "grad_norm": 0.30551856756210327, "kl": 0.00015142187476158142, "lambda_div_used": 0.5, "learning_rate": 9.6e-07, "loss": 0.0, "reward": -0.21847828477621078, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.21847828477621078, "reward_after_std": 0.776937173679471, "reward_before_mean": 0.16552765760570765, "reward_before_std": 0.7491930667310953, "reward_change_max": 0.0014309212565422058, "reward_change_mean": -0.3840059507638216, "reward_change_min": -0.7702530585229397, "reward_change_std": 0.29957136139273643, "reward_std": 0.7769371904432774, "rewards/cosine_scaled_reward": -0.08390284143388271, "rewards/format_reward": 0.3333333358168602, "step": 48 }, { "advantage_max": 1.8815445601940155, "advantage_mean": 1.179675274132208e-08, "advantage_min": -0.8456504121422768, "advantage_std": 0.9998609572649002, "completion_length": 2267.3125381469727, "epoch": 0.056, "grad_norm": 0.24697095155715942, "kl": 0.00010534748435020447, "lambda_div_used": 0.5, "learning_rate": 9.8e-07, "loss": 0.0, "reward": -0.055962367448955774, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.055962367448955774, "reward_after_std": 0.9708420261740685, "reward_before_mean": 0.4173060655593872, "reward_before_std": 1.0131547879427671, "reward_change_max": 0.003098607063293457, "reward_change_mean": -0.4732684250921011, "reward_change_min": -1.1020850576460361, "reward_change_std": 0.44488269835710526, "reward_std": 0.9708420485258102, "rewards/cosine_scaled_reward": -0.07259697344852611, "rewards/format_reward": 0.5625000167638063, "step": 49 }, { "advantage_max": 1.8893340080976486, "advantage_mean": 1.8005570812107408e-08, "advantage_min": -0.8202431090176105, "advantage_std": 0.9997889772057533, "completion_length": 2962.0625076293945, "epoch": 0.05714285714285714, "grad_norm": 0.1640988290309906, "kl": 0.00016727298498153687, "lambda_div_used": 0.5, "learning_rate": 1e-06, "loss": 0.0, "reward": -0.09585870243608952, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.09585870243608952, "reward_after_std": 0.7277687638998032, "reward_before_mean": 0.41637253458611667, "reward_before_std": 0.6608343506231904, "reward_change_max": 0.0, "reward_change_mean": -0.512231232598424, "reward_change_min": -0.9351947791874409, "reward_change_std": 0.3637898899614811, "reward_std": 0.727768812328577, "rewards/cosine_scaled_reward": 0.041519587859511375, "rewards/format_reward": 0.3333333358168602, "step": 50 }, { "advantage_max": 1.8653928488492966, "advantage_mean": 5.86733245322435e-08, "advantage_min": -0.9568943232297897, "advantage_std": 0.9997608214616776, "completion_length": 2327.958381652832, "epoch": 0.05828571428571429, "grad_norm": 0.24649883806705475, "kl": 0.00038273632526397705, "lambda_div_used": 0.5, "learning_rate": 9.999890338174275e-07, "loss": 0.0, "reward": -0.17880022956524044, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.17880022956524044, "reward_after_std": 0.6422858815640211, "reward_before_mean": 0.2922116946429014, "reward_before_std": 0.6047020759433508, "reward_change_max": 0.001704975962638855, "reward_change_mean": -0.4710119031369686, "reward_change_min": -0.8810345865786076, "reward_change_std": 0.34449261892586946, "reward_std": 0.6422859076410532, "rewards/cosine_scaled_reward": -0.10389416851103306, "rewards/format_reward": 0.5000000055879354, "step": 51 }, { "advantage_max": 1.823257952928543, "advantage_mean": -1.1796753018877837e-08, "advantage_min": -0.8818978518247604, "advantage_std": 0.9998767971992493, "completion_length": 2949.541717529297, "epoch": 0.05942857142857143, "grad_norm": 0.18382064998149872, "kl": 0.0005578286945819855, "lambda_div_used": 0.5, "learning_rate": 9.999561358041868e-07, "loss": 0.0, "reward": 0.14336604299023747, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.14336604299023747, "reward_after_std": 1.1285717599093914, "reward_before_mean": 0.7415338456630707, "reward_before_std": 1.2546439096331596, "reward_change_max": 0.0019373148679733276, "reward_change_mean": -0.5981677994132042, "reward_change_min": -1.3573957942426205, "reward_change_std": 0.5901675391942263, "reward_std": 1.1285717971622944, "rewards/cosine_scaled_reward": 0.1311835777014494, "rewards/format_reward": 0.4791666753590107, "step": 52 }, { "advantage_max": 1.8426322937011719, "advantage_mean": 7.450580818968433e-09, "advantage_min": -0.9017970934510231, "advantage_std": 0.9998707324266434, "completion_length": 2719.041717529297, "epoch": 0.060571428571428575, "grad_norm": 0.18095125257968903, "kl": 0.000291973352432251, "lambda_div_used": 0.5, "learning_rate": 9.999013075636804e-07, "loss": 0.0, "reward": 0.19375211838632822, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.19375211838632822, "reward_after_std": 1.0046247728168964, "reward_before_mean": 0.8646276481449604, "reward_before_std": 1.0044534653425217, "reward_change_max": 0.005661614239215851, "reward_change_mean": -0.6708755232393742, "reward_change_min": -1.3468510583043098, "reward_change_std": 0.5442535653710365, "reward_std": 1.0046248212456703, "rewards/cosine_scaled_reward": 0.11981380297220312, "rewards/format_reward": 0.6250000111758709, "step": 53 }, { "advantage_max": 1.838746502995491, "advantage_mean": -8.07146127712599e-09, "advantage_min": -0.8344198316335678, "advantage_std": 0.9998571202158928, "completion_length": 2901.416717529297, "epoch": 0.061714285714285715, "grad_norm": 0.1705823391675949, "kl": 8.270889520645142e-05, "lambda_div_used": 0.5, "learning_rate": 9.998245517681593e-07, "loss": 0.0, "reward": 0.13633309258148074, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.13633309258148074, "reward_after_std": 0.9937352724373341, "reward_before_mean": 0.7676731869578362, "reward_before_std": 1.0381547771394253, "reward_change_max": 0.001669757068157196, "reward_change_mean": -0.6313401181250811, "reward_change_min": -1.3544641695916653, "reward_change_std": 0.5637638978660107, "reward_std": 0.9937352761626244, "rewards/cosine_scaled_reward": 0.14425326080527157, "rewards/format_reward": 0.4791666753590107, "step": 54 }, { "advantage_max": 1.8439057767391205, "advantage_mean": -6.829699583654758e-09, "advantage_min": -0.8918868899345398, "advantage_std": 0.9998131170868874, "completion_length": 3011.3333740234375, "epoch": 0.06285714285714286, "grad_norm": 0.15338917076587677, "kl": 0.0003665238618850708, "lambda_div_used": 0.5, "learning_rate": 9.997258721585931e-07, "loss": 0.0, "reward": -0.14612246677279472, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.14612246677279472, "reward_after_std": 0.7486055754125118, "reward_before_mean": 0.31836483208462596, "reward_before_std": 0.7372772246599197, "reward_change_max": 0.0, "reward_change_mean": -0.46448732912540436, "reward_change_min": -0.9766540713608265, "reward_change_std": 0.39013639837503433, "reward_std": 0.7486055865883827, "rewards/cosine_scaled_reward": -0.028317579999566078, "rewards/format_reward": 0.3750000074505806, "step": 55 }, { "advantage_max": 1.8368095457553864, "advantage_mean": 2.7318797446440612e-08, "advantage_min": -0.9046461880207062, "advantage_std": 0.9997836649417877, "completion_length": 2900.312515258789, "epoch": 0.064, "grad_norm": 0.17617428302764893, "kl": 0.0005377233028411865, "lambda_div_used": 0.5, "learning_rate": 9.996052735444862e-07, "loss": 0.0, "reward": -0.13353440910577774, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.13353440910577774, "reward_after_std": 0.6982099693268538, "reward_before_mean": 0.3613228676840663, "reward_before_std": 0.7062958665192127, "reward_change_max": 0.0013622492551803589, "reward_change_mean": -0.4948572628200054, "reward_change_min": -0.8601287193596363, "reward_change_std": 0.3771468782797456, "reward_std": 0.698210010305047, "rewards/cosine_scaled_reward": -0.04850525222718716, "rewards/format_reward": 0.45833334140479565, "step": 56 }, { "advantage_max": 1.885016068816185, "advantage_mean": 4.2219957807621e-08, "advantage_min": -0.8030908405780792, "advantage_std": 0.9997961819171906, "completion_length": 3282.9584045410156, "epoch": 0.06514285714285714, "grad_norm": 0.1289624273777008, "kl": 0.0001293867826461792, "lambda_div_used": 0.5, "learning_rate": 9.994627618036452e-07, "loss": 0.0, "reward": -0.23884075693786144, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.23884075693786144, "reward_after_std": 0.7863927576690912, "reward_before_mean": 0.13652504794299603, "reward_before_std": 0.8171067088842392, "reward_change_max": 0.0025532394647598267, "reward_change_mean": -0.3753657881170511, "reward_change_min": -0.8432562984526157, "reward_change_std": 0.3470982797443867, "reward_std": 0.78639280423522, "rewards/cosine_scaled_reward": -0.12965414859354496, "rewards/format_reward": 0.3958333395421505, "step": 57 }, { "advantage_max": 1.8578455299139023, "advantage_mean": 2.793967834868738e-08, "advantage_min": -0.8778782561421394, "advantage_std": 0.9998427554965019, "completion_length": 2379.1250228881836, "epoch": 0.06628571428571428, "grad_norm": 0.27240949869155884, "kl": 0.0014230608940124512, "lambda_div_used": 0.5, "learning_rate": 9.992983438818915e-07, "loss": 0.0001, "reward": -0.005995278595946729, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.005995278595946729, "reward_after_std": 0.8489437401294708, "reward_before_mean": 0.5475127007812262, "reward_before_std": 0.8708108924329281, "reward_change_max": 0.0017568692564964294, "reward_change_mean": -0.5535079715773463, "reward_change_min": -1.082301240414381, "reward_change_std": 0.4453202374279499, "reward_std": 0.8489437624812126, "rewards/cosine_scaled_reward": -0.049160322174429893, "rewards/format_reward": 0.645833345130086, "step": 58 }, { "advantage_max": 1.861334353685379, "advantage_mean": -1.179675268581093e-08, "advantage_min": -0.8184168860316277, "advantage_std": 0.9997863620519638, "completion_length": 2806.333351135254, "epoch": 0.06742857142857143, "grad_norm": 0.15535277128219604, "kl": 0.00013153068721294403, "lambda_div_used": 0.5, "learning_rate": 9.991120277927223e-07, "loss": 0.0, "reward": -0.2553709470666945, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.2553709470666945, "reward_after_std": 0.6489655673503876, "reward_before_mean": 0.14917359128594398, "reward_before_std": 0.641713622957468, "reward_change_max": 0.0031897202134132385, "reward_change_mean": -0.4045445565134287, "reward_change_min": -0.8240447789430618, "reward_change_std": 0.3207780700176954, "reward_std": 0.6489655859768391, "rewards/cosine_scaled_reward": -0.09207986295223236, "rewards/format_reward": 0.3333333358168602, "step": 59 }, { "advantage_max": 1.911918118596077, "advantage_mean": 1.5543122344752192e-15, "advantage_min": -0.7607595697045326, "advantage_std": 0.9998063743114471, "completion_length": 2909.041702270508, "epoch": 0.06857142857142857, "grad_norm": 0.1574772149324417, "kl": 0.0002549951896071434, "lambda_div_used": 0.5, "learning_rate": 9.989038226169207e-07, "loss": 0.0, "reward": -0.24616998185229022, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.24616998185229022, "reward_after_std": 0.7582415752112865, "reward_before_mean": 0.12705273227766156, "reward_before_std": 0.7256910298019648, "reward_change_max": 0.0013196319341659546, "reward_change_mean": -0.3732227301225066, "reward_change_min": -0.6842942871153355, "reward_change_std": 0.2910933867096901, "reward_std": 0.7582416199147701, "rewards/cosine_scaled_reward": -0.13439030945301056, "rewards/format_reward": 0.39583333767950535, "step": 60 }, { "advantage_max": 1.8177462965250015, "advantage_mean": -1.241764135961887e-09, "advantage_min": -1.0356914550065994, "advantage_std": 0.9997951909899712, "completion_length": 2884.5834045410156, "epoch": 0.06971428571428571, "grad_norm": 0.1754026859998703, "kl": 0.00044707953929901123, "lambda_div_used": 0.5, "learning_rate": 9.98673738502114e-07, "loss": 0.0, "reward": -0.10852715838700533, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": -0.10852715838700533, "reward_after_std": 0.7251477800309658, "reward_before_mean": 0.400545597076416, "reward_before_std": 0.7691365052014589, "reward_change_max": 0.0020439624786376953, "reward_change_mean": -0.5090727712959051, "reward_change_min": -0.9695079140365124, "reward_change_std": 0.41014579124748707, "reward_std": 0.725147807970643, "rewards/cosine_scaled_reward": -0.049727211240679026, "rewards/format_reward": 0.5000000111758709, "step": 61 }, { "advantage_max": 1.9007128179073334, "advantage_mean": 1.707424779340272e-08, "advantage_min": -0.7632358595728874, "advantage_std": 0.999834880232811, "completion_length": 2444.4583435058594, "epoch": 0.07085714285714285, "grad_norm": 0.20853900909423828, "kl": 0.0012095272541046143, "lambda_div_used": 0.5, "learning_rate": 9.98421786662277e-07, "loss": 0.0, "reward": 0.08043081499636173, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.08043081499636173, "reward_after_std": 0.8052340261638165, "reward_before_mean": 0.7084651309996843, "reward_before_std": 0.6865715570747852, "reward_change_max": 0.0, "reward_change_mean": -0.6280343104153872, "reward_change_min": -1.0744778029620647, "reward_change_std": 0.4058781899511814, "reward_std": 0.8052340373396873, "rewards/cosine_scaled_reward": 0.05214923154562712, "rewards/format_reward": 0.6041666697710752, "step": 62 }, { "advantage_max": 1.8943010717630386, "advantage_mean": -2.235174201281609e-08, "advantage_min": -0.8820522204041481, "advantage_std": 0.9998508021235466, "completion_length": 2203.541717529297, "epoch": 0.072, "grad_norm": 0.25720417499542236, "kl": 0.0009733438491821289, "lambda_div_used": 0.5, "learning_rate": 9.981479793771866e-07, "loss": 0.0, "reward": 0.106345753534697, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.106345753534697, "reward_after_std": 0.8436989672482014, "reward_before_mean": 0.7488799318671227, "reward_before_std": 0.7643895111978054, "reward_change_max": 0.00014873594045639038, "reward_change_mean": -0.6425342075526714, "reward_change_min": -1.0930566787719727, "reward_change_std": 0.43817601166665554, "reward_std": 0.843698974698782, "rewards/cosine_scaled_reward": 0.041106633958406746, "rewards/format_reward": 0.6666666772216558, "step": 63 }, { "advantage_max": 1.9071584939956665, "advantage_mean": 5.494803312355856e-08, "advantage_min": -0.8514663949608803, "advantage_std": 0.9998283535242081, "completion_length": 2759.000045776367, "epoch": 0.07314285714285715, "grad_norm": 0.1668003350496292, "kl": 0.00036591291427612305, "lambda_div_used": 0.5, "learning_rate": 9.97852329991824e-07, "loss": 0.0, "reward": 0.03588845953345299, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.03588845953345299, "reward_after_std": 0.8736652098596096, "reward_before_mean": 0.6065978556871414, "reward_before_std": 0.8280804753303528, "reward_change_max": 0.0005163624882698059, "reward_change_mean": -0.5707093523815274, "reward_change_min": -1.060088012367487, "reward_change_std": 0.4052878515794873, "reward_std": 0.8736652284860611, "rewards/cosine_scaled_reward": 0.05329891119617969, "rewards/format_reward": 0.5000000111758709, "step": 64 }, { "advantage_max": 1.9266762882471085, "advantage_mean": 6.208816460961941e-09, "advantage_min": -0.7601385116577148, "advantage_std": 0.9997923299670219, "completion_length": 2736.000015258789, "epoch": 0.07428571428571429, "grad_norm": 0.19557268917560577, "kl": 0.0006786584854125977, "lambda_div_used": 0.5, "learning_rate": 9.975348529157229e-07, "loss": 0.0, "reward": -0.2076890431344509, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2076890431344509, "reward_after_std": 0.6389360800385475, "reward_before_mean": 0.2348735888954252, "reward_before_std": 0.5559763349592686, "reward_change_max": 0.001060768961906433, "reward_change_mean": -0.4425626490265131, "reward_change_min": -0.8092819452285767, "reward_change_std": 0.32220305129885674, "reward_std": 0.6389361061155796, "rewards/cosine_scaled_reward": -0.10131320543587208, "rewards/format_reward": 0.43750000186264515, "step": 65 }, { "advantage_max": 1.9095334112644196, "advantage_mean": 2.452482830705982e-08, "advantage_min": -0.7046549618244171, "advantage_std": 0.9998053535819054, "completion_length": 2092.9583473205566, "epoch": 0.07542857142857143, "grad_norm": 0.24275831878185272, "kl": 0.0019240379333496094, "lambda_div_used": 0.5, "learning_rate": 9.971955636222684e-07, "loss": 0.0001, "reward": -0.10312021151185036, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.10312021151185036, "reward_after_std": 0.705897755920887, "reward_before_mean": 0.4020507410168648, "reward_before_std": 0.6098583452403545, "reward_change_max": 0.002116262912750244, "reward_change_mean": -0.5051709450781345, "reward_change_min": -0.9165816679596901, "reward_change_std": 0.3466827627271414, "reward_std": 0.7058977633714676, "rewards/cosine_scaled_reward": -0.05939129926264286, "rewards/format_reward": 0.520833333954215, "step": 66 }, { "advantage_max": 1.8870718479156494, "advantage_mean": 4.594524827261637e-08, "advantage_min": -0.7717634811997414, "advantage_std": 0.9997425973415375, "completion_length": 3376.8333435058594, "epoch": 0.07657142857142857, "grad_norm": 0.13818876445293427, "kl": 0.0005912370979785919, "lambda_div_used": 0.5, "learning_rate": 9.968344786479415e-07, "loss": 0.0, "reward": -0.5356982201337814, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.5356982201337814, "reward_after_std": 0.5143481884151697, "reward_before_mean": -0.32520947977900505, "reward_before_std": 0.5225167237222195, "reward_change_max": 0.0011303573846817017, "reward_change_mean": -0.21048873430117965, "reward_change_min": -0.5026979595422745, "reward_change_std": 0.20696008298546076, "reward_std": 0.5143481958657503, "rewards/cosine_scaled_reward": -0.23552141524851322, "rewards/format_reward": 0.1458333395421505, "step": 67 }, { "advantage_max": 1.8678310364484787, "advantage_mean": -2.8871000701258254e-08, "advantage_min": -0.8665808811783791, "advantage_std": 0.9998535141348839, "completion_length": 2133.0417098999023, "epoch": 0.07771428571428571, "grad_norm": 0.22691960632801056, "kl": 0.001744091510772705, "lambda_div_used": 0.5, "learning_rate": 9.964516155915151e-07, "loss": 0.0001, "reward": -0.007227955153211951, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.007227955153211951, "reward_after_std": 0.9634606316685677, "reward_before_mean": 0.5006781555712223, "reward_before_std": 0.9779879823327065, "reward_change_max": 0.0024183765053749084, "reward_change_mean": -0.5079061184078455, "reward_change_min": -1.039794061332941, "reward_change_std": 0.4307109545916319, "reward_std": 0.9634606577455997, "rewards/cosine_scaled_reward": -0.05174426478333771, "rewards/format_reward": 0.604166679084301, "step": 68 }, { "advantage_max": 1.8920985758304596, "advantage_mean": 1.6142924885720333e-08, "advantage_min": -0.8405609056353569, "advantage_std": 0.9997691139578819, "completion_length": 2647.4166870117188, "epoch": 0.07885714285714286, "grad_norm": 0.27245891094207764, "kl": 0.0019397735595703125, "lambda_div_used": 0.5, "learning_rate": 9.960469931131936e-07, "loss": 0.0001, "reward": -0.3990805000066757, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.3990805000066757, "reward_after_std": 0.5694083124399185, "reward_before_mean": -0.10020093433558941, "reward_before_std": 0.5409816838800907, "reward_change_max": 0.0004583820700645447, "reward_change_mean": -0.2988795740529895, "reward_change_min": -0.5763977244496346, "reward_change_std": 0.22476927004754543, "reward_std": 0.5694083385169506, "rewards/cosine_scaled_reward": -0.22718380577862263, "rewards/format_reward": 0.3541666753590107, "step": 69 }, { "advantage_max": 1.9086698144674301, "advantage_mean": 2.856055980604566e-08, "advantage_min": -0.8275108188390732, "advantage_std": 0.9997518509626389, "completion_length": 3076.375045776367, "epoch": 0.08, "grad_norm": 0.16705681383609772, "kl": 0.0011624768376350403, "lambda_div_used": 0.5, "learning_rate": 9.956206309337066e-07, "loss": 0.0, "reward": -0.4171713124960661, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.4171713124960661, "reward_after_std": 0.5041129477322102, "reward_before_mean": -0.10847730562090874, "reward_before_std": 0.46587206050753593, "reward_change_max": 0.0018154531717300415, "reward_change_mean": -0.30869400314986706, "reward_change_min": -0.5475729629397392, "reward_change_std": 0.22376791667193174, "reward_std": 0.5041129551827908, "rewards/cosine_scaled_reward": -0.20007198816165328, "rewards/format_reward": 0.29166667349636555, "step": 70 }, { "advantage_max": 1.8870573192834854, "advantage_mean": 1.1796752186210568e-08, "advantage_min": -0.7760294862091541, "advantage_std": 0.9997637048363686, "completion_length": 2732.1458740234375, "epoch": 0.08114285714285714, "grad_norm": 0.498686820268631, "kl": 0.007267266511917114, "lambda_div_used": 0.5, "learning_rate": 9.951725498333448e-07, "loss": 0.0003, "reward": -0.17455044807866216, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.17455044807866216, "reward_after_std": 0.5208050534129143, "reward_before_mean": 0.3330681398510933, "reward_before_std": 0.3784195650368929, "reward_change_max": 0.0, "reward_change_mean": -0.5076185669749975, "reward_change_min": -0.7843321524560452, "reward_change_std": 0.31202565133571625, "reward_std": 0.5208050757646561, "rewards/cosine_scaled_reward": -0.02096594963222742, "rewards/format_reward": 0.3750000037252903, "step": 71 }, { "advantage_max": 1.8693189769983292, "advantage_mean": 2.6697914656814703e-08, "advantage_min": -0.8475881665945053, "advantage_std": 0.9998268261551857, "completion_length": 3021.166702270508, "epoch": 0.08228571428571428, "grad_norm": 0.2537727355957031, "kl": 0.0018652677536010742, "lambda_div_used": 0.5, "learning_rate": 9.947027716509488e-07, "loss": 0.0001, "reward": -0.30589374899864197, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.30589374899864197, "reward_after_std": 0.7303144559264183, "reward_before_mean": 0.0258680060505867, "reward_before_std": 0.7414015308022499, "reward_change_max": 0.0, "reward_change_mean": -0.33176175132393837, "reward_change_min": -0.7698645628988743, "reward_change_std": 0.3043969329446554, "reward_std": 0.7303144857287407, "rewards/cosine_scaled_reward": -0.1433160022716038, "rewards/format_reward": 0.3125000111758709, "step": 72 }, { "advantage_max": 1.87188358604908, "advantage_mean": 4.3461721443982526e-08, "advantage_min": -0.8683692142367363, "advantage_std": 0.99978356808424, "completion_length": 3466.187530517578, "epoch": 0.08342857142857144, "grad_norm": 0.15660125017166138, "kl": 0.00022289901971817017, "lambda_div_used": 0.5, "learning_rate": 9.942113192828444e-07, "loss": 0.0, "reward": -0.3592198118567467, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.3592198118567467, "reward_after_std": 0.6919657550752163, "reward_before_mean": -0.05832542385905981, "reward_before_std": 0.70926533639431, "reward_change_max": 0.0, "reward_change_mean": -0.3008943935856223, "reward_change_min": -0.6622928902506828, "reward_change_std": 0.27009566500782967, "reward_std": 0.6919657699763775, "rewards/cosine_scaled_reward": -0.14374604681506753, "rewards/format_reward": 0.22916666977107525, "step": 73 }, { "advantage_max": 1.937007024884224, "advantage_mean": 6.705522959116195e-08, "advantage_min": -0.726312592625618, "advantage_std": 0.9997691810131073, "completion_length": 3224.0000610351562, "epoch": 0.08457142857142858, "grad_norm": 0.16150522232055664, "kl": 0.0010072067379951477, "lambda_div_used": 0.5, "learning_rate": 9.93698216681727e-07, "loss": 0.0, "reward": -0.30428778287023306, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.30428778287023306, "reward_after_std": 0.693829420953989, "reward_before_mean": 0.036474743857979774, "reward_before_std": 0.6149825137108564, "reward_change_max": 0.0007465705275535583, "reward_change_mean": -0.3407625099644065, "reward_change_min": -0.6090046390891075, "reward_change_std": 0.23605277249589562, "reward_std": 0.6938294619321823, "rewards/cosine_scaled_reward": -0.08592929691076279, "rewards/format_reward": 0.2083333358168602, "step": 74 }, { "advantage_max": 1.9063157737255096, "advantage_mean": 1.3969838397187573e-08, "advantage_min": -0.7801737226545811, "advantage_std": 0.9998098164796829, "completion_length": 2924.5000610351562, "epoch": 0.08571428571428572, "grad_norm": 0.17792584002017975, "kl": 0.0008277297019958496, "lambda_div_used": 0.5, "learning_rate": 9.931634888554935e-07, "loss": 0.0, "reward": -0.09559960942715406, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.09559960942715406, "reward_after_std": 0.7040151692926884, "reward_before_mean": 0.4155034478753805, "reward_before_std": 0.5951447263360023, "reward_change_max": 0.00082358717918396, "reward_change_mean": -0.5111030461266637, "reward_change_min": -0.8385867662727833, "reward_change_std": 0.33908967301249504, "reward_std": 0.7040151953697205, "rewards/cosine_scaled_reward": -0.0005816244520246983, "rewards/format_reward": 0.4166666679084301, "step": 75 }, { "advantage_max": 1.898203819990158, "advantage_mean": -7.450580818968433e-09, "advantage_min": -0.8630974441766739, "advantage_std": 0.9997747763991356, "completion_length": 2809.229232788086, "epoch": 0.08685714285714285, "grad_norm": 0.1862030029296875, "kl": 0.0001981779932975769, "lambda_div_used": 0.5, "learning_rate": 9.926071618660237e-07, "loss": 0.0, "reward": -0.25100927520543337, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.25100927520543337, "reward_after_std": 0.5281644500792027, "reward_before_mean": 0.1934042815119028, "reward_before_std": 0.47337512113153934, "reward_change_max": 0.0, "reward_change_mean": -0.4444135669618845, "reward_change_min": -0.7713957540690899, "reward_change_std": 0.29939418844878674, "reward_std": 0.5281644649803638, "rewards/cosine_scaled_reward": -0.15329785831272602, "rewards/format_reward": 0.5000000093132257, "step": 76 }, { "advantage_max": 1.8327363729476929, "advantage_mean": 2.107117447192053e-08, "advantage_min": -1.0422530099749565, "advantage_std": 0.9997787699103355, "completion_length": 3040.812530517578, "epoch": 0.088, "grad_norm": 0.15248002111911774, "kl": 0.00022713467478752136, "lambda_div_used": 0.5, "learning_rate": 9.9202926282791e-07, "loss": 0.0, "reward": -0.3516826815903187, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.3516826815903187, "reward_after_std": 0.5596727095544338, "reward_before_mean": 0.0043606823310256, "reward_before_std": 0.6001443788409233, "reward_change_max": 0.0011972561478614807, "reward_change_mean": -0.3560433629900217, "reward_change_min": -0.7085311934351921, "reward_change_std": 0.30133811570703983, "reward_std": 0.5596727319061756, "rewards/cosine_scaled_reward": -0.14365299977362156, "rewards/format_reward": 0.29166667722165585, "step": 77 }, { "advantage_max": 1.8811450600624084, "advantage_mean": 4.967053979232361e-08, "advantage_min": -0.7257413119077682, "advantage_std": 0.9998095110058784, "completion_length": 3252.8958587646484, "epoch": 0.08914285714285715, "grad_norm": 0.17430712282657623, "kl": 0.00013465189840644598, "lambda_div_used": 0.5, "learning_rate": 9.91429819907136e-07, "loss": 0.0, "reward": -0.3133370358264074, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.3133370358264074, "reward_after_std": 0.8261595070362091, "reward_before_mean": -0.015162130817770958, "reward_before_std": 0.8511116541922092, "reward_change_max": 0.0, "reward_change_mean": -0.2981749000027776, "reward_change_min": -0.7532649748027325, "reward_change_std": 0.295009003020823, "reward_std": 0.8261595349758863, "rewards/cosine_scaled_reward": -0.12216439709300175, "rewards/format_reward": 0.2291666753590107, "step": 78 }, { "advantage_max": 1.9250884354114532, "advantage_mean": 1.4280279958533981e-08, "advantage_min": -0.7625341862440109, "advantage_std": 0.9998122826218605, "completion_length": 2306.250015258789, "epoch": 0.09028571428571429, "grad_norm": 0.21141712367534637, "kl": 0.0023859739303588867, "lambda_div_used": 0.5, "learning_rate": 9.908088623197048e-07, "loss": 0.0001, "reward": -0.12412842572666705, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.12412842572666705, "reward_after_std": 0.7300403695553541, "reward_before_mean": 0.35858380049467087, "reward_before_std": 0.6866056565195322, "reward_change_max": 0.001516088843345642, "reward_change_mean": -0.48271221574395895, "reward_change_min": -0.8789361789822578, "reward_change_std": 0.3496302356943488, "reward_std": 0.7300403844565153, "rewards/cosine_scaled_reward": -0.12279144860804081, "rewards/format_reward": 0.6041666679084301, "step": 79 }, { "advantage_max": 1.9084865599870682, "advantage_mean": 4.594524782852716e-08, "advantage_min": -0.7381228767335415, "advantage_std": 0.999783106148243, "completion_length": 3179.9375610351562, "epoch": 0.09142857142857143, "grad_norm": 0.17955073714256287, "kl": 0.0006044581532478333, "lambda_div_used": 0.5, "learning_rate": 9.901664203302124e-07, "loss": 0.0, "reward": -0.27829574840143323, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.27829574840143323, "reward_after_std": 0.7623252347111702, "reward_before_mean": 0.07243560440838337, "reward_before_std": 0.7849831110797822, "reward_change_max": 0.0006520003080368042, "reward_change_mean": -0.3507313448935747, "reward_change_min": -0.913384735584259, "reward_change_std": 0.35589488223195076, "reward_std": 0.7623252794146538, "rewards/cosine_scaled_reward": -0.14086553594097495, "rewards/format_reward": 0.3541666679084301, "step": 80 }, { "advantage_max": 1.874961331486702, "advantage_mean": 2.980232349791834e-08, "advantage_min": -0.926294356584549, "advantage_std": 0.9997445642948151, "completion_length": 3036.6667098999023, "epoch": 0.09257142857142857, "grad_norm": 0.2611143887042999, "kl": 0.0016833841800689697, "lambda_div_used": 0.5, "learning_rate": 9.895025252503755e-07, "loss": 0.0001, "reward": -0.3727427292615175, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.3727427292615175, "reward_after_std": 0.48480862006545067, "reward_before_mean": -0.014494793489575386, "reward_before_std": 0.469564750790596, "reward_change_max": 0.001546420156955719, "reward_change_mean": -0.3582479301840067, "reward_change_min": -0.637864962220192, "reward_change_std": 0.26254395116120577, "reward_std": 0.48480862379074097, "rewards/cosine_scaled_reward": -0.15308072790503502, "rewards/format_reward": 0.29166667349636555, "step": 81 }, { "advantage_max": 1.9086092114448547, "advantage_mean": -1.862645193639878e-08, "advantage_min": -0.8542907983064651, "advantage_std": 0.9998050406575203, "completion_length": 2752.7083892822266, "epoch": 0.09371428571428571, "grad_norm": 0.19600576162338257, "kl": 0.00047707557678222656, "lambda_div_used": 0.5, "learning_rate": 9.888172094375033e-07, "loss": 0.0, "reward": -0.07845883443951607, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.07845883443951607, "reward_after_std": 0.7290177270770073, "reward_before_mean": 0.4407756347209215, "reward_before_std": 0.6415084861218929, "reward_change_max": 0.0006015673279762268, "reward_change_mean": -0.5192344691604376, "reward_change_min": -0.8588821068406105, "reward_change_std": 0.34626587107777596, "reward_std": 0.7290177717804909, "rewards/cosine_scaled_reward": 0.03288781363517046, "rewards/format_reward": 0.3750000037252903, "step": 82 }, { "advantage_max": 1.8362404704093933, "advantage_mean": 2.669791410170319e-08, "advantage_min": -0.8356914222240448, "advantage_std": 0.9998195692896843, "completion_length": 2632.9375076293945, "epoch": 0.09485714285714286, "grad_norm": 0.1994089037179947, "kl": 0.0006115809082984924, "lambda_div_used": 0.5, "learning_rate": 9.881105062929221e-07, "loss": 0.0, "reward": -0.14021942391991615, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.14021942391991615, "reward_after_std": 0.721308272331953, "reward_before_mean": 0.3346644751727581, "reward_before_std": 0.710454810410738, "reward_change_max": 0.0016935467720031738, "reward_change_mean": -0.4748838823288679, "reward_change_min": -0.9975110255181789, "reward_change_std": 0.3862521070986986, "reward_std": 0.7213082872331142, "rewards/cosine_scaled_reward": -0.04100109916180372, "rewards/format_reward": 0.41666667349636555, "step": 83 }, { "advantage_max": 1.8866354674100876, "advantage_mean": 1.6763806787167823e-08, "advantage_min": -0.8430695235729218, "advantage_std": 0.9998267069458961, "completion_length": 2886.812530517578, "epoch": 0.096, "grad_norm": 0.1975167691707611, "kl": 0.00033015012741088867, "lambda_div_used": 0.5, "learning_rate": 9.873824502603459e-07, "loss": 0.0, "reward": -0.05239881947636604, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.05239881947636604, "reward_after_std": 0.8544594086706638, "reward_before_mean": 0.45468202233314514, "reward_before_std": 0.8344392105937004, "reward_change_max": 0.00045930594205856323, "reward_change_mean": -0.5070808418095112, "reward_change_min": -1.0089579410851002, "reward_change_std": 0.39830892719328403, "reward_std": 0.8544594198465347, "rewards/cosine_scaled_reward": -0.0018256474286317825, "rewards/format_reward": 0.45833334885537624, "step": 84 }, { "advantage_max": 1.8973801732063293, "advantage_mean": 2.1109978542988017e-08, "advantage_min": -0.8326839506626129, "advantage_std": 0.9998261779546738, "completion_length": 3018.3541870117188, "epoch": 0.09714285714285714, "grad_norm": 0.15980154275894165, "kl": 0.0002526193857192993, "lambda_div_used": 0.5, "learning_rate": 9.866330768241983e-07, "loss": 0.0, "reward": -0.1795702837407589, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.1795702837407589, "reward_after_std": 0.8903343379497528, "reward_before_mean": 0.20754530653357506, "reward_before_std": 0.8969676159322262, "reward_change_max": 8.164346218109131e-05, "reward_change_mean": -0.3871155809611082, "reward_change_min": -0.9163610003888607, "reward_change_std": 0.3585868049412966, "reward_std": 0.8903343491256237, "rewards/cosine_scaled_reward": -0.0941440174356103, "rewards/format_reward": 0.3958333469927311, "step": 85 }, { "advantage_max": 1.9123211801052094, "advantage_mean": 4.1599076183729267e-08, "advantage_min": -0.7952956557273865, "advantage_std": 0.9997774809598923, "completion_length": 2792.125030517578, "epoch": 0.09828571428571428, "grad_norm": 0.20445193350315094, "kl": 0.0007953643798828125, "lambda_div_used": 0.5, "learning_rate": 9.85862422507884e-07, "loss": 0.0, "reward": -0.12261645495891571, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.12261645495891571, "reward_after_std": 0.6959991101175547, "reward_before_mean": 0.3699899148195982, "reward_before_std": 0.6057371087372303, "reward_change_max": 0.0005937442183494568, "reward_change_mean": -0.4926063437014818, "reward_change_min": -0.8540405631065369, "reward_change_std": 0.3248249962925911, "reward_std": 0.6959991175681353, "rewards/cosine_scaled_reward": -0.023338390979915857, "rewards/format_reward": 0.4166666679084301, "step": 86 }, { "advantage_max": 1.8586776107549667, "advantage_mean": 2.6077032810878364e-08, "advantage_min": -0.8346791416406631, "advantage_std": 0.9998296871781349, "completion_length": 2592.5625762939453, "epoch": 0.09942857142857142, "grad_norm": 0.24278458952903748, "kl": 0.0009903311729431152, "lambda_div_used": 0.5, "learning_rate": 9.850705248720068e-07, "loss": 0.0, "reward": -0.18643693253397942, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.18643693253397942, "reward_after_std": 0.7580613270401955, "reward_before_mean": 0.2400309145450592, "reward_before_std": 0.7840206138789654, "reward_change_max": 0.002095058560371399, "reward_change_mean": -0.4264678508043289, "reward_change_min": -0.8813573159277439, "reward_change_std": 0.3639759235084057, "reward_std": 0.7580613419413567, "rewards/cosine_scaled_reward": -0.1299845464527607, "rewards/format_reward": 0.5000000074505806, "step": 87 }, { "advantage_max": 1.9035822749137878, "advantage_mean": 9.93410831373609e-09, "advantage_min": -0.8028503619134426, "advantage_std": 0.9998452365398407, "completion_length": 2750.729248046875, "epoch": 0.10057142857142858, "grad_norm": 0.22822467982769012, "kl": 0.0008462667465209961, "lambda_div_used": 0.5, "learning_rate": 9.8425742251254e-07, "loss": 0.0, "reward": -0.1496799192391336, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.1496799192391336, "reward_after_std": 0.8370900675654411, "reward_before_mean": 0.27654790971428156, "reward_before_std": 0.8350924924015999, "reward_change_max": 0.001572735607624054, "reward_change_mean": -0.42622782615944743, "reward_change_min": -0.7948319055140018, "reward_change_std": 0.33608314860612154, "reward_std": 0.8370900899171829, "rewards/cosine_scaled_reward": -0.11172605864703655, "rewards/format_reward": 0.5000000093132257, "step": 88 }, { "advantage_max": 1.8878425657749176, "advantage_mean": 3.476937759927523e-08, "advantage_min": -0.8848367482423782, "advantage_std": 0.999799333512783, "completion_length": 2829.854248046875, "epoch": 0.10171428571428572, "grad_norm": 0.1981220543384552, "kl": 0.0010972023010253906, "lambda_div_used": 0.5, "learning_rate": 9.83423155058946e-07, "loss": 0.0, "reward": -0.20833131205290556, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.20833131205290556, "reward_after_std": 0.756609233096242, "reward_before_mean": 0.1950739398598671, "reward_before_std": 0.7492302563041449, "reward_change_max": 0.0, "reward_change_mean": -0.40340525284409523, "reward_change_min": -0.8476431369781494, "reward_change_std": 0.33259575068950653, "reward_std": 0.7566092498600483, "rewards/cosine_scaled_reward": -0.100379703566432, "rewards/format_reward": 0.3958333395421505, "step": 89 }, { "advantage_max": 1.9099785536527634, "advantage_mean": 2.359350637082258e-08, "advantage_min": -0.7695377096533775, "advantage_std": 0.9997612684965134, "completion_length": 2356.7916946411133, "epoch": 0.10285714285714286, "grad_norm": 0.3218241333961487, "kl": 0.001089632511138916, "lambda_div_used": 0.5, "learning_rate": 9.825677631722435e-07, "loss": 0.0, "reward": -0.2230790453031659, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.2230790453031659, "reward_after_std": 0.6019329931586981, "reward_before_mean": 0.21309048682451248, "reward_before_std": 0.5367362890392542, "reward_change_max": 0.0002956688404083252, "reward_change_mean": -0.43616954190656543, "reward_change_min": -0.7830835692584515, "reward_change_std": 0.30072727892547846, "reward_std": 0.6019330080598593, "rewards/cosine_scaled_reward": -0.1747047562384978, "rewards/format_reward": 0.5625, "step": 90 }, { "advantage_max": 1.8672983944416046, "advantage_mean": 1.0554989493538613e-08, "advantage_min": -0.8877752497792244, "advantage_std": 0.9998028874397278, "completion_length": 3059.791702270508, "epoch": 0.104, "grad_norm": 0.19541896879673004, "kl": 0.0007580630481243134, "lambda_div_used": 0.5, "learning_rate": 9.816912885430258e-07, "loss": 0.0, "reward": -0.25269458070397377, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.25269458070397377, "reward_after_std": 0.7357797585427761, "reward_before_mean": 0.12885426357388496, "reward_before_std": 0.7820154465734959, "reward_change_max": 0.0023800507187843323, "reward_change_mean": -0.3815488638356328, "reward_change_min": -0.803812101483345, "reward_change_std": 0.35552883241325617, "reward_std": 0.7357797957956791, "rewards/cosine_scaled_reward": -0.10223953444801737, "rewards/format_reward": 0.3333333469927311, "step": 91 }, { "advantage_max": 1.8726870566606522, "advantage_mean": 2.1730860888524717e-08, "advantage_min": -0.9580079317092896, "advantage_std": 0.9998205602169037, "completion_length": 2739.8541870117188, "epoch": 0.10514285714285715, "grad_norm": 0.1873386800289154, "kl": 0.0021462738513946533, "lambda_div_used": 0.5, "learning_rate": 9.807937738894303e-07, "loss": 0.0001, "reward": -0.14901877380907536, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.14901877380907536, "reward_after_std": 0.7104939520359039, "reward_before_mean": 0.3285065218806267, "reward_before_std": 0.7255417667329311, "reward_change_max": 0.001958779990673065, "reward_change_mean": -0.4775253050029278, "reward_change_min": -0.9158918745815754, "reward_change_std": 0.37975638918578625, "reward_std": 0.7104939669370651, "rewards/cosine_scaled_reward": -0.08574674651026726, "rewards/format_reward": 0.5000000149011612, "step": 92 }, { "advantage_max": 1.8517653942108154, "advantage_mean": 1.2728075243773063e-07, "advantage_min": -0.9585303515195847, "advantage_std": 0.9996733292937279, "completion_length": 3399.229217529297, "epoch": 0.10628571428571429, "grad_norm": 0.20226261019706726, "kl": 0.0012685954570770264, "lambda_div_used": 0.5, "learning_rate": 9.798752629550546e-07, "loss": 0.0001, "reward": -0.5985056199133396, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.5985056199133396, "reward_after_std": 0.3514649849385023, "reward_before_mean": -0.3890782119706273, "reward_before_std": 0.3508653249591589, "reward_change_max": 0.0008473768830299377, "reward_change_mean": -0.2094273860566318, "reward_change_min": -0.3923211321234703, "reward_change_std": 0.16505926102399826, "reward_std": 0.3514649923890829, "rewards/cosine_scaled_reward": -0.23620576970279217, "rewards/format_reward": 0.0833333358168602, "step": 93 }, { "advantage_max": 1.9095520675182343, "advantage_mean": 5.960464588561365e-08, "advantage_min": -0.7972255274653435, "advantage_std": 0.9998055398464203, "completion_length": 3063.3541870117188, "epoch": 0.10742857142857143, "grad_norm": 0.17396514117717743, "kl": 0.0015499591827392578, "lambda_div_used": 0.5, "learning_rate": 9.78935800506826e-07, "loss": 0.0001, "reward": -0.24387888237833977, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.24387888237833977, "reward_after_std": 0.699499323964119, "reward_before_mean": 0.14552057534456253, "reward_before_std": 0.6129138586111367, "reward_change_max": 0.0001614391803741455, "reward_change_mean": -0.38939943816512823, "reward_change_min": -0.6290842294692993, "reward_change_std": 0.2671583485789597, "reward_std": 0.6994993314146996, "rewards/cosine_scaled_reward": -0.07307304441928864, "rewards/format_reward": 0.2916666716337204, "step": 94 }, { "advantage_max": 1.883233681321144, "advantage_mean": 5.743155395698807e-09, "advantage_min": -0.8746126741170883, "advantage_std": 0.9997252598404884, "completion_length": 3426.5833740234375, "epoch": 0.10857142857142857, "grad_norm": 0.20511451363563538, "kl": 0.0005182921886444092, "lambda_div_used": 0.5, "learning_rate": 9.779754323328192e-07, "loss": 0.0, "reward": -0.5034449929371476, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.5034449929371476, "reward_after_std": 0.5185970328748226, "reward_before_mean": -0.26567897759377956, "reward_before_std": 0.5346523299813271, "reward_change_max": 0.0007946863770484924, "reward_change_mean": -0.2377660358324647, "reward_change_min": -0.5584822706878185, "reward_change_std": 0.2266387245617807, "reward_std": 0.518597049638629, "rewards/cosine_scaled_reward": -0.22658948972821236, "rewards/format_reward": 0.1875000074505806, "step": 95 }, { "advantage_max": 1.919153854250908, "advantage_mean": 2.405916721404111e-08, "advantage_min": -0.8224275931715965, "advantage_std": 0.9998463094234467, "completion_length": 2780.291702270508, "epoch": 0.10971428571428571, "grad_norm": 0.18453562259674072, "kl": 0.001372743397951126, "lambda_div_used": 0.5, "learning_rate": 9.769942052400235e-07, "loss": 0.0001, "reward": -0.05859323777258396, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.05859323777258396, "reward_after_std": 0.9193606749176979, "reward_before_mean": 0.41439007595181465, "reward_before_std": 0.8898796737194061, "reward_change_max": 0.0017773956060409546, "reward_change_mean": -0.47298329696059227, "reward_change_min": -0.8093426078557968, "reward_change_std": 0.3369905035942793, "reward_std": 0.9193606898188591, "rewards/cosine_scaled_reward": -0.011554960161447525, "rewards/format_reward": 0.43750000186264515, "step": 96 }, { "advantage_max": 1.860081598162651, "advantage_mean": 5.0912303484196286e-08, "advantage_min": -0.9762744233012199, "advantage_std": 0.9997392669320107, "completion_length": 2987.9375610351562, "epoch": 0.11085714285714286, "grad_norm": 0.20188908278942108, "kl": 0.0014988183975219727, "lambda_div_used": 0.5, "learning_rate": 9.759921670520634e-07, "loss": 0.0001, "reward": -0.10808135382831097, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.10808135382831097, "reward_after_std": 0.5511326994746923, "reward_before_mean": 0.44958585873246193, "reward_before_std": 0.4742696601897478, "reward_change_max": 0.00031263381242752075, "reward_change_mean": -0.557667214423418, "reward_change_min": -0.9108167290687561, "reward_change_std": 0.3545882785692811, "reward_std": 0.5511327031999826, "rewards/cosine_scaled_reward": 0.01645958609879017, "rewards/format_reward": 0.41666667722165585, "step": 97 }, { "advantage_max": 1.924290120601654, "advantage_mean": 2.7318796669284495e-08, "advantage_min": -0.8337865993380547, "advantage_std": 0.99977807700634, "completion_length": 2615.604202270508, "epoch": 0.112, "grad_norm": 0.19144760072231293, "kl": 0.0005843713879585266, "lambda_div_used": 0.5, "learning_rate": 9.749693666068663e-07, "loss": 0.0, "reward": -0.19257944263517857, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.19257944263517857, "reward_after_std": 0.6021185424178839, "reward_before_mean": 0.26773499604314566, "reward_before_std": 0.48827750235795975, "reward_change_max": 0.0028692707419395447, "reward_change_mean": -0.46031447034329176, "reward_change_min": -0.7352127730846405, "reward_change_std": 0.28656440041959286, "reward_std": 0.6021185610443354, "rewards/cosine_scaled_reward": -0.12654916709288955, "rewards/format_reward": 0.5208333432674408, "step": 98 }, { "advantage_max": 1.8842637687921524, "advantage_mean": -4.967053546245381e-09, "advantage_min": -0.8146334141492844, "advantage_std": 0.999771386384964, "completion_length": 2808.9791717529297, "epoch": 0.11314285714285714, "grad_norm": 0.19145610928535461, "kl": 0.0007163286209106445, "lambda_div_used": 0.5, "learning_rate": 9.739258537542835e-07, "loss": 0.0, "reward": -0.25064975768327713, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.25064975768327713, "reward_after_std": 0.558937631547451, "reward_before_mean": 0.18323423340916634, "reward_before_std": 0.4840637035667896, "reward_change_max": 0.000568389892578125, "reward_change_mean": -0.43388404604047537, "reward_change_min": -0.7998382151126862, "reward_change_std": 0.29928822815418243, "reward_std": 0.5589376464486122, "rewards/cosine_scaled_reward": -0.0646328553557396, "rewards/format_reward": 0.3125, "step": 99 }, { "advantage_max": 1.883774295449257, "advantage_mean": 3.849466767569254e-08, "advantage_min": -0.8500948920845985, "advantage_std": 0.9998322054743767, "completion_length": 2592.9792098999023, "epoch": 0.11428571428571428, "grad_norm": 0.18909044563770294, "kl": 0.0011529922485351562, "lambda_div_used": 0.5, "learning_rate": 9.728616793536587e-07, "loss": 0.0, "reward": -0.0791124738752842, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.0791124738752842, "reward_after_std": 0.7873836122453213, "reward_before_mean": 0.4297027445281856, "reward_before_std": 0.7936761677265167, "reward_change_max": 0.000196017324924469, "reward_change_mean": -0.5088152242824435, "reward_change_min": -0.9765620827674866, "reward_change_std": 0.3931038361042738, "reward_std": 0.7873836234211922, "rewards/cosine_scaled_reward": -0.035148635506629944, "rewards/format_reward": 0.5000000093132257, "step": 100 }, { "advantage_max": 1.8153242319822311, "advantage_mean": 5.029142047252577e-08, "advantage_min": -0.938971072435379, "advantage_std": 0.9997990876436234, "completion_length": 2627.000045776367, "epoch": 0.11542857142857142, "grad_norm": 0.2275507003068924, "kl": 0.0010591745376586914, "lambda_div_used": 0.5, "learning_rate": 9.717768952713511e-07, "loss": 0.0, "reward": -0.19237697310745716, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.19237697310745716, "reward_after_std": 0.6066364720463753, "reward_before_mean": 0.2787788547575474, "reward_before_std": 0.5884393192827702, "reward_change_max": 0.0011661723256111145, "reward_change_mean": -0.4711558297276497, "reward_change_min": -0.8935237973928452, "reward_change_std": 0.3549905549734831, "reward_std": 0.6066364757716656, "rewards/cosine_scaled_reward": -0.08977724611759186, "rewards/format_reward": 0.4583333395421505, "step": 101 }, { "advantage_max": 1.9111991822719574, "advantage_mean": 0.0, "advantage_min": -0.8480004072189331, "advantage_std": 0.9998131468892097, "completion_length": 2199.3958892822266, "epoch": 0.11657142857142858, "grad_norm": 0.28151506185531616, "kl": 0.002061605453491211, "lambda_div_used": 0.5, "learning_rate": 9.706715543782064e-07, "loss": 0.0001, "reward": -0.10260325577110052, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.10260325577110052, "reward_after_std": 0.7042684890329838, "reward_before_mean": 0.40410364978015423, "reward_before_std": 0.6136751640588045, "reward_change_max": 0.00042323023080825806, "reward_change_mean": -0.506706902757287, "reward_change_min": -0.8719289116561413, "reward_change_std": 0.3393041845411062, "reward_std": 0.7042685002088547, "rewards/cosine_scaled_reward": -0.14169819233939052, "rewards/format_reward": 0.687500013038516, "step": 102 }, { "advantage_max": 1.8461199253797531, "advantage_mean": 2.359350548264416e-08, "advantage_min": -0.8268741890788078, "advantage_std": 0.999856062233448, "completion_length": 2750.125057220459, "epoch": 0.11771428571428572, "grad_norm": 0.274533748626709, "kl": 0.0018423646688461304, "lambda_div_used": 0.5, "learning_rate": 9.695457105469804e-07, "loss": 0.0001, "reward": -0.1559738339856267, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": -0.1559738339856267, "reward_after_std": 0.8337625414133072, "reward_before_mean": 0.27372268959879875, "reward_before_std": 0.9095275402069092, "reward_change_max": 0.0012431740760803223, "reward_change_mean": -0.42969651333987713, "reward_change_min": -0.9942761585116386, "reward_change_std": 0.422477787360549, "reward_std": 0.8337625414133072, "rewards/cosine_scaled_reward": -0.09230532869696617, "rewards/format_reward": 0.4583333432674408, "step": 103 }, { "advantage_max": 1.8895713835954666, "advantage_mean": 4.5324367148324995e-08, "advantage_min": -0.8674125224351883, "advantage_std": 0.9997276961803436, "completion_length": 2726.8958435058594, "epoch": 0.11885714285714286, "grad_norm": 0.2888650894165039, "kl": 0.0068280696868896484, "lambda_div_used": 0.5, "learning_rate": 9.683994186497132e-07, "loss": 0.0003, "reward": -0.2802076867665164, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.2802076867665164, "reward_after_std": 0.45476071164011955, "reward_before_mean": 0.161182364448905, "reward_before_std": 0.399234589189291, "reward_change_max": 0.00044030696153640747, "reward_change_mean": -0.44139003101736307, "reward_change_min": -0.7332473956048489, "reward_change_std": 0.2831938583403826, "reward_std": 0.45476071909070015, "rewards/cosine_scaled_reward": -0.1173255043104291, "rewards/format_reward": 0.39583333395421505, "step": 104 }, { "advantage_max": 1.8745701760053635, "advantage_mean": 1.707424779340272e-08, "advantage_min": -0.8500468209385872, "advantage_std": 0.9998556524515152, "completion_length": 2416.8333587646484, "epoch": 0.12, "grad_norm": 0.21496868133544922, "kl": 0.0012444257736206055, "lambda_div_used": 0.5, "learning_rate": 9.672327345550543e-07, "loss": 0.0, "reward": 0.03997505363076925, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.03997505363076925, "reward_after_std": 0.9438597373664379, "reward_before_mean": 0.5930667854845524, "reward_before_std": 0.9317218028008938, "reward_change_max": 0.0017218366265296936, "reward_change_mean": -0.5530917001888156, "reward_change_min": -1.0486153699457645, "reward_change_std": 0.429962957277894, "reward_std": 0.9438597410917282, "rewards/cosine_scaled_reward": 0.036116703413426876, "rewards/format_reward": 0.5208333414047956, "step": 105 }, { "advantage_max": 1.9127518236637115, "advantage_mean": -1.241763913917282e-09, "advantage_min": -0.7981174066662788, "advantage_std": 0.9998517706990242, "completion_length": 2194.812545776367, "epoch": 0.12114285714285715, "grad_norm": 0.21088606119155884, "kl": 0.004343807697296143, "lambda_div_used": 0.5, "learning_rate": 9.66045715125541e-07, "loss": 0.0002, "reward": 0.23806806560605764, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.23806806560605764, "reward_after_std": 0.8330020643770695, "reward_before_mean": 0.9928077161312103, "reward_before_std": 0.6945897105615586, "reward_change_max": 0.00025169551372528076, "reward_change_mean": -0.7547396793961525, "reward_change_min": -1.2395295053720474, "reward_change_std": 0.4977700933814049, "reward_std": 0.8330021128058434, "rewards/cosine_scaled_reward": 0.15265384782105684, "rewards/format_reward": 0.6875000111758709, "step": 106 }, { "advantage_max": 1.8897164016962051, "advantage_mean": 3.973643103449831e-08, "advantage_min": -0.8593711704015732, "advantage_std": 0.9998006448149681, "completion_length": 2883.812515258789, "epoch": 0.12228571428571429, "grad_norm": 0.21747052669525146, "kl": 0.0013761520385742188, "lambda_div_used": 0.5, "learning_rate": 9.648384182148252e-07, "loss": 0.0001, "reward": -0.13875769823789597, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.13875769823789597, "reward_after_std": 0.6199061535298824, "reward_before_mean": 0.37559359334409237, "reward_before_std": 0.5839899033308029, "reward_change_max": 0.0, "reward_change_mean": -0.5143513064831495, "reward_change_min": -0.8711799457669258, "reward_change_std": 0.36477554962038994, "reward_std": 0.6199061721563339, "rewards/cosine_scaled_reward": -0.062203213572502136, "rewards/format_reward": 0.5000000074505806, "step": 107 }, { "advantage_max": 1.8762633353471756, "advantage_mean": 2.483527827834564e-09, "advantage_min": -0.8940647393465042, "advantage_std": 0.999830886721611, "completion_length": 2561.500030517578, "epoch": 0.12342857142857143, "grad_norm": 0.23256301879882812, "kl": 0.00116729736328125, "lambda_div_used": 0.5, "learning_rate": 9.636109026648554e-07, "loss": 0.0, "reward": 0.07393225142732263, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.07393225142732263, "reward_after_std": 0.7953523583710194, "reward_before_mean": 0.7025719881057739, "reward_before_std": 0.7550234757363796, "reward_change_max": 0.0, "reward_change_mean": -0.6286397371441126, "reward_change_min": -1.0530790612101555, "reward_change_std": 0.43232874386012554, "reward_std": 0.7953523695468903, "rewards/cosine_scaled_reward": 0.059619318693876266, "rewards/format_reward": 0.5833333507180214, "step": 108 }, { "advantage_max": 1.9162142127752304, "advantage_mean": 1.7384688799637615e-08, "advantage_min": -0.7551117762923241, "advantage_std": 0.9998111501336098, "completion_length": 3017.2500762939453, "epoch": 0.12457142857142857, "grad_norm": 0.2104494720697403, "kl": 0.0007964372634887695, "lambda_div_used": 0.5, "learning_rate": 9.623632283030077e-07, "loss": 0.0, "reward": -0.177821128629148, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.177821128629148, "reward_after_std": 0.7017932273447514, "reward_before_mean": 0.26592331333085895, "reward_before_std": 0.6155474036931992, "reward_change_max": 0.000747285783290863, "reward_change_mean": -0.44374443776905537, "reward_change_min": -0.8128639236092567, "reward_change_std": 0.3069191947579384, "reward_std": 0.7017932571470737, "rewards/cosine_scaled_reward": -0.03370502591133118, "rewards/format_reward": 0.3333333395421505, "step": 109 }, { "advantage_max": 1.960287183523178, "advantage_mean": 1.862645193639878e-08, "advantage_min": -0.7720007188618183, "advantage_std": 0.9998722448945045, "completion_length": 2521.1458740234375, "epoch": 0.12571428571428572, "grad_norm": 0.26947009563446045, "kl": 0.0013470649719238281, "lambda_div_used": 0.5, "learning_rate": 9.610954559391704e-07, "loss": 0.0001, "reward": -0.05872512166388333, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.05872512166388333, "reward_after_std": 1.0253377817571163, "reward_before_mean": 0.3726255651563406, "reward_before_std": 0.9357779249548912, "reward_change_max": 0.0007335245609283447, "reward_change_mean": -0.4313506744801998, "reward_change_min": -0.8192849457263947, "reward_change_std": 0.31888777762651443, "reward_std": 1.0253378376364708, "rewards/cosine_scaled_reward": -0.07410389324650168, "rewards/format_reward": 0.5208333469927311, "step": 110 }, { "advantage_max": 1.8416922390460968, "advantage_mean": 2.483526972962835e-08, "advantage_min": -0.9220654144883156, "advantage_std": 0.9998046904802322, "completion_length": 2686.2083587646484, "epoch": 0.12685714285714286, "grad_norm": 0.22409984469413757, "kl": 0.0019087791442871094, "lambda_div_used": 0.5, "learning_rate": 9.598076473627796e-07, "loss": 0.0001, "reward": -0.12221940292511135, "reward_advantage_correlation": 0.9999999999999994, "reward_after_mean": -0.12221940292511135, "reward_after_std": 0.7193337231874466, "reward_before_mean": 0.3737189192324877, "reward_before_std": 0.7402253746986389, "reward_change_max": 0.0015523731708526611, "reward_change_mean": -0.4959383327513933, "reward_change_min": -1.015930712223053, "reward_change_std": 0.4064535070210695, "reward_std": 0.7193337418138981, "rewards/cosine_scaled_reward": -0.04230721411295235, "rewards/format_reward": 0.4583333544433117, "step": 111 }, { "advantage_max": 1.9122427105903625, "advantage_mean": 5.277494635747004e-09, "advantage_min": -0.794686496257782, "advantage_std": 0.9998061284422874, "completion_length": 2904.125045776367, "epoch": 0.128, "grad_norm": 0.19855168461799622, "kl": 0.0010465309023857117, "lambda_div_used": 0.5, "learning_rate": 9.58499865339809e-07, "loss": 0.0, "reward": -0.17366264760494232, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.17366264760494232, "reward_after_std": 0.734580010175705, "reward_before_mean": 0.26212383183883503, "reward_before_std": 0.6679606847465038, "reward_change_max": 0.0, "reward_change_mean": -0.4357864987105131, "reward_change_min": -0.8571107387542725, "reward_change_std": 0.3221443220973015, "reward_std": 0.7345800176262856, "rewards/cosine_scaled_reward": -0.035604753997176886, "rewards/format_reward": 0.33333334140479565, "step": 112 }, { "advantage_max": 1.8980989009141922, "advantage_mean": 4.967054101356894e-09, "advantage_min": -0.860293336212635, "advantage_std": 0.9998377710580826, "completion_length": 2528.7708740234375, "epoch": 0.12914285714285714, "grad_norm": 0.32358023524284363, "kl": 0.0017886161804199219, "lambda_div_used": 0.5, "learning_rate": 9.571721736097088e-07, "loss": 0.0001, "reward": -0.22631652595009655, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.22631652595009655, "reward_after_std": 0.7506244815886021, "reward_before_mean": 0.1606158297508955, "reward_before_std": 0.7282139845192432, "reward_change_max": 0.0006383880972862244, "reward_change_mean": -0.3869323618710041, "reward_change_min": -0.7756957449018955, "reward_change_std": 0.29775483161211014, "reward_std": 0.7506244964897633, "rewards/cosine_scaled_reward": -0.15927542932331562, "rewards/format_reward": 0.4791666753590107, "step": 113 }, { "advantage_max": 1.9110632240772247, "advantage_mean": 2.6542694486764162e-08, "advantage_min": -0.771735567599535, "advantage_std": 0.9997791200876236, "completion_length": 2564.1666870117188, "epoch": 0.13028571428571428, "grad_norm": 0.20324108004570007, "kl": 0.0022649765014648438, "lambda_div_used": 0.5, "learning_rate": 9.55824636882301e-07, "loss": 0.0001, "reward": -0.28347105346620083, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.28347105346620083, "reward_after_std": 0.6658044643700123, "reward_before_mean": 0.08999151457101107, "reward_before_std": 0.6686059813946486, "reward_change_max": 0.0010268613696098328, "reward_change_mean": -0.37346256989985704, "reward_change_min": -0.8506891429424286, "reward_change_std": 0.32815066166222095, "reward_std": 0.665804473683238, "rewards/cosine_scaled_reward": -0.19458758272230625, "rewards/format_reward": 0.4791666753590107, "step": 114 }, { "advantage_max": 1.866062507033348, "advantage_mean": 2.2351741513215728e-08, "advantage_min": -0.8335398361086845, "advantage_std": 0.999809741973877, "completion_length": 2849.583366394043, "epoch": 0.13142857142857142, "grad_norm": 0.176808163523674, "kl": 0.002234935760498047, "lambda_div_used": 0.5, "learning_rate": 9.54457320834625e-07, "loss": 0.0001, "reward": -0.14792929776012897, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.14792929776012897, "reward_after_std": 0.7658665478229523, "reward_before_mean": 0.31146786167664686, "reward_before_std": 0.7670408114790916, "reward_change_max": 0.0006662756204605103, "reward_change_mean": -0.45939717441797256, "reward_change_min": -0.956385251134634, "reward_change_std": 0.38262984342873096, "reward_std": 0.7658665888011456, "rewards/cosine_scaled_reward": -0.052599404007196426, "rewards/format_reward": 0.4166666679084301, "step": 115 }, { "advantage_max": 1.8941835165023804, "advantage_mean": 7.202228013980516e-08, "advantage_min": -0.8339600563049316, "advantage_std": 0.999761626124382, "completion_length": 3221.0625, "epoch": 0.13257142857142856, "grad_norm": 0.18061460554599762, "kl": 0.0020515918731689453, "lambda_div_used": 0.5, "learning_rate": 9.530702921077358e-07, "loss": 0.0001, "reward": -0.42343843169510365, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.42343843169510365, "reward_after_std": 0.6135209687054157, "reward_before_mean": -0.15599404089152813, "reward_before_std": 0.6033586822450161, "reward_change_max": 0.0025794655084609985, "reward_change_mean": -0.2674443628638983, "reward_change_min": -0.5305989198386669, "reward_change_std": 0.21838054060935974, "reward_std": 0.6135209947824478, "rewards/cosine_scaled_reward": -0.1717470269650221, "rewards/format_reward": 0.18750000186264515, "step": 116 }, { "advantage_max": 1.8966728001832962, "advantage_mean": -1.7384688688615313e-08, "advantage_min": -0.7671744376420975, "advantage_std": 0.9998082593083382, "completion_length": 3019.7291870117188, "epoch": 0.1337142857142857, "grad_norm": 0.20431950688362122, "kl": 0.0031223297119140625, "lambda_div_used": 0.5, "learning_rate": 9.516636183034564e-07, "loss": 0.0001, "reward": -0.38849915959872305, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.38849915959872305, "reward_after_std": 0.7110750675201416, "reward_before_mean": -0.1264194727409631, "reward_before_std": 0.6984617970883846, "reward_change_max": 0.0021105334162712097, "reward_change_mean": -0.26207969430834055, "reward_change_min": -0.6101348102092743, "reward_change_std": 0.24454447254538536, "reward_std": 0.7110750749707222, "rewards/cosine_scaled_reward": -0.1986264120787382, "rewards/format_reward": 0.2708333358168602, "step": 117 }, { "advantage_max": 1.8363988399505615, "advantage_mean": 2.949188337986186e-08, "advantage_min": -0.863525852560997, "advantage_std": 0.999841958284378, "completion_length": 2949.437515258789, "epoch": 0.13485714285714287, "grad_norm": 0.18025439977645874, "kl": 0.001627206802368164, "lambda_div_used": 0.5, "learning_rate": 9.502373679810839e-07, "loss": 0.0001, "reward": 0.07480202615261078, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.07480202615261078, "reward_after_std": 0.920928593724966, "reward_before_mean": 0.6708409860730171, "reward_before_std": 0.9098874349147081, "reward_change_max": 0.0014739260077476501, "reward_change_mean": -0.5960389524698257, "reward_change_min": -1.1508560217916965, "reward_change_std": 0.481814730912447, "reward_std": 0.920928630977869, "rewards/cosine_scaled_reward": 0.12708715675398707, "rewards/format_reward": 0.41666666977107525, "step": 118 }, { "advantage_max": 1.8390760868787766, "advantage_mean": -6.208820124697922e-10, "advantage_min": -0.949719063937664, "advantage_std": 0.9998083710670471, "completion_length": 2325.479202270508, "epoch": 0.136, "grad_norm": 0.2877277135848999, "kl": 0.0034942626953125, "lambda_div_used": 0.5, "learning_rate": 9.487916106540465e-07, "loss": 0.0001, "reward": -0.03717762790620327, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.03717762790620327, "reward_after_std": 0.7488083764910698, "reward_before_mean": 0.5156388609902933, "reward_before_std": 0.7618674747645855, "reward_change_max": 0.0008146613836288452, "reward_change_mean": -0.5528164934366941, "reward_change_min": -1.0835012346506119, "reward_change_std": 0.4235940780490637, "reward_std": 0.7488083802163601, "rewards/cosine_scaled_reward": -0.013013919815421104, "rewards/format_reward": 0.541666679084301, "step": 119 }, { "advantage_max": 1.9311043322086334, "advantage_mean": 7.450580818968433e-09, "advantage_min": -0.7216290608048439, "advantage_std": 0.9998380541801453, "completion_length": 2253.5417137145996, "epoch": 0.13714285714285715, "grad_norm": 0.22529254853725433, "kl": 0.0031099319458007812, "lambda_div_used": 0.5, "learning_rate": 9.473264167865171e-07, "loss": 0.0001, "reward": -0.043658461421728134, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.043658461421728134, "reward_after_std": 0.8314470946788788, "reward_before_mean": 0.4670662308926694, "reward_before_std": 0.7415252029895782, "reward_change_max": 0.0007584840059280396, "reward_change_mean": -0.5107246488332748, "reward_change_min": -0.9488695897161961, "reward_change_std": 0.36425756290555, "reward_std": 0.8314471282064915, "rewards/cosine_scaled_reward": -0.058133574202656746, "rewards/format_reward": 0.5833333414047956, "step": 120 }, { "advantage_max": 1.9516853392124176, "advantage_mean": -1.4280279514444771e-08, "advantage_min": -0.6656368263065815, "advantage_std": 0.9998772144317627, "completion_length": 1376.1250305175781, "epoch": 0.1382857142857143, "grad_norm": 0.2774488627910614, "kl": 0.002851724624633789, "lambda_div_used": 0.5, "learning_rate": 9.458418577899774e-07, "loss": 0.0001, "reward": 0.26350086531601846, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.26350086531601846, "reward_after_std": 0.9549853205680847, "reward_before_mean": 0.9884882047772408, "reward_before_std": 0.7882588542997837, "reward_change_max": 0.0, "reward_change_mean": -0.7249873355031013, "reward_change_min": -1.2277339547872543, "reward_change_std": 0.44961426220834255, "reward_std": 0.9549853503704071, "rewards/cosine_scaled_reward": 0.05674408434424549, "rewards/format_reward": 0.8750000111758709, "step": 121 }, { "advantage_max": 1.8947256356477737, "advantage_mean": -9.93410742555767e-09, "advantage_min": -0.781624436378479, "advantage_std": 0.99982900172472, "completion_length": 2792.458335876465, "epoch": 0.13942857142857143, "grad_norm": 0.25618037581443787, "kl": 0.001964092254638672, "lambda_div_used": 0.5, "learning_rate": 9.443380060197385e-07, "loss": 0.0001, "reward": -0.08971790038049221, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.08971790038049221, "reward_after_std": 0.8048688769340515, "reward_before_mean": 0.3975519463419914, "reward_before_std": 0.7465364970266819, "reward_change_max": 0.0019213780760765076, "reward_change_mean": -0.4872698709368706, "reward_change_min": -0.9815921522676945, "reward_change_std": 0.37444521114230156, "reward_std": 0.8048688843846321, "rewards/cosine_scaled_reward": -0.019974021706730127, "rewards/format_reward": 0.4375, "step": 122 }, { "advantage_max": 1.8762762248516083, "advantage_mean": -6.208817127095756e-09, "advantage_min": -0.8266568705439568, "advantage_std": 0.9997798949480057, "completion_length": 2543.937515258789, "epoch": 0.14057142857142857, "grad_norm": 0.1804003268480301, "kl": 0.0017232894897460938, "lambda_div_used": 0.5, "learning_rate": 9.428149347714143e-07, "loss": 0.0001, "reward": -0.2562871566042304, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.2562871566042304, "reward_after_std": 0.6744464114308357, "reward_before_mean": 0.13696255098329857, "reward_before_std": 0.6849313788115978, "reward_change_max": 0.0006846264004707336, "reward_change_mean": -0.39324971940368414, "reward_change_min": -0.8639900349080563, "reward_change_std": 0.3413631683215499, "reward_std": 0.6744464132934809, "rewards/cosine_scaled_reward": -0.17110206559300423, "rewards/format_reward": 0.47916667349636555, "step": 123 }, { "advantage_max": 1.9491935968399048, "advantage_mean": 1.8626452713554897e-08, "advantage_min": -0.7547185383737087, "advantage_std": 0.9998470917344093, "completion_length": 2096.875030517578, "epoch": 0.1417142857142857, "grad_norm": 0.24292317032814026, "kl": 0.009019851684570312, "lambda_div_used": 0.5, "learning_rate": 9.412727182773486e-07, "loss": 0.0004, "reward": 0.0013284431770443916, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.0013284431770443916, "reward_after_std": 0.912301491945982, "reward_before_mean": 0.5245756283402443, "reward_before_std": 0.841710695065558, "reward_change_max": 0.0007164850831031799, "reward_change_mean": -0.5232471814379096, "reward_change_min": -0.9632034972310066, "reward_change_std": 0.37779437424615026, "reward_std": 0.9123015441000462, "rewards/cosine_scaled_reward": -0.08146218955516815, "rewards/format_reward": 0.687500013038516, "step": 124 }, { "advantage_max": 1.9068433791399002, "advantage_mean": 5.285255755271834e-08, "advantage_min": -0.7247280701994896, "advantage_std": 0.999810703098774, "completion_length": 2890.5208587646484, "epoch": 0.14285714285714285, "grad_norm": 0.17092150449752808, "kl": 0.001659393310546875, "lambda_div_used": 0.5, "learning_rate": 9.397114317029974e-07, "loss": 0.0001, "reward": -0.09609029325656593, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.09609029325656593, "reward_after_std": 0.7903048172593117, "reward_before_mean": 0.38883402571082115, "reward_before_std": 0.7232473976910114, "reward_change_max": 0.0020884498953819275, "reward_change_mean": -0.48492429591715336, "reward_change_min": -0.9192905463278294, "reward_change_std": 0.3599092774093151, "reward_std": 0.790304858237505, "rewards/cosine_scaled_reward": 0.02775033819489181, "rewards/format_reward": 0.3333333358168602, "step": 125 }, { "advantage_max": 1.9119371473789215, "advantage_mean": -1.8626451603331873e-08, "advantage_min": -0.8382847681641579, "advantage_std": 0.9998529180884361, "completion_length": 2823.1458740234375, "epoch": 0.144, "grad_norm": 0.18794958293437958, "kl": 0.001481771469116211, "lambda_div_used": 0.5, "learning_rate": 9.381311511432658e-07, "loss": 0.0001, "reward": -0.1438035280443728, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.1438035280443728, "reward_after_std": 0.842867836356163, "reward_before_mean": 0.2903837040066719, "reward_before_std": 0.84921033680439, "reward_change_max": 0.0, "reward_change_mean": -0.4341872353106737, "reward_change_min": -0.8519404716789722, "reward_change_std": 0.36564615555107594, "reward_std": 0.8428678773343563, "rewards/cosine_scaled_reward": -0.0839748103171587, "rewards/format_reward": 0.4583333432674408, "step": 126 }, { "advantage_max": 1.8188635557889938, "advantage_mean": 3.973643153409867e-08, "advantage_min": -0.9549058154225349, "advantage_std": 0.9997655674815178, "completion_length": 2931.812530517578, "epoch": 0.14514285714285713, "grad_norm": 0.17654724419116974, "kl": 0.0027322769165039062, "lambda_div_used": 0.5, "learning_rate": 9.36531953618799e-07, "loss": 0.0001, "reward": -0.4510874133557081, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.4510874133557081, "reward_after_std": 0.4549293704330921, "reward_before_mean": -0.14654914196580648, "reward_before_std": 0.4743167757987976, "reward_change_max": 0.002840563654899597, "reward_change_mean": -0.30453827790915966, "reward_change_min": -0.6247300133109093, "reward_change_std": 0.25821792520582676, "reward_std": 0.4549293927848339, "rewards/cosine_scaled_reward": -0.26077457517385483, "rewards/format_reward": 0.37500000931322575, "step": 127 }, { "advantage_max": 1.844459444284439, "advantage_mean": 4.097819439330408e-08, "advantage_min": -1.0102382525801659, "advantage_std": 0.9997452944517136, "completion_length": 2836.083366394043, "epoch": 0.1462857142857143, "grad_norm": 0.17293789982795715, "kl": 0.003336310386657715, "lambda_div_used": 0.5, "learning_rate": 9.34913917072228e-07, "loss": 0.0001, "reward": 0.02156132459640503, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.02156132459640503, "reward_after_std": 0.7180147506296635, "reward_before_mean": 0.6425447445362806, "reward_before_std": 0.6847621034830809, "reward_change_max": 0.0016322359442710876, "reward_change_mean": -0.6209834087640047, "reward_change_min": -1.057206965982914, "reward_change_std": 0.44490036740899086, "reward_std": 0.7180147618055344, "rewards/cosine_scaled_reward": 0.10252236761152744, "rewards/format_reward": 0.43750000558793545, "step": 128 }, { "advantage_max": 1.8426750153303146, "advantage_mean": 2.7318795670083773e-08, "advantage_min": -0.8593291789293289, "advantage_std": 0.99974674731493, "completion_length": 3238.3125, "epoch": 0.14742857142857144, "grad_norm": 0.17253991961479187, "kl": 0.002398967742919922, "lambda_div_used": 0.5, "learning_rate": 9.332771203643714e-07, "loss": 0.0001, "reward": -0.35933883488178253, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.35933883488178253, "reward_after_std": 0.5568939186632633, "reward_before_mean": -0.006960507482290268, "reward_before_std": 0.5875996574759483, "reward_change_max": 0.000954747200012207, "reward_change_mean": -0.3523783441632986, "reward_change_min": -0.7693795971572399, "reward_change_std": 0.30717823654413223, "reward_std": 0.5568939335644245, "rewards/cosine_scaled_reward": -0.11806358769536018, "rewards/format_reward": 0.22916666977107525, "step": 129 }, { "advantage_max": 1.8513092994689941, "advantage_mean": 6.270905628102952e-08, "advantage_min": -0.8922067731618881, "advantage_std": 0.9997543543577194, "completion_length": 2752.5625534057617, "epoch": 0.14857142857142858, "grad_norm": 0.17327053844928741, "kl": 0.0019626617431640625, "lambda_div_used": 0.5, "learning_rate": 9.316216432703916e-07, "loss": 0.0001, "reward": -0.37150960601866245, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.37150960601866245, "reward_after_std": 0.5050280168652534, "reward_before_mean": -0.019790776073932648, "reward_before_std": 0.49720101431012154, "reward_change_max": 0.0008353963494300842, "reward_change_mean": -0.35171885043382645, "reward_change_min": -0.6718562357127666, "reward_change_std": 0.2648839596658945, "reward_std": 0.5050280280411243, "rewards/cosine_scaled_reward": -0.18697872385382652, "rewards/format_reward": 0.35416667722165585, "step": 130 }, { "advantage_max": 1.838008999824524, "advantage_mean": 2.1265198602016255e-08, "advantage_min": -0.9664673283696175, "advantage_std": 0.9998275339603424, "completion_length": 2747.7083740234375, "epoch": 0.14971428571428572, "grad_norm": 0.20217643678188324, "kl": 0.0033931732177734375, "lambda_div_used": 0.5, "learning_rate": 9.299475664759068e-07, "loss": 0.0001, "reward": 0.06368017196655273, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.06368017196655273, "reward_after_std": 0.761361688375473, "reward_before_mean": 0.7030438333749771, "reward_before_std": 0.7445781577844173, "reward_change_max": 0.00028298795223236084, "reward_change_mean": -0.639363644644618, "reward_change_min": -1.077939011156559, "reward_change_std": 0.46623699367046356, "reward_std": 0.7613617070019245, "rewards/cosine_scaled_reward": 0.10152191109955311, "rewards/format_reward": 0.5000000167638063, "step": 131 }, { "advantage_max": 1.901751920580864, "advantage_mean": 1.5832484268063496e-08, "advantage_min": -0.7725710570812225, "advantage_std": 0.999824121594429, "completion_length": 2550.3333435058594, "epoch": 0.15085714285714286, "grad_norm": 0.18242870271205902, "kl": 0.0016298294067382812, "lambda_div_used": 0.5, "learning_rate": 9.282549715730579e-07, "loss": 0.0001, "reward": -0.11272692680358887, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.11272692680358887, "reward_after_std": 0.8790729530155659, "reward_before_mean": 0.3309657648205757, "reward_before_std": 0.8492833152413368, "reward_change_max": 0.0006166920065879822, "reward_change_mean": -0.44369266368448734, "reward_change_min": -0.9826720170676708, "reward_change_std": 0.36282812524586916, "reward_std": 0.8790729679167271, "rewards/cosine_scaled_reward": -0.05326713342219591, "rewards/format_reward": 0.43750000558793545, "step": 132 }, { "advantage_max": 1.9136138558387756, "advantage_mean": 2.4835269396561444e-08, "advantage_min": -0.8313917182385921, "advantage_std": 0.9997948184609413, "completion_length": 2903.9583740234375, "epoch": 0.152, "grad_norm": 0.23344391584396362, "kl": 0.0031654834747314453, "lambda_div_used": 0.5, "learning_rate": 9.265439410565328e-07, "loss": 0.0001, "reward": -0.3133470695465803, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": -0.3133470695465803, "reward_after_std": 0.6490304917097092, "reward_before_mean": 0.032596178352832794, "reward_before_std": 0.5889531373977661, "reward_change_max": 0.0016025900840759277, "reward_change_mean": -0.3459432474337518, "reward_change_min": -0.5833385325968266, "reward_change_std": 0.23549414426088333, "reward_std": 0.6490305289626122, "rewards/cosine_scaled_reward": -0.16078524757176638, "rewards/format_reward": 0.35416666977107525, "step": 133 }, { "advantage_max": 1.9435906410217285, "advantage_mean": -4.967053879312289e-09, "advantage_min": -0.6701120026409626, "advantage_std": 0.9998712614178658, "completion_length": 2339.979202270508, "epoch": 0.15314285714285714, "grad_norm": 0.22259725630283356, "kl": 0.004046916961669922, "lambda_div_used": 0.5, "learning_rate": 9.248145583195447e-07, "loss": 0.0002, "reward": 0.10023209895007312, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.10023209895007312, "reward_after_std": 1.0333988890051842, "reward_before_mean": 0.6736355237662792, "reward_before_std": 0.9573747776448727, "reward_change_max": 4.84660267829895e-05, "reward_change_mean": -0.5734034404158592, "reward_change_min": -1.0696075037121773, "reward_change_std": 0.4389838185161352, "reward_std": 1.033398911356926, "rewards/cosine_scaled_reward": 0.024317767238244414, "rewards/format_reward": 0.6250000037252903, "step": 134 }, { "advantage_max": 1.9079030454158783, "advantage_mean": 2.4835269396561444e-09, "advantage_min": -0.7340277917683125, "advantage_std": 0.9998625218868256, "completion_length": 1714.1250381469727, "epoch": 0.15428571428571428, "grad_norm": 0.2778604030609131, "kl": 0.003810882568359375, "lambda_div_used": 0.5, "learning_rate": 9.230669076497687e-07, "loss": 0.0002, "reward": 0.16280546970665455, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.16280546970665455, "reward_after_std": 0.857132826000452, "reward_before_mean": 0.8465496301651001, "reward_before_std": 0.7335850484669209, "reward_change_max": 0.0013803690671920776, "reward_change_mean": -0.6837441800162196, "reward_change_min": -1.207035694271326, "reward_change_std": 0.4760168734937906, "reward_std": 0.8571328409016132, "rewards/cosine_scaled_reward": 0.08994148019701242, "rewards/format_reward": 0.6666666734963655, "step": 135 }, { "advantage_max": 1.9024064391851425, "advantage_mean": 2.483526828633842e-09, "advantage_min": -0.8795400336384773, "advantage_std": 0.9998568445444107, "completion_length": 2289.0208740234375, "epoch": 0.15542857142857142, "grad_norm": 0.21803708374500275, "kl": 0.003295421600341797, "lambda_div_used": 0.5, "learning_rate": 9.213010742252327e-07, "loss": 0.0001, "reward": -0.003703461028635502, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.003703461028635502, "reward_after_std": 0.86224514991045, "reward_before_mean": 0.5360340485349298, "reward_before_std": 0.8398735448718071, "reward_change_max": 0.0023953020572662354, "reward_change_mean": -0.5397375160828233, "reward_change_min": -1.0290844030678272, "reward_change_std": 0.4134146720170975, "reward_std": 0.8622451946139336, "rewards/cosine_scaled_reward": -0.013232994824647903, "rewards/format_reward": 0.5625000093132257, "step": 136 }, { "advantage_max": 1.8921066671609879, "advantage_mean": 8.568168108347152e-08, "advantage_min": -0.9159591495990753, "advantage_std": 0.9996925368905067, "completion_length": 2726.250030517578, "epoch": 0.15657142857142858, "grad_norm": 0.15787829458713531, "kl": 0.0028100013732910156, "lambda_div_used": 0.5, "learning_rate": 9.195171441101668e-07, "loss": 0.0001, "reward": -0.45353410951793194, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.45353410951793194, "reward_after_std": 0.40450212359428406, "reward_before_mean": -0.14005180727690458, "reward_before_std": 0.3673751577734947, "reward_change_max": 0.0007824301719665527, "reward_change_mean": -0.31348231015726924, "reward_change_min": -0.5197167657315731, "reward_change_std": 0.21080828132107854, "reward_std": 0.40450213477015495, "rewards/cosine_scaled_reward": -0.24710923619568348, "rewards/format_reward": 0.3541666679084301, "step": 137 }, { "advantage_max": 1.9471510499715805, "advantage_mean": 2.4835267176115394e-09, "advantage_min": -0.7620296776294708, "advantage_std": 0.999866396188736, "completion_length": 2125.145866394043, "epoch": 0.15771428571428572, "grad_norm": 0.20413421094417572, "kl": 0.0021791458129882812, "lambda_div_used": 0.5, "learning_rate": 9.177152042508077e-07, "loss": 0.0001, "reward": 0.08020066749304533, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.08020066749304533, "reward_after_std": 0.9415929988026619, "reward_before_mean": 0.6609942256473005, "reward_before_std": 0.8445362187922001, "reward_change_max": 0.0, "reward_change_mean": -0.5807935297489166, "reward_change_min": -1.122881568968296, "reward_change_std": 0.4123641811311245, "reward_std": 0.9415930137038231, "rewards/cosine_scaled_reward": -0.02366957487538457, "rewards/format_reward": 0.7083333432674408, "step": 138 }, { "advantage_max": 1.8990636467933655, "advantage_mean": 3.4148494587604716e-08, "advantage_min": -0.7599098831415176, "advantage_std": 0.9998147487640381, "completion_length": 2946.229232788086, "epoch": 0.15885714285714286, "grad_norm": 0.17725327610969543, "kl": 0.0037374496459960938, "lambda_div_used": 0.5, "learning_rate": 9.158953424711624e-07, "loss": 0.0001, "reward": -0.23691413225606084, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.23691413225606084, "reward_after_std": 0.8540695495903492, "reward_before_mean": 0.11466125026345253, "reward_before_std": 0.8710706867277622, "reward_change_max": 0.0020438656210899353, "reward_change_mean": -0.351575399748981, "reward_change_min": -0.9406367465853691, "reward_change_std": 0.35900538228452206, "reward_std": 0.8540695644915104, "rewards/cosine_scaled_reward": -0.16141936974599957, "rewards/format_reward": 0.4375000037252903, "step": 139 }, { "advantage_max": 1.9330047219991684, "advantage_mean": 1.0554989604560916e-08, "advantage_min": -0.7634199261665344, "advantage_std": 0.9998235404491425, "completion_length": 2628.354202270508, "epoch": 0.16, "grad_norm": 0.2571854889392853, "kl": 0.005505561828613281, "lambda_div_used": 0.5, "learning_rate": 9.140576474687263e-07, "loss": 0.0002, "reward": -0.185523915104568, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.185523915104568, "reward_after_std": 0.6811343766748905, "reward_before_mean": 0.2555219102650881, "reward_before_std": 0.5780453644692898, "reward_change_max": 0.0009342730045318604, "reward_change_mean": -0.4410457955673337, "reward_change_min": -0.8106644526124, "reward_change_std": 0.29249521903693676, "reward_std": 0.6811344102025032, "rewards/cosine_scaled_reward": -0.10140572674572468, "rewards/format_reward": 0.45833334513008595, "step": 140 }, { "advantage_max": 1.9670456051826477, "advantage_mean": 2.3593505704688766e-08, "advantage_min": -0.6404533721506596, "advantage_std": 0.9998448416590691, "completion_length": 2478.6041870117188, "epoch": 0.16114285714285714, "grad_norm": 0.20180006325244904, "kl": 0.0043582916259765625, "lambda_div_used": 0.5, "learning_rate": 9.122022088101613e-07, "loss": 0.0002, "reward": -0.25295988253492396, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.25295988253492396, "reward_after_std": 0.8347440958023071, "reward_before_mean": 0.07825877517461777, "reward_before_std": 0.7710718885064125, "reward_change_max": 0.0029845386743545532, "reward_change_mean": -0.33121864404529333, "reward_change_min": -0.6887594200670719, "reward_change_std": 0.26015863846987486, "reward_std": 0.8347441107034683, "rewards/cosine_scaled_reward": -0.2212872877717018, "rewards/format_reward": 0.5208333432674408, "step": 141 }, { "advantage_max": 1.9127947390079498, "advantage_mean": 2.4214387661647407e-08, "advantage_min": -0.8311641737818718, "advantage_std": 0.9998164772987366, "completion_length": 2687.9375610351562, "epoch": 0.16228571428571428, "grad_norm": 0.17241254448890686, "kl": 0.0032384395599365234, "lambda_div_used": 0.5, "learning_rate": 9.103291169269299e-07, "loss": 0.0001, "reward": -0.052776604890823364, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.052776604890823364, "reward_after_std": 0.8440326936542988, "reward_before_mean": 0.4517287611961365, "reward_before_std": 0.7939337026327848, "reward_change_max": 0.0012746453285217285, "reward_change_mean": -0.5045053567737341, "reward_change_min": -0.921326007694006, "reward_change_std": 0.3809546297416091, "reward_std": 0.8440327122807503, "rewards/cosine_scaled_reward": -0.05538563430309296, "rewards/format_reward": 0.5625000055879354, "step": 142 }, { "advantage_max": 1.8830101788043976, "advantage_mean": 1.4435500572673732e-08, "advantage_min": -0.869749516248703, "advantage_std": 0.9998139664530754, "completion_length": 2302.7083740234375, "epoch": 0.16342857142857142, "grad_norm": 0.19639165699481964, "kl": 0.0040073394775390625, "lambda_div_used": 0.5, "learning_rate": 9.084384631108882e-07, "loss": 0.0002, "reward": -0.1497055273503065, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.1497055273503065, "reward_after_std": 0.6513480730354786, "reward_before_mean": 0.34076992981135845, "reward_before_std": 0.6215660385787487, "reward_change_max": 0.0, "reward_change_mean": -0.4904754748567939, "reward_change_min": -0.8741833716630936, "reward_change_std": 0.34840161446481943, "reward_std": 0.6513480953872204, "rewards/cosine_scaled_reward": -0.1421150453388691, "rewards/format_reward": 0.6250000223517418, "step": 143 }, { "advantage_max": 1.9165276736021042, "advantage_mean": -1.0865431776529988e-08, "advantage_min": -0.8160995990037918, "advantage_std": 0.999792642891407, "completion_length": 2817.812530517578, "epoch": 0.16457142857142856, "grad_norm": 0.18998044729232788, "kl": 0.0037641525268554688, "lambda_div_used": 0.5, "learning_rate": 9.065303395098358e-07, "loss": 0.0002, "reward": -0.19920555595308542, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.19920555595308542, "reward_after_std": 0.7285573910921812, "reward_before_mean": 0.22228339943103492, "reward_before_std": 0.6816221624612808, "reward_change_max": 0.0012655630707740784, "reward_change_mean": -0.4214889407157898, "reward_change_min": -0.9015527628362179, "reward_change_std": 0.34488642401993275, "reward_std": 0.7285573966801167, "rewards/cosine_scaled_reward": -0.06594165321439505, "rewards/format_reward": 0.3541666716337204, "step": 144 }, { "advantage_max": 1.9259110987186432, "advantage_mean": -9.313230187046884e-10, "advantage_min": -0.7588883340358734, "advantage_std": 0.9998414367437363, "completion_length": 2125.31254196167, "epoch": 0.1657142857142857, "grad_norm": 0.2980290949344635, "kl": 0.0061130523681640625, "lambda_div_used": 0.5, "learning_rate": 9.046048391230247e-07, "loss": 0.0002, "reward": 0.04788858536630869, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.04788858536630869, "reward_after_std": 0.7997053451836109, "reward_before_mean": 0.649374682456255, "reward_before_std": 0.6824689619243145, "reward_change_max": 0.0015939399600028992, "reward_change_mean": -0.601486106403172, "reward_change_min": -1.022288154810667, "reward_change_std": 0.3961034547537565, "reward_std": 0.7997053638100624, "rewards/cosine_scaled_reward": 0.0017706537619233131, "rewards/format_reward": 0.6458333395421505, "step": 145 }, { "advantage_max": 1.8907168805599213, "advantage_mean": 1.459072107579118e-08, "advantage_min": -0.8794617429375648, "advantage_std": 0.9998200982809067, "completion_length": 1808.7708740234375, "epoch": 0.16685714285714287, "grad_norm": 0.2139863818883896, "kl": 0.0027594566345214844, "lambda_div_used": 0.5, "learning_rate": 9.026620557966279e-07, "loss": 0.0001, "reward": -0.08791144005954266, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.08791144005954266, "reward_after_std": 0.6702312454581261, "reward_before_mean": 0.4437525011599064, "reward_before_std": 0.6040500663220882, "reward_change_max": 1.3709068298339844e-05, "reward_change_mean": -0.5316639486700296, "reward_change_min": -0.9739903621375561, "reward_change_std": 0.35773606039583683, "reward_std": 0.670231256633997, "rewards/cosine_scaled_reward": -0.16354042233433574, "rewards/format_reward": 0.7708333395421505, "step": 146 }, { "advantage_max": 1.939362108707428, "advantage_mean": 4.811833542728294e-09, "advantage_min": -0.8192641139030457, "advantage_std": 0.9998601749539375, "completion_length": 1971.7500534057617, "epoch": 0.168, "grad_norm": 0.2149391919374466, "kl": 0.004222869873046875, "lambda_div_used": 0.5, "learning_rate": 9.007020842191634e-07, "loss": 0.0002, "reward": 0.08826066565234214, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.08826066565234214, "reward_after_std": 0.9593193866312504, "reward_before_mean": 0.6639640498906374, "reward_before_std": 0.8700052294880152, "reward_change_max": 0.0, "reward_change_mean": -0.5757033489644527, "reward_change_min": -0.9369436614215374, "reward_change_std": 0.37583223544061184, "reward_std": 0.9593194648623466, "rewards/cosine_scaled_reward": -0.0013513155281543732, "rewards/format_reward": 0.6666666772216558, "step": 147 }, { "advantage_max": 1.8952298015356064, "advantage_mean": -8.692344399818808e-09, "advantage_min": -0.8259298205375671, "advantage_std": 0.9998547807335854, "completion_length": 1961.854232788086, "epoch": 0.16914285714285715, "grad_norm": 0.2422906756401062, "kl": 0.0032510757446289062, "lambda_div_used": 0.5, "learning_rate": 8.987250199168808e-07, "loss": 0.0001, "reward": 0.02922473382204771, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.02922473382204771, "reward_after_std": 0.7920557893812656, "reward_before_mean": 0.6183740254491568, "reward_before_std": 0.7007161788642406, "reward_change_max": 0.0, "reward_change_mean": -0.589149309322238, "reward_change_min": -1.0229771696031094, "reward_change_std": 0.3872289936989546, "reward_std": 0.7920558117330074, "rewards/cosine_scaled_reward": -0.04497966240160167, "rewards/format_reward": 0.7083333414047956, "step": 148 }, { "advantage_max": 1.9160983711481094, "advantage_mean": 3.973643092347601e-08, "advantage_min": -0.8291295692324638, "advantage_std": 0.999817244708538, "completion_length": 2609.937545776367, "epoch": 0.1702857142857143, "grad_norm": 0.19904126226902008, "kl": 0.003222942352294922, "lambda_div_used": 0.5, "learning_rate": 8.967309592491052e-07, "loss": 0.0001, "reward": -0.07225888641551137, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.07225888641551137, "reward_after_std": 0.8874395098537207, "reward_before_mean": 0.40166381001472473, "reward_before_std": 0.848232576623559, "reward_change_max": 7.143616676330566e-05, "reward_change_mean": -0.47392270155251026, "reward_change_min": -0.8683690465986729, "reward_change_std": 0.36992147751152515, "reward_std": 0.8874395303428173, "rewards/cosine_scaled_reward": -0.0804181108251214, "rewards/format_reward": 0.5625000074505806, "step": 149 }, { "advantage_max": 1.851120799779892, "advantage_mean": 2.6387473095468295e-08, "advantage_min": -0.9206305295228958, "advantage_std": 0.9998620450496674, "completion_length": 2259.666702270508, "epoch": 0.17142857142857143, "grad_norm": 0.235165536403656, "kl": 0.0041065216064453125, "lambda_div_used": 0.5, "learning_rate": 8.9471999940354e-07, "loss": 0.0002, "reward": 0.12295411620289087, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.12295411620289087, "reward_after_std": 0.9792751409113407, "reward_before_mean": 0.7382536884397268, "reward_before_std": 0.9881106838583946, "reward_change_max": 0.0017604902386665344, "reward_change_mean": -0.6152995843440294, "reward_change_min": -1.2774429582059383, "reward_change_std": 0.5003240220248699, "reward_std": 0.9792751893401146, "rewards/cosine_scaled_reward": 0.03579352074302733, "rewards/format_reward": 0.6666666828095913, "step": 150 }, { "advantage_max": 1.860343113541603, "advantage_mean": 3.725290076417309e-09, "advantage_min": -0.9063407108187675, "advantage_std": 0.9998343735933304, "completion_length": 2302.7500534057617, "epoch": 0.17257142857142857, "grad_norm": 0.24778403341770172, "kl": 0.004438161849975586, "lambda_div_used": 0.5, "learning_rate": 8.926922383915315e-07, "loss": 0.0002, "reward": 0.07336704945191741, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.07336704945191741, "reward_after_std": 0.8510355353355408, "reward_before_mean": 0.6922492235898972, "reward_before_std": 0.83472590893507, "reward_change_max": 0.0, "reward_change_mean": -0.6188821662217379, "reward_change_min": -1.1263868436217308, "reward_change_std": 0.45005420222878456, "reward_std": 0.851035550236702, "rewards/cosine_scaled_reward": 0.033624591305851936, "rewards/format_reward": 0.6250000074505806, "step": 151 }, { "advantage_max": 1.8980120420455933, "advantage_mean": 1.3659397612997282e-08, "advantage_min": -0.8315541744232178, "advantage_std": 0.9997904226183891, "completion_length": 2617.5833740234375, "epoch": 0.1737142857142857, "grad_norm": 0.25885194540023804, "kl": 0.0038526058197021484, "lambda_div_used": 0.5, "learning_rate": 8.906477750432903e-07, "loss": 0.0002, "reward": -0.24192199483513832, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.24192199483513832, "reward_after_std": 0.6966664083302021, "reward_before_mean": 0.1597299799323082, "reward_before_std": 0.7053615525364876, "reward_change_max": 0.0009815618395805359, "reward_change_mean": -0.40165199153125286, "reward_change_min": -0.8559901602566242, "reward_change_std": 0.3405442573130131, "reward_std": 0.6966664344072342, "rewards/cosine_scaled_reward": -0.1180516816675663, "rewards/format_reward": 0.39583334140479565, "step": 152 }, { "advantage_max": 1.862968534231186, "advantage_mean": 4.6566130562641916e-08, "advantage_min": -0.9057076796889305, "advantage_std": 0.9997600317001343, "completion_length": 2691.729217529297, "epoch": 0.17485714285714285, "grad_norm": 0.26509541273117065, "kl": 0.008235931396484375, "lambda_div_used": 0.5, "learning_rate": 8.88586709003076e-07, "loss": 0.0003, "reward": -0.41280866833403707, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.41280866833403707, "reward_after_std": 0.4682539440691471, "reward_before_mean": -0.08330497704446316, "reward_before_std": 0.4649371337145567, "reward_change_max": 0.0011959150433540344, "reward_change_mean": -0.3295036805793643, "reward_change_min": -0.6323241330683231, "reward_change_std": 0.25634999945759773, "reward_std": 0.4682539738714695, "rewards/cosine_scaled_reward": -0.22915249690413475, "rewards/format_reward": 0.37500000931322575, "step": 153 }, { "advantage_max": 1.8755891919136047, "advantage_mean": 1.8626452213954536e-08, "advantage_min": -0.9043065384030342, "advantage_std": 0.9998662620782852, "completion_length": 2791.416717529297, "epoch": 0.176, "grad_norm": 0.16538317501544952, "kl": 0.0028333663940429688, "lambda_div_used": 0.5, "learning_rate": 8.865091407243394e-07, "loss": 0.0001, "reward": 0.12848534993827343, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.12848534993827343, "reward_after_std": 0.9189904667437077, "reward_before_mean": 0.7679711561650038, "reward_before_std": 0.8985546752810478, "reward_change_max": 0.0, "reward_change_mean": -0.6394858248531818, "reward_change_min": -1.227436114102602, "reward_change_std": 0.48260986618697643, "reward_std": 0.9189904779195786, "rewards/cosine_scaled_reward": 0.09231891017407179, "rewards/format_reward": 0.5833333488553762, "step": 154 }, { "advantage_max": 1.8813521564006805, "advantage_mean": 1.3038517820973539e-08, "advantage_min": -0.9157970994710922, "advantage_std": 0.9998358935117722, "completion_length": 2411.0625381469727, "epoch": 0.17714285714285713, "grad_norm": 0.22895239293575287, "kl": 0.0044879913330078125, "lambda_div_used": 0.5, "learning_rate": 8.844151714648274e-07, "loss": 0.0002, "reward": 0.12336456589400768, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.12336456589400768, "reward_after_std": 0.8058106414973736, "reward_before_mean": 0.7900370554998517, "reward_before_std": 0.7316606715321541, "reward_change_max": 0.0, "reward_change_mean": -0.6666725035756826, "reward_change_min": -1.1328043192625046, "reward_change_std": 0.44681636057794094, "reward_std": 0.8058106563985348, "rewards/cosine_scaled_reward": 0.10335184819996357, "rewards/format_reward": 0.5833333395421505, "step": 155 }, { "advantage_max": 1.8762772977352142, "advantage_mean": 3.228584977144067e-08, "advantage_min": -0.8056568801403046, "advantage_std": 0.9997596368193626, "completion_length": 2534.2500381469727, "epoch": 0.1782857142857143, "grad_norm": 0.18885566294193268, "kl": 0.0033082962036132812, "lambda_div_used": 0.5, "learning_rate": 8.823049032816478e-07, "loss": 0.0001, "reward": -0.15194648504257202, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.15194648504257202, "reward_after_std": 0.7984406873583794, "reward_before_mean": 0.2926546409726143, "reward_before_std": 0.8270257189869881, "reward_change_max": 0.00426897406578064, "reward_change_mean": -0.44460115022957325, "reward_change_min": -0.9271150156855583, "reward_change_std": 0.39101812755689025, "reward_std": 0.7984407059848309, "rewards/cosine_scaled_reward": -0.07242266833782196, "rewards/format_reward": 0.4375000037252903, "step": 156 }, { "advantage_max": 1.9505676925182343, "advantage_mean": 1.614292466367573e-08, "advantage_min": -0.7314069494605064, "advantage_std": 0.9998128190636635, "completion_length": 2602.187530517578, "epoch": 0.17942857142857144, "grad_norm": 0.24203114211559296, "kl": 0.005136966705322266, "lambda_div_used": 0.5, "learning_rate": 8.801784390262943e-07, "loss": 0.0002, "reward": -0.09863819554448128, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.09863819554448128, "reward_after_std": 0.7120418101549149, "reward_before_mean": 0.40562048298306763, "reward_before_std": 0.5621872544288635, "reward_change_max": 0.00021963566541671753, "reward_change_mean": -0.5042586587369442, "reward_change_min": -0.8021534122526646, "reward_change_std": 0.307680306956172, "reward_std": 0.7120418287813663, "rewards/cosine_scaled_reward": -0.06802311120554805, "rewards/format_reward": 0.5416666716337204, "step": 157 }, { "advantage_max": 1.909734457731247, "advantage_mean": 1.2417634698280722e-08, "advantage_min": -0.8408237770199776, "advantage_std": 0.9998659491539001, "completion_length": 2733.8126220703125, "epoch": 0.18057142857142858, "grad_norm": 0.19597436487674713, "kl": 0.0046710968017578125, "lambda_div_used": 0.5, "learning_rate": 8.780358823396352e-07, "loss": 0.0002, "reward": 0.04880722239613533, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.04880722239613533, "reward_after_std": 0.8823314607143402, "reward_before_mean": 0.6228008596226573, "reward_before_std": 0.8027093037962914, "reward_change_max": 0.0017058923840522766, "reward_change_mean": -0.573993619531393, "reward_change_min": -1.032454714179039, "reward_change_std": 0.4067836385220289, "reward_std": 0.8823314979672432, "rewards/cosine_scaled_reward": 0.05098374653607607, "rewards/format_reward": 0.5208333525806665, "step": 158 }, { "advantage_max": 1.8789568841457367, "advantage_mean": 4.346172643998614e-09, "advantage_min": -0.8822707831859589, "advantage_std": 0.9997832998633385, "completion_length": 2412.5208435058594, "epoch": 0.18171428571428572, "grad_norm": 0.22717958688735962, "kl": 0.0044708251953125, "lambda_div_used": 0.5, "learning_rate": 8.758773376468604e-07, "loss": 0.0002, "reward": -0.21282217151019722, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.21282217151019722, "reward_after_std": 0.5436347834765911, "reward_before_mean": 0.25775578059256077, "reward_before_std": 0.5021079070866108, "reward_change_max": 0.0008254200220108032, "reward_change_mean": -0.4705779440701008, "reward_change_min": -0.8097075633704662, "reward_change_std": 0.31645464431494474, "reward_std": 0.5436348021030426, "rewards/cosine_scaled_reward": -0.13153879530727863, "rewards/format_reward": 0.5208333432674408, "step": 159 }, { "advantage_max": 1.9354328662157059, "advantage_mean": 1.4901161637936866e-08, "advantage_min": -0.7828179746866226, "advantage_std": 0.9998691380023956, "completion_length": 2073.2500381469727, "epoch": 0.18285714285714286, "grad_norm": 0.20922227203845978, "kl": 0.0052356719970703125, "lambda_div_used": 0.5, "learning_rate": 8.737029101523929e-07, "loss": 0.0002, "reward": 0.041329525411129, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.041329525411129, "reward_after_std": 0.8853391669690609, "reward_before_mean": 0.6063848384656012, "reward_before_std": 0.8054082207381725, "reward_change_max": 0.0010421797633171082, "reward_change_mean": -0.5650553349405527, "reward_change_min": -0.9835452996194363, "reward_change_std": 0.3828981779515743, "reward_std": 0.8853391967713833, "rewards/cosine_scaled_reward": -0.03014090470969677, "rewards/format_reward": 0.6666666734963655, "step": 160 }, { "advantage_max": 1.9191998690366745, "advantage_mean": 3.414849514271623e-08, "advantage_min": -0.741759903728962, "advantage_std": 0.9998334273695946, "completion_length": 2272.854232788086, "epoch": 0.184, "grad_norm": 0.2741566002368927, "kl": 0.00377655029296875, "lambda_div_used": 0.5, "learning_rate": 8.715127058347614e-07, "loss": 0.0002, "reward": -0.12755455309525132, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.12755455309525132, "reward_after_std": 0.7911244854331017, "reward_before_mean": 0.3247405719012022, "reward_before_std": 0.7095393165946007, "reward_change_max": 0.0008605495095252991, "reward_change_mean": -0.45229510217905045, "reward_change_min": -0.8612713702023029, "reward_change_std": 0.313374862074852, "reward_std": 0.7911244966089725, "rewards/cosine_scaled_reward": -0.12929638382047415, "rewards/format_reward": 0.5833333358168602, "step": 161 }, { "advantage_max": 1.868800163269043, "advantage_mean": 3.7252905427109795e-08, "advantage_min": -0.8891083151102066, "advantage_std": 0.9998169168829918, "completion_length": 2741.6458892822266, "epoch": 0.18514285714285714, "grad_norm": 0.2599841356277466, "kl": 0.007129669189453125, "lambda_div_used": 0.5, "learning_rate": 8.693068314414344e-07, "loss": 0.0003, "reward": -0.12564774230122566, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.12564774230122566, "reward_after_std": 0.7360602058470249, "reward_before_mean": 0.3576655611395836, "reward_before_std": 0.7290375046432018, "reward_change_max": 0.00034596025943756104, "reward_change_mean": -0.4833132941275835, "reward_change_min": -0.9254280216991901, "reward_change_std": 0.37433141842484474, "reward_std": 0.7360602542757988, "rewards/cosine_scaled_reward": -0.02950056130066514, "rewards/format_reward": 0.4166666716337204, "step": 162 }, { "advantage_max": 1.8910458385944366, "advantage_mean": -1.1486311457531428e-08, "advantage_min": -0.7969113737344742, "advantage_std": 0.9998064860701561, "completion_length": 2245.729202270508, "epoch": 0.18628571428571428, "grad_norm": 0.2186397910118103, "kl": 0.005253791809082031, "lambda_div_used": 0.5, "learning_rate": 8.670853944836176e-07, "loss": 0.0002, "reward": 0.010679369792342186, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.010679369792342186, "reward_after_std": 0.683979082852602, "reward_before_mean": 0.6219610050320625, "reward_before_std": 0.5548157496377826, "reward_change_max": 0.0006725862622261047, "reward_change_mean": -0.6112816166132689, "reward_change_min": -0.9902759939432144, "reward_change_std": 0.39613597467541695, "reward_std": 0.6839791089296341, "rewards/cosine_scaled_reward": 0.019313829019665718, "rewards/format_reward": 0.5833333432674408, "step": 163 }, { "advantage_max": 1.8810230642557144, "advantage_mean": 2.1265200045306187e-08, "advantage_min": -0.8811631724238396, "advantage_std": 0.9998394995927811, "completion_length": 1964.4583587646484, "epoch": 0.18742857142857142, "grad_norm": 0.2334863245487213, "kl": 0.0047149658203125, "lambda_div_used": 0.5, "learning_rate": 8.648485032310144e-07, "loss": 0.0002, "reward": 0.07969965832307935, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.07969965832307935, "reward_after_std": 0.7769861966371536, "reward_before_mean": 0.7223222488537431, "reward_before_std": 0.7593985050916672, "reward_change_max": 0.0, "reward_change_mean": -0.642622577957809, "reward_change_min": -1.1558948084712029, "reward_change_std": 0.46158906538039446, "reward_std": 0.7769861966371536, "rewards/cosine_scaled_reward": -0.0034222062677145004, "rewards/format_reward": 0.7291666828095913, "step": 164 }, { "advantage_max": 1.9029300063848495, "advantage_mean": -1.2417634698280722e-09, "advantage_min": -0.8189666792750359, "advantage_std": 0.9998203292489052, "completion_length": 2241.9166946411133, "epoch": 0.18857142857142858, "grad_norm": 0.22668945789337158, "kl": 0.005153656005859375, "lambda_div_used": 0.5, "learning_rate": 8.625962667065487e-07, "loss": 0.0002, "reward": -0.16207364294677973, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.16207364294677973, "reward_after_std": 0.7425038516521454, "reward_before_mean": 0.2870515671093017, "reward_before_std": 0.7249557636678219, "reward_change_max": 0.0007998421788215637, "reward_change_mean": -0.44912520330399275, "reward_change_min": -0.9289770163595676, "reward_change_std": 0.3635756126604974, "reward_std": 0.7425038702785969, "rewards/cosine_scaled_reward": -0.12730755750089884, "rewards/format_reward": 0.541666679084301, "step": 165 }, { "advantage_max": 1.906169667840004, "advantage_mean": 1.552204276222824e-08, "advantage_min": -0.8235228583216667, "advantage_std": 0.9998261034488678, "completion_length": 2244.4583587646484, "epoch": 0.18971428571428572, "grad_norm": 0.16937896609306335, "kl": 0.0038509368896484375, "lambda_div_used": 0.5, "learning_rate": 8.603287946810513e-07, "loss": 0.0002, "reward": 0.02075265534222126, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.02075265534222126, "reward_after_std": 0.7293945774435997, "reward_before_mean": 0.6221222206950188, "reward_before_std": 0.6278940867632627, "reward_change_max": 0.0005093738436698914, "reward_change_mean": -0.6013695877045393, "reward_change_min": -1.004751831293106, "reward_change_std": 0.38687464594841003, "reward_std": 0.7293945997953415, "rewards/cosine_scaled_reward": 0.008977774530649185, "rewards/format_reward": 0.6041666697710752, "step": 166 }, { "advantage_max": 1.8805283606052399, "advantage_mean": 6.208817349140361e-09, "advantage_min": -0.9127615913748741, "advantage_std": 0.9998705834150314, "completion_length": 2170.187545776367, "epoch": 0.19085714285714286, "grad_norm": 0.19412115216255188, "kl": 0.0035266876220703125, "lambda_div_used": 0.5, "learning_rate": 8.580461976679099e-07, "loss": 0.0001, "reward": 0.07650089706294239, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.07650089706294239, "reward_after_std": 0.9583298601210117, "reward_before_mean": 0.6533183455467224, "reward_before_std": 0.9510982520878315, "reward_change_max": 0.006050758063793182, "reward_change_mean": -0.5768174193799496, "reward_change_min": -1.1144284754991531, "reward_change_std": 0.4647065959870815, "reward_std": 0.9583299160003662, "rewards/cosine_scaled_reward": -0.06917417328804731, "rewards/format_reward": 0.7916666865348816, "step": 167 }, { "advantage_max": 1.8871784955263138, "advantage_mean": -8.071462831438225e-09, "advantage_min": -0.8822121098637581, "advantage_std": 0.9998352527618408, "completion_length": 2507.8334197998047, "epoch": 0.192, "grad_norm": 0.19946350157260895, "kl": 0.0037517547607421875, "lambda_div_used": 0.5, "learning_rate": 8.557485869176825e-07, "loss": 0.0002, "reward": -0.021524932235479355, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.021524932235479355, "reward_after_std": 0.8910724967718124, "reward_before_mean": 0.4931417219340801, "reward_before_std": 0.8521817494183779, "reward_change_max": 0.001464754343032837, "reward_change_mean": -0.5146666690707207, "reward_change_min": -1.010201033204794, "reward_change_std": 0.3852034341543913, "reward_std": 0.8910725526511669, "rewards/cosine_scaled_reward": -0.05551247042603791, "rewards/format_reward": 0.6041666772216558, "step": 168 }, { "advantage_max": 1.9182345271110535, "advantage_mean": -3.4769377266208323e-08, "advantage_min": -0.9020901657640934, "advantage_std": 0.999874897301197, "completion_length": 1449.6041946411133, "epoch": 0.19314285714285714, "grad_norm": 0.20374953746795654, "kl": 0.0037746429443359375, "lambda_div_used": 0.5, "learning_rate": 8.534360744126753e-07, "loss": 0.0002, "reward": 0.4316548388451338, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4316548388451338, "reward_after_std": 0.8835392966866493, "reward_before_mean": 1.3296481654979289, "reward_before_std": 0.7325374772772193, "reward_change_max": 0.0, "reward_change_mean": -0.8979933187365532, "reward_change_min": -1.3769907057285309, "reward_change_std": 0.554233618080616, "reward_std": 0.8835393264889717, "rewards/cosine_scaled_reward": 0.21690738759934902, "rewards/format_reward": 0.8958333432674408, "step": 169 }, { "advantage_max": 1.9222121238708496, "advantage_mean": 4.718701318573437e-08, "advantage_min": -0.8292897716164589, "advantage_std": 0.9997989609837532, "completion_length": 2184.1875534057617, "epoch": 0.19428571428571428, "grad_norm": 0.21339966356754303, "kl": 0.004436492919921875, "lambda_div_used": 0.5, "learning_rate": 8.511087728614862e-07, "loss": 0.0002, "reward": -0.003595355898141861, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.003595355898141861, "reward_after_std": 0.7801594603806734, "reward_before_mean": 0.56236975453794, "reward_before_std": 0.7188321929425001, "reward_change_max": 0.0028308257460594177, "reward_change_mean": -0.5659651174210012, "reward_change_min": -1.0077791698276997, "reward_change_std": 0.40986311715096235, "reward_std": 0.7801594976335764, "rewards/cosine_scaled_reward": -6.513111293315887e-05, "rewards/format_reward": 0.5625, "step": 170 }, { "advantage_max": 1.8645301908254623, "advantage_mean": 1.870406252102441e-08, "advantage_min": -0.8515758588910103, "advantage_std": 0.9998170509934425, "completion_length": 2218.6041717529297, "epoch": 0.19542857142857142, "grad_norm": 0.1987854391336441, "kl": 0.0031156539916992188, "lambda_div_used": 0.5, "learning_rate": 8.487667956935087e-07, "loss": 0.0001, "reward": 0.025004766881465912, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.025004766881465912, "reward_after_std": 0.7621033377945423, "reward_before_mean": 0.6254060752689838, "reward_before_std": 0.7110554575920105, "reward_change_max": 0.0008187741041183472, "reward_change_mean": -0.600401321426034, "reward_change_min": -1.0654651895165443, "reward_change_std": 0.4122706390917301, "reward_std": 0.7621033787727356, "rewards/cosine_scaled_reward": 0.04186970740556717, "rewards/format_reward": 0.5416666679084301, "step": 171 }, { "advantage_max": 1.8990549445152283, "advantage_mean": 2.483527605789959e-09, "advantage_min": -0.8405355215072632, "advantage_std": 0.9998717159032822, "completion_length": 2534.520866394043, "epoch": 0.19657142857142856, "grad_norm": 0.26087817549705505, "kl": 0.0061130523681640625, "lambda_div_used": 0.5, "learning_rate": 8.464102570534061e-07, "loss": 0.0002, "reward": 0.19893252104520798, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.19893252104520798, "reward_after_std": 0.9219256825745106, "reward_before_mean": 0.8899438828229904, "reward_before_std": 0.8286309987306595, "reward_change_max": 0.0008333176374435425, "reward_change_mean": -0.6910113664343953, "reward_change_min": -1.1952459029853344, "reward_change_std": 0.4888560585677624, "reward_std": 0.9219257161021233, "rewards/cosine_scaled_reward": 0.1741385916247964, "rewards/format_reward": 0.5416666809469461, "step": 172 }, { "advantage_max": 1.9473352134227753, "advantage_mean": 7.450581041013038e-09, "advantage_min": -0.7159592658281326, "advantage_std": 0.9998235329985619, "completion_length": 1390.0833473205566, "epoch": 0.1977142857142857, "grad_norm": 0.2918720543384552, "kl": 0.004482269287109375, "lambda_div_used": 0.5, "learning_rate": 8.440392717955475e-07, "loss": 0.0002, "reward": -0.030134814442135394, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.030134814442135394, "reward_after_std": 0.7218679785728455, "reward_before_mean": 0.5274257343262434, "reward_before_std": 0.5976778594776988, "reward_change_max": 0.00031603872776031494, "reward_change_mean": -0.5575605668127537, "reward_change_min": -0.9151461683213711, "reward_change_std": 0.35466235876083374, "reward_std": 0.7218680009245872, "rewards/cosine_scaled_reward": -0.1321204612031579, "rewards/format_reward": 0.7916666679084301, "step": 173 }, { "advantage_max": 1.9343837201595306, "advantage_mean": 9.934107203513065e-09, "advantage_min": -0.6875580325722694, "advantage_std": 0.9998670294880867, "completion_length": 1463.7500305175781, "epoch": 0.19885714285714284, "grad_norm": 0.21818959712982178, "kl": 0.005323886871337891, "lambda_div_used": 0.5, "learning_rate": 8.416539554784089e-07, "loss": 0.0002, "reward": 0.22527608275413513, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.22527608275413513, "reward_after_std": 0.9232716746628284, "reward_before_mean": 0.9317025779746473, "reward_before_std": 0.7586650950834155, "reward_change_max": 0.0010546371340751648, "reward_change_mean": -0.7064264751970768, "reward_change_min": -1.2322439178824425, "reward_change_std": 0.45952145755290985, "reward_std": 0.9232717081904411, "rewards/cosine_scaled_reward": 0.028351284796372056, "rewards/format_reward": 0.875, "step": 174 }, { "advantage_max": 1.8499791771173477, "advantage_mean": -6.2088170160734535e-09, "advantage_min": -1.020237274467945, "advantage_std": 0.999834693968296, "completion_length": 2484.833366394043, "epoch": 0.2, "grad_norm": 0.22397536039352417, "kl": 0.0039272308349609375, "lambda_div_used": 0.5, "learning_rate": 8.392544243589427e-07, "loss": 0.0002, "reward": 0.09528359724208713, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.09528359724208713, "reward_after_std": 0.7255423031747341, "reward_before_mean": 0.7738265693187714, "reward_before_std": 0.7208334431052208, "reward_change_max": 0.00027216970920562744, "reward_change_mean": -0.6785429371520877, "reward_change_min": -1.1358166001737118, "reward_change_std": 0.4691190980374813, "reward_std": 0.725542314350605, "rewards/cosine_scaled_reward": 0.07441327720880508, "rewards/format_reward": 0.6250000167638063, "step": 175 }, { "advantage_max": 1.905025526881218, "advantage_mean": 7.450580818968433e-09, "advantage_min": -0.8770438581705093, "advantage_std": 0.9998798817396164, "completion_length": 2178.458396911621, "epoch": 0.20114285714285715, "grad_norm": 0.2587198317050934, "kl": 0.00556182861328125, "lambda_div_used": 0.5, "learning_rate": 8.368407953869103e-07, "loss": 0.0002, "reward": 0.1341405614912219, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.1341405614912219, "reward_after_std": 1.0044050849974155, "reward_before_mean": 0.7469082167372108, "reward_before_std": 0.9681647643446922, "reward_change_max": 0.0, "reward_change_mean": -0.6127676479518414, "reward_change_min": -1.131796881556511, "reward_change_std": 0.4701329004019499, "reward_std": 1.0044051185250282, "rewards/cosine_scaled_reward": 0.02970409602858126, "rewards/format_reward": 0.6875000074505806, "step": 176 }, { "advantage_max": 1.8849133551120758, "advantage_mean": -1.2417634698280722e-09, "advantage_min": -0.8384641855955124, "advantage_std": 0.9998805969953537, "completion_length": 2451.854232788086, "epoch": 0.2022857142857143, "grad_norm": 0.24274566769599915, "kl": 0.0052585601806640625, "lambda_div_used": 0.5, "learning_rate": 8.344131861991828e-07, "loss": 0.0002, "reward": 0.02091561071574688, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.02091561071574688, "reward_after_std": 0.9315523132681847, "reward_before_mean": 0.5612329412251711, "reward_before_std": 0.9423600323498249, "reward_change_max": 0.0008768588304519653, "reward_change_mean": -0.5403173211961985, "reward_change_min": -1.0866288468241692, "reward_change_std": 0.4319887850433588, "reward_std": 0.9315523356199265, "rewards/cosine_scaled_reward": -0.031883541494607925, "rewards/format_reward": 0.625000013038516, "step": 177 }, { "advantage_max": 1.858227476477623, "advantage_mean": -3.1044086745701804e-09, "advantage_min": -0.8183949738740921, "advantage_std": 0.9998875185847282, "completion_length": 1927.0833740234375, "epoch": 0.20342857142857143, "grad_norm": 0.26487603783607483, "kl": 0.00604248046875, "lambda_div_used": 0.5, "learning_rate": 8.319717151140072e-07, "loss": 0.0002, "reward": 0.2407649210654199, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2407649210654199, "reward_after_std": 1.03031674772501, "reward_before_mean": 0.9408783931285143, "reward_before_std": 1.0674152709543705, "reward_change_max": 0.00026485323905944824, "reward_change_mean": -0.7001134771853685, "reward_change_min": -1.4129510670900345, "reward_change_std": 0.5647882856428623, "reward_std": 1.0303168073296547, "rewards/cosine_scaled_reward": 0.13710585562512279, "rewards/format_reward": 0.6666666809469461, "step": 178 }, { "advantage_max": 1.928291454911232, "advantage_mean": 1.8005570590062803e-08, "advantage_min": -0.7337657734751701, "advantage_std": 0.9998143911361694, "completion_length": 2320.0417098999023, "epoch": 0.20457142857142857, "grad_norm": 0.2769847512245178, "kl": 0.0048007965087890625, "lambda_div_used": 0.5, "learning_rate": 8.295165011252396e-07, "loss": 0.0002, "reward": -0.2737759065348655, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.2737759065348655, "reward_after_std": 0.7116647139191628, "reward_before_mean": 0.0847318172454834, "reward_before_std": 0.6560190003365278, "reward_change_max": 0.0031480640172958374, "reward_change_mean": -0.3585077226161957, "reward_change_min": -0.7125654704868793, "reward_change_std": 0.2866736575961113, "reward_std": 0.7116647399961948, "rewards/cosine_scaled_reward": -0.22846743231639266, "rewards/format_reward": 0.5416666679084301, "step": 179 }, { "advantage_max": 1.9269533902406693, "advantage_mean": -2.1109979209121832e-08, "advantage_min": -0.6805262081325054, "advantage_std": 0.9998768717050552, "completion_length": 1817.520881652832, "epoch": 0.2057142857142857, "grad_norm": 0.3480105400085449, "kl": 0.0066127777099609375, "lambda_div_used": 0.5, "learning_rate": 8.270476638965461e-07, "loss": 0.0003, "reward": 0.24234669422730803, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.24234669422730803, "reward_after_std": 1.059589195996523, "reward_before_mean": 0.924484197050333, "reward_before_std": 0.9491253048181534, "reward_change_max": 0.0, "reward_change_mean": -0.6821374967694283, "reward_change_min": -1.345071405172348, "reward_change_std": 0.5074834516271949, "reward_std": 1.0595892071723938, "rewards/cosine_scaled_reward": 0.10807542316615582, "rewards/format_reward": 0.7083333358168602, "step": 180 }, { "advantage_max": 1.9125166982412338, "advantage_mean": 9.934107536579972e-09, "advantage_min": -0.8180154636502266, "advantage_std": 0.9997949376702309, "completion_length": 2856.791717529297, "epoch": 0.20685714285714285, "grad_norm": 0.23662905395030975, "kl": 0.0064849853515625, "lambda_div_used": 0.5, "learning_rate": 8.245653237555705e-07, "loss": 0.0003, "reward": -0.23404993303120136, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.23404993303120136, "reward_after_std": 0.6118505820631981, "reward_before_mean": 0.1888514976017177, "reward_before_std": 0.515215652063489, "reward_change_max": 0.0014413893222808838, "reward_change_mean": -0.4229014324955642, "reward_change_min": -0.6865375861525536, "reward_change_std": 0.2758261002600193, "reward_std": 0.6118505895137787, "rewards/cosine_scaled_reward": -0.0826575867831707, "rewards/format_reward": 0.3541666753590107, "step": 181 }, { "advantage_max": 1.8652609288692474, "advantage_mean": -1.055498977109437e-08, "advantage_min": -0.9345456510782242, "advantage_std": 0.9998304173350334, "completion_length": 1919.791732788086, "epoch": 0.208, "grad_norm": 0.16495570540428162, "kl": 0.002300739288330078, "lambda_div_used": 0.5, "learning_rate": 8.220696016880687e-07, "loss": 0.0001, "reward": 0.10570523329079151, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.10570523329079151, "reward_after_std": 0.7955246269702911, "reward_before_mean": 0.7623180113732815, "reward_before_std": 0.7287193089723587, "reward_change_max": 0.002306625247001648, "reward_change_mean": -0.6566128022968769, "reward_change_min": -1.0960058122873306, "reward_change_std": 0.43216282688081264, "reward_std": 0.7955246269702911, "rewards/cosine_scaled_reward": 0.016575670335441828, "rewards/format_reward": 0.7291666753590107, "step": 182 }, { "advantage_max": 1.9063877165317535, "advantage_mean": -3.7834980481932234e-08, "advantage_min": -0.8252428323030472, "advantage_std": 0.9998625591397285, "completion_length": 1387.1667022705078, "epoch": 0.20914285714285713, "grad_norm": 0.23810604214668274, "kl": 0.00701904296875, "lambda_div_used": 0.5, "learning_rate": 8.195606193320136e-07, "loss": 0.0003, "reward": 0.35889948764815927, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.35889948764815927, "reward_after_std": 0.8050801753997803, "reward_before_mean": 1.217661987990141, "reward_before_std": 0.6298352889716625, "reward_change_max": 0.0, "reward_change_mean": -0.8587625250220299, "reward_change_min": -1.343634694814682, "reward_change_std": 0.5051140710711479, "reward_std": 0.8050802126526833, "rewards/cosine_scaled_reward": 0.14008096978068352, "rewards/format_reward": 0.9375000074505806, "step": 183 }, { "advantage_max": 1.9439354538917542, "advantage_mean": -1.7384688355548406e-08, "advantage_min": -0.7739471718668938, "advantage_std": 0.9997957646846771, "completion_length": 2074.5417289733887, "epoch": 0.2102857142857143, "grad_norm": 0.2490999549627304, "kl": 0.0060520172119140625, "lambda_div_used": 0.5, "learning_rate": 8.170384989716657e-07, "loss": 0.0002, "reward": -0.20943114906549454, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.20943114906549454, "reward_after_std": 0.6082219518721104, "reward_before_mean": 0.23564279254060239, "reward_before_std": 0.49706319300457835, "reward_change_max": 0.0028117522597312927, "reward_change_mean": -0.44507398270070553, "reward_change_min": -0.7033988237380981, "reward_change_std": 0.28294625133275986, "reward_std": 0.608221959322691, "rewards/cosine_scaled_reward": -0.20509526692330837, "rewards/format_reward": 0.6458333358168602, "step": 184 }, { "advantage_max": 1.9413893222808838, "advantage_mean": 4.718701052119911e-08, "advantage_min": -0.8057427629828453, "advantage_std": 0.9997518807649612, "completion_length": 1958.3750228881836, "epoch": 0.21142857142857144, "grad_norm": 0.2674250900745392, "kl": 0.004154205322265625, "lambda_div_used": 0.5, "learning_rate": 8.145033635316128e-07, "loss": 0.0002, "reward": -0.3591794992535142, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3591794992535142, "reward_after_std": 0.5219908468425274, "reward_before_mean": -0.010549581609666348, "reward_before_std": 0.439737007021904, "reward_change_max": 0.0004829689860343933, "reward_change_mean": -0.3486299216747284, "reward_change_min": -0.5860956497490406, "reward_change_std": 0.23113016970455647, "reward_std": 0.5219908636063337, "rewards/cosine_scaled_reward": -0.3073581252247095, "rewards/format_reward": 0.6041666753590107, "step": 185 }, { "advantage_max": 1.873088613152504, "advantage_mean": 1.490116141589226e-08, "advantage_min": -0.9214615821838379, "advantage_std": 0.9998159259557724, "completion_length": 2246.875015258789, "epoch": 0.21257142857142858, "grad_norm": 0.2050282061100006, "kl": 0.00543212890625, "lambda_div_used": 0.5, "learning_rate": 8.119553365707802e-07, "loss": 0.0002, "reward": -0.07845413440372795, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.07845413440372795, "reward_after_std": 0.6444421708583832, "reward_before_mean": 0.47786211501806974, "reward_before_std": 0.6074944473803043, "reward_change_max": 0.002060152590274811, "reward_change_mean": -0.556316233240068, "reward_change_min": -0.9550213851034641, "reward_change_std": 0.38717483170330524, "reward_std": 0.6444421894848347, "rewards/cosine_scaled_reward": -0.04231897369027138, "rewards/format_reward": 0.5625000055879354, "step": 186 }, { "advantage_max": 1.9475088268518448, "advantage_mean": 1.3038517154839724e-08, "advantage_min": -0.7693781480193138, "advantage_std": 0.9998098015785217, "completion_length": 1656.9375305175781, "epoch": 0.21371428571428572, "grad_norm": 0.23222263157367706, "kl": 0.005061149597167969, "lambda_div_used": 0.5, "learning_rate": 8.093945422764069e-07, "loss": 0.0002, "reward": -0.08286092977505177, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.08286092977505177, "reward_after_std": 0.5727328844368458, "reward_before_mean": 0.4811726361513138, "reward_before_std": 0.43661591596901417, "reward_change_max": 0.0015158876776695251, "reward_change_mean": -0.564033567905426, "reward_change_min": -0.8926524370908737, "reward_change_std": 0.33777882531285286, "reward_std": 0.5727329030632973, "rewards/cosine_scaled_reward": -0.15524702798575163, "rewards/format_reward": 0.7916666716337204, "step": 187 }, { "advantage_max": 1.9176759123802185, "advantage_mean": -4.967054045845742e-09, "advantage_min": -0.7783575281500816, "advantage_std": 0.9998136684298515, "completion_length": 2514.562515258789, "epoch": 0.21485714285714286, "grad_norm": 0.1731446236371994, "kl": 0.0059967041015625, "lambda_div_used": 0.5, "learning_rate": 8.068211054579943e-07, "loss": 0.0002, "reward": -0.20618502353318036, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.20618502353318036, "reward_after_std": 0.7404297292232513, "reward_before_mean": 0.19920311123132706, "reward_before_std": 0.6630725599825382, "reward_change_max": 0.00040778517723083496, "reward_change_mean": -0.40538814663887024, "reward_change_min": -0.7672394849359989, "reward_change_std": 0.28316234797239304, "reward_std": 0.7404297553002834, "rewards/cosine_scaled_reward": -0.15039845742285252, "rewards/format_reward": 0.5000000074505806, "step": 188 }, { "advantage_max": 1.9747483879327774, "advantage_mean": 7.450580596923828e-09, "advantage_min": -0.6897084377706051, "advantage_std": 0.999849870800972, "completion_length": 1798.0417098999023, "epoch": 0.216, "grad_norm": 0.24236349761486053, "kl": 0.0056133270263671875, "lambda_div_used": 0.5, "learning_rate": 8.04235151541222e-07, "loss": 0.0002, "reward": -0.020142017863690853, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.020142017863690853, "reward_after_std": 0.8742521069943905, "reward_before_mean": 0.4924778901040554, "reward_before_std": 0.7583403587341309, "reward_change_max": 0.0004052966833114624, "reward_change_mean": -0.5126199284568429, "reward_change_min": -0.873991385102272, "reward_change_std": 0.3210005727596581, "reward_std": 0.8742521218955517, "rewards/cosine_scaled_reward": -0.0975110623985529, "rewards/format_reward": 0.687500013038516, "step": 189 }, { "advantage_max": 1.9481079131364822, "advantage_mean": 2.4835269396561444e-09, "advantage_min": -0.6872792914509773, "advantage_std": 0.9998851120471954, "completion_length": 1400.145881652832, "epoch": 0.21714285714285714, "grad_norm": 0.22386141121387482, "kl": 0.0051937103271484375, "lambda_div_used": 0.5, "learning_rate": 8.01636806561836e-07, "loss": 0.0002, "reward": 0.2527433391660452, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2527433391660452, "reward_after_std": 0.9781098589301109, "reward_before_mean": 0.9614727329462767, "reward_before_std": 0.8016847297549248, "reward_change_max": 0.0, "reward_change_mean": -0.7087293863296509, "reward_change_min": -1.2133204266428947, "reward_change_std": 0.45133110880851746, "reward_std": 0.9781098812818527, "rewards/cosine_scaled_reward": 0.0432363604195416, "rewards/format_reward": 0.8750000149011612, "step": 190 }, { "advantage_max": 1.9580977708101273, "advantage_mean": 3.725290298461914e-09, "advantage_min": -0.6801789999008179, "advantage_std": 0.9998925402760506, "completion_length": 1348.5625534057617, "epoch": 0.21828571428571428, "grad_norm": 0.26126664876937866, "kl": 0.004961967468261719, "lambda_div_used": 0.5, "learning_rate": 7.990261971595048e-07, "loss": 0.0002, "reward": 0.320645788917318, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.320645788917318, "reward_after_std": 1.0616176091134548, "reward_before_mean": 1.060829535126686, "reward_before_std": 0.8928857175633311, "reward_change_max": 0.002303190529346466, "reward_change_mean": -0.7401837222278118, "reward_change_min": -1.3448436558246613, "reward_change_std": 0.4931131489574909, "reward_std": 1.0616176202893257, "rewards/cosine_scaled_reward": 0.07208140660077333, "rewards/format_reward": 0.9166666716337204, "step": 191 }, { "advantage_max": 1.9228132516145706, "advantage_mean": 1.179675312990014e-08, "advantage_min": -0.7693631574511528, "advantage_std": 0.9998264163732529, "completion_length": 1864.6250305175781, "epoch": 0.21942857142857142, "grad_norm": 0.21476708352565765, "kl": 0.005016326904296875, "lambda_div_used": 0.5, "learning_rate": 7.964034505716476e-07, "loss": 0.0002, "reward": -0.013189246295951307, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.013189246295951307, "reward_after_std": 0.6955150477588177, "reward_before_mean": 0.5694208387285471, "reward_before_std": 0.5938720889389515, "reward_change_max": 0.0, "reward_change_mean": -0.5826100930571556, "reward_change_min": -1.0174332857131958, "reward_change_std": 0.368743147701025, "reward_std": 0.6955150812864304, "rewards/cosine_scaled_reward": -0.12153958529233932, "rewards/format_reward": 0.8125, "step": 192 }, { "advantage_max": 1.9259341210126877, "advantage_mean": 1.1175871006408045e-08, "advantage_min": -0.7410866692662239, "advantage_std": 0.9997951835393906, "completion_length": 2657.7708587646484, "epoch": 0.22057142857142858, "grad_norm": 0.5041037201881409, "kl": 0.016092300415039062, "lambda_div_used": 0.5, "learning_rate": 7.93768694627233e-07, "loss": 0.0006, "reward": -0.32176475087180734, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.32176475087180734, "reward_after_std": 0.5517542436718941, "reward_before_mean": 0.04636443965137005, "reward_before_std": 0.48256898298859596, "reward_change_max": 0.0009066537022590637, "reward_change_mean": -0.3681291975080967, "reward_change_min": -0.6095230802893639, "reward_change_std": 0.2412662087008357, "reward_std": 0.5517542473971844, "rewards/cosine_scaled_reward": -0.19556778552941978, "rewards/format_reward": 0.43750000186264515, "step": 193 }, { "advantage_max": 1.8814585208892822, "advantage_mean": -2.8560559917067962e-08, "advantage_min": -0.7467570975422859, "advantage_std": 0.999844953417778, "completion_length": 2386.416732788086, "epoch": 0.22171428571428572, "grad_norm": 0.19957709312438965, "kl": 0.006275177001953125, "lambda_div_used": 0.5, "learning_rate": 7.911220577405484e-07, "loss": 0.0003, "reward": 0.24586467817425728, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.24586467817425728, "reward_after_std": 0.8887559995055199, "reward_before_mean": 0.9962658639997244, "reward_before_std": 0.8165496792644262, "reward_change_max": 0.0, "reward_change_mean": -0.7504012230783701, "reward_change_min": -1.3230471685528755, "reward_change_std": 0.5285136736929417, "reward_std": 0.8887560218572617, "rewards/cosine_scaled_reward": 0.12313293479382992, "rewards/format_reward": 0.7500000074505806, "step": 194 }, { "advantage_max": 1.944841906428337, "advantage_mean": 1.3969838702498905e-08, "advantage_min": -0.7255363836884499, "advantage_std": 0.9998145550489426, "completion_length": 1739.8542022705078, "epoch": 0.22285714285714286, "grad_norm": 0.22262509167194366, "kl": 0.00640106201171875, "lambda_div_used": 0.5, "learning_rate": 7.884636689049422e-07, "loss": 0.0003, "reward": -0.14309865795075893, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.14309865795075893, "reward_after_std": 0.700147919356823, "reward_before_mean": 0.3267921321094036, "reward_before_std": 0.6149430330842733, "reward_change_max": 0.00010732561349868774, "reward_change_mean": -0.4698908142745495, "reward_change_min": -0.9223340824246407, "reward_change_std": 0.3288527149707079, "reward_std": 0.7001479230821133, "rewards/cosine_scaled_reward": -0.20118727069348097, "rewards/format_reward": 0.7291666753590107, "step": 195 }, { "advantage_max": 1.8364529013633728, "advantage_mean": 1.11758712839638e-08, "advantage_min": -0.9304062947630882, "advantage_std": 0.9998329728841782, "completion_length": 2679.729263305664, "epoch": 0.224, "grad_norm": 0.22063890099525452, "kl": 0.007236480712890625, "lambda_div_used": 0.5, "learning_rate": 7.857936576865356e-07, "loss": 0.0003, "reward": -0.1205627042800188, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.1205627042800188, "reward_after_std": 0.7646887302398682, "reward_before_mean": 0.36539409251417965, "reward_before_std": 0.8065420165657997, "reward_change_max": 7.483363151550293e-05, "reward_change_mean": -0.48595677874982357, "reward_change_min": -0.9786637127399445, "reward_change_std": 0.40618606097996235, "reward_std": 0.7646887451410294, "rewards/cosine_scaled_reward": -0.05688631488010287, "rewards/format_reward": 0.4791666716337204, "step": 196 }, { "advantage_max": 1.9239116162061691, "advantage_mean": -9.54605661185326e-09, "advantage_min": -0.7319557182490826, "advantage_std": 0.9998846724629402, "completion_length": 1059.9167022705078, "epoch": 0.22514285714285714, "grad_norm": 0.2716602385044098, "kl": 0.005367279052734375, "lambda_div_used": 0.5, "learning_rate": 7.831121542179086e-07, "loss": 0.0002, "reward": 0.40557946916669607, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.40557946916669607, "reward_after_std": 1.0015158616006374, "reward_before_mean": 1.2390838451683521, "reward_before_std": 0.8481772616505623, "reward_change_max": 0.0, "reward_change_mean": -0.8335044011473656, "reward_change_min": -1.465621568262577, "reward_change_std": 0.5422770120203495, "reward_std": 1.0015158914029598, "rewards/cosine_scaled_reward": 0.1507919318974018, "rewards/format_reward": 0.9375, "step": 197 }, { "advantage_max": 1.911670058965683, "advantage_mean": 2.110997909809953e-08, "advantage_min": -0.8416686952114105, "advantage_std": 0.9998601600527763, "completion_length": 1591.2083587646484, "epoch": 0.22628571428571428, "grad_norm": 0.2324543595314026, "kl": 0.006961822509765625, "lambda_div_used": 0.5, "learning_rate": 7.804192891917571e-07, "loss": 0.0003, "reward": 0.2818256893660873, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2818256893660873, "reward_after_std": 0.9285234659910202, "reward_before_mean": 1.0414979457855225, "reward_before_std": 0.8359009139239788, "reward_change_max": 0.0, "reward_change_mean": -0.7596722422167659, "reward_change_min": -1.2616542540490627, "reward_change_std": 0.504207344725728, "reward_std": 0.9285234957933426, "rewards/cosine_scaled_reward": 0.10408229497261345, "rewards/format_reward": 0.8333333358168602, "step": 198 }, { "advantage_max": 1.9385438710451126, "advantage_mean": -2.1730859334212482e-09, "advantage_min": -0.7928804978728294, "advantage_std": 0.9998569265007973, "completion_length": 1582.166732788086, "epoch": 0.22742857142857142, "grad_norm": 0.22295568883419037, "kl": 0.0064563751220703125, "lambda_div_used": 0.5, "learning_rate": 7.777151938545235e-07, "loss": 0.0003, "reward": 0.047651538625359535, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.047651538625359535, "reward_after_std": 0.8108061738312244, "reward_before_mean": 0.6391786062158644, "reward_before_std": 0.6576478350907564, "reward_change_max": 0.0, "reward_change_mean": -0.591527096927166, "reward_change_min": -0.955488495528698, "reward_change_std": 0.36143919453024864, "reward_std": 0.8108061775565147, "rewards/cosine_scaled_reward": -0.1491607059724629, "rewards/format_reward": 0.9375000074505806, "step": 199 }, { "advantage_max": 1.9287515133619308, "advantage_mean": -6.208816794028849e-10, "advantage_min": -0.8186419308185577, "advantage_std": 0.999849408864975, "completion_length": 1619.4583740234375, "epoch": 0.22857142857142856, "grad_norm": 0.2281968593597412, "kl": 0.005645751953125, "lambda_div_used": 0.5, "learning_rate": 7.75e-07, "loss": 0.0002, "reward": 0.13060855865478516, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.13060855865478516, "reward_after_std": 0.787628311663866, "reward_before_mean": 0.8036444410681725, "reward_before_std": 0.6729063596576452, "reward_change_max": 0.0010903030633926392, "reward_change_mean": -0.673035865649581, "reward_change_min": -1.091851219534874, "reward_change_std": 0.4272896870970726, "reward_std": 0.7876283414661884, "rewards/cosine_scaled_reward": 0.005988870281726122, "rewards/format_reward": 0.7916666753590107, "step": 200 }, { "advantage_max": 1.9076077789068222, "advantage_mean": 1.6763806343078613e-08, "advantage_min": -0.7760433480143547, "advantage_std": 0.9998678788542747, "completion_length": 1999.1667022705078, "epoch": 0.2297142857142857, "grad_norm": 0.23005475103855133, "kl": 0.0055484771728515625, "lambda_div_used": 0.5, "learning_rate": 7.72273839962904e-07, "loss": 0.0002, "reward": 0.4647516645491123, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4647516645491123, "reward_after_std": 0.9089144691824913, "reward_before_mean": 1.3805565685033798, "reward_before_std": 0.7163506662473083, "reward_change_max": 0.0, "reward_change_mean": -0.9158048909157515, "reward_change_min": -1.4860983304679394, "reward_change_std": 0.5765415318310261, "reward_std": 0.9089145064353943, "rewards/cosine_scaled_reward": 0.2840282618999481, "rewards/format_reward": 0.8125000074505806, "step": 201 }, { "advantage_max": 1.9722920954227448, "advantage_mean": -2.0566708336389183e-08, "advantage_min": -0.6951834484934807, "advantage_std": 0.9998218566179276, "completion_length": 1457.9166946411133, "epoch": 0.23085714285714284, "grad_norm": 0.20306335389614105, "kl": 0.004856109619140625, "lambda_div_used": 0.5, "learning_rate": 7.695368466124296e-07, "loss": 0.0002, "reward": 0.2708722506649792, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2708722506649792, "reward_after_std": 0.6537715718150139, "reward_before_mean": 1.1032776683568954, "reward_before_std": 0.3559281248599291, "reward_change_max": 0.0013939812779426575, "reward_change_mean": -0.8324054088443518, "reward_change_min": -1.1846450828015804, "reward_change_std": 0.455369858071208, "reward_std": 0.6537716016173363, "rewards/cosine_scaled_reward": 0.14538880321197212, "rewards/format_reward": 0.8125, "step": 202 }, { "advantage_max": 1.9697763472795486, "advantage_mean": 1.2417631367611648e-09, "advantage_min": -0.7334681376814842, "advantage_std": 0.9998695030808449, "completion_length": 1679.4375610351562, "epoch": 0.232, "grad_norm": 0.23335202038288116, "kl": 0.0074920654296875, "lambda_div_used": 0.5, "learning_rate": 7.667891533457718e-07, "loss": 0.0003, "reward": 0.15621353359892964, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.15621353359892964, "reward_after_std": 0.8448192551732063, "reward_before_mean": 0.8237780556082726, "reward_before_std": 0.6491473764181137, "reward_change_max": 0.0, "reward_change_mean": -0.6675645336508751, "reward_change_min": -1.0493612885475159, "reward_change_std": 0.399025097489357, "reward_std": 0.8448192626237869, "rewards/cosine_scaled_reward": -0.015194314531981945, "rewards/format_reward": 0.854166679084301, "step": 203 }, { "advantage_max": 1.9231015890836716, "advantage_mean": 1.862645149230957e-09, "advantage_min": -0.7674050070345402, "advantage_std": 0.9998611137270927, "completion_length": 1489.666732788086, "epoch": 0.23314285714285715, "grad_norm": 0.2765462100505829, "kl": 0.006114959716796875, "lambda_div_used": 0.5, "learning_rate": 7.640308940816239e-07, "loss": 0.0002, "reward": 0.19802786083891988, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.19802786083891988, "reward_after_std": 0.8996425718069077, "reward_before_mean": 0.8953988589346409, "reward_before_std": 0.8019166607409716, "reward_change_max": 0.0, "reward_change_mean": -0.6973709799349308, "reward_change_min": -1.293586179614067, "reward_change_std": 0.4740810338407755, "reward_std": 0.8996425941586494, "rewards/cosine_scaled_reward": -0.00021725334227085114, "rewards/format_reward": 0.8958333432674408, "step": 204 }, { "advantage_max": 1.9331641048192978, "advantage_mean": -4.097819450432638e-08, "advantage_min": -0.7692599110305309, "advantage_std": 0.9998975172638893, "completion_length": 1505.895881652832, "epoch": 0.2342857142857143, "grad_norm": 0.24515070021152496, "kl": 0.0053730010986328125, "lambda_div_used": 0.5, "learning_rate": 7.612622032536507e-07, "loss": 0.0002, "reward": 0.45918071921914816, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.45918071921914816, "reward_after_std": 1.0181467235088348, "reward_before_mean": 1.3325093338498846, "reward_before_std": 0.8213769532740116, "reward_change_max": 0.0004552304744720459, "reward_change_mean": -0.8733286261558533, "reward_change_min": -1.388043962419033, "reward_change_std": 0.5551399476826191, "reward_std": 1.018146738409996, "rewards/cosine_scaled_reward": 0.2287546508014202, "rewards/format_reward": 0.8750000074505806, "step": 205 }, { "advantage_max": 1.8465500622987747, "advantage_mean": 1.4280280458134342e-08, "advantage_min": -0.9646818488836288, "advantage_std": 0.9998023062944412, "completion_length": 2294.312545776367, "epoch": 0.23542857142857143, "grad_norm": 0.2301677167415619, "kl": 0.0046977996826171875, "lambda_div_used": 0.5, "learning_rate": 7.584832158039378e-07, "loss": 0.0002, "reward": -0.29218528768979013, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.29218528768979013, "reward_after_std": 0.5265999808907509, "reward_before_mean": 0.11779676657170057, "reward_before_std": 0.5209280513226986, "reward_change_max": 0.0007386580109596252, "reward_change_mean": -0.40998202189803123, "reward_change_min": -0.7738494612276554, "reward_change_std": 0.3092892915010452, "reward_std": 0.5265999883413315, "rewards/cosine_scaled_reward": -0.2536016311496496, "rewards/format_reward": 0.625000013038516, "step": 206 }, { "advantage_max": 1.9277321547269821, "advantage_mean": -5.587935947293232e-09, "advantage_min": -0.7697168327867985, "advantage_std": 0.9998654946684837, "completion_length": 1900.0416870117188, "epoch": 0.23657142857142857, "grad_norm": 0.3748758137226105, "kl": 0.007366180419921875, "lambda_div_used": 0.5, "learning_rate": 7.556940671764124e-07, "loss": 0.0003, "reward": -0.02678732480853796, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.02678732480853796, "reward_after_std": 0.8746708072721958, "reward_before_mean": 0.4839032180607319, "reward_before_std": 0.8162506558001041, "reward_change_max": 0.0, "reward_change_mean": -0.5106905549764633, "reward_change_min": -1.037591204047203, "reward_change_std": 0.3922302946448326, "reward_std": 0.8746708072721958, "rewards/cosine_scaled_reward": -0.15388172399252653, "rewards/format_reward": 0.7916666753590107, "step": 207 }, { "advantage_max": 1.9064188599586487, "advantage_mean": 2.1109978876054925e-08, "advantage_min": -0.9026965498924255, "advantage_std": 0.9998533874750137, "completion_length": 1185.145866394043, "epoch": 0.2377142857142857, "grad_norm": 0.22525237500667572, "kl": 0.00638580322265625, "lambda_div_used": 0.5, "learning_rate": 7.528948933102438e-07, "loss": 0.0003, "reward": 0.2584262453019619, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2584262453019619, "reward_after_std": 0.7765984460711479, "reward_before_mean": 1.0444153249263763, "reward_before_std": 0.6186449788510799, "reward_change_max": 0.0, "reward_change_mean": -0.7859890535473824, "reward_change_min": -1.2380337715148926, "reward_change_std": 0.4751918613910675, "reward_std": 0.7765984572470188, "rewards/cosine_scaled_reward": 0.0534576578065753, "rewards/format_reward": 0.9375000149011612, "step": 208 }, { "advantage_max": 1.9135385006666183, "advantage_mean": -6.5192582443529545e-09, "advantage_min": -0.9110362008213997, "advantage_std": 0.9998896718025208, "completion_length": 1635.7917098999023, "epoch": 0.23885714285714285, "grad_norm": 0.2753134071826935, "kl": 0.009868621826171875, "lambda_div_used": 0.5, "learning_rate": 7.500858306332172e-07, "loss": 0.0004, "reward": 0.2827942790463567, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2827942790463567, "reward_after_std": 0.9956619068980217, "reward_before_mean": 1.0158954737707973, "reward_before_std": 0.890892380848527, "reward_change_max": 0.001428067684173584, "reward_change_mean": -0.733101200312376, "reward_change_min": -1.203175701200962, "reward_change_std": 0.5036097802221775, "reward_std": 0.9956619516015053, "rewards/cosine_scaled_reward": 0.12253105826675892, "rewards/format_reward": 0.7708333544433117, "step": 209 }, { "advantage_max": 1.9230344742536545, "advantage_mean": -1.862645149230957e-09, "advantage_min": -0.8080510422587395, "advantage_std": 0.9998234212398529, "completion_length": 1600.104232788086, "epoch": 0.24, "grad_norm": 0.21205396950244904, "kl": 0.005344390869140625, "lambda_div_used": 0.5, "learning_rate": 7.472670160550848e-07, "loss": 0.0002, "reward": 0.10949054697994143, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.10949054697994143, "reward_after_std": 0.6526135057210922, "reward_before_mean": 0.8106731325387955, "reward_before_std": 0.505129705183208, "reward_change_max": 0.0018544942140579224, "reward_change_mean": -0.7011825982481241, "reward_change_min": -1.0848392620682716, "reward_change_std": 0.42527284659445286, "reward_std": 0.6526135131716728, "rewards/cosine_scaled_reward": -0.03216344257816672, "rewards/format_reward": 0.8750000055879354, "step": 210 }, { "advantage_max": 1.9566478729248047, "advantage_mean": -3.725290520506519e-09, "advantage_min": -0.6811323426663876, "advantage_std": 0.9998495057225227, "completion_length": 1779.6250610351562, "epoch": 0.24114285714285713, "grad_norm": 0.26686230301856995, "kl": 0.008592605590820312, "lambda_div_used": 0.5, "learning_rate": 7.444385869608921e-07, "loss": 0.0003, "reward": 0.23951259814202785, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.23951259814202785, "reward_after_std": 0.825482502579689, "reward_before_mean": 0.990179181098938, "reward_before_std": 0.6018722131848335, "reward_change_max": 0.00048663467168807983, "reward_change_mean": -0.7506665643304586, "reward_change_min": -1.2245620265603065, "reward_change_std": 0.4667343068867922, "reward_std": 0.8254825361073017, "rewards/cosine_scaled_reward": 0.15133956633508205, "rewards/format_reward": 0.6875000037252903, "step": 211 }, { "advantage_max": 1.903679609298706, "advantage_mean": -3.104407619858307e-09, "advantage_min": -0.8234128206968307, "advantage_std": 0.9998500868678093, "completion_length": 1217.854190826416, "epoch": 0.2422857142857143, "grad_norm": 0.2410409301519394, "kl": 0.00655364990234375, "lambda_div_used": 0.5, "learning_rate": 7.416006812042827e-07, "loss": 0.0003, "reward": 0.33111227909103036, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.33111227909103036, "reward_after_std": 0.8645887561142445, "reward_before_mean": 1.150598868727684, "reward_before_std": 0.7242950107902288, "reward_change_max": 0.0, "reward_change_mean": -0.8194866478443146, "reward_change_min": -1.380204539746046, "reward_change_std": 0.5173613056540489, "reward_std": 0.8645887933671474, "rewards/cosine_scaled_reward": 0.1482161059975624, "rewards/format_reward": 0.8541666716337204, "step": 212 }, { "advantage_max": 1.8833979219198227, "advantage_mean": 7.450580818968433e-09, "advantage_min": -0.8867584019899368, "advantage_std": 0.9998794943094254, "completion_length": 1776.3750648498535, "epoch": 0.24342857142857144, "grad_norm": 0.29762017726898193, "kl": 0.008733749389648438, "lambda_div_used": 0.5, "learning_rate": 7.387534371007797e-07, "loss": 0.0003, "reward": 0.14349895459599793, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.14349895459599793, "reward_after_std": 0.9624890685081482, "reward_before_mean": 0.7755306456238031, "reward_before_std": 0.9363698288798332, "reward_change_max": 0.0, "reward_change_mean": -0.6320316605269909, "reward_change_min": -1.2054100409150124, "reward_change_std": 0.4742110073566437, "reward_std": 0.9624891020357609, "rewards/cosine_scaled_reward": -0.008068038150668144, "rewards/format_reward": 0.7916666716337204, "step": 213 }, { "advantage_max": 1.896130695939064, "advantage_mean": 3.104409507237449e-10, "advantage_min": -0.8746867999434471, "advantage_std": 0.9998463988304138, "completion_length": 2014.4792175292969, "epoch": 0.24457142857142858, "grad_norm": 0.20922031998634338, "kl": 0.007541656494140625, "lambda_div_used": 0.5, "learning_rate": 7.358969934210438e-07, "loss": 0.0003, "reward": 0.0535255391150713, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.0535255391150713, "reward_after_std": 0.8194720596075058, "reward_before_mean": 0.6575473733246326, "reward_before_std": 0.756176769733429, "reward_change_max": 0.0028605982661247253, "reward_change_mean": -0.6040218118578196, "reward_change_min": -1.0416854172945023, "reward_change_std": 0.41382226534187794, "reward_std": 0.8194721043109894, "rewards/cosine_scaled_reward": -0.03580965753644705, "rewards/format_reward": 0.7291666865348816, "step": 214 }, { "advantage_max": 1.915054827928543, "advantage_mean": 1.1175871117430347e-08, "advantage_min": -0.8241091445088387, "advantage_std": 0.9998098164796829, "completion_length": 1411.7500228881836, "epoch": 0.24571428571428572, "grad_norm": 0.25436699390411377, "kl": 0.00496673583984375, "lambda_div_used": 0.5, "learning_rate": 7.330314893841101e-07, "loss": 0.0002, "reward": -0.07637928635813296, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.07637928635813296, "reward_after_std": 0.5911289788782597, "reward_before_mean": 0.4801142290234566, "reward_before_std": 0.47789497673511505, "reward_change_max": 0.0, "reward_change_mean": -0.5564935095608234, "reward_change_min": -0.9470963031053543, "reward_change_std": 0.33488226495683193, "reward_std": 0.5911289900541306, "rewards/cosine_scaled_reward": -0.17660956643521786, "rewards/format_reward": 0.8333333432674408, "step": 215 }, { "advantage_max": 1.8878718316555023, "advantage_mean": -1.117587122845265e-08, "advantage_min": -0.9693049490451813, "advantage_std": 0.9998598992824554, "completion_length": 1302.7292098999023, "epoch": 0.24685714285714286, "grad_norm": 0.290322870016098, "kl": 0.00699615478515625, "lambda_div_used": 0.5, "learning_rate": 7.301570646506027e-07, "loss": 0.0003, "reward": 0.3085772795602679, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3085772795602679, "reward_after_std": 0.7910725995898247, "reward_before_mean": 1.1356257870793343, "reward_before_std": 0.6947167627513409, "reward_change_max": 0.0, "reward_change_mean": -0.8270485177636147, "reward_change_min": -1.3118191435933113, "reward_change_std": 0.5104956552386284, "reward_std": 0.7910726070404053, "rewards/cosine_scaled_reward": 0.11989622749388218, "rewards/format_reward": 0.8958333395421505, "step": 216 }, { "advantage_max": 1.9119762033224106, "advantage_mean": 1.986821618338297e-08, "advantage_min": -0.7242433242499828, "advantage_std": 0.9998660087585449, "completion_length": 1618.4791946411133, "epoch": 0.248, "grad_norm": 0.21120773255825043, "kl": 0.00627899169921875, "lambda_div_used": 0.5, "learning_rate": 7.27273859315928e-07, "loss": 0.0003, "reward": 0.20981781790032983, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.20981781790032983, "reward_after_std": 0.9317884184420109, "reward_before_mean": 0.905009500682354, "reward_before_std": 0.818368062376976, "reward_change_max": 0.0008935406804084778, "reward_change_mean": -0.6951915994286537, "reward_change_min": -1.2974179834127426, "reward_change_std": 0.490953104570508, "reward_std": 0.9317884258925915, "rewards/cosine_scaled_reward": 0.056671383790671825, "rewards/format_reward": 0.7916666679084301, "step": 217 }, { "advantage_max": 1.918167695403099, "advantage_mean": 2.7939677238464355e-09, "advantage_min": -0.767826035618782, "advantage_std": 0.9998479783535004, "completion_length": 1574.7292175292969, "epoch": 0.24914285714285714, "grad_norm": 0.2458459585905075, "kl": 0.006359100341796875, "lambda_div_used": 0.5, "learning_rate": 7.243820139034464e-07, "loss": 0.0003, "reward": -0.017704853788018227, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.017704853788018227, "reward_after_std": 0.7833386063575745, "reward_before_mean": 0.5354463215917349, "reward_before_std": 0.7116248942911625, "reward_change_max": 0.0005803108215332031, "reward_change_mean": -0.5531511753797531, "reward_change_min": -1.0884820893406868, "reward_change_std": 0.38788705691695213, "reward_std": 0.7833386063575745, "rewards/cosine_scaled_reward": -0.15936017641797662, "rewards/format_reward": 0.854166679084301, "step": 218 }, { "advantage_max": 1.9283503293991089, "advantage_mean": 1.862645371275562e-09, "advantage_min": -0.8017013743519783, "advantage_std": 0.9998396635055542, "completion_length": 1339.4791946411133, "epoch": 0.2502857142857143, "grad_norm": 0.348197340965271, "kl": 0.0066318511962890625, "lambda_div_used": 0.5, "learning_rate": 7.214816693576234e-07, "loss": 0.0003, "reward": 0.24929398368112743, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.24929398368112743, "reward_after_std": 0.7916696332395077, "reward_before_mean": 1.0231999158859253, "reward_before_std": 0.6326823104172945, "reward_change_max": 0.0, "reward_change_mean": -0.7739059310406446, "reward_change_min": -1.2800477743148804, "reward_change_std": 0.4866549037396908, "reward_std": 0.7916696481406689, "rewards/cosine_scaled_reward": 0.09493327140808105, "rewards/format_reward": 0.8333333432674408, "step": 219 }, { "advantage_max": 1.897721529006958, "advantage_mean": 1.6142925329809543e-08, "advantage_min": -0.8538015857338905, "advantage_std": 0.9997849836945534, "completion_length": 1803.0416831970215, "epoch": 0.25142857142857145, "grad_norm": 0.2825516164302826, "kl": 0.0066280364990234375, "lambda_div_used": 0.5, "learning_rate": 7.185729670371604e-07, "loss": 0.0003, "reward": -0.23762081807944924, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.23762081807944924, "reward_after_std": 0.4904538653790951, "reward_before_mean": 0.2176198773086071, "reward_before_std": 0.4150450900197029, "reward_change_max": 0.0025578737258911133, "reward_change_mean": -0.4552406966686249, "reward_change_min": -0.7237915322184563, "reward_change_std": 0.29777046479284763, "reward_std": 0.49045388400554657, "rewards/cosine_scaled_reward": -0.2661900743842125, "rewards/format_reward": 0.7500000074505806, "step": 220 }, { "advantage_max": 1.9148018211126328, "advantage_mean": -1.2417633588057697e-09, "advantage_min": -0.8129096515476704, "advantage_std": 0.9998365119099617, "completion_length": 1439.7291831970215, "epoch": 0.25257142857142856, "grad_norm": 0.20042772591114044, "kl": 0.0054264068603515625, "lambda_div_used": 0.5, "learning_rate": 7.156560487081051e-07, "loss": 0.0002, "reward": 0.16458496823906898, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.16458496823906898, "reward_after_std": 0.7355330400168896, "reward_before_mean": 0.8835766538977623, "reward_before_std": 0.5434940056875348, "reward_change_max": 0.0, "reward_change_mean": -0.7189916595816612, "reward_change_min": -1.1088164262473583, "reward_change_std": 0.4312316067516804, "reward_std": 0.7355330511927605, "rewards/cosine_scaled_reward": 0.0251216241158545, "rewards/format_reward": 0.8333333432674408, "step": 221 }, { "advantage_max": 1.8958768248558044, "advantage_mean": -7.916242328320777e-09, "advantage_min": -0.9265587478876114, "advantage_std": 0.9998396784067154, "completion_length": 1538.4167022705078, "epoch": 0.2537142857142857, "grad_norm": 0.2274702936410904, "kl": 0.006367683410644531, "lambda_div_used": 0.5, "learning_rate": 7.127310565369415e-07, "loss": 0.0003, "reward": 0.12984950304962695, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.12984950304962695, "reward_after_std": 0.7608728632330894, "reward_before_mean": 0.8141781985759735, "reward_before_std": 0.6723398230969906, "reward_change_max": 0.0017403960227966309, "reward_change_mean": -0.6843287535011768, "reward_change_min": -1.0775920823216438, "reward_change_std": 0.4317518901079893, "reward_std": 0.7608729153871536, "rewards/cosine_scaled_reward": 0.011255767196416855, "rewards/format_reward": 0.791666679084301, "step": 222 }, { "advantage_max": 1.967112883925438, "advantage_mean": 1.893689272058907e-08, "advantage_min": -0.8013791739940643, "advantage_std": 0.9998081922531128, "completion_length": 1757.1667022705078, "epoch": 0.25485714285714284, "grad_norm": 0.24869827926158905, "kl": 0.0062732696533203125, "lambda_div_used": 0.5, "learning_rate": 7.097981330836616e-07, "loss": 0.0003, "reward": 0.12367848050780594, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.12367848050780594, "reward_after_std": 0.6536546088755131, "reward_before_mean": 0.8322453033179045, "reward_before_std": 0.45517710596323013, "reward_change_max": 0.0, "reward_change_mean": -0.7085668370127678, "reward_change_min": -1.0539904236793518, "reward_change_std": 0.39596980810165405, "reward_std": 0.6536546275019646, "rewards/cosine_scaled_reward": 0.06195598840713501, "rewards/format_reward": 0.7083333395421505, "step": 223 }, { "advantage_max": 1.9077493101358414, "advantage_mean": 1.2417635808503746e-09, "advantage_min": -0.7769346758723259, "advantage_std": 0.9998728260397911, "completion_length": 1987.6875457763672, "epoch": 0.256, "grad_norm": 0.20385780930519104, "kl": 0.0055694580078125, "lambda_div_used": 0.5, "learning_rate": 7.068574212948169e-07, "loss": 0.0002, "reward": 0.21556761115789413, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.21556761115789413, "reward_after_std": 0.9941208474338055, "reward_before_mean": 0.8981098346412182, "reward_before_std": 0.9162913672626019, "reward_change_max": 0.0, "reward_change_mean": -0.682542210444808, "reward_change_min": -1.266989454627037, "reward_change_std": 0.4777718782424927, "reward_std": 0.9941208772361279, "rewards/cosine_scaled_reward": 0.032388224732130766, "rewards/format_reward": 0.8333333432674408, "step": 224 }, { "advantage_max": 1.8822802901268005, "advantage_mean": -8.692344177774203e-09, "advantage_min": -0.8069443702697754, "advantage_std": 0.9998818635940552, "completion_length": 2217.208396911621, "epoch": 0.2571428571428571, "grad_norm": 0.2534785270690918, "kl": 0.010219573974609375, "lambda_div_used": 0.5, "learning_rate": 7.039090644965509e-07, "loss": 0.0004, "reward": 0.05662869522348046, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.05662869522348046, "reward_after_std": 0.9689956679940224, "reward_before_mean": 0.6146425190381706, "reward_before_std": 0.9694811850786209, "reward_change_max": 0.0, "reward_change_mean": -0.558013841509819, "reward_change_min": -1.179277814924717, "reward_change_std": 0.4618273414671421, "reward_std": 0.9689956903457642, "rewards/cosine_scaled_reward": -0.015595396980643272, "rewards/format_reward": 0.6458333432674408, "step": 225 }, { "advantage_max": 1.8899008184671402, "advantage_mean": 8.537124229768267e-09, "advantage_min": -0.8222019150853157, "advantage_std": 0.999876007437706, "completion_length": 1630.270881652832, "epoch": 0.2582857142857143, "grad_norm": 0.20453056693077087, "kl": 0.006317138671875, "lambda_div_used": 0.5, "learning_rate": 7.009532063876148e-07, "loss": 0.0003, "reward": 0.27888658829033375, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.27888658829033375, "reward_after_std": 0.9403040483593941, "reward_before_mean": 1.036810528486967, "reward_before_std": 0.8681198097765446, "reward_change_max": 0.0, "reward_change_mean": -0.7579239271581173, "reward_change_min": -1.3337569385766983, "reward_change_std": 0.5241223052144051, "reward_std": 0.9403041005134583, "rewards/cosine_scaled_reward": 0.09132191189564764, "rewards/format_reward": 0.8541666716337204, "step": 226 }, { "advantage_max": 1.971393644809723, "advantage_mean": 3.725290520506519e-09, "advantage_min": -0.6958120688796043, "advantage_std": 0.9998596981167793, "completion_length": 1262.6667098999023, "epoch": 0.25942857142857145, "grad_norm": 0.3305748999118805, "kl": 0.009073257446289062, "lambda_div_used": 0.5, "learning_rate": 6.979899910323624e-07, "loss": 0.0004, "reward": 0.11022061249241233, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.11022061249241233, "reward_after_std": 0.8254881426692009, "reward_before_mean": 0.7461133226752281, "reward_before_std": 0.6367629840970039, "reward_change_max": 0.0, "reward_change_mean": -0.6358926966786385, "reward_change_min": -1.0167369693517685, "reward_change_std": 0.3645213767886162, "reward_std": 0.8254881650209427, "rewards/cosine_scaled_reward": -0.11652669706381857, "rewards/format_reward": 0.9791666716337204, "step": 227 }, { "advantage_max": 1.9378290474414825, "advantage_mean": -2.2041301228625798e-08, "advantage_min": -0.7519373595714569, "advantage_std": 0.9998505935072899, "completion_length": 1381.7292022705078, "epoch": 0.26057142857142856, "grad_norm": 0.234617680311203, "kl": 0.00640869140625, "lambda_div_used": 0.5, "learning_rate": 6.950195628537299e-07, "loss": 0.0003, "reward": 0.2944382159039378, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2944382159039378, "reward_after_std": 0.8001749962568283, "reward_before_mean": 1.0973990336060524, "reward_before_std": 0.5767837278544903, "reward_change_max": 0.0, "reward_change_mean": -0.8029608391225338, "reward_change_min": -1.2629943564534187, "reward_change_std": 0.46727374754846096, "reward_std": 0.8001750260591507, "rewards/cosine_scaled_reward": 0.142449501901865, "rewards/format_reward": 0.8125000149011612, "step": 228 }, { "advantage_max": 1.9648645520210266, "advantage_mean": -1.4901161971003773e-08, "advantage_min": -0.7115680947899818, "advantage_std": 0.9998573809862137, "completion_length": 1574.3125762939453, "epoch": 0.26171428571428573, "grad_norm": 0.2357558012008667, "kl": 0.007843017578125, "lambda_div_used": 0.5, "learning_rate": 6.920420666261961e-07, "loss": 0.0003, "reward": 0.1340332217514515, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1340332217514515, "reward_after_std": 0.8145388253033161, "reward_before_mean": 0.7931003291159868, "reward_before_std": 0.6152823623269796, "reward_change_max": 0.0, "reward_change_mean": -0.6590671092271805, "reward_change_min": -1.0245969370007515, "reward_change_std": 0.38715394400060177, "reward_std": 0.8145388327538967, "rewards/cosine_scaled_reward": -0.009699843125417829, "rewards/format_reward": 0.812500013038516, "step": 229 }, { "advantage_max": 1.8860953599214554, "advantage_mean": 1.8626452658043746e-08, "advantage_min": -0.9193001091480255, "advantage_std": 0.9998148381710052, "completion_length": 1827.3542175292969, "epoch": 0.26285714285714284, "grad_norm": 0.22835993766784668, "kl": 0.008272171020507812, "lambda_div_used": 0.5, "learning_rate": 6.890576474687263e-07, "loss": 0.0003, "reward": -0.1569704683497548, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1569704683497548, "reward_after_std": 0.6100475452840328, "reward_before_mean": 0.33352593518793583, "reward_before_std": 0.544262558221817, "reward_change_max": 0.0, "reward_change_mean": -0.49049638770520687, "reward_change_min": -0.8691147491335869, "reward_change_std": 0.3244625609368086, "reward_std": 0.6100475862622261, "rewards/cosine_scaled_reward": -0.21865370776504278, "rewards/format_reward": 0.7708333507180214, "step": 230 }, { "advantage_max": 1.9178503304719925, "advantage_mean": -2.4835269396561444e-09, "advantage_min": -0.8343557715415955, "advantage_std": 0.9998718351125717, "completion_length": 1749.9166946411133, "epoch": 0.264, "grad_norm": 0.22359751164913177, "kl": 0.008632659912109375, "lambda_div_used": 0.5, "learning_rate": 6.860664508377001e-07, "loss": 0.0003, "reward": 0.22381134470924735, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.22381134470924735, "reward_after_std": 0.86066984385252, "reward_before_mean": 0.9546388499438763, "reward_before_std": 0.7596081979572773, "reward_change_max": 0.0020909160375595093, "reward_change_mean": -0.7308275178074837, "reward_change_min": -1.3097406700253487, "reward_change_std": 0.4961383566260338, "reward_std": 0.8606698885560036, "rewards/cosine_scaled_reward": 0.07106942869722843, "rewards/format_reward": 0.8125000111758709, "step": 231 }, { "advantage_max": 1.9307349771261215, "advantage_mean": 7.574757443506996e-08, "advantage_min": -0.7901148796081543, "advantage_std": 0.9997728392481804, "completion_length": 1812.833381652832, "epoch": 0.2651428571428571, "grad_norm": 0.2341795712709427, "kl": 0.008274078369140625, "lambda_div_used": 0.5, "learning_rate": 6.83068622519821e-07, "loss": 0.0003, "reward": -0.07405400322750211, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.07405400322750211, "reward_after_std": 0.7224782053381205, "reward_before_mean": 0.448210122063756, "reward_before_std": 0.6184751987457275, "reward_change_max": 0.00040543824434280396, "reward_change_mean": -0.5222641108557582, "reward_change_min": -0.9738937392830849, "reward_change_std": 0.34536191495135427, "reward_std": 0.7224782090634108, "rewards/cosine_scaled_reward": -0.16131161339581013, "rewards/format_reward": 0.7708333432674408, "step": 232 }, { "advantage_max": 1.9683336466550827, "advantage_mean": -1.2417634698280722e-09, "advantage_min": -0.7432735189795494, "advantage_std": 0.999840646982193, "completion_length": 1129.708366394043, "epoch": 0.2662857142857143, "grad_norm": 0.3380148708820343, "kl": 0.0073413848876953125, "lambda_div_used": 0.5, "learning_rate": 6.800643086250121e-07, "loss": 0.0003, "reward": -0.01638099644333124, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.01638099644333124, "reward_after_std": 0.7068706303834915, "reward_before_mean": 0.5511052198708057, "reward_before_std": 0.5402777791023254, "reward_change_max": 0.0, "reward_change_mean": -0.5674862191081047, "reward_change_min": -0.8964772894978523, "reward_change_std": 0.3298127166926861, "reward_std": 0.7068706452846527, "rewards/cosine_scaled_reward": -0.19319739658385515, "rewards/format_reward": 0.9375000074505806, "step": 233 }, { "advantage_max": 1.9574606865644455, "advantage_mean": 3.725291075618031e-09, "advantage_min": -0.7137792631983757, "advantage_std": 0.9998077154159546, "completion_length": 1706.3542175292969, "epoch": 0.2674285714285714, "grad_norm": 0.23307549953460693, "kl": 0.0071849822998046875, "lambda_div_used": 0.5, "learning_rate": 6.770536555792944e-07, "loss": 0.0003, "reward": 0.01580604538321495, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.01580604538321495, "reward_after_std": 0.702125009149313, "reward_before_mean": 0.62279331125319, "reward_before_std": 0.5630986513569951, "reward_change_max": 0.0, "reward_change_mean": -0.6069872789084911, "reward_change_min": -0.9930330999195576, "reward_change_std": 0.39121099561452866, "reward_std": 0.7021250482648611, "rewards/cosine_scaled_reward": -0.05318668344989419, "rewards/format_reward": 0.7291666772216558, "step": 234 }, { "advantage_max": 1.9572331607341766, "advantage_mean": -8.692344399818808e-09, "advantage_min": -0.7316535785794258, "advantage_std": 0.9998847916722298, "completion_length": 1356.958381652832, "epoch": 0.26857142857142857, "grad_norm": 0.2740318775177002, "kl": 0.007434844970703125, "lambda_div_used": 0.5, "learning_rate": 6.740368101176495e-07, "loss": 0.0003, "reward": 0.38920610025525093, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.38920610025525093, "reward_after_std": 0.9113083258271217, "reward_before_mean": 1.2318087760359049, "reward_before_std": 0.6570627186447382, "reward_change_max": 0.0014033988118171692, "reward_change_mean": -0.8426027111709118, "reward_change_min": -1.267478797584772, "reward_change_std": 0.48250637575984, "reward_std": 0.9113083481788635, "rewards/cosine_scaled_reward": 0.1784043600782752, "rewards/format_reward": 0.8750000055879354, "step": 235 }, { "advantage_max": 1.9679247587919235, "advantage_mean": -2.4835269396561444e-09, "advantage_min": -0.6973557993769646, "advantage_std": 0.9998724982142448, "completion_length": 1780.5417251586914, "epoch": 0.26971428571428574, "grad_norm": 0.22148916125297546, "kl": 0.0067691802978515625, "lambda_div_used": 0.5, "learning_rate": 6.710139192768694e-07, "loss": 0.0003, "reward": 0.02068489557132125, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.02068489557132125, "reward_after_std": 0.9226878210902214, "reward_before_mean": 0.5527768121100962, "reward_before_std": 0.8031711392104626, "reward_change_max": 0.0, "reward_change_mean": -0.5320919454097748, "reward_change_min": -0.9312163218855858, "reward_change_std": 0.3605116531252861, "reward_std": 0.9226878210902214, "rewards/cosine_scaled_reward": -0.10902826674282551, "rewards/format_reward": 0.770833333954215, "step": 236 }, { "advantage_max": 1.9593615680932999, "advantage_mean": -2.4835269063494536e-08, "advantage_min": -0.7497310638427734, "advantage_std": 0.9998056143522263, "completion_length": 1496.6666870117188, "epoch": 0.27085714285714285, "grad_norm": 0.21749630570411682, "kl": 0.006267547607421875, "lambda_div_used": 0.5, "learning_rate": 6.679851303883891e-07, "loss": 0.0003, "reward": 0.08538356237113476, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.08538356237113476, "reward_after_std": 0.6719049979001284, "reward_before_mean": 0.7545446641743183, "reward_before_std": 0.4781134817749262, "reward_change_max": 0.000691726803779602, "reward_change_mean": -0.6691611055284739, "reward_change_min": -1.0085529759526253, "reward_change_std": 0.3762901732698083, "reward_std": 0.6719050072133541, "rewards/cosine_scaled_reward": -0.03939435165375471, "rewards/format_reward": 0.8333333432674408, "step": 237 }, { "advantage_max": 1.9590356647968292, "advantage_mean": -1.1175871450497255e-08, "advantage_min": -0.745018545538187, "advantage_std": 0.999871663749218, "completion_length": 1041.8750381469727, "epoch": 0.272, "grad_norm": 0.23885765671730042, "kl": 0.007602691650390625, "lambda_div_used": 0.5, "learning_rate": 6.649505910711058e-07, "loss": 0.0003, "reward": 0.26911926828324795, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.26911926828324795, "reward_after_std": 0.8938291482627392, "reward_before_mean": 1.0215275082737207, "reward_before_std": 0.6879322770982981, "reward_change_max": 0.00012250244617462158, "reward_change_mean": -0.7524082288146019, "reward_change_min": -1.1642607524991035, "reward_change_std": 0.4326386693865061, "reward_std": 0.8938291892409325, "rewards/cosine_scaled_reward": 0.03159707225859165, "rewards/format_reward": 0.9583333432674408, "step": 238 }, { "advantage_max": 1.9281752556562424, "advantage_mean": -3.290673267208888e-08, "advantage_min": -0.7456583306193352, "advantage_std": 0.999865360558033, "completion_length": 1458.6667098999023, "epoch": 0.27314285714285713, "grad_norm": 0.21850885450839996, "kl": 0.0056705474853515625, "lambda_div_used": 0.5, "learning_rate": 6.619104492241847e-07, "loss": 0.0002, "reward": 0.46745580551214516, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.46745580551214516, "reward_after_std": 0.8389881551265717, "reward_before_mean": 1.4097450375556946, "reward_before_std": 0.6188498791307211, "reward_change_max": 0.0, "reward_change_mean": -0.942289263010025, "reward_change_min": -1.4645648337900639, "reward_change_std": 0.570222893729806, "reward_std": 0.8389881774783134, "rewards/cosine_scaled_reward": 0.2986225029453635, "rewards/format_reward": 0.8125000018626451, "step": 239 }, { "advantage_max": 1.943466305732727, "advantage_mean": 1.552204287325054e-08, "advantage_min": -0.7473693750798702, "advantage_std": 0.9998158067464828, "completion_length": 1844.145866394043, "epoch": 0.2742857142857143, "grad_norm": 0.37496596574783325, "kl": 0.011318206787109375, "lambda_div_used": 0.5, "learning_rate": 6.588648530198504e-07, "loss": 0.0005, "reward": -0.23318948596715927, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.23318948596715927, "reward_after_std": 0.6114565394818783, "reward_before_mean": 0.1918274350464344, "reward_before_std": 0.5235918611288071, "reward_change_max": 0.0006126239895820618, "reward_change_mean": -0.42501691612415016, "reward_change_min": -0.7172445729374886, "reward_change_std": 0.2740626563318074, "reward_std": 0.6114565506577492, "rewards/cosine_scaled_reward": -0.2686696262098849, "rewards/format_reward": 0.7291666846722364, "step": 240 }, { "advantage_max": 1.9376345574855804, "advantage_mean": 1.5522043428362053e-08, "advantage_min": -0.7810809761285782, "advantage_std": 0.999825157225132, "completion_length": 1782.8333587646484, "epoch": 0.2754285714285714, "grad_norm": 0.26208123564720154, "kl": 0.0093231201171875, "lambda_div_used": 0.5, "learning_rate": 6.558139508961654e-07, "loss": 0.0004, "reward": -0.16795344126876444, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.16795344126876444, "reward_after_std": 0.6143370009958744, "reward_before_mean": 0.309583880007267, "reward_before_std": 0.496045283973217, "reward_change_max": 0.0004753321409225464, "reward_change_mean": -0.47753732465207577, "reward_change_min": -0.7713849879801273, "reward_change_std": 0.2903636433184147, "reward_std": 0.6143370270729065, "rewards/cosine_scaled_reward": -0.22020806092768908, "rewards/format_reward": 0.7500000093132257, "step": 241 }, { "advantage_max": 1.9451895356178284, "advantage_mean": 1.6653345369377348e-16, "advantage_min": -0.7966984063386917, "advantage_std": 0.9998231902718544, "completion_length": 1370.9791946411133, "epoch": 0.2765714285714286, "grad_norm": 0.32628756761550903, "kl": 0.009744644165039062, "lambda_div_used": 0.5, "learning_rate": 6.527578915497951e-07, "loss": 0.0004, "reward": -0.0009593330323696136, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.0009593330323696136, "reward_after_std": 0.7521246317774057, "reward_before_mean": 0.5692737740464509, "reward_before_std": 0.6212347452528775, "reward_change_max": 0.0, "reward_change_mean": -0.5702331103384495, "reward_change_min": -0.8771290257573128, "reward_change_std": 0.34795637615025043, "reward_std": 0.7521246485412121, "rewards/cosine_scaled_reward": -0.16327978996559978, "rewards/format_reward": 0.8958333432674408, "step": 242 }, { "advantage_max": 1.9557791501283646, "advantage_mean": -4.967053768289986e-09, "advantage_min": -0.7319461777806282, "advantage_std": 0.9998825341463089, "completion_length": 1414.0416946411133, "epoch": 0.2777142857142857, "grad_norm": 0.2184247523546219, "kl": 0.0065326690673828125, "lambda_div_used": 0.5, "learning_rate": 6.496968239287603e-07, "loss": 0.0003, "reward": 0.2227376624941826, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2227376624941826, "reward_after_std": 0.9276175498962402, "reward_before_mean": 0.9240134842693806, "reward_before_std": 0.7810794413089752, "reward_change_max": 0.0009024292230606079, "reward_change_mean": -0.7012757956981659, "reward_change_min": -1.248128518462181, "reward_change_std": 0.4496439266949892, "reward_std": 0.9276175945997238, "rewards/cosine_scaled_reward": 0.014090052805840969, "rewards/format_reward": 0.8958333395421505, "step": 243 }, { "advantage_max": 1.9719029814004898, "advantage_mean": -5.89837656495007e-09, "advantage_min": -0.6661188155412674, "advantage_std": 0.9999043568968773, "completion_length": 1704.1458854675293, "epoch": 0.27885714285714286, "grad_norm": 0.23143883049488068, "kl": 0.00717926025390625, "lambda_div_used": 0.5, "learning_rate": 6.466308972251785e-07, "loss": 0.0003, "reward": 0.4157985597848892, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4157985597848892, "reward_after_std": 1.1085382103919983, "reward_before_mean": 1.2161034047603607, "reward_before_std": 0.8881407734006643, "reward_change_max": 0.0004424676299095154, "reward_change_mean": -0.8003048803657293, "reward_change_min": -1.3503287062048912, "reward_change_std": 0.49696409702301025, "reward_std": 1.108538269996643, "rewards/cosine_scaled_reward": 0.20180170447565615, "rewards/format_reward": 0.8125000055879354, "step": 244 }, { "advantage_max": 1.932635858654976, "advantage_mean": -1.76951293617833e-08, "advantage_min": -0.7667308263480663, "advantage_std": 0.999868243932724, "completion_length": 1864.5208740234375, "epoch": 0.28, "grad_norm": 0.2242145985364914, "kl": 0.0067577362060546875, "lambda_div_used": 0.5, "learning_rate": 6.435602608679916e-07, "loss": 0.0003, "reward": 0.09724759729579091, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.09724759729579091, "reward_after_std": 0.9333448335528374, "reward_before_mean": 0.6936495788395405, "reward_before_std": 0.8179382756352425, "reward_change_max": 0.0, "reward_change_mean": -0.5964019894599915, "reward_change_min": -1.0710543617606163, "reward_change_std": 0.4002857990562916, "reward_std": 0.9333448670804501, "rewards/cosine_scaled_reward": -0.02817522920668125, "rewards/format_reward": 0.7500000093132257, "step": 245 }, { "advantage_max": 1.9313433915376663, "advantage_mean": 1.3659398501175701e-08, "advantage_min": -0.7306452617049217, "advantage_std": 0.999847486615181, "completion_length": 1426.0000610351562, "epoch": 0.28114285714285714, "grad_norm": 0.21626295149326324, "kl": 0.007900238037109375, "lambda_div_used": 0.5, "learning_rate": 6.404850645156841e-07, "loss": 0.0003, "reward": 0.04067504871636629, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.04067504871636629, "reward_after_std": 0.7928140051662922, "reward_before_mean": 0.6401501521468163, "reward_before_std": 0.6862226165831089, "reward_change_max": 0.000470772385597229, "reward_change_mean": -0.5994750969111919, "reward_change_min": -1.0783002860844135, "reward_change_std": 0.39229152724146843, "reward_std": 0.7928140312433243, "rewards/cosine_scaled_reward": -0.12784160394221544, "rewards/format_reward": 0.8958333395421505, "step": 246 }, { "advantage_max": 1.909987896680832, "advantage_mean": 1.2417634698280722e-08, "advantage_min": -0.8228132203221321, "advantage_std": 0.9998472183942795, "completion_length": 1998.7500457763672, "epoch": 0.2822857142857143, "grad_norm": 0.23221814632415771, "kl": 0.0082244873046875, "lambda_div_used": 0.5, "learning_rate": 6.374054580489873e-07, "loss": 0.0003, "reward": -0.10598975839093328, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": -0.10598975839093328, "reward_after_std": 0.7731035724282265, "reward_before_mean": 0.37022377736866474, "reward_before_std": 0.7434089183807373, "reward_change_max": 0.0009429380297660828, "reward_change_mean": -0.4762135464698076, "reward_change_min": -0.9990293830633163, "reward_change_std": 0.36950034089386463, "reward_std": 0.7731035761535168, "rewards/cosine_scaled_reward": -0.15863812156021595, "rewards/format_reward": 0.687500013038516, "step": 247 }, { "advantage_max": 1.9852658063173294, "advantage_mean": -3.7252901874396116e-09, "advantage_min": -0.6675059422850609, "advantage_std": 0.9998695775866508, "completion_length": 1363.3125305175781, "epoch": 0.2834285714285714, "grad_norm": 0.3437131643295288, "kl": 0.008054733276367188, "lambda_div_used": 0.5, "learning_rate": 6.343215915635761e-07, "loss": 0.0003, "reward": 0.3367419361602515, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3367419361602515, "reward_after_std": 0.8527059704065323, "reward_before_mean": 1.1518472461029887, "reward_before_std": 0.582564564421773, "reward_change_max": 0.0, "reward_change_mean": -0.8151053376495838, "reward_change_min": -1.2186406515538692, "reward_change_std": 0.45573683083057404, "reward_std": 0.8527059927582741, "rewards/cosine_scaled_reward": 0.15925694815814495, "rewards/format_reward": 0.833333333954215, "step": 248 }, { "advantage_max": 1.9299704134464264, "advantage_mean": -6.208817349140361e-09, "advantage_min": -0.759776271879673, "advantage_std": 0.9998761713504791, "completion_length": 1248.5000457763672, "epoch": 0.2845714285714286, "grad_norm": 0.30233049392700195, "kl": 0.012414932250976562, "lambda_div_used": 0.5, "learning_rate": 6.31233615362752e-07, "loss": 0.0005, "reward": 0.4085944064427167, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4085944064427167, "reward_after_std": 0.8884996175765991, "reward_before_mean": 1.2769385250285268, "reward_before_std": 0.6949853375554085, "reward_change_max": 0.0018889382481575012, "reward_change_mean": -0.8683441504836082, "reward_change_min": -1.3353769183158875, "reward_change_std": 0.5192967671900988, "reward_std": 0.8884996548295021, "rewards/cosine_scaled_reward": 0.2218025820911862, "rewards/format_reward": 0.8333333395421505, "step": 249 }, { "advantage_max": 1.897703930735588, "advantage_mean": 2.514571070810767e-08, "advantage_min": -0.8732591867446899, "advantage_std": 0.9998427405953407, "completion_length": 1361.4583702087402, "epoch": 0.2857142857142857, "grad_norm": 0.3435612916946411, "kl": 0.008312225341796875, "lambda_div_used": 0.5, "learning_rate": 6.281416799501187e-07, "loss": 0.0003, "reward": 0.05800286494195461, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.05800286494195461, "reward_after_std": 0.7511843629181385, "reward_before_mean": 0.6857307204045355, "reward_before_std": 0.6802803799510002, "reward_change_max": 0.0, "reward_change_mean": -0.6277278400957584, "reward_change_min": -1.1124539822340012, "reward_change_std": 0.41588929295539856, "reward_std": 0.7511844001710415, "rewards/cosine_scaled_reward": -0.11546798469498754, "rewards/format_reward": 0.916666679084301, "step": 250 }, { "advantage_max": 1.9394332319498062, "advantage_mean": -1.2417634476236117e-08, "advantage_min": -0.8116177469491959, "advantage_std": 0.9998854398727417, "completion_length": 1301.6875381469727, "epoch": 0.28685714285714287, "grad_norm": 0.26373091340065, "kl": 0.009695053100585938, "lambda_div_used": 0.5, "learning_rate": 6.25045936022246e-07, "loss": 0.0004, "reward": 0.24317791312932968, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.24317791312932968, "reward_after_std": 0.9907942861318588, "reward_before_mean": 0.9422217644751072, "reward_before_std": 0.8384118843823671, "reward_change_max": 0.0, "reward_change_mean": -0.6990438550710678, "reward_change_min": -1.1878609210252762, "reward_change_std": 0.4547945335507393, "reward_std": 0.9907942861318588, "rewards/cosine_scaled_reward": 0.03361086605582386, "rewards/format_reward": 0.8750000074505806, "step": 251 }, { "advantage_max": 1.9412772208452225, "advantage_mean": 1.490116141589226e-08, "advantage_min": -0.8077448084950447, "advantage_std": 0.9998365715146065, "completion_length": 1653.0208587646484, "epoch": 0.288, "grad_norm": 0.22626619040966034, "kl": 0.0092926025390625, "lambda_div_used": 0.5, "learning_rate": 6.219465344613258e-07, "loss": 0.0004, "reward": -0.016541813500225544, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.016541813500225544, "reward_after_std": 0.6735903918743134, "reward_before_mean": 0.5694787334650755, "reward_before_std": 0.5151693597435951, "reward_change_max": 0.0, "reward_change_mean": -0.5860205516219139, "reward_change_min": -0.8795042932033539, "reward_change_std": 0.3404890410602093, "reward_std": 0.6735904067754745, "rewards/cosine_scaled_reward": -0.1110939746722579, "rewards/format_reward": 0.7916666846722364, "step": 252 }, { "advantage_max": 1.8977369666099548, "advantage_mean": 4.9670543234014986e-09, "advantage_min": -0.7935145944356918, "advantage_std": 0.9998346045613289, "completion_length": 1753.9167175292969, "epoch": 0.28914285714285715, "grad_norm": 0.2937052547931671, "kl": 0.013032913208007812, "lambda_div_used": 0.5, "learning_rate": 6.188436263278172e-07, "loss": 0.0005, "reward": 0.07536422368139029, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.07536422368139029, "reward_after_std": 0.8575031999498606, "reward_before_mean": 0.6837189728394151, "reward_before_std": 0.8161356411874294, "reward_change_max": 0.0, "reward_change_mean": -0.6083547510206699, "reward_change_min": -1.1546659246087074, "reward_change_std": 0.4495114888995886, "reward_std": 0.8575032278895378, "rewards/cosine_scaled_reward": -0.05397386848926544, "rewards/format_reward": 0.7916666828095913, "step": 253 }, { "advantage_max": 1.874565601348877, "advantage_mean": 5.587935669737476e-09, "advantage_min": -0.8698348104953766, "advantage_std": 0.9998382553458214, "completion_length": 1759.9791946411133, "epoch": 0.29028571428571426, "grad_norm": 0.3259371221065521, "kl": 0.012237548828125, "lambda_div_used": 0.5, "learning_rate": 6.157373628530852e-07, "loss": 0.0005, "reward": -0.029336320236325264, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.029336320236325264, "reward_after_std": 0.7416080571711063, "reward_before_mean": 0.5269271868746728, "reward_before_std": 0.6862723678350449, "reward_change_max": 0.00011439621448516846, "reward_change_mean": -0.5562635250389576, "reward_change_min": -0.9738981239497662, "reward_change_std": 0.38014569878578186, "reward_std": 0.7416080869734287, "rewards/cosine_scaled_reward": -0.142786405980587, "rewards/format_reward": 0.812500013038516, "step": 254 }, { "advantage_max": 1.9653310924768448, "advantage_mean": 3.7873786884468075e-08, "advantage_min": -0.6673059165477753, "advantage_std": 0.9997921586036682, "completion_length": 1813.2917098999023, "epoch": 0.2914285714285714, "grad_norm": 0.22582511603832245, "kl": 0.009113311767578125, "lambda_div_used": 0.5, "learning_rate": 6.126278954320294e-07, "loss": 0.0004, "reward": -0.19223103299736977, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.19223103299736977, "reward_after_std": 0.6329327449202538, "reward_before_mean": 0.2562922164797783, "reward_before_std": 0.4785165935754776, "reward_change_max": 0.00114508718252182, "reward_change_mean": -0.4485232476145029, "reward_change_min": -0.7125186286866665, "reward_change_std": 0.26121316477656364, "reward_std": 0.632932759821415, "rewards/cosine_scaled_reward": -0.23643723208806477, "rewards/format_reward": 0.7291666716337204, "step": 255 }, { "advantage_max": 1.922520250082016, "advantage_mean": 2.4835269396561444e-09, "advantage_min": -0.8391077145934105, "advantage_std": 0.9998262971639633, "completion_length": 1616.6458587646484, "epoch": 0.2925714285714286, "grad_norm": 0.289453387260437, "kl": 0.010000228881835938, "lambda_div_used": 0.5, "learning_rate": 6.095153756157051e-07, "loss": 0.0004, "reward": 0.08426090609282255, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.08426090609282255, "reward_after_std": 0.7107764109969139, "reward_before_mean": 0.7408394683152437, "reward_before_std": 0.5993353240191936, "reward_change_max": 0.0, "reward_change_mean": -0.6565785631537437, "reward_change_min": -1.0525353848934174, "reward_change_std": 0.40571545250713825, "reward_std": 0.7107764147222042, "rewards/cosine_scaled_reward": -0.04624694274389185, "rewards/format_reward": 0.8333333469927311, "step": 256 }, { "advantage_max": 1.7898119240999222, "advantage_mean": -6.519258355375257e-09, "advantage_min": -1.0971575528383255, "advantage_std": 0.9998827576637268, "completion_length": 1864.1250610351562, "epoch": 0.2937142857142857, "grad_norm": 0.25969284772872925, "kl": 0.00867462158203125, "lambda_div_used": 0.5, "learning_rate": 6.06399955103937e-07, "loss": 0.0003, "reward": 0.293114073574543, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.293114073574543, "reward_after_std": 0.9689504988491535, "reward_before_mean": 1.0681517720222473, "reward_before_std": 1.044908344745636, "reward_change_max": 0.0009504184126853943, "reward_change_mean": -0.7750377468764782, "reward_change_min": -1.4173622876405716, "reward_change_std": 0.6037551872432232, "reward_std": 0.9689505062997341, "rewards/cosine_scaled_reward": 0.15907588601112366, "rewards/format_reward": 0.7500000260770321, "step": 257 }, { "advantage_max": 1.913239747285843, "advantage_mean": -2.483527050678447e-09, "advantage_min": -0.7339973300695419, "advantage_std": 0.9998701959848404, "completion_length": 1717.6250534057617, "epoch": 0.2948571428571429, "grad_norm": 0.2277102768421173, "kl": 0.007541656494140625, "lambda_div_used": 0.5, "learning_rate": 6.032817857379256e-07, "loss": 0.0003, "reward": 0.14000254310667515, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.14000254310667515, "reward_after_std": 0.9483177699148655, "reward_before_mean": 0.7783935330808163, "reward_before_std": 0.8779289349913597, "reward_change_max": 0.0, "reward_change_mean": -0.6383909955620766, "reward_change_min": -1.2183105871081352, "reward_change_std": 0.4686681590974331, "reward_std": 0.9483177699148655, "rewards/cosine_scaled_reward": -0.03788657521363348, "rewards/format_reward": 0.8541666716337204, "step": 258 }, { "advantage_max": 1.960391491651535, "advantage_mean": -1.1175871339474952e-08, "advantage_min": -0.6969268918037415, "advantage_std": 0.9998453557491302, "completion_length": 1366.3958740234375, "epoch": 0.296, "grad_norm": 0.27390286326408386, "kl": 0.00946807861328125, "lambda_div_used": 0.5, "learning_rate": 6.001610194928464e-07, "loss": 0.0004, "reward": 0.17380837351083755, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.17380837351083755, "reward_after_std": 0.709467351436615, "reward_before_mean": 0.9051290564239025, "reward_before_std": 0.48298365622758865, "reward_change_max": 0.0, "reward_change_mean": -0.7313206605613232, "reward_change_min": -1.1136563420295715, "reward_change_std": 0.40611193887889385, "reward_std": 0.7094673663377762, "rewards/cosine_scaled_reward": 0.015064499340951443, "rewards/format_reward": 0.8750000055879354, "step": 259 }, { "advantage_max": 1.902090847492218, "advantage_mean": -5.551115123125783e-16, "advantage_min": -0.8360799662768841, "advantage_std": 0.9998557269573212, "completion_length": 1141.3958644866943, "epoch": 0.29714285714285715, "grad_norm": 0.27379411458969116, "kl": 0.00658416748046875, "lambda_div_used": 0.5, "learning_rate": 5.97037808470444e-07, "loss": 0.0003, "reward": 0.33953036181628704, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.33953036181628704, "reward_after_std": 0.8164723627269268, "reward_before_mean": 1.1856674198061228, "reward_before_std": 0.6960519719868898, "reward_change_max": 0.00030853599309921265, "reward_change_mean": -0.8461369834840298, "reward_change_min": -1.3422378227114677, "reward_change_std": 0.533370828256011, "reward_std": 0.816472377628088, "rewards/cosine_scaled_reward": 0.10325032752007246, "rewards/format_reward": 0.9791666716337204, "step": 260 }, { "advantage_max": 1.9397707134485245, "advantage_mean": 5.215406695402436e-08, "advantage_min": -0.7806360647082329, "advantage_std": 0.9997915849089622, "completion_length": 2092.395866394043, "epoch": 0.29828571428571427, "grad_norm": 0.2074173092842102, "kl": 0.008243560791015625, "lambda_div_used": 0.5, "learning_rate": 5.939123048916173e-07, "loss": 0.0003, "reward": -0.11741140764206648, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.11741140764206648, "reward_after_std": 0.7313024029135704, "reward_before_mean": 0.3657475281506777, "reward_before_std": 0.625818207859993, "reward_change_max": 0.0005079880356788635, "reward_change_mean": -0.4831589162349701, "reward_change_min": -0.8495705388486385, "reward_change_std": 0.30140868201851845, "reward_std": 0.7313024085015059, "rewards/cosine_scaled_reward": -0.11920957826077938, "rewards/format_reward": 0.6041666716337204, "step": 261 }, { "advantage_max": 1.9241239577531815, "advantage_mean": 1.9247333560290514e-08, "advantage_min": -0.7412631884217262, "advantage_std": 0.999830037355423, "completion_length": 1531.395896911621, "epoch": 0.29942857142857143, "grad_norm": 0.2947309613227844, "kl": 0.010721206665039062, "lambda_div_used": 0.5, "learning_rate": 5.907846610890011e-07, "loss": 0.0004, "reward": -0.11896365694701672, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.11896365694701672, "reward_after_std": 0.6233756765723228, "reward_before_mean": 0.3966408409178257, "reward_before_std": 0.5298713929951191, "reward_change_max": 0.001282908022403717, "reward_change_mean": -0.5156044885516167, "reward_change_min": -0.9353062510490417, "reward_change_std": 0.3425426837056875, "reward_std": 0.6233757026493549, "rewards/cosine_scaled_reward": -0.20792959071695805, "rewards/format_reward": 0.8125000111758709, "step": 262 }, { "advantage_max": 1.9853202849626541, "advantage_mean": 1.8626452047421083e-08, "advantage_min": -0.6768524274230003, "advantage_std": 0.9998548626899719, "completion_length": 1272.0625228881836, "epoch": 0.30057142857142854, "grad_norm": 0.23958361148834229, "kl": 0.007053375244140625, "lambda_div_used": 0.5, "learning_rate": 5.87655029499542e-07, "loss": 0.0003, "reward": -0.035472466610372066, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.035472466610372066, "reward_after_std": 0.7921707406640053, "reward_before_mean": 0.487296462059021, "reward_before_std": 0.6246878579258919, "reward_change_max": 0.0, "reward_change_mean": -0.5227689333260059, "reward_change_min": -0.8227484747767448, "reward_change_std": 0.2987172771245241, "reward_std": 0.7921707481145859, "rewards/cosine_scaled_reward": -0.23551844991743565, "rewards/format_reward": 0.9583333432674408, "step": 263 }, { "advantage_max": 1.94135420024395, "advantage_mean": -1.738468852208186e-08, "advantage_min": -0.7536604218184948, "advantage_std": 0.99985770881176, "completion_length": 1475.1250305175781, "epoch": 0.3017142857142857, "grad_norm": 0.22166408598423004, "kl": 0.007488250732421875, "lambda_div_used": 0.5, "learning_rate": 5.845235626570683e-07, "loss": 0.0003, "reward": 0.054106075898744166, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.054106075898744166, "reward_after_std": 0.7976480908691883, "reward_before_mean": 0.6590630635619164, "reward_before_std": 0.6566238440573215, "reward_change_max": 0.0, "reward_change_mean": -0.6049569994211197, "reward_change_min": -0.9770181886851788, "reward_change_std": 0.37318576872348785, "reward_std": 0.797648124396801, "rewards/cosine_scaled_reward": -0.11838514357805252, "rewards/format_reward": 0.8958333395421505, "step": 264 }, { "advantage_max": 1.9491015672683716, "advantage_mean": -1.1796752907855534e-08, "advantage_min": -0.708407960832119, "advantage_std": 0.9998629614710808, "completion_length": 1303.1042251586914, "epoch": 0.3028571428571429, "grad_norm": 0.25498053431510925, "kl": 0.008037567138671875, "lambda_div_used": 0.5, "learning_rate": 5.813904131848564e-07, "loss": 0.0003, "reward": 0.2292765413003508, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2292765413003508, "reward_after_std": 0.8382696844637394, "reward_before_mean": 0.9684329330921173, "reward_before_std": 0.6394072566181421, "reward_change_max": 0.0, "reward_change_mean": -0.739156398922205, "reward_change_min": -1.1916795074939728, "reward_change_std": 0.44865376502275467, "reward_std": 0.8382696956396103, "rewards/cosine_scaled_reward": 0.005049763713032007, "rewards/format_reward": 0.9583333358168602, "step": 265 }, { "advantage_max": 1.9380334466695786, "advantage_mean": 8.6923440667519e-09, "advantage_min": -0.7872503623366356, "advantage_std": 0.9998449459671974, "completion_length": 1485.3750305175781, "epoch": 0.304, "grad_norm": 0.24378226697444916, "kl": 0.008502960205078125, "lambda_div_used": 0.5, "learning_rate": 5.78255733788191e-07, "loss": 0.0003, "reward": 0.026207237504422665, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.026207237504422665, "reward_after_std": 0.762421753257513, "reward_before_mean": 0.6195590551942587, "reward_before_std": 0.6429805513471365, "reward_change_max": 1.6391277313232422e-06, "reward_change_mean": -0.5933518260717392, "reward_change_min": -1.0302874110639095, "reward_change_std": 0.3743545040488243, "reward_std": 0.762421753257513, "rewards/cosine_scaled_reward": -0.11730381986126304, "rewards/format_reward": 0.8541666679084301, "step": 266 }, { "advantage_max": 1.9066883474588394, "advantage_mean": 2.23517424569053e-08, "advantage_min": -0.8579533696174622, "advantage_std": 0.9998121857643127, "completion_length": 2088.250015258789, "epoch": 0.30514285714285716, "grad_norm": 0.2329891175031662, "kl": 0.012308120727539062, "lambda_div_used": 0.5, "learning_rate": 5.751196772469237e-07, "loss": 0.0005, "reward": -0.20661406670114957, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.20661406670114957, "reward_after_std": 0.6844783090054989, "reward_before_mean": 0.21980984695255756, "reward_before_std": 0.6176054794341326, "reward_change_max": 0.003687061369419098, "reward_change_mean": -0.42642390355467796, "reward_change_min": -0.7731238566339016, "reward_change_std": 0.3042891379445791, "reward_std": 0.6844783164560795, "rewards/cosine_scaled_reward": -0.192178413271904, "rewards/format_reward": 0.6041666753590107, "step": 267 }, { "advantage_max": 1.9255520403385162, "advantage_mean": -4.967053990334591e-09, "advantage_min": -0.8325854539871216, "advantage_std": 0.9998375400900841, "completion_length": 1344.5625305175781, "epoch": 0.3062857142857143, "grad_norm": 0.31328123807907104, "kl": 0.010145187377929688, "lambda_div_used": 0.5, "learning_rate": 5.71982396408026e-07, "loss": 0.0004, "reward": 0.04860646743327379, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.04860646743327379, "reward_after_std": 0.7247105650603771, "reward_before_mean": 0.6737147830426693, "reward_before_std": 0.6382793746888638, "reward_change_max": 0.0, "reward_change_mean": -0.6251083053648472, "reward_change_min": -1.0648128241300583, "reward_change_std": 0.41542051173746586, "reward_std": 0.7247105725109577, "rewards/cosine_scaled_reward": -0.09022596850991249, "rewards/format_reward": 0.8541666753590107, "step": 268 }, { "advantage_max": 1.9525828659534454, "advantage_mean": 2.8560558584800333e-08, "advantage_min": -0.7449537627398968, "advantage_std": 0.9998025745153427, "completion_length": 1480.8333587646484, "epoch": 0.30742857142857144, "grad_norm": 0.2323724776506424, "kl": 0.00814056396484375, "lambda_div_used": 0.5, "learning_rate": 5.688440441781398e-07, "loss": 0.0003, "reward": 0.09112085448578, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.09112085448578, "reward_after_std": 0.6920596230775118, "reward_before_mean": 0.763656685128808, "reward_before_std": 0.5157211106270552, "reward_change_max": 0.0005363896489143372, "reward_change_mean": -0.6725358641706407, "reward_change_min": -1.0665729567408562, "reward_change_std": 0.4021889283321798, "reward_std": 0.6920596417039633, "rewards/cosine_scaled_reward": -0.03483833000063896, "rewards/format_reward": 0.8333333358168602, "step": 269 }, { "advantage_max": 1.9452837705612183, "advantage_mean": 1.2417634254191512e-08, "advantage_min": -0.6652894914150238, "advantage_std": 0.9998657330870628, "completion_length": 1606.645866394043, "epoch": 0.30857142857142855, "grad_norm": 0.18927238881587982, "kl": 0.009557723999023438, "lambda_div_used": 0.5, "learning_rate": 5.657047735161255e-07, "loss": 0.0004, "reward": 0.21252715727314353, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.21252715727314353, "reward_after_std": 0.8603941760957241, "reward_before_mean": 0.9310585322091356, "reward_before_std": 0.6936173308640718, "reward_change_max": 0.0003393515944480896, "reward_change_mean": -0.7185313515365124, "reward_change_min": -1.1931066066026688, "reward_change_std": 0.4510572552680969, "reward_std": 0.8603942133486271, "rewards/cosine_scaled_reward": 0.03844592347741127, "rewards/format_reward": 0.8541666679084301, "step": 270 }, { "advantage_max": 1.9340698719024658, "advantage_mean": 1.0554989549049765e-08, "advantage_min": -0.8037978634238243, "advantage_std": 0.9998864904046059, "completion_length": 1319.333381652832, "epoch": 0.3097142857142857, "grad_norm": 0.29956671595573425, "kl": 0.00847625732421875, "lambda_div_used": 0.5, "learning_rate": 5.625647374256061e-07, "loss": 0.0003, "reward": 0.37543570157140493, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.37543570157140493, "reward_after_std": 0.9555936455726624, "reward_before_mean": 1.198203792795539, "reward_before_std": 0.7853490561246872, "reward_change_max": 0.0, "reward_change_mean": -0.8227681331336498, "reward_change_min": -1.3215494900941849, "reward_change_std": 0.5134271755814552, "reward_std": 0.9555936753749847, "rewards/cosine_scaled_reward": 0.15118524804711342, "rewards/format_reward": 0.8958333432674408, "step": 271 }, { "advantage_max": 1.9177703112363815, "advantage_mean": 3.414849514271623e-09, "advantage_min": -0.8598247207701206, "advantage_std": 0.9998156502842903, "completion_length": 1830.6250305175781, "epoch": 0.31085714285714283, "grad_norm": 0.29469630122184753, "kl": 0.011463165283203125, "lambda_div_used": 0.5, "learning_rate": 5.594240889475106e-07, "loss": 0.0005, "reward": -0.05768436938524246, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.05768436938524246, "reward_after_std": 0.6672587916254997, "reward_before_mean": 0.4954916397109628, "reward_before_std": 0.5718173142522573, "reward_change_max": 0.0, "reward_change_mean": -0.5531760081648827, "reward_change_min": -0.9284788183867931, "reward_change_std": 0.3455266337841749, "reward_std": 0.6672588251531124, "rewards/cosine_scaled_reward": -0.13767085410654545, "rewards/format_reward": 0.7708333358168602, "step": 272 }, { "advantage_max": 1.9324713498353958, "advantage_mean": 2.1730858223989458e-09, "advantage_min": -0.802878201007843, "advantage_std": 0.9998410195112228, "completion_length": 1315.0000381469727, "epoch": 0.312, "grad_norm": 0.24351400136947632, "kl": 0.00801849365234375, "lambda_div_used": 0.5, "learning_rate": 5.562829811526154e-07, "loss": 0.0003, "reward": 0.13825794821605086, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.13825794821605086, "reward_after_std": 0.7033094502985477, "reward_before_mean": 0.8465951606631279, "reward_before_std": 0.5551106706261635, "reward_change_max": 0.0007760822772979736, "reward_change_mean": -0.7083372462075204, "reward_change_min": -1.0916896015405655, "reward_change_std": 0.4183959634974599, "reward_std": 0.703309491276741, "rewards/cosine_scaled_reward": 0.006630909629166126, "rewards/format_reward": 0.8333333488553762, "step": 273 }, { "advantage_max": 1.9511011093854904, "advantage_mean": 2.6077033199456423e-08, "advantage_min": -0.7403512001037598, "advantage_std": 0.999863512814045, "completion_length": 1045.1458549499512, "epoch": 0.31314285714285717, "grad_norm": 0.28778988122940063, "kl": 0.0078582763671875, "lambda_div_used": 0.5, "learning_rate": 5.531415671340826e-07, "loss": 0.0003, "reward": 0.29660653905011714, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.29660653905011714, "reward_after_std": 0.857206005603075, "reward_before_mean": 1.0843129493296146, "reward_before_std": 0.6621245350688696, "reward_change_max": 0.0, "reward_change_mean": -0.7877063974738121, "reward_change_min": -1.2632982358336449, "reward_change_std": 0.4729926325380802, "reward_std": 0.8572060279548168, "rewards/cosine_scaled_reward": 0.052573127672076225, "rewards/format_reward": 0.9791666716337204, "step": 274 }, { "advantage_max": 1.9466314613819122, "advantage_mean": -1.8626451714354175e-08, "advantage_min": -0.7949444241821766, "advantage_std": 0.999883271753788, "completion_length": 1440.5208740234375, "epoch": 0.3142857142857143, "grad_norm": 0.24470460414886475, "kl": 0.008533477783203125, "lambda_div_used": 0.5, "learning_rate": 5.5e-07, "loss": 0.0003, "reward": 0.39895466226153076, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.39895466226153076, "reward_after_std": 0.9324078634381294, "reward_before_mean": 1.2469081226736307, "reward_before_std": 0.7196089113131166, "reward_change_max": 0.00023962557315826416, "reward_change_mean": -0.8479535095393658, "reward_change_min": -1.2866101413965225, "reward_change_std": 0.509355966001749, "reward_std": 0.9324079155921936, "rewards/cosine_scaled_reward": 0.19637071434408426, "rewards/format_reward": 0.8541666772216558, "step": 275 }, { "advantage_max": 1.9146538376808167, "advantage_mean": 3.104408841103634e-09, "advantage_min": -0.7443368807435036, "advantage_std": 0.9998732730746269, "completion_length": 1471.916732788086, "epoch": 0.31542857142857145, "grad_norm": 0.3321644365787506, "kl": 0.0133209228515625, "lambda_div_used": 0.5, "learning_rate": 5.468584328659172e-07, "loss": 0.0005, "reward": 0.14495250955224037, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.14495250955224037, "reward_after_std": 0.9756482355296612, "reward_before_mean": 0.76621616166085, "reward_before_std": 0.8980020098388195, "reward_change_max": 0.0011306777596473694, "reward_change_mean": -0.6212636511772871, "reward_change_min": -1.210908930748701, "reward_change_std": 0.44276667572557926, "reward_std": 0.9756482467055321, "rewards/cosine_scaled_reward": -0.023141922429203987, "rewards/format_reward": 0.8125000055879354, "step": 276 }, { "advantage_max": 1.9444350749254227, "advantage_mean": 3.725290520506519e-09, "advantage_min": -0.7333899475634098, "advantage_std": 0.9998810589313507, "completion_length": 1408.020881652832, "epoch": 0.31657142857142856, "grad_norm": 0.34184730052948, "kl": 0.013032913208007812, "lambda_div_used": 0.5, "learning_rate": 5.437170188473847e-07, "loss": 0.0005, "reward": 0.2797253541648388, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2797253541648388, "reward_after_std": 0.9698216766119003, "reward_before_mean": 1.0142830004915595, "reward_before_std": 0.7941651232540607, "reward_change_max": 0.0, "reward_change_mean": -0.734557643532753, "reward_change_min": -1.2394614443182945, "reward_change_std": 0.4602707177400589, "reward_std": 0.9698216803371906, "rewards/cosine_scaled_reward": 0.06964147090911865, "rewards/format_reward": 0.8750000074505806, "step": 277 }, { "advantage_max": 1.9824930280447006, "advantage_mean": -1.2417631367611648e-09, "advantage_min": -0.7233392670750618, "advantage_std": 0.9998651966452599, "completion_length": 1325.6458930969238, "epoch": 0.3177142857142857, "grad_norm": 0.2691362202167511, "kl": 0.0094451904296875, "lambda_div_used": 0.5, "learning_rate": 5.405759110524894e-07, "loss": 0.0004, "reward": 0.3478627223521471, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3478627223521471, "reward_after_std": 0.8299554251134396, "reward_before_mean": 1.1734108105301857, "reward_before_std": 0.5487337484955788, "reward_change_max": 0.0006553307175636292, "reward_change_mean": -0.8255480825901031, "reward_change_min": -1.1466122642159462, "reward_change_std": 0.4497763179242611, "reward_std": 0.8299554325640202, "rewards/cosine_scaled_reward": 0.12837206269614398, "rewards/format_reward": 0.9166666716337204, "step": 278 }, { "advantage_max": 1.9485204070806503, "advantage_mean": -3.1044087300813317e-09, "advantage_min": -0.8334387242794037, "advantage_std": 0.9998544678092003, "completion_length": 1508.8542022705078, "epoch": 0.31885714285714284, "grad_norm": 0.18029850721359253, "kl": 0.0090484619140625, "lambda_div_used": 0.5, "learning_rate": 5.37435262574394e-07, "loss": 0.0004, "reward": 0.13744146656244993, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.13744146656244993, "reward_after_std": 0.8153170794248581, "reward_before_mean": 0.8065908160060644, "reward_before_std": 0.6509456839412451, "reward_change_max": 0.0, "reward_change_mean": -0.6691493093967438, "reward_change_min": -1.0829177498817444, "reward_change_std": 0.40314605459570885, "reward_std": 0.8153170831501484, "rewards/cosine_scaled_reward": -0.06545462599024177, "rewards/format_reward": 0.9375000149011612, "step": 279 }, { "advantage_max": 1.8773256838321686, "advantage_mean": 3.104408619059029e-09, "advantage_min": -0.8730496391654015, "advantage_std": 0.9998912662267685, "completion_length": 1677.208381652832, "epoch": 0.32, "grad_norm": 0.274914026260376, "kl": 0.011020660400390625, "lambda_div_used": 0.5, "learning_rate": 5.342952264838747e-07, "loss": 0.0004, "reward": 0.4246948091313243, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4246948091313243, "reward_after_std": 1.0214713141322136, "reward_before_mean": 1.2751869540661573, "reward_before_std": 0.9706107303500175, "reward_change_max": 0.0, "reward_change_mean": -0.8504920788109303, "reward_change_min": -1.5552897602319717, "reward_change_std": 0.5969200953841209, "reward_std": 1.0214713215827942, "rewards/cosine_scaled_reward": 0.20009343978017569, "rewards/format_reward": 0.875, "step": 280 }, { "advantage_max": 1.9586692303419113, "advantage_mean": 3.104408841103634e-09, "advantage_min": -0.7022324539721012, "advantage_std": 0.9998129606246948, "completion_length": 2271.416702270508, "epoch": 0.3211428571428571, "grad_norm": 0.25421464443206787, "kl": 0.01201629638671875, "lambda_div_used": 0.5, "learning_rate": 5.311559558218603e-07, "loss": 0.0005, "reward": -0.1680087298154831, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.1680087298154831, "reward_after_std": 0.7381243333220482, "reward_before_mean": 0.27042799443006516, "reward_before_std": 0.659267095848918, "reward_change_max": 0.0005218088626861572, "reward_change_mean": -0.4384367326274514, "reward_change_min": -0.7785316742956638, "reward_change_std": 0.3023688681423664, "reward_std": 0.73812435567379, "rewards/cosine_scaled_reward": -0.16686933673918247, "rewards/format_reward": 0.6041666679084301, "step": 281 }, { "advantage_max": 1.958927944302559, "advantage_mean": 3.104408619059029e-09, "advantage_min": -0.719094455242157, "advantage_std": 0.9998442083597183, "completion_length": 1348.5208740234375, "epoch": 0.3222857142857143, "grad_norm": 0.23964226245880127, "kl": 0.008609771728515625, "lambda_div_used": 0.5, "learning_rate": 5.28017603591974e-07, "loss": 0.0003, "reward": 0.13726119976490736, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.13726119976490736, "reward_after_std": 0.7930195443332195, "reward_before_mean": 0.81022091768682, "reward_before_std": 0.5720008015632629, "reward_change_max": 0.0, "reward_change_mean": -0.6729597263038158, "reward_change_min": -0.9675496965646744, "reward_change_std": 0.37686675041913986, "reward_std": 0.793019562959671, "rewards/cosine_scaled_reward": -0.05322289373725653, "rewards/format_reward": 0.916666679084301, "step": 282 }, { "advantage_max": 1.9157568961381912, "advantage_mean": -1.2728075482471013e-08, "advantage_min": -0.8272853344678879, "advantage_std": 0.9998943582177162, "completion_length": 1776.1042175292969, "epoch": 0.32342857142857145, "grad_norm": 0.21559320390224457, "kl": 0.008855819702148438, "lambda_div_used": 0.5, "learning_rate": 5.248803227530763e-07, "loss": 0.0004, "reward": 0.44334246404469013, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.44334246404469013, "reward_after_std": 1.0401191785931587, "reward_before_mean": 1.3003057707101107, "reward_before_std": 0.907433059066534, "reward_change_max": 0.00027686357498168945, "reward_change_mean": -0.8569633327424526, "reward_change_min": -1.4866943806409836, "reward_change_std": 0.5639832280576229, "reward_std": 1.0401191860437393, "rewards/cosine_scaled_reward": 0.21265287976711988, "rewards/format_reward": 0.8750000074505806, "step": 283 }, { "advantage_max": 1.926544651389122, "advantage_mean": 8.847564458847046e-09, "advantage_min": -0.7751799561083317, "advantage_std": 0.999853253364563, "completion_length": 1054.770866394043, "epoch": 0.32457142857142857, "grad_norm": 0.34369370341300964, "kl": 0.006954193115234375, "lambda_div_used": 0.5, "learning_rate": 5.21744266211809e-07, "loss": 0.0003, "reward": 0.12165421736426651, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.12165421736426651, "reward_after_std": 0.8663975708186626, "reward_before_mean": 0.7515358105301857, "reward_before_std": 0.7445918060839176, "reward_change_max": 0.0019152984023094177, "reward_change_mean": -0.6298815757036209, "reward_change_min": -1.1273687332868576, "reward_change_std": 0.40246682800352573, "reward_std": 0.8663975708186626, "rewards/cosine_scaled_reward": -0.1138154431246221, "rewards/format_reward": 0.9791666716337204, "step": 284 }, { "advantage_max": 1.9379163980484009, "advantage_mean": -3.1044087300813317e-09, "advantage_min": -0.7704954259097576, "advantage_std": 0.9998484328389168, "completion_length": 992.1875305175781, "epoch": 0.32571428571428573, "grad_norm": 0.26117557287216187, "kl": 0.008707046508789062, "lambda_div_used": 0.5, "learning_rate": 5.186095868151436e-07, "loss": 0.0003, "reward": 0.15157003700733185, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.15157003700733185, "reward_after_std": 0.843847218900919, "reward_before_mean": 0.8257862031459808, "reward_before_std": 0.7248164545744658, "reward_change_max": 0.0, "reward_change_mean": -0.6742161959409714, "reward_change_min": -1.11558248847723, "reward_change_std": 0.42356468737125397, "reward_std": 0.8438472338020802, "rewards/cosine_scaled_reward": -0.07669024355709553, "rewards/format_reward": 0.9791666716337204, "step": 285 }, { "advantage_max": 1.927560493350029, "advantage_mean": -1.2728075149404106e-08, "advantage_min": -0.8058949783444405, "advantage_std": 0.9998480826616287, "completion_length": 1363.7083740234375, "epoch": 0.32685714285714285, "grad_norm": 0.27410030364990234, "kl": 0.006923675537109375, "lambda_div_used": 0.5, "learning_rate": 5.154764373429315e-07, "loss": 0.0003, "reward": 0.14017292112112045, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.14017292112112045, "reward_after_std": 0.7733436599373817, "reward_before_mean": 0.8225498106330633, "reward_before_std": 0.6446866802871227, "reward_change_max": 0.0, "reward_change_mean": -0.6823768839240074, "reward_change_min": -1.1488277539610863, "reward_change_std": 0.4256325364112854, "reward_std": 0.7733436599373817, "rewards/cosine_scaled_reward": -0.057475125417113304, "rewards/format_reward": 0.9375, "step": 286 }, { "advantage_max": 1.9195478856563568, "advantage_mean": 1.6142925107764938e-08, "advantage_min": -0.7916990965604782, "advantage_std": 0.9998394548892975, "completion_length": 1224.8958625793457, "epoch": 0.328, "grad_norm": 0.29854050278663635, "kl": 0.0105438232421875, "lambda_div_used": 0.5, "learning_rate": 5.123449705004581e-07, "loss": 0.0004, "reward": 0.09376479079946876, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.09376479079946876, "reward_after_std": 0.7171185463666916, "reward_before_mean": 0.7573619782924652, "reward_before_std": 0.5836118310689926, "reward_change_max": 0.0, "reward_change_mean": -0.66359718516469, "reward_change_min": -1.1100826933979988, "reward_change_std": 0.4130848478525877, "reward_std": 0.7171185612678528, "rewards/cosine_scaled_reward": -0.02756902575492859, "rewards/format_reward": 0.8125000074505806, "step": 287 }, { "advantage_max": 1.9400405883789062, "advantage_mean": -1.2417632477834672e-09, "advantage_min": -0.7356844134628773, "advantage_std": 0.9998426660895348, "completion_length": 1373.5000457763672, "epoch": 0.3291428571428571, "grad_norm": 0.22060362994670868, "kl": 0.009063720703125, "lambda_div_used": 0.5, "learning_rate": 5.09215338910999e-07, "loss": 0.0004, "reward": -0.04765269602648914, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.04765269602648914, "reward_after_std": 0.7224587984383106, "reward_before_mean": 0.50075370259583, "reward_before_std": 0.622645877301693, "reward_change_max": 0.0, "reward_change_mean": -0.5484064146876335, "reward_change_min": -1.0198993384838104, "reward_change_std": 0.3653989788144827, "reward_std": 0.7224588245153427, "rewards/cosine_scaled_reward": -0.21837315894663334, "rewards/format_reward": 0.9375000074505806, "step": 288 }, { "advantage_max": 1.9686802327632904, "advantage_mean": -8.692344177774203e-09, "advantage_min": -0.7215582430362701, "advantage_std": 0.9998724237084389, "completion_length": 1411.6250381469727, "epoch": 0.3302857142857143, "grad_norm": 0.3080616891384125, "kl": 0.0121307373046875, "lambda_div_used": 0.5, "learning_rate": 5.060876951083828e-07, "loss": 0.0005, "reward": 0.1366715773474425, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1366715773474425, "reward_after_std": 0.8801541402935982, "reward_before_mean": 0.7729975432157516, "reward_before_std": 0.7023350726813078, "reward_change_max": 0.0006599947810173035, "reward_change_mean": -0.6363259479403496, "reward_change_min": -1.0092889964580536, "reward_change_std": 0.3881163038313389, "reward_std": 0.880154199898243, "rewards/cosine_scaled_reward": -0.030167920514941216, "rewards/format_reward": 0.8333333395421505, "step": 289 }, { "advantage_max": 1.9003676027059555, "advantage_mean": -2.4835269396561444e-09, "advantage_min": -0.7423394173383713, "advantage_std": 0.9998863041400909, "completion_length": 1045.5416984558105, "epoch": 0.3314285714285714, "grad_norm": 0.3107107877731323, "kl": 0.0094146728515625, "lambda_div_used": 0.5, "learning_rate": 5.02962191529556e-07, "loss": 0.0004, "reward": 0.42231632210314274, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.42231632210314274, "reward_after_std": 1.0070499442517757, "reward_before_mean": 1.2733629271388054, "reward_before_std": 0.8752043275162578, "reward_change_max": 0.0, "reward_change_mean": -0.851046584546566, "reward_change_min": -1.5561627894639969, "reward_change_std": 0.5719406455755234, "reward_std": 1.007049947977066, "rewards/cosine_scaled_reward": 0.1575147584080696, "rewards/format_reward": 0.9583333358168602, "step": 290 }, { "advantage_max": 1.961781457066536, "advantage_mean": 1.2417633588057697e-09, "advantage_min": -0.7081636115908623, "advantage_std": 0.9998603463172913, "completion_length": 1178.5625305175781, "epoch": 0.3325714285714286, "grad_norm": 0.2246701866388321, "kl": 0.0083160400390625, "lambda_div_used": 0.5, "learning_rate": 4.998389805071536e-07, "loss": 0.0003, "reward": 0.13762659142958, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.13762659142958, "reward_after_std": 0.8188576325774193, "reward_before_mean": 0.8014777625649003, "reward_before_std": 0.6381174903362989, "reward_change_max": 0.0, "reward_change_mean": -0.66385118663311, "reward_change_min": -1.0244802385568619, "reward_change_std": 0.3820841684937477, "reward_std": 0.8188576474785805, "rewards/cosine_scaled_reward": -0.07842779252678156, "rewards/format_reward": 0.9583333358168602, "step": 291 }, { "advantage_max": 1.919129803776741, "advantage_mean": -4.113341445233232e-09, "advantage_min": -0.8278881087899208, "advantage_std": 0.9998489618301392, "completion_length": 1414.7917098999023, "epoch": 0.33371428571428574, "grad_norm": 0.2348564714193344, "kl": 0.009510040283203125, "lambda_div_used": 0.5, "learning_rate": 4.967182142620745e-07, "loss": 0.0004, "reward": 0.1004533120431006, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.1004533120431006, "reward_after_std": 0.7959478050470352, "reward_before_mean": 0.7488336265087128, "reward_before_std": 0.6854219231754541, "reward_change_max": 0.0, "reward_change_mean": -0.6483803391456604, "reward_change_min": -1.0386323034763336, "reward_change_std": 0.4167475663125515, "reward_std": 0.7959478460252285, "rewards/cosine_scaled_reward": -0.07349985092878342, "rewards/format_reward": 0.8958333432674408, "step": 292 }, { "advantage_max": 1.9526210129261017, "advantage_mean": 1.2417634698280722e-09, "advantage_min": -0.8244255632162094, "advantage_std": 0.9998187869787216, "completion_length": 1080.5000228881836, "epoch": 0.33485714285714285, "grad_norm": 0.24422034621238708, "kl": 0.009332656860351562, "lambda_div_used": 0.5, "learning_rate": 4.93600044896063e-07, "loss": 0.0004, "reward": 0.24010583432391286, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.24010583432391286, "reward_after_std": 0.6077372506260872, "reward_before_mean": 1.0529423654079437, "reward_before_std": 0.36602981574833393, "reward_change_max": 0.0, "reward_change_mean": -0.8128365054726601, "reward_change_min": -1.1496832966804504, "reward_change_std": 0.43168095126748085, "reward_std": 0.6077372655272484, "rewards/cosine_scaled_reward": 0.03688783012330532, "rewards/format_reward": 0.9791666716337204, "step": 293 }, { "advantage_max": 1.90102319419384, "advantage_mean": 8.07146260939362e-09, "advantage_min": -0.7834720239043236, "advantage_std": 0.9998085051774979, "completion_length": 1687.7083740234375, "epoch": 0.336, "grad_norm": 0.2405546009540558, "kl": 0.01241302490234375, "lambda_div_used": 0.5, "learning_rate": 4.904846243842949e-07, "loss": 0.0005, "reward": -0.004911705851554871, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.004911705851554871, "reward_after_std": 0.7069712989032269, "reward_before_mean": 0.583304937928915, "reward_before_std": 0.6382516892626882, "reward_change_max": 0.0, "reward_change_mean": -0.5882166475057602, "reward_change_min": -1.0201143249869347, "reward_change_std": 0.40058210119605064, "reward_std": 0.7069713175296783, "rewards/cosine_scaled_reward": -0.09376422129571438, "rewards/format_reward": 0.7708333432674408, "step": 294 }, { "advantage_max": 1.8850695043802261, "advantage_mean": -1.490116174895917e-08, "advantage_min": -0.8308575078845024, "advantage_std": 0.9998729452490807, "completion_length": 1527.8542022705078, "epoch": 0.33714285714285713, "grad_norm": 0.25586745142936707, "kl": 0.01103973388671875, "lambda_div_used": 0.5, "learning_rate": 4.873721045679706e-07, "loss": 0.0004, "reward": 0.36468934011645615, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.36468934011645615, "reward_after_std": 0.9241388477385044, "reward_before_mean": 1.1961545972153544, "reward_before_std": 0.8213022798299789, "reward_change_max": 0.0, "reward_change_mean": -0.8314652666449547, "reward_change_min": -1.457765981554985, "reward_change_std": 0.5527912154793739, "reward_std": 0.9241388812661171, "rewards/cosine_scaled_reward": 0.1605772953480482, "rewards/format_reward": 0.8750000074505806, "step": 295 }, { "advantage_max": 1.8951444178819656, "advantage_mean": -1.1331091731570098e-08, "advantage_min": -0.9481487721204758, "advantage_std": 0.9998307302594185, "completion_length": 1485.3125610351562, "epoch": 0.3382857142857143, "grad_norm": 0.2452612966299057, "kl": 0.00897216796875, "lambda_div_used": 0.5, "learning_rate": 4.842626371469149e-07, "loss": 0.0004, "reward": 0.08178290724754333, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.08178290724754333, "reward_after_std": 0.6763971261680126, "reward_before_mean": 0.7581503801047802, "reward_before_std": 0.6051702238619328, "reward_change_max": 0.0, "reward_change_mean": -0.6763674877583981, "reward_change_min": -1.109766460955143, "reward_change_std": 0.4316418059170246, "reward_std": 0.6763971447944641, "rewards/cosine_scaled_reward": -0.08967481926083565, "rewards/format_reward": 0.9375000149011612, "step": 296 }, { "advantage_max": 1.9076000899076462, "advantage_mean": 1.459072151988039e-08, "advantage_min": -0.8135585486888885, "advantage_std": 0.9998480305075645, "completion_length": 1972.2292022705078, "epoch": 0.3394285714285714, "grad_norm": 0.27767202258110046, "kl": 0.015995025634765625, "lambda_div_used": 0.5, "learning_rate": 4.811563736721829e-07, "loss": 0.0006, "reward": 0.09356885030865669, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.09356885030865669, "reward_after_std": 0.7759539633989334, "reward_before_mean": 0.7381240706890821, "reward_before_std": 0.6730439588427544, "reward_change_max": 0.0, "reward_change_mean": -0.6445552557706833, "reward_change_min": -1.1503963880240917, "reward_change_std": 0.4329614248126745, "reward_std": 0.7759539783000946, "rewards/cosine_scaled_reward": -0.016354622319340706, "rewards/format_reward": 0.7708333488553762, "step": 297 }, { "advantage_max": 1.9864699989557266, "advantage_mean": -1.9557773955902746e-08, "advantage_min": -0.670455165207386, "advantage_std": 0.9998606741428375, "completion_length": 1538.7500534057617, "epoch": 0.3405714285714286, "grad_norm": 0.21603329479694366, "kl": 0.008434295654296875, "lambda_div_used": 0.5, "learning_rate": 4.780534655386743e-07, "loss": 0.0003, "reward": 0.1843671938404441, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.1843671938404441, "reward_after_std": 0.8246655911207199, "reward_before_mean": 0.878794476389885, "reward_before_std": 0.5901540853083134, "reward_change_max": 0.0, "reward_change_mean": -0.6944273039698601, "reward_change_min": -1.0942930579185486, "reward_change_std": 0.39840497076511383, "reward_std": 0.8246655985713005, "rewards/cosine_scaled_reward": -0.029352783225476742, "rewards/format_reward": 0.9375, "step": 298 }, { "advantage_max": 1.9436794072389603, "advantage_mean": -9.313225746154785e-09, "advantage_min": -0.7619923055171967, "advantage_std": 0.9998595044016838, "completion_length": 1167.5416870117188, "epoch": 0.3417142857142857, "grad_norm": 0.22107698023319244, "kl": 0.008447647094726562, "lambda_div_used": 0.5, "learning_rate": 4.749540639777539e-07, "loss": 0.0003, "reward": 0.1298121795989573, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.1298121795989573, "reward_after_std": 0.7819931656122208, "reward_before_mean": 0.803707379847765, "reward_before_std": 0.6334694363176823, "reward_change_max": 0.0, "reward_change_mean": -0.6738951951265335, "reward_change_min": -1.084967590868473, "reward_change_std": 0.4023057296872139, "reward_std": 0.781993180513382, "rewards/cosine_scaled_reward": -0.08772965613752604, "rewards/format_reward": 0.9791666716337204, "step": 299 }, { "advantage_max": 1.9793277829885483, "advantage_mean": 1.0554989549049765e-08, "advantage_min": -0.6793212965130806, "advantage_std": 0.9998233988881111, "completion_length": 1506.3333702087402, "epoch": 0.34285714285714286, "grad_norm": 0.31695958971977234, "kl": 0.0128173828125, "lambda_div_used": 0.5, "learning_rate": 4.7185832004988133e-07, "loss": 0.0005, "reward": 0.02915547788143158, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.02915547788143158, "reward_after_std": 0.6308148801326752, "reward_before_mean": 0.6603523679077625, "reward_before_std": 0.42772908695042133, "reward_change_max": 0.0, "reward_change_mean": -0.6311969514936209, "reward_change_min": -0.9433031231164932, "reward_change_std": 0.35703119821846485, "reward_std": 0.6308148987591267, "rewards/cosine_scaled_reward": -0.09690714068710804, "rewards/format_reward": 0.8541666716337204, "step": 300 }, { "advantage_max": 1.9420295059680939, "advantage_mean": 2.4835269396561444e-09, "advantage_min": -0.7559764087200165, "advantage_std": 0.9998279139399529, "completion_length": 1254.3125457763672, "epoch": 0.344, "grad_norm": 0.2822733223438263, "kl": 0.014049530029296875, "lambda_div_used": 0.5, "learning_rate": 4.68766384637248e-07, "loss": 0.0006, "reward": 0.03332954691722989, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.03332954691722989, "reward_after_std": 0.6856257766485214, "reward_before_mean": 0.652209609746933, "reward_before_std": 0.5623303577303886, "reward_change_max": 0.0, "reward_change_mean": -0.6188800632953644, "reward_change_min": -1.0174327939748764, "reward_change_std": 0.3780553489923477, "reward_std": 0.6856257915496826, "rewards/cosine_scaled_reward": -0.1426452063024044, "rewards/format_reward": 0.9375000149011612, "step": 301 }, { "advantage_max": 1.8940949887037277, "advantage_mean": 1.0865430444262358e-08, "advantage_min": -0.8010035902261734, "advantage_std": 0.9998576045036316, "completion_length": 1526.37504196167, "epoch": 0.34514285714285714, "grad_norm": 0.3930858075618744, "kl": 0.01357269287109375, "lambda_div_used": 0.5, "learning_rate": 4.656784084364238e-07, "loss": 0.0005, "reward": 0.1928790423553437, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.1928790423553437, "reward_after_std": 0.8109844736754894, "reward_before_mean": 0.9092248368542641, "reward_before_std": 0.6983633888885379, "reward_change_max": 0.0008866190910339355, "reward_change_mean": -0.7163458056747913, "reward_change_min": -1.235180925577879, "reward_change_std": 0.4744630251079798, "reward_std": 0.8109845034778118, "rewards/cosine_scaled_reward": 0.06919574737548828, "rewards/format_reward": 0.7708333376795053, "step": 302 }, { "advantage_max": 1.8796220421791077, "advantage_mean": -3.6476800815976596e-09, "advantage_min": -0.8963010087609291, "advantage_std": 0.9998479634523392, "completion_length": 1107.0208740234375, "epoch": 0.3462857142857143, "grad_norm": 0.3062381148338318, "kl": 0.009563446044921875, "lambda_div_used": 0.5, "learning_rate": 4.6259454195101267e-07, "loss": 0.0004, "reward": 0.2196194063872099, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2196194063872099, "reward_after_std": 0.7703746408224106, "reward_before_mean": 0.9780805017799139, "reward_before_std": 0.7009712867438793, "reward_change_max": 0.0, "reward_change_mean": -0.7584610693156719, "reward_change_min": -1.1881650909781456, "reward_change_std": 0.47840745374560356, "reward_std": 0.7703746594488621, "rewards/cosine_scaled_reward": 0.009873565286397934, "rewards/format_reward": 0.9583333432674408, "step": 303 }, { "advantage_max": 1.9196258336305618, "advantage_mean": -2.6387469986843826e-09, "advantage_min": -0.7958096042275429, "advantage_std": 0.9998370930552483, "completion_length": 1217.895851135254, "epoch": 0.3474285714285714, "grad_norm": 0.27057960629463196, "kl": 0.010829925537109375, "lambda_div_used": 0.5, "learning_rate": 4.59514935484316e-07, "loss": 0.0004, "reward": 0.10770251415669918, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.10770251415669918, "reward_after_std": 0.754070594906807, "reward_before_mean": 0.7725160010159016, "reward_before_std": 0.6370177734643221, "reward_change_max": 0.0, "reward_change_mean": -0.6648134812712669, "reward_change_min": -1.0958143062889576, "reward_change_std": 0.4159948546439409, "reward_std": 0.7540706358850002, "rewards/cosine_scaled_reward": -0.05124201602302492, "rewards/format_reward": 0.875, "step": 304 }, { "advantage_max": 1.9056095480918884, "advantage_mean": 1.2728075482471013e-08, "advantage_min": -0.8551415503025055, "advantage_std": 0.9998487681150436, "completion_length": 1382.4375381469727, "epoch": 0.3485714285714286, "grad_norm": 0.3034442961215973, "kl": 0.0097503662109375, "lambda_div_used": 0.5, "learning_rate": 4.5643973913200837e-07, "loss": 0.0004, "reward": 0.05656467331573367, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.05656467331573367, "reward_after_std": 0.785954438149929, "reward_before_mean": 0.6694810688495636, "reward_before_std": 0.7176000475883484, "reward_change_max": 0.0, "reward_change_mean": -0.6129163838922977, "reward_change_min": -1.1551690623164177, "reward_change_std": 0.4239853620529175, "reward_std": 0.7859544530510902, "rewards/cosine_scaled_reward": -0.10275947768241167, "rewards/format_reward": 0.8750000223517418, "step": 305 }, { "advantage_max": 1.929152637720108, "advantage_mean": -3.725290964595729e-09, "advantage_min": -0.8490537628531456, "advantage_std": 0.9998709782958031, "completion_length": 1078.0625228881836, "epoch": 0.3497142857142857, "grad_norm": 0.3089045584201813, "kl": 0.0123291015625, "lambda_div_used": 0.5, "learning_rate": 4.5336910277482155e-07, "loss": 0.0005, "reward": 0.45340352691709995, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.45340352691709995, "reward_after_std": 0.8783687688410282, "reward_before_mean": 1.3650562167167664, "reward_before_std": 0.6945241689682007, "reward_change_max": 0.0, "reward_change_mean": -0.9116526544094086, "reward_change_min": -1.4638760089874268, "reward_change_std": 0.5430373214185238, "reward_std": 0.8783687688410282, "rewards/cosine_scaled_reward": 0.20336140575818717, "rewards/format_reward": 0.9583333432674408, "step": 306 }, { "advantage_max": 1.928852841258049, "advantage_mean": -6.829699139565548e-09, "advantage_min": -0.7465885579586029, "advantage_std": 0.9998717904090881, "completion_length": 1224.1875457763672, "epoch": 0.35085714285714287, "grad_norm": 0.29672563076019287, "kl": 0.010662078857421875, "lambda_div_used": 0.5, "learning_rate": 4.503031760712397e-07, "loss": 0.0004, "reward": 0.26565046235919, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.26565046235919, "reward_after_std": 0.9444398619234562, "reward_before_mean": 1.0031846947968006, "reward_before_std": 0.8245226852595806, "reward_change_max": 0.0, "reward_change_mean": -0.7375341951847076, "reward_change_min": -1.283432550728321, "reward_change_std": 0.4830533228814602, "reward_std": 0.9444398768246174, "rewards/cosine_scaled_reward": 0.03284231084398925, "rewards/format_reward": 0.9375000074505806, "step": 307 }, { "advantage_max": 1.9126890301704407, "advantage_mean": -3.1044089521259366e-09, "advantage_min": -0.7414450347423553, "advantage_std": 0.9998699352145195, "completion_length": 1937.0000610351562, "epoch": 0.352, "grad_norm": 0.21619656682014465, "kl": 0.014070510864257812, "lambda_div_used": 0.5, "learning_rate": 4.4724210845020494e-07, "loss": 0.0006, "reward": 0.15780878346413374, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.15780878346413374, "reward_after_std": 0.9465266540646553, "reward_before_mean": 0.8064774088561535, "reward_before_std": 0.8910163529217243, "reward_change_max": 0.0, "reward_change_mean": -0.6486686486750841, "reward_change_min": -1.1572811380028725, "reward_change_std": 0.45672182738780975, "reward_std": 0.9465266764163971, "rewards/cosine_scaled_reward": -0.023844645358622074, "rewards/format_reward": 0.8541666697710752, "step": 308 }, { "advantage_max": 1.9088157713413239, "advantage_mean": 2.4835269396561444e-09, "advantage_min": -0.7664674371480942, "advantage_std": 0.999833844602108, "completion_length": 1647.5000457763672, "epoch": 0.35314285714285715, "grad_norm": 0.27004313468933105, "kl": 0.013214111328125, "lambda_div_used": 0.5, "learning_rate": 4.441860491038345e-07, "loss": 0.0005, "reward": 0.02324374718591571, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.02324374718591571, "reward_after_std": 0.7216699905693531, "reward_before_mean": 0.6317189037799835, "reward_before_std": 0.6191937811672688, "reward_change_max": 0.0001543685793876648, "reward_change_mean": -0.6084751673042774, "reward_change_min": -1.0604383125901222, "reward_change_std": 0.3989776838570833, "reward_std": 0.721670001745224, "rewards/cosine_scaled_reward": -0.10080722998827696, "rewards/format_reward": 0.8333333414047956, "step": 309 }, { "advantage_max": 1.9126400649547577, "advantage_mean": 4.967054212379196e-09, "advantage_min": -0.8069312274456024, "advantage_std": 0.9998295158147812, "completion_length": 1398.7917022705078, "epoch": 0.35428571428571426, "grad_norm": 0.30042949318885803, "kl": 0.01859283447265625, "lambda_div_used": 0.5, "learning_rate": 4.4113514698014953e-07, "loss": 0.0007, "reward": 0.017530305543914437, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.017530305543914437, "reward_after_std": 0.6874769181013107, "reward_before_mean": 0.626902480609715, "reward_before_std": 0.5805229172110558, "reward_change_max": 0.0, "reward_change_mean": -0.6093721650540829, "reward_change_min": -1.0393969900906086, "reward_change_std": 0.387352529913187, "reward_std": 0.6874769255518913, "rewards/cosine_scaled_reward": -0.12404878530651331, "rewards/format_reward": 0.8750000037252903, "step": 310 }, { "advantage_max": 1.9714401960372925, "advantage_mean": -1.3038516266661304e-08, "advantage_min": -0.6754600629210472, "advantage_std": 0.9998734816908836, "completion_length": 1114.9375228881836, "epoch": 0.3554285714285714, "grad_norm": 0.2700650990009308, "kl": 0.009521484375, "lambda_div_used": 0.5, "learning_rate": 4.3808955077581546e-07, "loss": 0.0004, "reward": 0.30486697098240256, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.30486697098240256, "reward_after_std": 0.9124359712004662, "reward_before_mean": 1.0774826928973198, "reward_before_std": 0.6932089999318123, "reward_change_max": 0.0, "reward_change_mean": -0.7726157456636429, "reward_change_min": -1.2057588621973991, "reward_change_std": 0.45093817077577114, "reward_std": 0.9124359861016273, "rewards/cosine_scaled_reward": 0.0491579994559288, "rewards/format_reward": 0.9791666716337204, "step": 311 }, { "advantage_max": 1.9676008075475693, "advantage_mean": -1.2417635808503746e-09, "advantage_min": -0.6602420620620251, "advantage_std": 0.9998646304011345, "completion_length": 1168.2708587646484, "epoch": 0.3565714285714286, "grad_norm": 0.23943375051021576, "kl": 0.00844573974609375, "lambda_div_used": 0.5, "learning_rate": 4.350494089288943e-07, "loss": 0.0003, "reward": 0.44616672629490495, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.44616672629490495, "reward_after_std": 0.8134817592799664, "reward_before_mean": 1.368921009125188, "reward_before_std": 0.495204322040081, "reward_change_max": 0.0, "reward_change_mean": -0.922754317522049, "reward_change_min": -1.3639105558395386, "reward_change_std": 0.5105717405676842, "reward_std": 0.8134817853569984, "rewards/cosine_scaled_reward": 0.22612717002630234, "rewards/format_reward": 0.9166666679084301, "step": 312 }, { "advantage_max": 1.9355697929859161, "advantage_mean": -1.241763458725842e-08, "advantage_min": -0.6672081351280212, "advantage_std": 0.9998272061347961, "completion_length": 1695.708366394043, "epoch": 0.3577142857142857, "grad_norm": 0.24104565382003784, "kl": 0.01529693603515625, "lambda_div_used": 0.5, "learning_rate": 4.3201486961161093e-07, "loss": 0.0006, "reward": 0.009465799666941166, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.009465799666941166, "reward_after_std": 0.7200150936841965, "reward_before_mean": 0.6010144352912903, "reward_before_std": 0.5657152414787561, "reward_change_max": 0.0007646605372428894, "reward_change_mean": -0.5915486626327038, "reward_change_min": -0.921930406242609, "reward_change_std": 0.3610827811062336, "reward_std": 0.7200151309370995, "rewards/cosine_scaled_reward": -0.06407611817121506, "rewards/format_reward": 0.7291666716337204, "step": 313 }, { "advantage_max": 1.918437272310257, "advantage_mean": -2.483526828633842e-09, "advantage_min": -0.9093864634633064, "advantage_std": 0.9998309463262558, "completion_length": 1344.1875228881836, "epoch": 0.3588571428571429, "grad_norm": 0.31180670857429504, "kl": 0.013317108154296875, "lambda_div_used": 0.5, "learning_rate": 4.2898608072313045e-07, "loss": 0.0005, "reward": 0.20816711336374283, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.20816711336374283, "reward_after_std": 0.7800188288092613, "reward_before_mean": 0.9478923492133617, "reward_before_std": 0.6674759928137064, "reward_change_max": 0.0, "reward_change_mean": -0.739725261926651, "reward_change_min": -1.2057388499379158, "reward_change_std": 0.4727477263659239, "reward_std": 0.7800188288092613, "rewards/cosine_scaled_reward": 0.07811282994225621, "rewards/format_reward": 0.7916666679084301, "step": 314 }, { "advantage_max": 1.9160043001174927, "advantage_mean": 4.346172532976311e-09, "advantage_min": -0.7502782195806503, "advantage_std": 0.9998539909720421, "completion_length": 1994.2708892822266, "epoch": 0.36, "grad_norm": 0.31821468472480774, "kl": 0.029644012451171875, "lambda_div_used": 0.5, "learning_rate": 4.2596318988235037e-07, "loss": 0.0012, "reward": 0.11614364665001631, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.11614364665001631, "reward_after_std": 0.8035916015505791, "reward_before_mean": 0.7763093619141728, "reward_before_std": 0.6844161916524172, "reward_change_max": 0.0012766644358634949, "reward_change_mean": -0.6601656787097454, "reward_change_min": -1.1163662187755108, "reward_change_std": 0.44189007207751274, "reward_std": 0.8035916239023209, "rewards/cosine_scaled_reward": 0.013154652551747859, "rewards/format_reward": 0.7500000111758709, "step": 315 }, { "advantage_max": 1.9073859602212906, "advantage_mean": 3.6011140736036396e-08, "advantage_min": -0.8746335953474045, "advantage_std": 0.9998039901256561, "completion_length": 2026.1042556762695, "epoch": 0.36114285714285715, "grad_norm": 0.33457982540130615, "kl": 0.023563385009765625, "lambda_div_used": 0.5, "learning_rate": 4.2294634442070553e-07, "loss": 0.0009, "reward": -0.21840902511030436, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.21840902511030436, "reward_after_std": 0.5321423746645451, "reward_before_mean": 0.24390754383057356, "reward_before_std": 0.46077893674373627, "reward_change_max": 0.000392720103263855, "reward_change_mean": -0.46231655217707157, "reward_change_min": -0.7747361436486244, "reward_change_std": 0.29503229446709156, "reward_std": 0.5321423932909966, "rewards/cosine_scaled_reward": -0.24262958019971848, "rewards/format_reward": 0.7291666865348816, "step": 316 }, { "advantage_max": 1.9794776886701584, "advantage_mean": 1.1175871006408045e-08, "advantage_min": -0.7000977098941803, "advantage_std": 0.9998548403382301, "completion_length": 1663.4167175292969, "epoch": 0.36228571428571427, "grad_norm": 0.34562116861343384, "kl": 0.0205841064453125, "lambda_div_used": 0.5, "learning_rate": 4.1993569137498776e-07, "loss": 0.0008, "reward": 0.04382408410310745, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.04382408410310745, "reward_after_std": 0.7862890549004078, "reward_before_mean": 0.6326780554954894, "reward_before_std": 0.602029662579298, "reward_change_max": 0.0, "reward_change_mean": -0.5888539738953114, "reward_change_min": -0.8865768276154995, "reward_change_std": 0.3419474679976702, "reward_std": 0.7862890884280205, "rewards/cosine_scaled_reward": -0.02741097833495587, "rewards/format_reward": 0.6875000074505806, "step": 317 }, { "advantage_max": 1.9762675315141678, "advantage_mean": -2.5456151353520085e-08, "advantage_min": -0.6477275937795639, "advantage_std": 0.9998506307601929, "completion_length": 1250.791706085205, "epoch": 0.36342857142857143, "grad_norm": 0.3656529188156128, "kl": 0.02227783203125, "lambda_div_used": 0.5, "learning_rate": 4.1693137748017915e-07, "loss": 0.0009, "reward": 0.05467936210334301, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.05467936210334301, "reward_after_std": 0.8251577764749527, "reward_before_mean": 0.6445684731006622, "reward_before_std": 0.6484403479844332, "reward_change_max": 0.0, "reward_change_mean": -0.5898891296237707, "reward_change_min": -0.9933161847293377, "reward_change_std": 0.3516414873301983, "reward_std": 0.8251578062772751, "rewards/cosine_scaled_reward": -0.1256324439891614, "rewards/format_reward": 0.895833333954215, "step": 318 }, { "advantage_max": 1.9682789146900177, "advantage_mean": 8.07146260939362e-09, "advantage_min": -0.7039951980113983, "advantage_std": 0.9998318552970886, "completion_length": 1496.895881652832, "epoch": 0.36457142857142855, "grad_norm": 0.3081127107143402, "kl": 0.011016845703125, "lambda_div_used": 0.5, "learning_rate": 4.1393354916230005e-07, "loss": 0.0004, "reward": -0.004789367318153381, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.004789367318153381, "reward_after_std": 0.6943408586084843, "reward_before_mean": 0.5780723858624697, "reward_before_std": 0.5280365757644176, "reward_change_max": 0.0, "reward_change_mean": -0.5828617438673973, "reward_change_min": -0.9427573829889297, "reward_change_std": 0.335221491754055, "reward_std": 0.6943408660590649, "rewards/cosine_scaled_reward": -0.14846382848918438, "rewards/format_reward": 0.8750000074505806, "step": 319 }, { "advantage_max": 1.9441251009702682, "advantage_mean": -1.6142924885720333e-08, "advantage_min": -0.7833335176110268, "advantage_std": 0.9998515844345093, "completion_length": 1153.4791870117188, "epoch": 0.3657142857142857, "grad_norm": 0.3333378732204437, "kl": 0.0160675048828125, "lambda_div_used": 0.5, "learning_rate": 4.1094235253127374e-07, "loss": 0.0006, "reward": 0.20300496055278927, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.20300496055278927, "reward_after_std": 0.7820871509611607, "reward_before_mean": 0.9307644553482533, "reward_before_std": 0.6208079401403666, "reward_change_max": 0.0, "reward_change_mean": -0.7277594916522503, "reward_change_min": -1.1636041551828384, "reward_change_std": 0.4276655428111553, "reward_std": 0.7820871770381927, "rewards/cosine_scaled_reward": -0.0033677939791232347, "rewards/format_reward": 0.9375, "step": 320 }, { "advantage_max": 1.9495942294597626, "advantage_mean": -1.614292477469803e-08, "advantage_min": -0.7533436268568039, "advantage_std": 0.9998676404356956, "completion_length": 1021.6666946411133, "epoch": 0.3668571428571429, "grad_norm": 0.2943817377090454, "kl": 0.00847625732421875, "lambda_div_used": 0.5, "learning_rate": 4.079579333738039e-07, "loss": 0.0003, "reward": 0.3986971661215648, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3986971661215648, "reward_after_std": 0.8560882285237312, "reward_before_mean": 1.268271841108799, "reward_before_std": 0.6319062225520611, "reward_change_max": 0.0, "reward_change_mean": -0.8695746436715126, "reward_change_min": -1.3420241624116898, "reward_change_std": 0.5012955367565155, "reward_std": 0.8560882434248924, "rewards/cosine_scaled_reward": 0.13413588888943195, "rewards/format_reward": 1.0, "step": 321 }, { "advantage_max": 1.9425024837255478, "advantage_mean": 1.0554989438027462e-08, "advantage_min": -0.6427893787622452, "advantage_std": 0.9998832494020462, "completion_length": 1422.2292137145996, "epoch": 0.368, "grad_norm": 0.36897826194763184, "kl": 0.026554107666015625, "lambda_div_used": 0.5, "learning_rate": 4.0498043714627006e-07, "loss": 0.0011, "reward": 0.06227410305291414, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.06227410305291414, "reward_after_std": 1.005720667541027, "reward_before_mean": 0.6011641100049019, "reward_before_std": 0.9237909056246281, "reward_change_max": 0.0, "reward_change_mean": -0.5388900116086006, "reward_change_min": -1.156266689300537, "reward_change_std": 0.4076250493526459, "reward_std": 1.0057206749916077, "rewards/cosine_scaled_reward": -0.11608462547883391, "rewards/format_reward": 0.8333333432674408, "step": 322 }, { "advantage_max": 1.95187209546566, "advantage_mean": -2.0799538202886936e-08, "advantage_min": -0.8534364998340607, "advantage_std": 0.9998331591486931, "completion_length": 1464.7500305175781, "epoch": 0.36914285714285716, "grad_norm": 0.40443792939186096, "kl": 0.021869659423828125, "lambda_div_used": 0.5, "learning_rate": 4.020100089676376e-07, "loss": 0.0009, "reward": 0.11423857533372939, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.11423857533372939, "reward_after_std": 0.6722201742231846, "reward_before_mean": 0.8066409444436431, "reward_before_std": 0.49490773119032383, "reward_change_max": 0.0024028271436691284, "reward_change_mean": -0.6924023889005184, "reward_change_min": -1.0461347699165344, "reward_change_std": 0.40142657794058323, "reward_std": 0.6722202003002167, "rewards/cosine_scaled_reward": -0.013346204534173012, "rewards/format_reward": 0.8333333395421505, "step": 323 }, { "advantage_max": 1.8911200314760208, "advantage_mean": 1.3038516377683607e-08, "advantage_min": -0.9142744615674019, "advantage_std": 0.9998573586344719, "completion_length": 1039.583366394043, "epoch": 0.3702857142857143, "grad_norm": 0.3049314618110657, "kl": 0.01213836669921875, "lambda_div_used": 0.5, "learning_rate": 3.9904679361238526e-07, "loss": 0.0005, "reward": 0.03350969776511192, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.03350969776511192, "reward_after_std": 0.7819260433316231, "reward_before_mean": 0.6313423737883568, "reward_before_std": 0.7142659351229668, "reward_change_max": 0.0011547952890396118, "reward_change_mean": -0.5978326722979546, "reward_change_min": -1.0388338789343834, "reward_change_std": 0.3994863033294678, "reward_std": 0.7819260433316231, "rewards/cosine_scaled_reward": -0.1426621489226818, "rewards/format_reward": 0.9166666865348816, "step": 324 }, { "advantage_max": 1.9915322363376617, "advantage_mean": 2.980232371996294e-08, "advantage_min": -0.6144200935959816, "advantage_std": 0.9998652711510658, "completion_length": 1653.1250305175781, "epoch": 0.37142857142857144, "grad_norm": 0.26348766684532166, "kl": 0.014278411865234375, "lambda_div_used": 0.5, "learning_rate": 3.9609093550344907e-07, "loss": 0.0006, "reward": 0.17073870450258255, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.17073870450258255, "reward_after_std": 0.8961613662540913, "reward_before_mean": 0.835974670946598, "reward_before_std": 0.6641275025904179, "reward_change_max": 0.0009778067469596863, "reward_change_mean": -0.6652359329164028, "reward_change_min": -1.0335832685232162, "reward_change_std": 0.38337370892986655, "reward_std": 0.8961613737046719, "rewards/cosine_scaled_reward": 0.02215397759573534, "rewards/format_reward": 0.7916666679084301, "step": 325 }, { "advantage_max": 1.895740658044815, "advantage_mean": 1.6142924996742636e-08, "advantage_min": -0.8704542517662048, "advantage_std": 0.9998702183365822, "completion_length": 1409.4375305175781, "epoch": 0.37257142857142855, "grad_norm": 0.43456870317459106, "kl": 0.022167205810546875, "lambda_div_used": 0.5, "learning_rate": 3.931425787051832e-07, "loss": 0.0009, "reward": 0.27470920979976654, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.27470920979976654, "reward_after_std": 0.9201184548437595, "reward_before_mean": 1.0299480855464935, "reward_before_std": 0.8468910120427608, "reward_change_max": 0.0, "reward_change_mean": -0.7552388813346624, "reward_change_min": -1.3375147059559822, "reward_change_std": 0.5060300789773464, "reward_std": 0.9201184548437595, "rewards/cosine_scaled_reward": 0.0983073660172522, "rewards/format_reward": 0.8333333469927311, "step": 326 }, { "advantage_max": 1.9163780510425568, "advantage_mean": -5.2774945524802774e-09, "advantage_min": -0.8060482665896416, "advantage_std": 0.9998558238148689, "completion_length": 1544.0000305175781, "epoch": 0.3737142857142857, "grad_norm": 0.24153485894203186, "kl": 0.01259613037109375, "lambda_div_used": 0.5, "learning_rate": 3.902018669163384e-07, "loss": 0.0005, "reward": 0.2656072140671313, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2656072140671313, "reward_after_std": 0.7916287072002888, "reward_before_mean": 1.0449545159935951, "reward_before_std": 0.6116082668304443, "reward_change_max": 0.0, "reward_change_mean": -0.7793473340570927, "reward_change_min": -1.2160406894981861, "reward_change_std": 0.4596743304282427, "reward_std": 0.7916287481784821, "rewards/cosine_scaled_reward": 0.11622725054621696, "rewards/format_reward": 0.8125000074505806, "step": 327 }, { "advantage_max": 1.9556438773870468, "advantage_mean": 9.623666974434286e-09, "advantage_min": -0.7209471166133881, "advantage_std": 0.999858446419239, "completion_length": 1625.8958587646484, "epoch": 0.37485714285714283, "grad_norm": 0.4186374545097351, "kl": 0.0170440673828125, "lambda_div_used": 0.5, "learning_rate": 3.872689434630585e-07, "loss": 0.0007, "reward": -0.05386994406580925, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.05386994406580925, "reward_after_std": 0.8116550669074059, "reward_before_mean": 0.45288407802581787, "reward_before_std": 0.7180973216891289, "reward_change_max": 0.00030559301376342773, "reward_change_mean": -0.5067540071904659, "reward_change_min": -0.9800437428057194, "reward_change_std": 0.35215629637241364, "reward_std": 0.8116550669074059, "rewards/cosine_scaled_reward": -0.15897464100271463, "rewards/format_reward": 0.770833345130086, "step": 328 }, { "advantage_max": 1.9007090032100677, "advantage_mean": 1.4280280180578586e-08, "advantage_min": -0.9063881933689117, "advantage_std": 0.9998352080583572, "completion_length": 1053.7916793823242, "epoch": 0.376, "grad_norm": 0.29686838388442993, "kl": 0.012237548828125, "lambda_div_used": 0.5, "learning_rate": 3.843439512918949e-07, "loss": 0.0005, "reward": 0.19925944739952683, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.19925944739952683, "reward_after_std": 0.7242099829018116, "reward_before_mean": 0.9510072600096464, "reward_before_std": 0.6169087514281273, "reward_change_max": 0.0, "reward_change_mean": -0.7517478205263615, "reward_change_min": -1.2093592062592506, "reward_change_std": 0.4579017572104931, "reward_std": 0.7242099903523922, "rewards/cosine_scaled_reward": -0.003663059324026108, "rewards/format_reward": 0.9583333432674408, "step": 329 }, { "advantage_max": 1.9330773949623108, "advantage_mean": -4.346172088887101e-09, "advantage_min": -0.779715783894062, "advantage_std": 0.9998436868190765, "completion_length": 1192.6667175292969, "epoch": 0.37714285714285717, "grad_norm": 0.44872358441352844, "kl": 0.020023345947265625, "lambda_div_used": 0.5, "learning_rate": 3.8142703296283953e-07, "loss": 0.0008, "reward": 0.06654795771464705, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.06654795771464705, "reward_after_std": 0.7801444493234158, "reward_before_mean": 0.6826637480407953, "reward_before_std": 0.673134308308363, "reward_change_max": 0.0, "reward_change_mean": -0.616115789860487, "reward_change_min": -1.0302981063723564, "reward_change_std": 0.39053851924836636, "reward_std": 0.7801444493234158, "rewards/cosine_scaled_reward": -0.11700147949159145, "rewards/format_reward": 0.916666679084301, "step": 330 }, { "advantage_max": 1.943891003727913, "advantage_mean": 6.208816905051151e-09, "advantage_min": -0.7582138329744339, "advantage_std": 0.9998082295060158, "completion_length": 1751.770866394043, "epoch": 0.3782857142857143, "grad_norm": 0.44164517521858215, "kl": 0.018161773681640625, "lambda_div_used": 0.5, "learning_rate": 3.785183306423767e-07, "loss": 0.0007, "reward": -0.15124160097911954, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.15124160097911954, "reward_after_std": 0.6162938810884953, "reward_before_mean": 0.33426031470298767, "reward_before_std": 0.49191057682037354, "reward_change_max": 0.0008933767676353455, "reward_change_mean": -0.4855019422248006, "reward_change_min": -0.7947644628584385, "reward_change_std": 0.29183477628976107, "reward_std": 0.6162938885390759, "rewards/cosine_scaled_reward": -0.18703650496900082, "rewards/format_reward": 0.7083333414047956, "step": 331 }, { "advantage_max": 1.9274078607559204, "advantage_mean": 3.725290520506519e-09, "advantage_min": -0.6919441595673561, "advantage_std": 0.9998692721128464, "completion_length": 1333.5208587646484, "epoch": 0.37942857142857145, "grad_norm": 0.3751254975795746, "kl": 0.015411376953125, "lambda_div_used": 0.5, "learning_rate": 3.7561798609655373e-07, "loss": 0.0006, "reward": 0.12781556928530335, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.12781556928530335, "reward_after_std": 0.889508455991745, "reward_before_mean": 0.7664818149060011, "reward_before_std": 0.8011388294398785, "reward_change_max": 0.0011112168431282043, "reward_change_mean": -0.6386662609875202, "reward_change_min": -1.1745759025216103, "reward_change_std": 0.44522836804389954, "reward_std": 0.8895084857940674, "rewards/cosine_scaled_reward": -0.05425910046324134, "rewards/format_reward": 0.8750000037252903, "step": 332 }, { "advantage_max": 1.9375999569892883, "advantage_mean": 2.374872591637267e-08, "advantage_min": -0.7582232654094696, "advantage_std": 0.9998498037457466, "completion_length": 1146.5208740234375, "epoch": 0.38057142857142856, "grad_norm": 0.27129772305488586, "kl": 0.011959075927734375, "lambda_div_used": 0.5, "learning_rate": 3.72726140684072e-07, "loss": 0.0005, "reward": 0.1619847072288394, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1619847072288394, "reward_after_std": 0.80493438616395, "reward_before_mean": 0.8567570652812719, "reward_before_std": 0.6830338407307863, "reward_change_max": 0.0, "reward_change_mean": -0.6947723813354969, "reward_change_min": -1.1889399215579033, "reward_change_std": 0.442412793636322, "reward_std": 0.8049344308674335, "rewards/cosine_scaled_reward": -0.050788127817213535, "rewards/format_reward": 0.9583333358168602, "step": 333 }, { "advantage_max": 1.9378287494182587, "advantage_mean": 3.539025866805545e-08, "advantage_min": -0.7852133959531784, "advantage_std": 0.9998596906661987, "completion_length": 1976.9167175292969, "epoch": 0.38171428571428573, "grad_norm": 0.42113620042800903, "kl": 0.026947021484375, "lambda_div_used": 0.5, "learning_rate": 3.6984293534939737e-07, "loss": 0.0011, "reward": -0.13296086061745882, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.13296086061745882, "reward_after_std": 0.834507130086422, "reward_before_mean": 0.300143308006227, "reward_before_std": 0.76963946595788, "reward_change_max": 0.0035352930426597595, "reward_change_mean": -0.43310416489839554, "reward_change_min": -0.8661820441484451, "reward_change_std": 0.3252943940460682, "reward_std": 0.8345071524381638, "rewards/cosine_scaled_reward": -0.19367835018783808, "rewards/format_reward": 0.6875000093132257, "step": 334 }, { "advantage_max": 1.897423803806305, "advantage_mean": -2.1109979209121832e-08, "advantage_min": -0.8362590447068214, "advantage_std": 0.9998617917299271, "completion_length": 1236.3750534057617, "epoch": 0.38285714285714284, "grad_norm": 0.4480448067188263, "kl": 0.0216522216796875, "lambda_div_used": 0.5, "learning_rate": 3.6696851061588994e-07, "loss": 0.0009, "reward": 0.21662542037665844, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.21662542037665844, "reward_after_std": 0.8816086277365685, "reward_before_mean": 0.9350887164473534, "reward_before_std": 0.8172469660639763, "reward_change_max": 0.0, "reward_change_mean": -0.7184633240103722, "reward_change_min": -1.356583371758461, "reward_change_std": 0.49653930589556694, "reward_std": 0.8816086612641811, "rewards/cosine_scaled_reward": 0.009211016818881035, "rewards/format_reward": 0.9166666716337204, "step": 335 }, { "advantage_max": 1.9454235136508942, "advantage_mean": -3.4148496252939253e-09, "advantage_min": -0.799261599779129, "advantage_std": 0.9998770728707314, "completion_length": 1380.0209121704102, "epoch": 0.384, "grad_norm": 0.43987566232681274, "kl": 0.016510009765625, "lambda_div_used": 0.5, "learning_rate": 3.641030065789562e-07, "loss": 0.0007, "reward": 0.2827145103365183, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2827145103365183, "reward_after_std": 0.9053149372339249, "reward_before_mean": 1.0367488861083984, "reward_before_std": 0.719157699495554, "reward_change_max": 0.0, "reward_change_mean": -0.7540343515574932, "reward_change_min": -1.2173069790005684, "reward_change_std": 0.4567888453602791, "reward_std": 0.9053149521350861, "rewards/cosine_scaled_reward": 0.10170776396989822, "rewards/format_reward": 0.8333333432674408, "step": 336 }, { "advantage_max": 1.9325546622276306, "advantage_mean": 3.1044086745701804e-09, "advantage_min": -0.7538226917386055, "advantage_std": 0.9998637139797211, "completion_length": 1289.333381652832, "epoch": 0.3851428571428571, "grad_norm": 0.3083600103855133, "kl": 0.012012481689453125, "lambda_div_used": 0.5, "learning_rate": 3.612465628992203e-07, "loss": 0.0005, "reward": 0.17610874178353697, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.17610874178353697, "reward_after_std": 0.9225818663835526, "reward_before_mean": 0.8412223365157843, "reward_before_std": 0.7876620050519705, "reward_change_max": 0.0, "reward_change_mean": -0.6651136055588722, "reward_change_min": -1.143676407635212, "reward_change_std": 0.41685409285128117, "reward_std": 0.9225818961858749, "rewards/cosine_scaled_reward": -0.0689721773378551, "rewards/format_reward": 0.9791666716337204, "step": 337 }, { "advantage_max": 1.9723588973283768, "advantage_mean": -1.8626452602532595e-08, "advantage_min": -0.734128400683403, "advantage_std": 0.9998419284820557, "completion_length": 1366.9375457763672, "epoch": 0.3862857142857143, "grad_norm": 0.34599220752716064, "kl": 0.01892852783203125, "lambda_div_used": 0.5, "learning_rate": 3.5839931879571725e-07, "loss": 0.0008, "reward": 0.2359230676665902, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2359230676665902, "reward_after_std": 0.7228243090212345, "reward_before_mean": 1.0056013464927673, "reward_before_std": 0.4889524318277836, "reward_change_max": 0.001291126012802124, "reward_change_mean": -0.7696783049032092, "reward_change_min": -1.112348735332489, "reward_change_std": 0.42689187824726105, "reward_std": 0.7228243388235569, "rewards/cosine_scaled_reward": 0.05488398531451821, "rewards/format_reward": 0.895833333954215, "step": 338 }, { "advantage_max": 1.8810593783855438, "advantage_mean": 1.2417634476236117e-08, "advantage_min": -0.88125079870224, "advantage_std": 0.9998427778482437, "completion_length": 1647.0000305175781, "epoch": 0.38742857142857146, "grad_norm": 0.32401907444000244, "kl": 0.026760101318359375, "lambda_div_used": 0.5, "learning_rate": 3.555614130391079e-07, "loss": 0.0011, "reward": 0.010632646270096302, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.010632646270096302, "reward_after_std": 0.7342008873820305, "reward_before_mean": 0.5953217758797109, "reward_before_std": 0.6697694882750511, "reward_change_max": 0.0, "reward_change_mean": -0.5846891477704048, "reward_change_min": -0.9830212518572807, "reward_change_std": 0.38205210864543915, "reward_std": 0.7342009283602238, "rewards/cosine_scaled_reward": -0.12942244857549667, "rewards/format_reward": 0.8541666865348816, "step": 339 }, { "advantage_max": 1.9713415503501892, "advantage_mean": 2.4835267176115394e-09, "advantage_min": -0.7005641125142574, "advantage_std": 0.9998614490032196, "completion_length": 1372.1875305175781, "epoch": 0.38857142857142857, "grad_norm": 0.24274250864982605, "kl": 0.012508392333984375, "lambda_div_used": 0.5, "learning_rate": 3.5273298394491515e-07, "loss": 0.0005, "reward": 0.056589219719171524, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.056589219719171524, "reward_after_std": 0.856462549418211, "reward_before_mean": 0.6404929962009192, "reward_before_std": 0.7010131273418665, "reward_change_max": 0.0, "reward_change_mean": -0.5839037746191025, "reward_change_min": -0.9369787387549877, "reward_change_std": 0.3534848652780056, "reward_std": 0.8564625568687916, "rewards/cosine_scaled_reward": -0.11725351912900805, "rewards/format_reward": 0.8750000055879354, "step": 340 }, { "advantage_max": 1.9696991741657257, "advantage_mean": -1.9247334503980085e-08, "advantage_min": -0.6668812446296215, "advantage_std": 0.9998695105314255, "completion_length": 1139.9792022705078, "epoch": 0.38971428571428574, "grad_norm": 0.27058425545692444, "kl": 0.01445770263671875, "lambda_div_used": 0.5, "learning_rate": 3.4991416936678276e-07, "loss": 0.0006, "reward": 0.24952181614935398, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.24952181614935398, "reward_after_std": 0.9109466038644314, "reward_before_mean": 0.9754927828907967, "reward_before_std": 0.6992852129042149, "reward_change_max": 0.0, "reward_change_mean": -0.725970946252346, "reward_change_min": -1.183552272617817, "reward_change_std": 0.4466256331652403, "reward_std": 0.9109466522932053, "rewards/cosine_scaled_reward": 0.03982970770448446, "rewards/format_reward": 0.8958333358168602, "step": 341 }, { "advantage_max": 1.898781567811966, "advantage_mean": -1.2417635808503746e-09, "advantage_min": -0.8452117443084717, "advantage_std": 0.9998630881309509, "completion_length": 1458.0833435058594, "epoch": 0.39085714285714285, "grad_norm": 0.3426169157028198, "kl": 0.0243072509765625, "lambda_div_used": 0.5, "learning_rate": 3.471051066897562e-07, "loss": 0.001, "reward": 0.13850300945341587, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.13850300945341587, "reward_after_std": 0.8392068706452847, "reward_before_mean": 0.800941426306963, "reward_before_std": 0.7777084633708, "reward_change_max": 8.752942085266113e-05, "reward_change_mean": -0.662438403815031, "reward_change_min": -1.175816796720028, "reward_change_std": 0.4522414803504944, "reward_std": 0.8392068967223167, "rewards/cosine_scaled_reward": -0.047445970587432384, "rewards/format_reward": 0.8958333432674408, "step": 342 }, { "advantage_max": 1.9382910281419754, "advantage_mean": -1.117587122845265e-08, "advantage_min": -0.7976772412657738, "advantage_std": 0.9998707920312881, "completion_length": 1346.5000305175781, "epoch": 0.392, "grad_norm": 0.32439133524894714, "kl": 0.014873504638671875, "lambda_div_used": 0.5, "learning_rate": 3.4430593282358777e-07, "loss": 0.0006, "reward": 0.21847632061690092, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.21847632061690092, "reward_after_std": 0.9025693461298943, "reward_before_mean": 0.9282007124274969, "reward_before_std": 0.7880385238677263, "reward_change_max": 0.0, "reward_change_mean": -0.7097243778407574, "reward_change_min": -1.207001969218254, "reward_change_std": 0.4577541835606098, "reward_std": 0.9025693461298943, "rewards/cosine_scaled_reward": -0.00464966893196106, "rewards/format_reward": 0.9375000149011612, "step": 343 }, { "advantage_max": 1.971865400671959, "advantage_mean": -3.414849514271623e-08, "advantage_min": -0.7260220609605312, "advantage_std": 0.9998649135231972, "completion_length": 1340.458351135254, "epoch": 0.3931428571428571, "grad_norm": 0.25376710295677185, "kl": 0.017496109008789062, "lambda_div_used": 0.5, "learning_rate": 3.4151678419606233e-07, "loss": 0.0007, "reward": 0.4441331517882645, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4441331517882645, "reward_after_std": 0.8279833234846592, "reward_before_mean": 1.3637418150901794, "reward_before_std": 0.5514478012919426, "reward_change_max": 0.0, "reward_change_mean": -0.9196087047457695, "reward_change_min": -1.3315959051251411, "reward_change_std": 0.5144347231835127, "reward_std": 0.8279833309352398, "rewards/cosine_scaled_reward": 0.2339542363770306, "rewards/format_reward": 0.895833333954215, "step": 344 }, { "advantage_max": 1.8874634951353073, "advantage_mean": -1.4901161637936866e-08, "advantage_min": -0.8752384632825851, "advantage_std": 0.9998615980148315, "completion_length": 1664.3125610351562, "epoch": 0.3942857142857143, "grad_norm": 0.4946304261684418, "kl": 0.0244293212890625, "lambda_div_used": 0.5, "learning_rate": 3.387377967463493e-07, "loss": 0.001, "reward": 0.14852892188355327, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.14852892188355327, "reward_after_std": 0.8501206710934639, "reward_before_mean": 0.8226657584309578, "reward_before_std": 0.7914411835372448, "reward_change_max": 0.0013569816946983337, "reward_change_mean": -0.6741368658840656, "reward_change_min": -1.1830965094268322, "reward_change_std": 0.467490840703249, "reward_std": 0.8501207120716572, "rewards/cosine_scaled_reward": 0.0050828717648983, "rewards/format_reward": 0.812500013038516, "step": 345 }, { "advantage_max": 1.9018942415714264, "advantage_mean": 6.208816238917336e-10, "advantage_min": -0.8773997947573662, "advantage_std": 0.9998459815979004, "completion_length": 1397.2500228881836, "epoch": 0.3954285714285714, "grad_norm": 0.2937193214893341, "kl": 0.01808929443359375, "lambda_div_used": 0.5, "learning_rate": 3.359691059183761e-07, "loss": 0.0007, "reward": 0.0976130670751445, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.0976130670751445, "reward_after_std": 0.7293423525989056, "reward_before_mean": 0.7637905050069094, "reward_before_std": 0.6217326875776052, "reward_change_max": 0.00037025660276412964, "reward_change_mean": -0.6661774702370167, "reward_change_min": -1.1007812693715096, "reward_change_std": 0.4196817334741354, "reward_std": 0.729342382401228, "rewards/cosine_scaled_reward": -0.07643807306885719, "rewards/format_reward": 0.9166666865348816, "step": 346 }, { "advantage_max": 1.9513923674821854, "advantage_mean": 6.3640377412355065e-09, "advantage_min": -0.766438364982605, "advantage_std": 0.9998519346117973, "completion_length": 1442.7083587646484, "epoch": 0.3965714285714286, "grad_norm": 0.3007684051990509, "kl": 0.01589202880859375, "lambda_div_used": 0.5, "learning_rate": 3.3321084665422803e-07, "loss": 0.0006, "reward": 0.014781358651816845, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.014781358651816845, "reward_after_std": 0.7751567997038364, "reward_before_mean": 0.5895684882998466, "reward_before_std": 0.6202553771436214, "reward_change_max": 0.001266680657863617, "reward_change_mean": -0.5747871249914169, "reward_change_min": -0.8554559350013733, "reward_change_std": 0.33717326261103153, "reward_std": 0.7751568369567394, "rewards/cosine_scaled_reward": -0.17396577447652817, "rewards/format_reward": 0.9375000149011612, "step": 347 }, { "advantage_max": 1.9060746431350708, "advantage_mean": 1.0477379408513343e-08, "advantage_min": -0.8555322960019112, "advantage_std": 0.9998508542776108, "completion_length": 1647.4792251586914, "epoch": 0.3977142857142857, "grad_norm": 0.518498420715332, "kl": 0.033458709716796875, "lambda_div_used": 0.5, "learning_rate": 3.3046315338757026e-07, "loss": 0.0013, "reward": 0.14236977510154247, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.14236977510154247, "reward_after_std": 0.7683847695589066, "reward_before_mean": 0.834890816360712, "reward_before_std": 0.6821971833705902, "reward_change_max": 0.0012637749314308167, "reward_change_mean": -0.6925210300832987, "reward_change_min": -1.134917676448822, "reward_change_std": 0.45355450361967087, "reward_std": 0.7683847993612289, "rewards/cosine_scaled_reward": 0.04244539514183998, "rewards/format_reward": 0.750000013038516, "step": 348 }, { "advantage_max": 1.8973789811134338, "advantage_mean": 1.2417634698280722e-09, "advantage_min": -0.843739926815033, "advantage_std": 0.9998513013124466, "completion_length": 1307.8125305175781, "epoch": 0.39885714285714285, "grad_norm": 0.30172500014305115, "kl": 0.019256591796875, "lambda_div_used": 0.5, "learning_rate": 3.2772616003709616e-07, "loss": 0.0008, "reward": 0.258987728622742, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.258987728622742, "reward_after_std": 0.80018550157547, "reward_before_mean": 1.0375699400901794, "reward_before_std": 0.7033422328531742, "reward_change_max": 0.0, "reward_change_mean": -0.7785822227597237, "reward_change_min": -1.2887407094240189, "reward_change_std": 0.4977263957262039, "reward_std": 0.80018550157547, "rewards/cosine_scaled_reward": 0.03961828793399036, "rewards/format_reward": 0.9583333432674408, "step": 349 }, { "advantage_max": 1.965741217136383, "advantage_mean": 2.4835269396561444e-09, "advantage_min": -0.6967620141804218, "advantage_std": 0.9998733699321747, "completion_length": 961.3958740234375, "epoch": 0.4, "grad_norm": 0.3641263246536255, "kl": 0.0122528076171875, "lambda_div_used": 0.5, "learning_rate": 3.250000000000001e-07, "loss": 0.0005, "reward": 0.1289899628609419, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1289899628609419, "reward_after_std": 0.9607405439019203, "reward_before_mean": 0.7350360294803977, "reward_before_std": 0.7971515450626612, "reward_change_max": 0.0033571943640708923, "reward_change_mean": -0.6060460731387138, "reward_change_min": -1.0657524466514587, "reward_change_std": 0.3789667785167694, "reward_std": 0.9607405923306942, "rewards/cosine_scaled_reward": -0.12206532340496778, "rewards/format_reward": 0.9791666716337204, "step": 350 }, { "advantage_max": 1.9018863588571548, "advantage_mean": 9.235616160729876e-09, "advantage_min": -0.7558177262544632, "advantage_std": 0.999875046312809, "completion_length": 1295.708366394043, "epoch": 0.40114285714285713, "grad_norm": 0.340808242559433, "kl": 0.018463134765625, "lambda_div_used": 0.5, "learning_rate": 3.222848061454764e-07, "loss": 0.0007, "reward": 0.23417375516146421, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.23417375516146421, "reward_after_std": 0.9156857281923294, "reward_before_mean": 0.9571765847504139, "reward_before_std": 0.858966302126646, "reward_change_max": 0.0011729896068572998, "reward_change_mean": -0.7230028323829174, "reward_change_min": -1.339646216481924, "reward_change_std": 0.5007602255791426, "reward_std": 0.9156857430934906, "rewards/cosine_scaled_reward": 0.04108827468007803, "rewards/format_reward": 0.8750000055879354, "step": 351 }, { "advantage_max": 1.9002157002687454, "advantage_mean": 9.934107758624577e-09, "advantage_min": -0.7841279283165932, "advantage_std": 0.9998011961579323, "completion_length": 1612.3125305175781, "epoch": 0.4022857142857143, "grad_norm": 0.6093102097511292, "kl": 0.02661895751953125, "lambda_div_used": 0.5, "learning_rate": 3.195807108082429e-07, "loss": 0.0011, "reward": -0.020187399117276073, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.020187399117276073, "reward_after_std": 0.5602889284491539, "reward_before_mean": 0.5988853393937461, "reward_before_std": 0.4055903349071741, "reward_change_max": 0.0, "reward_change_mean": -0.6190727520734072, "reward_change_min": -0.973639614880085, "reward_change_std": 0.3668802008032799, "reward_std": 0.5602889433503151, "rewards/cosine_scaled_reward": -0.06514065247029066, "rewards/format_reward": 0.7291666697710752, "step": 352 }, { "advantage_max": 1.9760214239358902, "advantage_mean": 1.2417633588057697e-09, "advantage_min": -0.647670142352581, "advantage_std": 0.9998504817485809, "completion_length": 980.2083702087402, "epoch": 0.4034285714285714, "grad_norm": 0.40995076298713684, "kl": 0.00830078125, "lambda_div_used": 0.5, "learning_rate": 3.168878457820915e-07, "loss": 0.0003, "reward": 0.362104510422796, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.362104510422796, "reward_after_std": 0.7626262679696083, "reward_before_mean": 1.230547845363617, "reward_before_std": 0.48653218522667885, "reward_change_max": 0.0, "reward_change_mean": -0.8684433326125145, "reward_change_min": -1.3135455027222633, "reward_change_std": 0.4772064797580242, "reward_std": 0.7626262977719307, "rewards/cosine_scaled_reward": 0.11527392640709877, "rewards/format_reward": 1.0, "step": 353 }, { "advantage_max": 1.9692391902208328, "advantage_mean": -1.241763691872677e-09, "advantage_min": -0.7419257685542107, "advantage_std": 0.9998397529125214, "completion_length": 1016.020866394043, "epoch": 0.4045714285714286, "grad_norm": 0.2783952057361603, "kl": 0.013134002685546875, "lambda_div_used": 0.5, "learning_rate": 3.142063423134644e-07, "loss": 0.0005, "reward": 0.2590313320979476, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2590313320979476, "reward_after_std": 0.7170304656028748, "reward_before_mean": 1.0561553873121738, "reward_before_std": 0.4702160977758467, "reward_change_max": 0.0, "reward_change_mean": -0.7971240431070328, "reward_change_min": -1.1697766482830048, "reward_change_std": 0.4459417313337326, "reward_std": 0.7170304767787457, "rewards/cosine_scaled_reward": 0.05932767526246607, "rewards/format_reward": 0.9375000149011612, "step": 354 }, { "advantage_max": 1.9122939109802246, "advantage_mean": -6.208817349140361e-09, "advantage_min": -0.7931451685726643, "advantage_std": 0.9998869970440865, "completion_length": 1000.5625305175781, "epoch": 0.4057142857142857, "grad_norm": 0.2880876064300537, "kl": 0.010530471801757812, "lambda_div_used": 0.5, "learning_rate": 3.115363310950578e-07, "loss": 0.0004, "reward": 0.256419240264222, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.256419240264222, "reward_after_std": 0.9639424160122871, "reward_before_mean": 0.9679636843502522, "reward_before_std": 0.8596122078597546, "reward_change_max": 0.0, "reward_change_mean": -0.7115443907678127, "reward_change_min": -1.2375174909830093, "reward_change_std": 0.46681670658290386, "reward_std": 0.9639424830675125, "rewards/cosine_scaled_reward": -0.005601532757282257, "rewards/format_reward": 0.9791666716337204, "step": 355 }, { "advantage_max": 1.9162172675132751, "advantage_mean": -1.8626451603331873e-08, "advantage_min": -0.8011289536952972, "advantage_std": 0.999864473938942, "completion_length": 1473.395851135254, "epoch": 0.40685714285714286, "grad_norm": 0.27427616715431213, "kl": 0.026065826416015625, "lambda_div_used": 0.5, "learning_rate": 3.0887794225945143e-07, "loss": 0.001, "reward": 0.15733138285577297, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.15733138285577297, "reward_after_std": 0.8644469156861305, "reward_before_mean": 0.8291059145703912, "reward_before_std": 0.7782079391181469, "reward_change_max": 0.0, "reward_change_mean": -0.6717745885252953, "reward_change_min": -1.1893984526395798, "reward_change_std": 0.45820096507668495, "reward_std": 0.8644469529390335, "rewards/cosine_scaled_reward": -0.022947038523852825, "rewards/format_reward": 0.875, "step": 356 }, { "advantage_max": 1.9351384490728378, "advantage_mean": 2.1730860333413204e-08, "advantage_min": -0.8235296234488487, "advantage_std": 0.9998565465211868, "completion_length": 1561.2500457763672, "epoch": 0.408, "grad_norm": 0.3253626823425293, "kl": 0.024013519287109375, "lambda_div_used": 0.5, "learning_rate": 3.062313053727671e-07, "loss": 0.001, "reward": -0.026991624385118484, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.026991624385118484, "reward_after_std": 0.8327379450201988, "reward_before_mean": 0.4940991383045912, "reward_before_std": 0.7405038252472878, "reward_change_max": 0.0, "reward_change_mean": -0.5210907440632582, "reward_change_min": -0.9241106547415257, "reward_change_std": 0.3496807739138603, "reward_std": 0.83273795992136, "rewards/cosine_scaled_reward": -0.13836710306350142, "rewards/format_reward": 0.7708333488553762, "step": 357 }, { "advantage_max": 1.906896635890007, "advantage_mean": 3.104408619059029e-09, "advantage_min": -0.7559650018811226, "advantage_std": 0.9998820126056671, "completion_length": 1381.4166793823242, "epoch": 0.40914285714285714, "grad_norm": 0.24099020659923553, "kl": 0.015361785888671875, "lambda_div_used": 0.5, "learning_rate": 3.0359654942835247e-07, "loss": 0.0006, "reward": 0.367542517837137, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.367542517837137, "reward_after_std": 0.971061497926712, "reward_before_mean": 1.1805765418102965, "reward_before_std": 0.8555642701685429, "reward_change_max": 0.0, "reward_change_mean": -0.8130340054631233, "reward_change_min": -1.4749982208013535, "reward_change_std": 0.5291310101747513, "reward_std": 0.9710615314543247, "rewards/cosine_scaled_reward": 0.12153824418783188, "rewards/format_reward": 0.9375, "step": 358 }, { "advantage_max": 1.9519437849521637, "advantage_mean": 9.313226023710541e-09, "advantage_min": -0.7854745984077454, "advantage_std": 0.9998278766870499, "completion_length": 960.1666946411133, "epoch": 0.4102857142857143, "grad_norm": 0.4961448311805725, "kl": 0.017177581787109375, "lambda_div_used": 0.5, "learning_rate": 3.0097380284049523e-07, "loss": 0.0007, "reward": 0.07578269951045513, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.07578269951045513, "reward_after_std": 0.6794970296323299, "reward_before_mean": 0.7309711538255215, "reward_before_std": 0.5242785401642323, "reward_change_max": 0.0, "reward_change_mean": -0.6551884561777115, "reward_change_min": -0.9929383620619774, "reward_change_std": 0.38128501921892166, "reward_std": 0.679497055709362, "rewards/cosine_scaled_reward": -0.12409775704145432, "rewards/format_reward": 0.9791666716337204, "step": 359 }, { "advantage_max": 1.921691581606865, "advantage_mean": -3.849466723160333e-08, "advantage_min": -0.8046199455857277, "advantage_std": 0.9998729974031448, "completion_length": 1303.9375381469727, "epoch": 0.4114285714285714, "grad_norm": 0.33698728680610657, "kl": 0.0240478515625, "lambda_div_used": 0.5, "learning_rate": 2.9836319343816397e-07, "loss": 0.001, "reward": 0.4048253740184009, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4048253740184009, "reward_after_std": 0.8957697823643684, "reward_before_mean": 1.270486131310463, "reward_before_std": 0.7351746968924999, "reward_change_max": 0.0, "reward_change_mean": -0.865660771727562, "reward_change_min": -1.4209284782409668, "reward_change_std": 0.535039871931076, "reward_std": 0.895769789814949, "rewards/cosine_scaled_reward": 0.14565971928823274, "rewards/format_reward": 0.9791666716337204, "step": 360 }, { "advantage_max": 1.9461275935173035, "advantage_mean": -2.0178656301439446e-08, "advantage_min": -0.6842218115925789, "advantage_std": 0.9998639598488808, "completion_length": 1284.0625305175781, "epoch": 0.4125714285714286, "grad_norm": 0.2980879545211792, "kl": 0.020538330078125, "lambda_div_used": 0.5, "learning_rate": 2.9576484845877793e-07, "loss": 0.0008, "reward": 0.20743734575808048, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.20743734575808048, "reward_after_std": 0.8618561178445816, "reward_before_mean": 0.9113494399935007, "reward_before_std": 0.6808569990098476, "reward_change_max": 0.0015008747577667236, "reward_change_mean": -0.7039121389389038, "reward_change_min": -1.143224611878395, "reward_change_std": 0.4279782176017761, "reward_std": 0.8618561401963234, "rewards/cosine_scaled_reward": -0.013075282797217369, "rewards/format_reward": 0.9375000074505806, "step": 361 }, { "advantage_max": 1.9330978840589523, "advantage_mean": 1.8626450937198058e-09, "advantage_min": -0.8092173300683498, "advantage_std": 0.9998206868767738, "completion_length": 880.7500190734863, "epoch": 0.4137142857142857, "grad_norm": 0.3951859474182129, "kl": 0.01638031005859375, "lambda_div_used": 0.5, "learning_rate": 2.931788945420058e-07, "loss": 0.0007, "reward": 0.1559063233435154, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1559063233435154, "reward_after_std": 0.647941593080759, "reward_before_mean": 0.8897985154762864, "reward_before_std": 0.4787643002346158, "reward_change_max": 0.0, "reward_change_mean": -0.7338921837508678, "reward_change_min": -1.1151539906859398, "reward_change_std": 0.4269598387181759, "reward_std": 0.6479416117072105, "rewards/cosine_scaled_reward": -0.023850757628679276, "rewards/format_reward": 0.9375, "step": 362 }, { "advantage_max": 1.9025491178035736, "advantage_mean": -3.476937759927523e-08, "advantage_min": -0.7924175783991814, "advantage_std": 0.9998592659831047, "completion_length": 936.8750152587891, "epoch": 0.41485714285714287, "grad_norm": 0.35128331184387207, "kl": 0.013484954833984375, "lambda_div_used": 0.5, "learning_rate": 2.9060545772359305e-07, "loss": 0.0005, "reward": 0.3590309312567115, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3590309312567115, "reward_after_std": 0.7893019616603851, "reward_before_mean": 1.216321088373661, "reward_before_std": 0.6109752170741558, "reward_change_max": 0.0, "reward_change_mean": -0.8572902157902718, "reward_change_min": -1.3023911118507385, "reward_change_std": 0.5073912441730499, "reward_std": 0.7893019765615463, "rewards/cosine_scaled_reward": 0.11857721768319607, "rewards/format_reward": 0.9791666716337204, "step": 363 }, { "advantage_max": 1.9401460587978363, "advantage_mean": 1.3969839451899446e-09, "advantage_min": -0.7830497920513153, "advantage_std": 0.9998208284378052, "completion_length": 1427.3542098999023, "epoch": 0.416, "grad_norm": 0.41800156235694885, "kl": 0.02063751220703125, "lambda_div_used": 0.5, "learning_rate": 2.8804466342921987e-07, "loss": 0.0008, "reward": -0.08546914509497583, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.08546914509497583, "reward_after_std": 0.6147534213960171, "reward_before_mean": 0.45134393498301506, "reward_before_std": 0.49978536926209927, "reward_change_max": 0.0, "reward_change_mean": -0.5368130728602409, "reward_change_min": -0.8912122398614883, "reward_change_std": 0.32553112506866455, "reward_std": 0.6147534511983395, "rewards/cosine_scaled_reward": -0.2326613813638687, "rewards/format_reward": 0.916666679084301, "step": 364 }, { "advantage_max": 1.9052964746952057, "advantage_mean": -1.2417632477834672e-09, "advantage_min": -0.7776142209768295, "advantage_std": 0.9998699054121971, "completion_length": 1549.3750381469727, "epoch": 0.41714285714285715, "grad_norm": 0.5369855165481567, "kl": 0.035430908203125, "lambda_div_used": 0.5, "learning_rate": 2.854966364683872e-07, "loss": 0.0014, "reward": 0.005877653602510691, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.005877653602510691, "reward_after_std": 0.8726274520158768, "reward_before_mean": 0.5426001232117414, "reward_before_std": 0.8404004983603954, "reward_change_max": 0.0007102638483047485, "reward_change_mean": -0.5367224644869566, "reward_change_min": -1.1121207065880299, "reward_change_std": 0.41939173452556133, "reward_std": 0.8726274818181992, "rewards/cosine_scaled_reward": -0.0932832807302475, "rewards/format_reward": 0.7291666697710752, "step": 365 }, { "advantage_max": 1.9043861776590347, "advantage_mean": 6.208818126296478e-09, "advantage_min": -0.8616622500121593, "advantage_std": 0.9998663142323494, "completion_length": 1251.1667022705078, "epoch": 0.41828571428571426, "grad_norm": 0.3164905607700348, "kl": 0.0154266357421875, "lambda_div_used": 0.5, "learning_rate": 2.829615010283344e-07, "loss": 0.0006, "reward": 0.32734458870254457, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.32734458870254457, "reward_after_std": 0.8601884730160236, "reward_before_mean": 1.138159309513867, "reward_before_std": 0.7124536950141191, "reward_change_max": 0.0, "reward_change_mean": -0.8108147121965885, "reward_change_min": -1.284881740808487, "reward_change_std": 0.5022755339741707, "reward_std": 0.8601884730160236, "rewards/cosine_scaled_reward": 0.10032964125275612, "rewards/format_reward": 0.9375000074505806, "step": 366 }, { "advantage_max": 1.9257488250732422, "advantage_mean": -1.8626452602532595e-09, "advantage_min": -0.8024434819817543, "advantage_std": 0.9998431578278542, "completion_length": 1324.3958892822266, "epoch": 0.41942857142857143, "grad_norm": 0.3114745616912842, "kl": 0.015369415283203125, "lambda_div_used": 0.5, "learning_rate": 2.8043938066798645e-07, "loss": 0.0006, "reward": 0.008062966400757432, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.008062966400757432, "reward_after_std": 0.7330645881593227, "reward_before_mean": 0.593254167586565, "reward_before_std": 0.6409398391842842, "reward_change_max": 0.0, "reward_change_mean": -0.5851911939680576, "reward_change_min": -1.088577315211296, "reward_change_std": 0.3858226127922535, "reward_std": 0.7330645956099033, "rewards/cosine_scaled_reward": -0.16170626878738403, "rewards/format_reward": 0.916666679084301, "step": 367 }, { "advantage_max": 1.961761862039566, "advantage_mean": -6.208817127095756e-09, "advantage_min": -0.6810972690582275, "advantage_std": 0.9998715221881866, "completion_length": 1611.4792098999023, "epoch": 0.4205714285714286, "grad_norm": 0.6100217700004578, "kl": 0.026096343994140625, "lambda_div_used": 0.5, "learning_rate": 2.7793039831193133e-07, "loss": 0.001, "reward": 0.0674855774268508, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.0674855774268508, "reward_after_std": 0.9106010496616364, "reward_before_mean": 0.6422249140887288, "reward_before_std": 0.7696337550878525, "reward_change_max": 0.0, "reward_change_mean": -0.5747393220663071, "reward_change_min": -1.0176613926887512, "reward_change_std": 0.3697497956454754, "reward_std": 0.9106010720133781, "rewards/cosine_scaled_reward": -0.10597089910879731, "rewards/format_reward": 0.8541666679084301, "step": 368 }, { "advantage_max": 1.9500256478786469, "advantage_mean": 1.7074247571358114e-09, "advantage_min": -0.7334987670183182, "advantage_std": 0.9998728185892105, "completion_length": 1466.7291946411133, "epoch": 0.4217142857142857, "grad_norm": 0.48714351654052734, "kl": 0.028980255126953125, "lambda_div_used": 0.5, "learning_rate": 2.7543467624442956e-07, "loss": 0.0012, "reward": 0.13486522855237126, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.13486522855237126, "reward_after_std": 0.9019280709326267, "reward_before_mean": 0.7667449675500393, "reward_before_std": 0.7737538255751133, "reward_change_max": 0.0004113316535949707, "reward_change_mean": -0.6318797618150711, "reward_change_min": -1.1101751253008842, "reward_change_std": 0.41809099167585373, "reward_std": 0.9019280709326267, "rewards/cosine_scaled_reward": -0.06454417761415243, "rewards/format_reward": 0.8958333432674408, "step": 369 }, { "advantage_max": 2.000591605901718, "advantage_mean": -9.934107314535368e-09, "advantage_min": -0.6458289884030819, "advantage_std": 0.9998491033911705, "completion_length": 1292.6667213439941, "epoch": 0.4228571428571429, "grad_norm": 0.3272714614868164, "kl": 0.024097442626953125, "lambda_div_used": 0.5, "learning_rate": 2.729523361034538e-07, "loss": 0.001, "reward": 0.11595443380065262, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.11595443380065262, "reward_after_std": 0.7612169794738293, "reward_before_mean": 0.7688500918447971, "reward_before_std": 0.5021704901009798, "reward_change_max": 0.0, "reward_change_mean": -0.6528956815600395, "reward_change_min": -0.9266734272241592, "reward_change_std": 0.3518510889261961, "reward_std": 0.7612170018255711, "rewards/cosine_scaled_reward": -0.07390830665826797, "rewards/format_reward": 0.9166666865348816, "step": 370 }, { "advantage_max": 1.9721637219190598, "advantage_mean": -2.7318796669284495e-08, "advantage_min": -0.6613831929862499, "advantage_std": 0.9998557940125465, "completion_length": 752.2500305175781, "epoch": 0.424, "grad_norm": 0.3771182894706726, "kl": 0.013050079345703125, "lambda_div_used": 0.5, "learning_rate": 2.7048349887476037e-07, "loss": 0.0005, "reward": 0.4583681761287153, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4583681761287153, "reward_after_std": 0.8134524710476398, "reward_before_mean": 1.3909370601177216, "reward_before_std": 0.5117593847680837, "reward_change_max": 0.0, "reward_change_mean": -0.9325688779354095, "reward_change_min": -1.3584751039743423, "reward_change_std": 0.524842644110322, "reward_std": 0.8134525120258331, "rewards/cosine_scaled_reward": 0.2058851895853877, "rewards/format_reward": 0.9791666716337204, "step": 371 }, { "advantage_max": 1.9488580971956253, "advantage_mean": -8.692344177774203e-09, "advantage_min": -0.7659207582473755, "advantage_std": 0.9998446479439735, "completion_length": 1342.9375381469727, "epoch": 0.42514285714285716, "grad_norm": 0.240267813205719, "kl": 0.01468658447265625, "lambda_div_used": 0.5, "learning_rate": 2.6802828488599294e-07, "loss": 0.0006, "reward": 0.24963407404720783, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.24963407404720783, "reward_after_std": 0.7213221676647663, "reward_before_mean": 1.0393864251673222, "reward_before_std": 0.5199251472949982, "reward_change_max": 0.0, "reward_change_mean": -0.7897523567080498, "reward_change_min": -1.1659668758511543, "reward_change_std": 0.4499006439000368, "reward_std": 0.7213221788406372, "rewards/cosine_scaled_reward": 0.030109863728284836, "rewards/format_reward": 0.9791666716337204, "step": 372 }, { "advantage_max": 1.958022728562355, "advantage_mean": -3.104408563547878e-09, "advantage_min": -0.677016519010067, "advantage_std": 0.99983249604702, "completion_length": 759.9166946411133, "epoch": 0.42628571428571427, "grad_norm": 0.37422969937324524, "kl": 0.011119842529296875, "lambda_div_used": 0.5, "learning_rate": 2.655868138008171e-07, "loss": 0.0004, "reward": 0.19378744415007532, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.19378744415007532, "reward_after_std": 0.8117514979094267, "reward_before_mean": 0.8973379731178284, "reward_before_std": 0.6298465020954609, "reward_change_max": 0.0, "reward_change_mean": -0.7035505324602127, "reward_change_min": -1.1113643571734428, "reward_change_std": 0.41324375942349434, "reward_std": 0.811751514673233, "rewards/cosine_scaled_reward": -0.05133102275431156, "rewards/format_reward": 1.0, "step": 373 }, { "advantage_max": 1.9328200817108154, "advantage_mean": -7.140139812733537e-09, "advantage_min": -0.865662969648838, "advantage_std": 0.9998394921422005, "completion_length": 1035.5416870117188, "epoch": 0.42742857142857144, "grad_norm": 0.3418057858943939, "kl": 0.011562347412109375, "lambda_div_used": 0.5, "learning_rate": 2.631592046130896e-07, "loss": 0.0005, "reward": 0.2122154445387423, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2122154445387423, "reward_after_std": 0.729767944663763, "reward_before_mean": 0.9647579044103622, "reward_before_std": 0.5690576434135437, "reward_change_max": 0.0, "reward_change_mean": -0.7525424435734749, "reward_change_min": -1.1482946649193764, "reward_change_std": 0.44476964697241783, "reward_std": 0.7297679595649242, "rewards/cosine_scaled_reward": -0.007204409688711166, "rewards/format_reward": 0.9791666716337204, "step": 374 }, { "advantage_max": 1.9218790829181671, "advantage_mean": -6.208817238118058e-09, "advantage_min": -0.7422648146748543, "advantage_std": 0.9998613074421883, "completion_length": 1885.0625228881836, "epoch": 0.42857142857142855, "grad_norm": 0.36158016324043274, "kl": 0.052459716796875, "lambda_div_used": 0.5, "learning_rate": 2.6074557564105724e-07, "loss": 0.0021, "reward": 0.16519121266901493, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.16519121266901493, "reward_after_std": 0.9700677208602428, "reward_before_mean": 0.7985913008451462, "reward_before_std": 0.8461023792624474, "reward_change_max": 0.0019219592213630676, "reward_change_mean": -0.6334001235663891, "reward_change_min": -1.1537334434688091, "reward_change_std": 0.44267112016677856, "reward_std": 0.9700677394866943, "rewards/cosine_scaled_reward": 0.08679564902558923, "rewards/format_reward": 0.6250000074505806, "step": 375 }, { "advantage_max": 1.9197021126747131, "advantage_mean": -6.208817238118058e-09, "advantage_min": -0.8161247000098228, "advantage_std": 0.999818779528141, "completion_length": 1328.5625534057617, "epoch": 0.4297142857142857, "grad_norm": 0.3313137888908386, "kl": 0.02817535400390625, "lambda_div_used": 0.5, "learning_rate": 2.583460445215911e-07, "loss": 0.0011, "reward": 0.03532506921328604, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.03532506921328604, "reward_after_std": 0.5968778096139431, "reward_before_mean": 0.6952200355008245, "reward_before_std": 0.48110751807689667, "reward_change_max": 0.0, "reward_change_mean": -0.6598949693143368, "reward_change_min": -1.0436818599700928, "reward_change_std": 0.3981757313013077, "reward_std": 0.5968778170645237, "rewards/cosine_scaled_reward": -0.10030667018145323, "rewards/format_reward": 0.8958333358168602, "step": 376 }, { "advantage_max": 1.8938241600990295, "advantage_mean": 1.2728075482471013e-08, "advantage_min": -0.8486315608024597, "advantage_std": 0.9998703300952911, "completion_length": 1734.0209121704102, "epoch": 0.4308571428571429, "grad_norm": 0.3574160933494568, "kl": 0.041107177734375, "lambda_div_used": 0.5, "learning_rate": 2.5596072820445254e-07, "loss": 0.0016, "reward": 0.17134802043437958, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.17134802043437958, "reward_after_std": 0.9034502916038036, "reward_before_mean": 0.8413319541141391, "reward_before_std": 0.8715251758694649, "reward_change_max": 0.0, "reward_change_mean": -0.6699839308857918, "reward_change_min": -1.2517807893455029, "reward_change_std": 0.4853241294622421, "reward_std": 0.9034503139555454, "rewards/cosine_scaled_reward": -0.006417365744709969, "rewards/format_reward": 0.854166679084301, "step": 377 }, { "advantage_max": 1.9512610882520676, "advantage_mean": -9.934107758624577e-09, "advantage_min": -0.673358790576458, "advantage_std": 0.999885655939579, "completion_length": 1136.0000228881836, "epoch": 0.432, "grad_norm": 0.3083365559577942, "kl": 0.016523361206054688, "lambda_div_used": 0.5, "learning_rate": 2.5358974294659373e-07, "loss": 0.0007, "reward": 0.431393014267087, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.431393014267087, "reward_after_std": 1.0133287981152534, "reward_before_mean": 1.2786421403288841, "reward_before_std": 0.8173676989972591, "reward_change_max": 0.0, "reward_change_mean": -0.8472491428256035, "reward_change_min": -1.4520145133137703, "reward_change_std": 0.5355722364038229, "reward_std": 1.0133288130164146, "rewards/cosine_scaled_reward": 0.1705710692331195, "rewards/format_reward": 0.9375, "step": 378 }, { "advantage_max": 1.9463636577129364, "advantage_mean": 3.725290853573426e-09, "advantage_min": -0.7703254446387291, "advantage_std": 0.9998480081558228, "completion_length": 1578.1875457763672, "epoch": 0.43314285714285716, "grad_norm": 0.33663100004196167, "kl": 0.03569793701171875, "lambda_div_used": 0.5, "learning_rate": 2.512332043064913e-07, "loss": 0.0014, "reward": 0.18515793047845364, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.18515793047845364, "reward_after_std": 0.7468986734747887, "reward_before_mean": 0.9092491827905178, "reward_before_std": 0.5696240924298763, "reward_change_max": 0.0, "reward_change_mean": -0.724091213196516, "reward_change_min": -1.1145296394824982, "reward_change_std": 0.4156108219176531, "reward_std": 0.7468986958265305, "rewards/cosine_scaled_reward": -0.014125420711934566, "rewards/format_reward": 0.9375000074505806, "step": 379 }, { "advantage_max": 1.9473781883716583, "advantage_mean": 4.967053879312289e-09, "advantage_min": -0.7745387181639671, "advantage_std": 0.9998561814427376, "completion_length": 1598.6875228881836, "epoch": 0.4342857142857143, "grad_norm": 0.3826024532318115, "kl": 0.038959503173828125, "lambda_div_used": 0.5, "learning_rate": 2.488912271385139e-07, "loss": 0.0016, "reward": 0.168661929666996, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.168661929666996, "reward_after_std": 0.8322513662278652, "reward_before_mean": 0.8563592098653316, "reward_before_std": 0.6955513991415501, "reward_change_max": 0.0005630478262901306, "reward_change_mean": -0.6876972541213036, "reward_change_min": -1.0310806632041931, "reward_change_std": 0.4104973468929529, "reward_std": 0.8322513960301876, "rewards/cosine_scaled_reward": 0.0010962523519992828, "rewards/format_reward": 0.8541666772216558, "step": 380 }, { "advantage_max": 1.8988536298274994, "advantage_mean": -1.7384688355548406e-08, "advantage_min": -0.9226270765066147, "advantage_std": 0.9998243972659111, "completion_length": 1515.3542175292969, "epoch": 0.43542857142857144, "grad_norm": 0.4202130138874054, "kl": 0.038272857666015625, "lambda_div_used": 0.5, "learning_rate": 2.465639255873246e-07, "loss": 0.0015, "reward": -0.0928139602765441, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.0928139602765441, "reward_after_std": 0.6203138083219528, "reward_before_mean": 0.4476607348769903, "reward_before_std": 0.5513483323156834, "reward_change_max": 0.0, "reward_change_mean": -0.5404747053980827, "reward_change_min": -0.9399674460291862, "reward_change_std": 0.35836709290742874, "reward_std": 0.6203138120472431, "rewards/cosine_scaled_reward": -0.19283631443977356, "rewards/format_reward": 0.8333333507180214, "step": 381 }, { "advantage_max": 1.9759876430034637, "advantage_mean": 7.450580818968433e-09, "advantage_min": -0.6738567687571049, "advantage_std": 0.999846376478672, "completion_length": 897.5833549499512, "epoch": 0.43657142857142855, "grad_norm": 0.30360478162765503, "kl": 0.010141372680664062, "lambda_div_used": 0.5, "learning_rate": 2.4425141308231765e-07, "loss": 0.0004, "reward": 0.05183810880407691, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.05183810880407691, "reward_after_std": 0.7484404519200325, "reward_before_mean": 0.6674462854862213, "reward_before_std": 0.5618630647659302, "reward_change_max": 0.0, "reward_change_mean": -0.6156081818044186, "reward_change_min": -0.9522461071610451, "reward_change_std": 0.34806813672184944, "reward_std": 0.7484404593706131, "rewards/cosine_scaled_reward": -0.16627686785068363, "rewards/format_reward": 1.0, "step": 382 }, { "advantage_max": 1.9031931459903717, "advantage_mean": -1.3038516488705909e-08, "advantage_min": -0.7755768671631813, "advantage_std": 0.999890647828579, "completion_length": 1310.645881652832, "epoch": 0.4377142857142857, "grad_norm": 0.4743720591068268, "kl": 0.034099578857421875, "lambda_div_used": 0.5, "learning_rate": 2.4195380233209006e-07, "loss": 0.0014, "reward": 0.3746328540146351, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3746328540146351, "reward_after_std": 1.0508150830864906, "reward_before_mean": 1.163354642689228, "reward_before_std": 0.9778174087405205, "reward_change_max": 0.0, "reward_change_mean": -0.7887218073010445, "reward_change_min": -1.4353682398796082, "reward_change_std": 0.5492018274962902, "reward_std": 1.0508150979876518, "rewards/cosine_scaled_reward": 0.1650106585584581, "rewards/format_reward": 0.8333333358168602, "step": 383 }, { "advantage_max": 1.9253188371658325, "advantage_mean": -3.414849525373853e-08, "advantage_min": -0.8059746026992798, "advantage_std": 0.9998823553323746, "completion_length": 1079.7708740234375, "epoch": 0.43885714285714283, "grad_norm": 0.352461576461792, "kl": 0.0101470947265625, "lambda_div_used": 0.5, "learning_rate": 2.3967120531894857e-07, "loss": 0.0004, "reward": 0.5411808973294683, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.5411808973294683, "reward_after_std": 0.9512509405612946, "reward_before_mean": 1.5009002909064293, "reward_before_std": 0.7456874251365662, "reward_change_max": 0.008606597781181335, "reward_change_mean": -0.9597193524241447, "reward_change_min": -1.4905616790056229, "reward_change_std": 0.5804052986204624, "reward_std": 0.9512509629130363, "rewards/cosine_scaled_reward": 0.2712834384292364, "rewards/format_reward": 0.9583333432674408, "step": 384 }, { "advantage_max": 1.9382314532995224, "advantage_mean": 1.2417634531747268e-08, "advantage_min": -0.7810436561703682, "advantage_std": 0.9998589232563972, "completion_length": 1364.6666946411133, "epoch": 0.44, "grad_norm": 0.3704834282398224, "kl": 0.019073486328125, "lambda_div_used": 0.5, "learning_rate": 2.374037332934512e-07, "loss": 0.0008, "reward": 0.13947630883194506, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.13947630883194506, "reward_after_std": 0.8770334720611572, "reward_before_mean": 0.7803040593862534, "reward_before_std": 0.7611396610736847, "reward_change_max": 0.0, "reward_change_mean": -0.6408277489244938, "reward_change_min": -1.1368419975042343, "reward_change_std": 0.4232936166226864, "reward_std": 0.8770334757864475, "rewards/cosine_scaled_reward": -0.07859798357822001, "rewards/format_reward": 0.9375000074505806, "step": 385 }, { "advantage_max": 1.9047647416591644, "advantage_mean": 4.346172088887101e-09, "advantage_min": -0.8413906320929527, "advantage_std": 0.9998516365885735, "completion_length": 1433.645866394043, "epoch": 0.44114285714285717, "grad_norm": 0.41787880659103394, "kl": 0.04235076904296875, "lambda_div_used": 0.5, "learning_rate": 2.3515149676898552e-07, "loss": 0.0017, "reward": 0.30202385812299326, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.30202385812299326, "reward_after_std": 0.7839195318520069, "reward_before_mean": 1.120539478957653, "reward_before_std": 0.6518189385533333, "reward_change_max": 0.0, "reward_change_mean": -0.8185156136751175, "reward_change_min": -1.297236330807209, "reward_change_std": 0.512270912528038, "reward_std": 0.7839195430278778, "rewards/cosine_scaled_reward": 0.08110305480659008, "rewards/format_reward": 0.9583333432674408, "step": 386 }, { "advantage_max": 1.9116900265216827, "advantage_mean": 8.07146305348283e-09, "advantage_min": -0.8842339888215065, "advantage_std": 0.9998463243246078, "completion_length": 1939.6041946411133, "epoch": 0.4422857142857143, "grad_norm": 0.5493029952049255, "kl": 0.0702667236328125, "lambda_div_used": 0.5, "learning_rate": 2.3291460551638237e-07, "loss": 0.0028, "reward": 0.05763331870548427, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.05763331870548427, "reward_after_std": 0.7158108092844486, "reward_before_mean": 0.6914464961737394, "reward_before_std": 0.6264186557382345, "reward_change_max": 0.000324346125125885, "reward_change_mean": -0.6338131837546825, "reward_change_min": -1.006659995764494, "reward_change_std": 0.4054126776754856, "reward_std": 0.7158108279109001, "rewards/cosine_scaled_reward": 0.0019732341170310974, "rewards/format_reward": 0.6875000167638063, "step": 387 }, { "advantage_max": 1.8982749581336975, "advantage_mean": -6.208817349140361e-10, "advantage_min": -0.867442212998867, "advantage_std": 0.9998432621359825, "completion_length": 1462.5833740234375, "epoch": 0.44342857142857145, "grad_norm": 0.3895389437675476, "kl": 0.03160667419433594, "lambda_div_used": 0.5, "learning_rate": 2.306931685585657e-07, "loss": 0.0013, "reward": 0.0997003959491849, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.0997003959491849, "reward_after_std": 0.7914495207369328, "reward_before_mean": 0.7393311122432351, "reward_before_std": 0.7161987256258726, "reward_change_max": 0.002636954188346863, "reward_change_mean": -0.6396307125687599, "reward_change_min": -1.1351251155138016, "reward_change_std": 0.4340355843305588, "reward_std": 0.7914495468139648, "rewards/cosine_scaled_reward": -0.0574177885428071, "rewards/format_reward": 0.8541666753590107, "step": 388 }, { "advantage_max": 1.960044041275978, "advantage_mean": -1.6142925107764938e-08, "advantage_min": -0.6981809362769127, "advantage_std": 0.9998580366373062, "completion_length": 1519.270881652832, "epoch": 0.44457142857142856, "grad_norm": 0.3216893970966339, "kl": 0.034030914306640625, "lambda_div_used": 0.5, "learning_rate": 2.2848729416523859e-07, "loss": 0.0014, "reward": 0.06332766944251489, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.06332766944251489, "reward_after_std": 0.875417023897171, "reward_before_mean": 0.6491998098790646, "reward_before_std": 0.7423716522753239, "reward_change_max": 0.0, "reward_change_mean": -0.5858721435070038, "reward_change_min": -1.0857658833265305, "reward_change_std": 0.3880383335053921, "reward_std": 0.8754170686006546, "rewards/cosine_scaled_reward": -0.1337334355339408, "rewards/format_reward": 0.9166666716337204, "step": 389 }, { "advantage_max": 1.9449383169412613, "advantage_mean": 2.220446049250313e-16, "advantage_min": -0.7815985605120659, "advantage_std": 0.9998255670070648, "completion_length": 1706.4792098999023, "epoch": 0.44571428571428573, "grad_norm": 0.25515004992485046, "kl": 0.037689208984375, "lambda_div_used": 0.5, "learning_rate": 2.2629708984760706e-07, "loss": 0.0015, "reward": 0.08829102944582701, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.08829102944582701, "reward_after_std": 0.7169809453189373, "reward_before_mean": 0.7542915940284729, "reward_before_std": 0.5898414496332407, "reward_change_max": 0.0, "reward_change_mean": -0.6660005636513233, "reward_change_min": -1.0728442445397377, "reward_change_std": 0.40890334732830524, "reward_std": 0.7169809490442276, "rewards/cosine_scaled_reward": -0.029104202054440975, "rewards/format_reward": 0.8125, "step": 390 }, { "advantage_max": 1.9109623730182648, "advantage_mean": -6.2088170160734535e-09, "advantage_min": -0.8808049410581589, "advantage_std": 0.9998817071318626, "completion_length": 1412.6250267028809, "epoch": 0.44685714285714284, "grad_norm": 0.6990765929222107, "kl": 0.051448822021484375, "lambda_div_used": 0.5, "learning_rate": 2.2412266235313973e-07, "loss": 0.0021, "reward": 0.1668861098587513, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1668861098587513, "reward_after_std": 0.9577151201665401, "reward_before_mean": 0.8118191917892545, "reward_before_std": 0.8829915300011635, "reward_change_max": 0.0, "reward_change_mean": -0.644933145493269, "reward_change_min": -1.16757021099329, "reward_change_std": 0.455503998324275, "reward_std": 0.9577151350677013, "rewards/cosine_scaled_reward": -0.010757071897387505, "rewards/format_reward": 0.8333333432674408, "step": 391 }, { "advantage_max": 1.8998601883649826, "advantage_mean": -2.4835269396561444e-09, "advantage_min": -0.8490258902311325, "advantage_std": 0.9998376965522766, "completion_length": 1412.666732788086, "epoch": 0.448, "grad_norm": 0.41337519884109497, "kl": 0.0423736572265625, "lambda_div_used": 0.5, "learning_rate": 2.2196411766036487e-07, "loss": 0.0017, "reward": 0.133538922178559, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.133538922178559, "reward_after_std": 0.7546857632696629, "reward_before_mean": 0.8264464624226093, "reward_before_std": 0.6899518445134163, "reward_change_max": 0.00036785751581192017, "reward_change_mean": -0.6929075047373772, "reward_change_min": -1.1742151752114296, "reward_change_std": 0.46688779070973396, "reward_std": 0.754685778170824, "rewards/cosine_scaled_reward": -0.05552679859101772, "rewards/format_reward": 0.9375000074505806, "step": 392 }, { "advantage_max": 1.9469918012619019, "advantage_mean": -9.934107536579972e-09, "advantage_min": -0.7732015550136566, "advantage_std": 0.9998616725206375, "completion_length": 1526.2500305175781, "epoch": 0.4491428571428571, "grad_norm": 0.5304766893386841, "kl": 0.03658294677734375, "lambda_div_used": 0.5, "learning_rate": 2.1982156097370557e-07, "loss": 0.0015, "reward": 0.11220249300822616, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.11220249300822616, "reward_after_std": 0.8990034684538841, "reward_before_mean": 0.724617250263691, "reward_before_std": 0.7731786463409662, "reward_change_max": 0.0, "reward_change_mean": -0.6124147698283195, "reward_change_min": -0.9759417399764061, "reward_change_std": 0.3854426145553589, "reward_std": 0.8990034759044647, "rewards/cosine_scaled_reward": -0.0543580437079072, "rewards/format_reward": 0.8333333395421505, "step": 393 }, { "advantage_max": 1.9288842529058456, "advantage_mean": -3.7252901874396116e-09, "advantage_min": -0.7776024453341961, "advantage_std": 0.9998523741960526, "completion_length": 1408.5625076293945, "epoch": 0.4502857142857143, "grad_norm": 0.5574933290481567, "kl": 0.03571319580078125, "lambda_div_used": 0.5, "learning_rate": 2.1769509671835223e-07, "loss": 0.0014, "reward": 0.016732539370423183, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.016732539370423183, "reward_after_std": 0.8311260640621185, "reward_before_mean": 0.5725190471857786, "reward_before_std": 0.745930090546608, "reward_change_max": 0.0, "reward_change_mean": -0.555786494165659, "reward_change_min": -0.9936203956604004, "reward_change_std": 0.37467138282954693, "reward_std": 0.8311261013150215, "rewards/cosine_scaled_reward": -0.11999049689620733, "rewards/format_reward": 0.8125000149011612, "step": 394 }, { "advantage_max": 1.8790218234062195, "advantage_mean": -5.587935614226325e-09, "advantage_min": -0.9174651429057121, "advantage_std": 0.9998451918363571, "completion_length": 1520.1041984558105, "epoch": 0.4514285714285714, "grad_norm": 0.4611024558544159, "kl": 0.04803466796875, "lambda_div_used": 0.5, "learning_rate": 2.1558482853517253e-07, "loss": 0.0019, "reward": 0.2775235758163035, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2775235758163035, "reward_after_std": 0.8445008955895901, "reward_before_mean": 1.0525891445577145, "reward_before_std": 0.7422155807726085, "reward_change_max": 0.0, "reward_change_mean": -0.7750656194984913, "reward_change_min": -1.3609160706400871, "reward_change_std": 0.5207080245018005, "reward_std": 0.8445009030401707, "rewards/cosine_scaled_reward": 0.14087790716439486, "rewards/format_reward": 0.770833333954215, "step": 395 }, { "advantage_max": 1.9562052339315414, "advantage_mean": -1.117587122845265e-08, "advantage_min": -0.7776111736893654, "advantage_std": 0.9998528361320496, "completion_length": 1210.6041870117188, "epoch": 0.45257142857142857, "grad_norm": 0.3211035430431366, "kl": 0.022783279418945312, "lambda_div_used": 0.5, "learning_rate": 2.134908592756607e-07, "loss": 0.0009, "reward": 0.15681475645396858, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.15681475645396858, "reward_after_std": 0.8517627380788326, "reward_before_mean": 0.8255343623459339, "reward_before_std": 0.7159137614071369, "reward_change_max": 0.0029743388295173645, "reward_change_mean": -0.6687196530401707, "reward_change_min": -1.1430072411894798, "reward_change_std": 0.43173813447356224, "reward_std": 0.8517627380788326, "rewards/cosine_scaled_reward": -0.03514947555959225, "rewards/format_reward": 0.8958333432674408, "step": 396 }, { "advantage_max": 1.902836725115776, "advantage_mean": 7.450580596923828e-09, "advantage_min": -0.7827468067407608, "advantage_std": 0.9998227432370186, "completion_length": 1434.4375305175781, "epoch": 0.45371428571428574, "grad_norm": 0.43412744998931885, "kl": 0.033294677734375, "lambda_div_used": 0.5, "learning_rate": 2.1141329099692406e-07, "loss": 0.0013, "reward": -0.0025264378637075424, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.0025264378637075424, "reward_after_std": 0.7433805800974369, "reward_before_mean": 0.583407724276185, "reward_before_std": 0.7002998664975166, "reward_change_max": 0.0011475682258605957, "reward_change_mean": -0.5859341472387314, "reward_change_min": -1.1278854310512543, "reward_change_std": 0.4434995539486408, "reward_std": 0.743380606174469, "rewards/cosine_scaled_reward": -0.1145461443811655, "rewards/format_reward": 0.8125000037252903, "step": 397 }, { "advantage_max": 1.9327199161052704, "advantage_mean": 6.208817682207268e-09, "advantage_min": -0.7297875881195068, "advantage_std": 0.9998441785573959, "completion_length": 1530.7709121704102, "epoch": 0.45485714285714285, "grad_norm": 0.6134384274482727, "kl": 0.046314239501953125, "lambda_div_used": 0.5, "learning_rate": 2.0935222495670968e-07, "loss": 0.0019, "reward": -0.03442497365176678, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.03442497365176678, "reward_after_std": 0.7637092061340809, "reward_before_mean": 0.5058358758687973, "reward_before_std": 0.6379642691463232, "reward_change_max": 0.0020051226019859314, "reward_change_mean": -0.5402608290314674, "reward_change_min": -0.9148640409111977, "reward_change_std": 0.3525990601629019, "reward_std": 0.7637092582881451, "rewards/cosine_scaled_reward": -0.14291540812700987, "rewards/format_reward": 0.7916666716337204, "step": 398 }, { "advantage_max": 1.9521061778068542, "advantage_mean": -2.7318796558262193e-08, "advantage_min": -0.7446748539805412, "advantage_std": 0.9998859688639641, "completion_length": 1348.770866394043, "epoch": 0.456, "grad_norm": 0.33135783672332764, "kl": 0.03333282470703125, "lambda_div_used": 0.5, "learning_rate": 2.0730776160846853e-07, "loss": 0.0013, "reward": 0.31152006052434444, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.31152006052434444, "reward_after_std": 1.0044073984026909, "reward_before_mean": 1.0513861402869225, "reward_before_std": 0.8144489899277687, "reward_change_max": 0.00045236945152282715, "reward_change_mean": -0.739866092801094, "reward_change_min": -1.2442336976528168, "reward_change_std": 0.4599989354610443, "reward_std": 1.0044074207544327, "rewards/cosine_scaled_reward": 0.046526393853127956, "rewards/format_reward": 0.9583333432674408, "step": 399 }, { "advantage_max": 1.9472700208425522, "advantage_mean": -2.188608116959756e-08, "advantage_min": -0.8032184466719627, "advantage_std": 0.9998786151409149, "completion_length": 1004.2083587646484, "epoch": 0.45714285714285713, "grad_norm": 0.34391146898269653, "kl": 0.019023895263671875, "lambda_div_used": 0.5, "learning_rate": 2.0528000059645995e-07, "loss": 0.0008, "reward": 0.47748311748728156, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.47748311748728156, "reward_after_std": 0.9301084578037262, "reward_before_mean": 1.3930362164974213, "reward_before_std": 0.7088299039751291, "reward_change_max": 0.0, "reward_change_mean": -0.91555305570364, "reward_change_min": -1.4300957471132278, "reward_change_std": 0.5428194254636765, "reward_std": 0.9301084876060486, "rewards/cosine_scaled_reward": 0.20693476125597954, "rewards/format_reward": 0.9791666716337204, "step": 400 }, { "advantage_max": 1.918205127120018, "advantage_mean": 6.829699361610153e-09, "advantage_min": -0.8171031698584557, "advantage_std": 0.9998555332422256, "completion_length": 1798.8750305175781, "epoch": 0.4582857142857143, "grad_norm": 0.4476446211338043, "kl": 0.07358551025390625, "lambda_div_used": 0.5, "learning_rate": 2.032690407508949e-07, "loss": 0.0029, "reward": 0.11297351177199744, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.11297351177199744, "reward_after_std": 0.825851283967495, "reward_before_mean": 0.752119667828083, "reward_before_std": 0.7276361435651779, "reward_change_max": 0.0, "reward_change_mean": -0.6391461752355099, "reward_change_min": -1.0911386832594872, "reward_change_std": 0.4222437683492899, "reward_std": 0.825851283967495, "rewards/cosine_scaled_reward": 0.0010598432272672653, "rewards/format_reward": 0.7500000111758709, "step": 401 }, { "advantage_max": 1.926381230354309, "advantage_mean": -1.373700808660061e-08, "advantage_min": -0.8074382990598679, "advantage_std": 0.9998680353164673, "completion_length": 1267.895881652832, "epoch": 0.4594285714285714, "grad_norm": 0.4774840474128723, "kl": 0.050079345703125, "lambda_div_used": 0.5, "learning_rate": 2.0127498008311922e-07, "loss": 0.002, "reward": 0.1499855355359614, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1499855355359614, "reward_after_std": 0.853604331612587, "reward_before_mean": 0.8121988624334335, "reward_before_std": 0.7147294506430626, "reward_change_max": 0.0, "reward_change_mean": -0.6622133255004883, "reward_change_min": -1.0627883076667786, "reward_change_std": 0.42420555651187897, "reward_std": 0.8536043539643288, "rewards/cosine_scaled_reward": -0.04181723203510046, "rewards/format_reward": 0.8958333507180214, "step": 402 }, { "advantage_max": 1.9298964142799377, "advantage_mean": -1.9247333560290514e-08, "advantage_min": -0.855850912630558, "advantage_std": 0.9998518601059914, "completion_length": 1122.0000305175781, "epoch": 0.4605714285714286, "grad_norm": 0.37893974781036377, "kl": 0.034740447998046875, "lambda_div_used": 0.5, "learning_rate": 1.9929791578083655e-07, "loss": 0.0014, "reward": 0.27812413964420557, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.27812413964420557, "reward_after_std": 0.7593046091496944, "reward_before_mean": 1.073563028126955, "reward_before_std": 0.5789857367053628, "reward_change_max": 0.0, "reward_change_mean": -0.79543886333704, "reward_change_min": -1.182148739695549, "reward_change_std": 0.46591442450881004, "reward_std": 0.7593046091496944, "rewards/cosine_scaled_reward": 0.057614823803305626, "rewards/format_reward": 0.9583333358168602, "step": 403 }, { "advantage_max": 1.9724492132663727, "advantage_mean": -2.6387474177935744e-08, "advantage_min": -0.7148680537939072, "advantage_std": 0.9998226389288902, "completion_length": 1216.8750457763672, "epoch": 0.4617142857142857, "grad_norm": 0.6647066473960876, "kl": 0.03928375244140625, "lambda_div_used": 0.5, "learning_rate": 1.9733794420337213e-07, "loss": 0.0016, "reward": 0.22924650724348794, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.22924650724348794, "reward_after_std": 0.6133319586515427, "reward_before_mean": 1.0240289568901062, "reward_before_std": 0.336720185354352, "reward_change_max": 0.0, "reward_change_mean": -0.7947824373841286, "reward_change_min": -1.0856451392173767, "reward_change_std": 0.4123692698776722, "reward_std": 0.6133319735527039, "rewards/cosine_scaled_reward": 0.022431131452322006, "rewards/format_reward": 0.9791666716337204, "step": 404 }, { "advantage_max": 1.9340013265609741, "advantage_mean": -2.2584572989536866e-08, "advantage_min": -0.8358136937022209, "advantage_std": 0.9998725801706314, "completion_length": 1360.708396911621, "epoch": 0.46285714285714286, "grad_norm": 0.39807194471359253, "kl": 0.04998016357421875, "lambda_div_used": 0.5, "learning_rate": 1.9539516087697517e-07, "loss": 0.002, "reward": 0.4218410551548004, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.4218410551548004, "reward_after_std": 0.8566682934761047, "reward_before_mean": 1.3111839480698109, "reward_before_std": 0.6185014648362994, "reward_change_max": 0.0007586926221847534, "reward_change_mean": -0.8893428482115269, "reward_change_min": -1.2902274504303932, "reward_change_std": 0.5102062933146954, "reward_std": 0.8566683232784271, "rewards/cosine_scaled_reward": 0.18684194469824433, "rewards/format_reward": 0.9375000074505806, "step": 405 }, { "advantage_max": 1.9196098744869232, "advantage_mean": 1.2417633588057697e-09, "advantage_min": -0.8075883463025093, "advantage_std": 0.9998895972967148, "completion_length": 1349.770881652832, "epoch": 0.464, "grad_norm": 0.4375143349170685, "kl": 0.03128814697265625, "lambda_div_used": 0.5, "learning_rate": 1.934696604901642e-07, "loss": 0.0013, "reward": 0.3450129013508558, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3450129013508558, "reward_after_std": 1.0195146724581718, "reward_before_mean": 1.1223999299108982, "reward_before_std": 0.8690014518797398, "reward_change_max": 0.0, "reward_change_mean": -0.777387011796236, "reward_change_min": -1.3255415260791779, "reward_change_std": 0.5084087513387203, "reward_std": 1.0195147022604942, "rewards/cosine_scaled_reward": 0.10286661703139544, "rewards/format_reward": 0.916666679084301, "step": 406 }, { "advantage_max": 1.9780168533325195, "advantage_mean": -3.539025983378963e-08, "advantage_min": -0.7044540494680405, "advantage_std": 0.9998190328478813, "completion_length": 1630.479206085205, "epoch": 0.46514285714285714, "grad_norm": 0.36417001485824585, "kl": 0.0523529052734375, "lambda_div_used": 0.5, "learning_rate": 1.915615368891117e-07, "loss": 0.0021, "reward": 0.1611375161446631, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1611375161446631, "reward_after_std": 0.6852178312838078, "reward_before_mean": 0.8822555118240416, "reward_before_std": 0.444427031558007, "reward_change_max": 0.0, "reward_change_mean": -0.7211180254817009, "reward_change_min": -1.059138998389244, "reward_change_std": 0.4020147733390331, "reward_std": 0.6852178387343884, "rewards/cosine_scaled_reward": 0.024461084976792336, "rewards/format_reward": 0.8333333432674408, "step": 407 }, { "advantage_max": 1.9174300134181976, "advantage_mean": 5.8983765094389184e-09, "advantage_min": -0.857127234339714, "advantage_std": 0.9998722821474075, "completion_length": 1546.0000610351562, "epoch": 0.4662857142857143, "grad_norm": 0.4313151240348816, "kl": 0.03270912170410156, "lambda_div_used": 0.5, "learning_rate": 1.8967088307307e-07, "loss": 0.0013, "reward": 0.07606437988579273, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.07606437988579273, "reward_after_std": 0.8793220743536949, "reward_before_mean": 0.6768680065870285, "reward_before_std": 0.7902133017778397, "reward_change_max": 0.0029953643679618835, "reward_change_mean": -0.6008036248385906, "reward_change_min": -1.1326103135943413, "reward_change_std": 0.43921875581145287, "reward_std": 0.8793221041560173, "rewards/cosine_scaled_reward": -0.06781600136309862, "rewards/format_reward": 0.8125000223517418, "step": 408 }, { "advantage_max": 1.9145079553127289, "advantage_mean": 1.2417631367611648e-09, "advantage_min": -0.7866168022155762, "advantage_std": 0.9998549968004227, "completion_length": 1843.708381652832, "epoch": 0.4674285714285714, "grad_norm": 0.3521879017353058, "kl": 0.05733489990234375, "lambda_div_used": 0.5, "learning_rate": 1.8779779118983867e-07, "loss": 0.0023, "reward": 0.09330637939274311, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.09330637939274311, "reward_after_std": 0.9056757166981697, "reward_before_mean": 0.6961738504469395, "reward_before_std": 0.8515808396041393, "reward_change_max": 0.0, "reward_change_mean": -0.6028674840927124, "reward_change_min": -1.1532711759209633, "reward_change_std": 0.4412507191300392, "reward_std": 0.9056757390499115, "rewards/cosine_scaled_reward": -0.058163101435638964, "rewards/format_reward": 0.8125000055879354, "step": 409 }, { "advantage_max": 1.9500681310892105, "advantage_mean": -3.104408285992122e-09, "advantage_min": -0.6658405214548111, "advantage_std": 0.9998787567019463, "completion_length": 1263.2083587646484, "epoch": 0.4685714285714286, "grad_norm": 0.2878897488117218, "kl": 0.019916534423828125, "lambda_div_used": 0.5, "learning_rate": 1.8594235253127372e-07, "loss": 0.0008, "reward": 0.19085952546447515, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.19085952546447515, "reward_after_std": 0.9523617699742317, "reward_before_mean": 0.8548973593860865, "reward_before_std": 0.7926736399531364, "reward_change_max": 0.0, "reward_change_mean": -0.6640378534793854, "reward_change_min": -1.1314915791153908, "reward_change_std": 0.42829754017293453, "reward_std": 0.9523618072271347, "rewards/cosine_scaled_reward": -0.030884657404385507, "rewards/format_reward": 0.916666679084301, "step": 410 }, { "advantage_max": 1.9501308053731918, "advantage_mean": 2.5456150520852816e-08, "advantage_min": -0.722774401307106, "advantage_std": 0.9998585432767868, "completion_length": 2099.645908355713, "epoch": 0.4697142857142857, "grad_norm": 0.4024723768234253, "kl": 0.06557846069335938, "lambda_div_used": 0.5, "learning_rate": 1.8410465752883758e-07, "loss": 0.0026, "reward": 0.04722657427191734, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.04722657427191734, "reward_after_std": 0.8598898909986019, "reward_before_mean": 0.627219133079052, "reward_before_std": 0.742165463976562, "reward_change_max": 0.0003335103392601013, "reward_change_mean": -0.5799925178289413, "reward_change_min": -0.9638400673866272, "reward_change_std": 0.3880952801555395, "reward_std": 0.8598899245262146, "rewards/cosine_scaled_reward": -0.061390455812215805, "rewards/format_reward": 0.7500000055879354, "step": 411 }, { "advantage_max": 1.9316768646240234, "advantage_mean": 6.5192582443529545e-09, "advantage_min": -0.7628925666213036, "advantage_std": 0.999868169426918, "completion_length": 1022.5000228881836, "epoch": 0.47085714285714286, "grad_norm": 0.42367619276046753, "kl": 0.018037796020507812, "lambda_div_used": 0.5, "learning_rate": 1.822847957491922e-07, "loss": 0.0007, "reward": 0.1572526041418314, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.1572526041418314, "reward_after_std": 0.8611396998167038, "reward_before_mean": 0.8268182594329119, "reward_before_std": 0.748228445649147, "reward_change_max": 0.0, "reward_change_mean": -0.6695656627416611, "reward_change_min": -1.220998875796795, "reward_change_std": 0.4380665756762028, "reward_std": 0.8611396998167038, "rewards/cosine_scaled_reward": -0.07617420144379139, "rewards/format_reward": 0.9791666716337204, "step": 412 }, { "advantage_max": 1.9105447828769684, "advantage_mean": -6.208817238118058e-09, "advantage_min": -0.7985327839851379, "advantage_std": 0.9998429268598557, "completion_length": 1399.7917022705078, "epoch": 0.472, "grad_norm": 0.4110771119594574, "kl": 0.03682518005371094, "lambda_div_used": 0.5, "learning_rate": 1.804828558898332e-07, "loss": 0.0015, "reward": 0.19052385329268873, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.19052385329268873, "reward_after_std": 0.7446985505521297, "reward_before_mean": 0.9286044277250767, "reward_before_std": 0.6417394857853651, "reward_change_max": 0.0, "reward_change_mean": -0.738080620765686, "reward_change_min": -1.2469749003648758, "reward_change_std": 0.4651348330080509, "reward_std": 0.74469855427742, "rewards/cosine_scaled_reward": 0.00596888642758131, "rewards/format_reward": 0.9166666716337204, "step": 413 }, { "advantage_max": 1.9046124964952469, "advantage_mean": -5.587935447692871e-09, "advantage_min": -0.8260460719466209, "advantage_std": 0.9998347759246826, "completion_length": 1609.083396911621, "epoch": 0.47314285714285714, "grad_norm": 0.36283785104751587, "kl": 0.032196044921875, "lambda_div_used": 0.5, "learning_rate": 1.7869892577476722e-07, "loss": 0.0013, "reward": -0.037756118923425674, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.037756118923425674, "reward_after_std": 0.7010756619274616, "reward_before_mean": 0.5217839451506734, "reward_before_std": 0.6131279207766056, "reward_change_max": 0.00289192795753479, "reward_change_mean": -0.5595400780439377, "reward_change_min": -1.0230029672384262, "reward_change_std": 0.3680335786193609, "reward_std": 0.7010756768286228, "rewards/cosine_scaled_reward": -0.1661913748830557, "rewards/format_reward": 0.8541666753590107, "step": 414 }, { "advantage_max": 1.915537714958191, "advantage_mean": 2.6697914157214342e-08, "advantage_min": -0.7383338809013367, "advantage_std": 0.9998762533068657, "completion_length": 1897.583396911621, "epoch": 0.4742857142857143, "grad_norm": 0.7078869938850403, "kl": 0.08300018310546875, "lambda_div_used": 0.5, "learning_rate": 1.7693309235023127e-07, "loss": 0.0033, "reward": -0.02524216379970312, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.02524216379970312, "reward_after_std": 0.9051150903105736, "reward_before_mean": 0.4787615801615175, "reward_before_std": 0.8592314906418324, "reward_change_max": 0.0013706609606742859, "reward_change_mean": -0.5040037520229816, "reward_change_min": -1.025732345879078, "reward_change_std": 0.39429098181426525, "reward_std": 0.9051151052117348, "rewards/cosine_scaled_reward": -0.10436921380460262, "rewards/format_reward": 0.6875000093132257, "step": 415 }, { "advantage_max": 1.9446207731962204, "advantage_mean": -9.313227966600834e-10, "advantage_min": -0.7540571428835392, "advantage_std": 0.9998806416988373, "completion_length": 1226.6458740234375, "epoch": 0.4754285714285714, "grad_norm": 0.6419525146484375, "kl": 0.0194091796875, "lambda_div_used": 0.5, "learning_rate": 1.7518544168045524e-07, "loss": 0.0008, "reward": 0.34382338635623455, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.34382338635623455, "reward_after_std": 0.9477375037968159, "reward_before_mean": 1.1409937180578709, "reward_before_std": 0.7824789434671402, "reward_change_max": 0.0, "reward_change_mean": -0.7971703410148621, "reward_change_min": -1.2335463464260101, "reward_change_std": 0.475644176825881, "reward_std": 0.9477375410497189, "rewards/cosine_scaled_reward": 0.07049683481454849, "rewards/format_reward": 1.0, "step": 416 }, { "advantage_max": 1.9392386972904205, "advantage_mean": 3.1044089521259366e-10, "advantage_min": -0.8309417217969894, "advantage_std": 0.999857485294342, "completion_length": 1679.3125457763672, "epoch": 0.4765714285714286, "grad_norm": 0.41789889335632324, "kl": 0.03730010986328125, "lambda_div_used": 0.5, "learning_rate": 1.7345605894346726e-07, "loss": 0.0015, "reward": 0.18889883533120155, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.18889883533120155, "reward_after_std": 0.8192192353308201, "reward_before_mean": 0.8952180985361338, "reward_before_std": 0.6806813403964043, "reward_change_max": 0.0, "reward_change_mean": -0.7063192613422871, "reward_change_min": -1.1340601965785027, "reward_change_std": 0.431840430945158, "reward_std": 0.8192192912101746, "rewards/cosine_scaled_reward": 0.010109039023518562, "rewards/format_reward": 0.8750000149011612, "step": 417 }, { "advantage_max": 1.9405268132686615, "advantage_mean": -3.725290520506519e-09, "advantage_min": -0.7386938184499741, "advantage_std": 0.9998854398727417, "completion_length": 1269.7500305175781, "epoch": 0.4777142857142857, "grad_norm": 0.4162716269493103, "kl": 0.0435333251953125, "lambda_div_used": 0.5, "learning_rate": 1.7174502842694212e-07, "loss": 0.0017, "reward": 0.416334574110806, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.416334574110806, "reward_after_std": 0.980750635266304, "reward_before_mean": 1.2588410302996635, "reward_before_std": 0.79679533559829, "reward_change_max": 0.0006934776902198792, "reward_change_mean": -0.8425064440816641, "reward_change_min": -1.3780925124883652, "reward_change_std": 0.5267263427376747, "reward_std": 0.980750672519207, "rewards/cosine_scaled_reward": 0.18150383047759533, "rewards/format_reward": 0.8958333395421505, "step": 418 }, { "advantage_max": 1.919666275382042, "advantage_mean": -1.4590721297835785e-08, "advantage_min": -0.7799930199980736, "advantage_std": 0.9998503774404526, "completion_length": 1480.1458740234375, "epoch": 0.47885714285714287, "grad_norm": 0.41415131092071533, "kl": 0.05619621276855469, "lambda_div_used": 0.5, "learning_rate": 1.7005243352409333e-07, "loss": 0.0022, "reward": 0.328931987285614, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.328931987285614, "reward_after_std": 0.8104345016181469, "reward_before_mean": 1.158056017011404, "reward_before_std": 0.640655167400837, "reward_change_max": 0.0, "reward_change_mean": -0.8291240483522415, "reward_change_min": -1.3252276219427586, "reward_change_std": 0.4964596051722765, "reward_std": 0.8104345090687275, "rewards/cosine_scaled_reward": 0.15194466523826122, "rewards/format_reward": 0.8541666716337204, "step": 419 }, { "advantage_max": 1.9785042852163315, "advantage_mean": -1.490116141589226e-08, "advantage_min": -0.6844378933310509, "advantage_std": 0.9998235180974007, "completion_length": 966.3333740234375, "epoch": 0.48, "grad_norm": 0.5011756420135498, "kl": 0.02069854736328125, "lambda_div_used": 0.5, "learning_rate": 1.6837835672960831e-07, "loss": 0.0008, "reward": 0.05833008675836027, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.05833008675836027, "reward_after_std": 0.612760417163372, "reward_before_mean": 0.7137523256242275, "reward_before_std": 0.39219198003411293, "reward_change_max": 0.0, "reward_change_mean": -0.65542221814394, "reward_change_min": -0.9523419812321663, "reward_change_std": 0.34629401564598083, "reward_std": 0.6127604395151138, "rewards/cosine_scaled_reward": -0.14312385022640228, "rewards/format_reward": 1.0, "step": 420 }, { "advantage_max": 1.9388664364814758, "advantage_mean": 8.692344399818808e-09, "advantage_min": -0.7572106420993805, "advantage_std": 0.9998262673616409, "completion_length": 1458.2708854675293, "epoch": 0.48114285714285715, "grad_norm": 0.5800455212593079, "kl": 0.0557098388671875, "lambda_div_used": 0.5, "learning_rate": 1.6672287963562852e-07, "loss": 0.0022, "reward": -0.0324164031771943, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.0324164031771943, "reward_after_std": 0.7277033217251301, "reward_before_mean": 0.5232896497473121, "reward_before_std": 0.6215553600341082, "reward_change_max": 0.0010679811239242554, "reward_change_mean": -0.5557060427963734, "reward_change_min": -0.9187875688076019, "reward_change_std": 0.3641525115817785, "reward_std": 0.7277033478021622, "rewards/cosine_scaled_reward": -0.1654385207220912, "rewards/format_reward": 0.854166679084301, "step": 421 }, { "advantage_max": 1.9458617120981216, "advantage_mean": -7.140140034778142e-09, "advantage_min": -0.7656347528100014, "advantage_std": 0.9998919069766998, "completion_length": 1688.0000457763672, "epoch": 0.48228571428571426, "grad_norm": 0.4291648864746094, "kl": 0.061893463134765625, "lambda_div_used": 0.5, "learning_rate": 1.6508608292777203e-07, "loss": 0.0025, "reward": 0.19870495703071356, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.19870495703071356, "reward_after_std": 1.0406667441129684, "reward_before_mean": 0.8401019177399576, "reward_before_std": 0.8951727598905563, "reward_change_max": 0.0, "reward_change_mean": -0.6413969695568085, "reward_change_min": -1.067455343902111, "reward_change_std": 0.41564703918993473, "reward_std": 1.0406667664647102, "rewards/cosine_scaled_reward": 0.0033842832781374454, "rewards/format_reward": 0.8333333414047956, "step": 422 }, { "advantage_max": 1.9458054602146149, "advantage_mean": 1.2417634809303024e-08, "advantage_min": -0.7163062021136284, "advantage_std": 0.9998630881309509, "completion_length": 2009.3125610351562, "epoch": 0.48342857142857143, "grad_norm": 0.5385973453521729, "kl": 0.09122467041015625, "lambda_div_used": 0.5, "learning_rate": 1.6346804638120098e-07, "loss": 0.0037, "reward": -0.002161663491278887, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.002161663491278887, "reward_after_std": 0.8518885150551796, "reward_before_mean": 0.5340993963181973, "reward_before_std": 0.7732574082911015, "reward_change_max": 0.0011147409677505493, "reward_change_mean": -0.5362610556185246, "reward_change_min": -0.997920099645853, "reward_change_std": 0.3805042449384928, "reward_std": 0.8518885150551796, "rewards/cosine_scaled_reward": -0.08711698092520237, "rewards/format_reward": 0.708333345130086, "step": 423 }, { "advantage_max": 1.932769998908043, "advantage_mean": 1.552204320631745e-08, "advantage_min": -0.7818415835499763, "advantage_std": 0.9998092278838158, "completion_length": 1649.0000381469727, "epoch": 0.4845714285714286, "grad_norm": 0.43892186880111694, "kl": 0.048885345458984375, "lambda_div_used": 0.5, "learning_rate": 1.6186884885673413e-07, "loss": 0.002, "reward": -0.14956187270581722, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.14956187270581722, "reward_after_std": 0.6577580161392689, "reward_before_mean": 0.3286280228057876, "reward_before_std": 0.5828495901077986, "reward_change_max": 0.0017966404557228088, "reward_change_mean": -0.47818990983068943, "reward_change_min": -0.839508980512619, "reward_change_std": 0.32613570243120193, "reward_std": 0.6577580310404301, "rewards/cosine_scaled_reward": -0.21068599075078964, "rewards/format_reward": 0.7500000074505806, "step": 424 }, { "advantage_max": 1.900831788778305, "advantage_mean": -3.166496842510469e-08, "advantage_min": -0.8162828199565411, "advantage_std": 0.9999080747365952, "completion_length": 1217.9167175292969, "epoch": 0.4857142857142857, "grad_norm": 0.2568250596523285, "kl": 0.0250244140625, "lambda_div_used": 0.5, "learning_rate": 1.6028856829700258e-07, "loss": 0.001, "reward": 0.6311993859708309, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6311993859708309, "reward_after_std": 1.1424714177846909, "reward_before_mean": 1.6125700250267982, "reward_before_std": 1.0077669620513916, "reward_change_max": 0.0, "reward_change_mean": -0.9813706278800964, "reward_change_min": -1.6586797833442688, "reward_change_std": 0.6413742937147617, "reward_std": 1.1424714773893356, "rewards/cosine_scaled_reward": 0.3271183331380598, "rewards/format_reward": 0.9583333358168602, "step": 425 }, { "advantage_max": 1.9204679131507874, "advantage_mean": 9.623666918923135e-09, "advantage_min": -0.820533998310566, "advantage_std": 0.999837689101696, "completion_length": 1694.708366394043, "epoch": 0.4868571428571429, "grad_norm": 0.6034704446792603, "kl": 0.09422683715820312, "lambda_div_used": 0.5, "learning_rate": 1.5872728172265146e-07, "loss": 0.0038, "reward": 0.1135131117189303, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1135131117189303, "reward_after_std": 0.767748273909092, "reward_before_mean": 0.7725034542381763, "reward_before_std": 0.6063254494220018, "reward_change_max": 0.0, "reward_change_mean": -0.6589903496205807, "reward_change_min": -1.0663022696971893, "reward_change_std": 0.4144749026745558, "reward_std": 0.7677483111619949, "rewards/cosine_scaled_reward": -0.01999828591942787, "rewards/format_reward": 0.812500013038516, "step": 426 }, { "advantage_max": 1.9189137816429138, "advantage_mean": -6.829698917520943e-09, "advantage_min": -0.8139993920922279, "advantage_std": 0.9998674094676971, "completion_length": 1720.7500534057617, "epoch": 0.488, "grad_norm": 0.48113617300987244, "kl": 0.04186248779296875, "lambda_div_used": 0.5, "learning_rate": 1.5718506522858572e-07, "loss": 0.0017, "reward": 0.171463415666949, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.171463415666949, "reward_after_std": 0.9078935757279396, "reward_before_mean": 0.8412778452038765, "reward_before_std": 0.8320760056376457, "reward_change_max": 0.0, "reward_change_mean": -0.6698144376277924, "reward_change_min": -1.2227418944239616, "reward_change_std": 0.46701946668326855, "reward_std": 0.9078936129808426, "rewards/cosine_scaled_reward": 0.014388916082680225, "rewards/format_reward": 0.8125000186264515, "step": 427 }, { "advantage_max": 1.937966212630272, "advantage_mean": 6.208817349140361e-09, "advantage_min": -0.7897857651114464, "advantage_std": 0.9998549371957779, "completion_length": 1417.1041946411133, "epoch": 0.48914285714285716, "grad_norm": 0.6769260764122009, "kl": 0.032806396484375, "lambda_div_used": 0.5, "learning_rate": 1.5566199398026147e-07, "loss": 0.0013, "reward": 0.05286524537950754, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.05286524537950754, "reward_after_std": 0.8130511678755283, "reward_before_mean": 0.6512714847922325, "reward_before_std": 0.6781092509627342, "reward_change_max": 0.0005588680505752563, "reward_change_mean": -0.5984062403440475, "reward_change_min": -0.9986972808837891, "reward_change_std": 0.36525629833340645, "reward_std": 0.8130511939525604, "rewards/cosine_scaled_reward": -0.12228094134479761, "rewards/format_reward": 0.8958333432674408, "step": 428 }, { "advantage_max": 1.9107749164104462, "advantage_mean": -9.313226023710541e-09, "advantage_min": -0.7637681402266026, "advantage_std": 0.9998604357242584, "completion_length": 1157.8750495910645, "epoch": 0.49028571428571427, "grad_norm": 0.4458828866481781, "kl": 0.0574798583984375, "lambda_div_used": 0.5, "learning_rate": 1.5415814221002265e-07, "loss": 0.0023, "reward": 0.16714679636061192, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.16714679636061192, "reward_after_std": 0.8375038094818592, "reward_before_mean": 0.8551300168037415, "reward_before_std": 0.7358640916645527, "reward_change_max": 0.0, "reward_change_mean": -0.6879832223057747, "reward_change_min": -1.2509738504886627, "reward_change_std": 0.4568104110658169, "reward_std": 0.8375038281083107, "rewards/cosine_scaled_reward": -0.06201833672821522, "rewards/format_reward": 0.9791666716337204, "step": 429 }, { "advantage_max": 1.9214459359645844, "advantage_mean": -3.104408619059029e-09, "advantage_min": -0.7615657895803452, "advantage_std": 0.9998621419072151, "completion_length": 1377.2708892822266, "epoch": 0.49142857142857144, "grad_norm": 0.47784435749053955, "kl": 0.054538726806640625, "lambda_div_used": 0.5, "learning_rate": 1.5267358321348285e-07, "loss": 0.0022, "reward": 0.21472038747742772, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.21472038747742772, "reward_after_std": 0.8057798445224762, "reward_before_mean": 0.9536108346655965, "reward_before_std": 0.6560990251600742, "reward_change_max": 0.0, "reward_change_mean": -0.7388904243707657, "reward_change_min": -1.1853134781122208, "reward_change_std": 0.46452105045318604, "reward_std": 0.805779866874218, "rewards/cosine_scaled_reward": 0.03930539125576615, "rewards/format_reward": 0.875, "step": 430 }, { "advantage_max": 1.9473706632852554, "advantage_mean": -5.122274382429737e-09, "advantage_min": -0.7392523810267448, "advantage_std": 0.9998333901166916, "completion_length": 1522.8333740234375, "epoch": 0.49257142857142855, "grad_norm": 0.5233393907546997, "kl": 0.08725738525390625, "lambda_div_used": 0.5, "learning_rate": 1.5120838934595337e-07, "loss": 0.0035, "reward": 0.0229043357539922, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.0229043357539922, "reward_after_std": 0.6961653456091881, "reward_before_mean": 0.6314683072268963, "reward_before_std": 0.5440207023639232, "reward_change_max": 0.0, "reward_change_mean": -0.6085639595985413, "reward_change_min": -1.0415047705173492, "reward_change_std": 0.36842281371355057, "reward_std": 0.6961653828620911, "rewards/cosine_scaled_reward": -0.09051587281282991, "rewards/format_reward": 0.8125, "step": 431 }, { "advantage_max": 1.8979507982730865, "advantage_mean": -3.570069795344466e-09, "advantage_min": -0.7884392961859703, "advantage_std": 0.9998320043087006, "completion_length": 1696.8541870117188, "epoch": 0.4937142857142857, "grad_norm": 0.44175800681114197, "kl": 0.06729888916015625, "lambda_div_used": 0.5, "learning_rate": 1.4976263201891613e-07, "loss": 0.0027, "reward": -0.02060939557850361, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.02060939557850361, "reward_after_std": 0.7018601484596729, "reward_before_mean": 0.5527903139591217, "reward_before_std": 0.5883015915751457, "reward_change_max": 0.00041546672582626343, "reward_change_mean": -0.573399730026722, "reward_change_min": -0.9393155761063099, "reward_change_std": 0.35436554066836834, "reward_std": 0.7018601670861244, "rewards/cosine_scaled_reward": -0.11943817464634776, "rewards/format_reward": 0.7916666716337204, "step": 432 }, { "advantage_max": 1.964208960533142, "advantage_mean": -1.1796753240922442e-08, "advantage_min": -0.6931241601705551, "advantage_std": 0.9998587071895599, "completion_length": 1315.8333587646484, "epoch": 0.4948571428571429, "grad_norm": 0.43583711981773376, "kl": 0.021282196044921875, "lambda_div_used": 0.5, "learning_rate": 1.483363816965435e-07, "loss": 0.0009, "reward": 0.28905248921364546, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.28905248921364546, "reward_after_std": 0.7677325867116451, "reward_before_mean": 1.0941534340381622, "reward_before_std": 0.504822900518775, "reward_change_max": 0.0002372339367866516, "reward_change_mean": -0.8051009066402912, "reward_change_min": -1.2095557525753975, "reward_change_std": 0.44658362865448, "reward_std": 0.7677325941622257, "rewards/cosine_scaled_reward": 0.06791000673547387, "rewards/format_reward": 0.9583333358168602, "step": 433 }, { "advantage_max": 1.90696319937706, "advantage_mean": 3.1044085080367267e-09, "advantage_min": -0.8565580695867538, "advantage_std": 0.9998314157128334, "completion_length": 1433.1042022705078, "epoch": 0.496, "grad_norm": 0.4644394516944885, "kl": 0.045490264892578125, "lambda_div_used": 0.5, "learning_rate": 1.469297078922642e-07, "loss": 0.0018, "reward": -0.05052966655057389, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.05052966655057389, "reward_after_std": 0.6372621580958366, "reward_before_mean": 0.513536169193685, "reward_before_std": 0.5464147813618183, "reward_change_max": 0.0025284886360168457, "reward_change_mean": -0.5640658438205719, "reward_change_min": -0.9717100188136101, "reward_change_std": 0.3585630767047405, "reward_std": 0.6372621655464172, "rewards/cosine_scaled_reward": -0.20156525447964668, "rewards/format_reward": 0.916666679084301, "step": 434 }, { "advantage_max": 1.9291100949048996, "advantage_mean": 3.725290353973065e-09, "advantage_min": -0.8469479605555534, "advantage_std": 0.999834306538105, "completion_length": 1254.3542098999023, "epoch": 0.49714285714285716, "grad_norm": 0.5041975975036621, "kl": 0.0749053955078125, "lambda_div_used": 0.5, "learning_rate": 1.4554267916537495e-07, "loss": 0.003, "reward": 0.05097749922424555, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.05097749922424555, "reward_after_std": 0.7026257328689098, "reward_before_mean": 0.6741197109222412, "reward_before_std": 0.5542252194136381, "reward_change_max": 0.0, "reward_change_mean": -0.6231421791017056, "reward_change_min": -1.0012230202555656, "reward_change_std": 0.37164881080389023, "reward_std": 0.7026257365942001, "rewards/cosine_scaled_reward": -0.14210683782584965, "rewards/format_reward": 0.9583333432674408, "step": 435 }, { "advantage_max": 1.9043562710285187, "advantage_mean": -1.6142925440831846e-08, "advantage_min": -0.829190157353878, "advantage_std": 0.9998815432190895, "completion_length": 1361.083366394043, "epoch": 0.4982857142857143, "grad_norm": 0.6900902390480042, "kl": 0.06423187255859375, "lambda_div_used": 0.5, "learning_rate": 1.4417536311769885e-07, "loss": 0.0026, "reward": 0.2773757204413414, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2773757204413414, "reward_after_std": 0.9254966825246811, "reward_before_mean": 1.0332196983508766, "reward_before_std": 0.8475972041487694, "reward_change_max": 0.0005481541156768799, "reward_change_mean": -0.7558439932763577, "reward_change_min": -1.2967700064182281, "reward_change_std": 0.5047092605382204, "reward_std": 0.9254966899752617, "rewards/cosine_scaled_reward": 0.07910984754562378, "rewards/format_reward": 0.875, "step": 436 }, { "advantage_max": 1.9277342706918716, "advantage_mean": 1.2417633588057697e-09, "advantage_min": -0.8391797617077827, "advantage_std": 0.9998452290892601, "completion_length": 1271.4167251586914, "epoch": 0.49942857142857144, "grad_norm": 0.4023401141166687, "kl": 0.03212738037109375, "lambda_div_used": 0.5, "learning_rate": 1.4282782639029128e-07, "loss": 0.0013, "reward": 0.2056784473825246, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2056784473825246, "reward_after_std": 0.7642825543880463, "reward_before_mean": 0.947747528553009, "reward_before_std": 0.6257410123944283, "reward_change_max": 0.0, "reward_change_mean": -0.7420690581202507, "reward_change_min": -1.1740047186613083, "reward_change_std": 0.45448190718889236, "reward_std": 0.7642825618386269, "rewards/cosine_scaled_reward": -0.005292933899909258, "rewards/format_reward": 0.9583333432674408, "step": 437 }, { "advantage_max": 1.9327512085437775, "advantage_mean": 2.7318795781106076e-08, "advantage_min": -0.7429120875895023, "advantage_std": 0.9998285323381424, "completion_length": 1894.7500534057617, "epoch": 0.5005714285714286, "grad_norm": 0.6199660301208496, "kl": 0.08026504516601562, "lambda_div_used": 0.5, "learning_rate": 1.4150013466019114e-07, "loss": 0.0032, "reward": -0.031040742062032223, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.031040742062032223, "reward_after_std": 0.6640561632812023, "reward_before_mean": 0.5457404367625713, "reward_before_std": 0.5272583463229239, "reward_change_max": 0.0, "reward_change_mean": -0.5767811760306358, "reward_change_min": -0.9341515824198723, "reward_change_std": 0.360694108530879, "reward_std": 0.6640561930835247, "rewards/cosine_scaled_reward": -0.10212977975606918, "rewards/format_reward": 0.7500000111758709, "step": 438 }, { "advantage_max": 1.919052854180336, "advantage_mean": -6.208817349140361e-09, "advantage_min": -0.8913252726197243, "advantage_std": 0.9998451843857765, "completion_length": 1347.8542098999023, "epoch": 0.5017142857142857, "grad_norm": 0.5470811128616333, "kl": 0.0438079833984375, "lambda_div_used": 0.5, "learning_rate": 1.4019235263722034e-07, "loss": 0.0018, "reward": 0.04901101998984814, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.04901101998984814, "reward_after_std": 0.7144185900688171, "reward_before_mean": 0.6771840838191565, "reward_before_std": 0.6143386028707027, "reward_change_max": 0.002023160457611084, "reward_change_mean": -0.6281730607151985, "reward_change_min": -1.0101951658725739, "reward_change_std": 0.39536717906594276, "reward_std": 0.7144186124205589, "rewards/cosine_scaled_reward": -0.09890797361731529, "rewards/format_reward": 0.8750000149011612, "step": 439 }, { "advantage_max": 1.961982324719429, "advantage_mean": 1.2728075454715437e-08, "advantage_min": -0.7568341344594955, "advantage_std": 0.9998323395848274, "completion_length": 1275.854206085205, "epoch": 0.5028571428571429, "grad_norm": 0.45573917031288147, "kl": 0.042362213134765625, "lambda_div_used": 0.5, "learning_rate": 1.3890454406082956e-07, "loss": 0.0017, "reward": -0.05465042544528842, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.05465042544528842, "reward_after_std": 0.6868558041751385, "reward_before_mean": 0.4895972586236894, "reward_before_std": 0.5479109510779381, "reward_change_max": 0.0007667094469070435, "reward_change_mean": -0.5442476458847523, "reward_change_min": -0.8939982280135155, "reward_change_std": 0.3350295424461365, "reward_std": 0.6868558302521706, "rewards/cosine_scaled_reward": -0.21353472862392664, "rewards/format_reward": 0.916666679084301, "step": 440 }, { "advantage_max": 1.9711784422397614, "advantage_mean": 8.07146260939362e-09, "advantage_min": -0.6901766732335091, "advantage_std": 0.9998734667897224, "completion_length": 1920.6667175292969, "epoch": 0.504, "grad_norm": 0.773413360118866, "kl": 0.0863800048828125, "lambda_div_used": 0.5, "learning_rate": 1.3763677169699217e-07, "loss": 0.0034, "reward": 0.05949468910694122, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.05949468910694122, "reward_after_std": 0.9693707525730133, "reward_before_mean": 0.6075320821255445, "reward_before_std": 0.853855162858963, "reward_change_max": 0.0, "reward_change_mean": -0.5480373837053776, "reward_change_min": -0.9951094016432762, "reward_change_std": 0.37295518442988396, "reward_std": 0.9693707972764969, "rewards/cosine_scaled_reward": -0.06081731495214626, "rewards/format_reward": 0.7291666809469461, "step": 441 }, { "advantage_max": 1.9738472253084183, "advantage_mean": -2.483526828633842e-09, "advantage_min": -0.6937413960695267, "advantage_std": 0.9998617991805077, "completion_length": 1337.0416946411133, "epoch": 0.5051428571428571, "grad_norm": 0.5474178791046143, "kl": 0.04239082336425781, "lambda_div_used": 0.5, "learning_rate": 1.3638909733514452e-07, "loss": 0.0017, "reward": 0.2079712525010109, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2079712525010109, "reward_after_std": 0.802063100039959, "reward_before_mean": 0.9304749630391598, "reward_before_std": 0.582705058157444, "reward_change_max": 0.0, "reward_change_mean": -0.7225036658346653, "reward_change_min": -1.1098268404603004, "reward_change_std": 0.41106531769037247, "reward_std": 0.8020631074905396, "rewards/cosine_scaled_reward": 0.017320780083537102, "rewards/format_reward": 0.8958333432674408, "step": 442 }, { "advantage_max": 1.9156748950481415, "advantage_mean": 1.1175870673341137e-08, "advantage_min": -0.854647807776928, "advantage_std": 0.9998498931527138, "completion_length": 1900.0834121704102, "epoch": 0.5062857142857143, "grad_norm": 0.6384137272834778, "kl": 0.07502937316894531, "lambda_div_used": 0.5, "learning_rate": 1.351615817851748e-07, "loss": 0.003, "reward": 0.0032209542114287615, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.0032209542114287615, "reward_after_std": 0.7304340079426765, "reward_before_mean": 0.5884513519704342, "reward_before_std": 0.6362341642379761, "reward_change_max": 0.0014985054731369019, "reward_change_mean": -0.5852304063737392, "reward_change_min": -0.9630409777164459, "reward_change_std": 0.37647127173841, "reward_std": 0.7304340153932571, "rewards/cosine_scaled_reward": -0.08077432494610548, "rewards/format_reward": 0.7500000111758709, "step": 443 }, { "advantage_max": 1.8896929323673248, "advantage_mean": 4.035731082652205e-09, "advantage_min": -0.808488741517067, "advantage_std": 0.9998615458607674, "completion_length": 1932.8125190734863, "epoch": 0.5074285714285715, "grad_norm": 0.7674197554588318, "kl": 0.12237548828125, "lambda_div_used": 0.5, "learning_rate": 1.3395428487445914e-07, "loss": 0.0049, "reward": -0.03767237951979041, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.03767237951979041, "reward_after_std": 0.8582460209727287, "reward_before_mean": 0.47329113259911537, "reward_before_std": 0.831679854542017, "reward_change_max": 0.001330450177192688, "reward_change_mean": -0.5109634958207607, "reward_change_min": -0.9912841245532036, "reward_change_std": 0.39683387242257595, "reward_std": 0.8582460507750511, "rewards/cosine_scaled_reward": -0.10710444860160351, "rewards/format_reward": 0.6875000111758709, "step": 444 }, { "advantage_max": 1.9227261245250702, "advantage_mean": 2.7939677571531263e-08, "advantage_min": -0.8477248698472977, "advantage_std": 0.999822311103344, "completion_length": 1753.2917175292969, "epoch": 0.5085714285714286, "grad_norm": 0.4278562366962433, "kl": 0.08892059326171875, "lambda_div_used": 0.5, "learning_rate": 1.3276726544494571e-07, "loss": 0.0036, "reward": -0.11785220762249082, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.11785220762249082, "reward_after_std": 0.5929725170135498, "reward_before_mean": 0.4046326084062457, "reward_before_std": 0.47818462178111076, "reward_change_max": 0.0008755475282669067, "reward_change_mean": -0.5224847923964262, "reward_change_min": -0.8103570342063904, "reward_change_std": 0.31598146446049213, "reward_std": 0.5929725207388401, "rewards/cosine_scaled_reward": -0.2039337046444416, "rewards/format_reward": 0.8125000111758709, "step": 445 }, { "advantage_max": 1.9457627087831497, "advantage_mean": -6.208817238118058e-09, "advantage_min": -0.7387218102812767, "advantage_std": 0.9998513013124466, "completion_length": 1702.1458740234375, "epoch": 0.5097142857142857, "grad_norm": 0.5779604911804199, "kl": 0.07822990417480469, "lambda_div_used": 0.5, "learning_rate": 1.316005813502869e-07, "loss": 0.0031, "reward": 0.15322877001017332, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.15322877001017332, "reward_after_std": 0.7949897982180119, "reward_before_mean": 0.8317308221012354, "reward_before_std": 0.6429251153022051, "reward_change_max": 0.0012955516576766968, "reward_change_mean": -0.6785020679235458, "reward_change_min": -1.1380436643958092, "reward_change_std": 0.4298906698822975, "reward_std": 0.794989813119173, "rewards/cosine_scaled_reward": -0.0008012736216187477, "rewards/format_reward": 0.8333333395421505, "step": 446 }, { "advantage_max": 1.9582972824573517, "advantage_mean": 1.3038516266661304e-08, "advantage_min": -0.7675222232937813, "advantage_std": 0.9998466297984123, "completion_length": 1413.083396911621, "epoch": 0.5108571428571429, "grad_norm": 0.6830373406410217, "kl": 0.084716796875, "lambda_div_used": 0.5, "learning_rate": 1.3045428945301953e-07, "loss": 0.0034, "reward": 0.09648872714024037, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.09648872714024037, "reward_after_std": 0.7626160159707069, "reward_before_mean": 0.7410371452569962, "reward_before_std": 0.5982570890337229, "reward_change_max": 0.0017501115798950195, "reward_change_mean": -0.6445484086871147, "reward_change_min": -0.991340659558773, "reward_change_std": 0.38689782470464706, "reward_std": 0.7626160383224487, "rewards/cosine_scaled_reward": -0.0982314352877438, "rewards/format_reward": 0.9375000074505806, "step": 447 }, { "advantage_max": 1.9502499103546143, "advantage_mean": 1.8626452269465688e-08, "advantage_min": -0.7153580188751221, "advantage_std": 0.999836340546608, "completion_length": 1293.833366394043, "epoch": 0.512, "grad_norm": 0.7012367844581604, "kl": 0.06674957275390625, "lambda_div_used": 0.5, "learning_rate": 1.2932844562179352e-07, "loss": 0.0027, "reward": 0.12058082967996597, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.12058082967996597, "reward_after_std": 0.7431465946137905, "reward_before_mean": 0.783910283818841, "reward_before_std": 0.5813306700438261, "reward_change_max": 0.0, "reward_change_mean": -0.663329441100359, "reward_change_min": -1.070579469203949, "reward_change_std": 0.40219525434076786, "reward_std": 0.7431466057896614, "rewards/cosine_scaled_reward": -0.05596153810620308, "rewards/format_reward": 0.8958333432674408, "step": 448 }, { "advantage_max": 1.9743008613586426, "advantage_mean": 3.7252904094842165e-09, "advantage_min": -0.680752731859684, "advantage_std": 0.9998580440878868, "completion_length": 1440.93754196167, "epoch": 0.5131428571428571, "grad_norm": 0.8411633372306824, "kl": 0.1081695556640625, "lambda_div_used": 0.5, "learning_rate": 1.2822310472864885e-07, "loss": 0.0043, "reward": -0.0015947124920785427, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.0015947124920785427, "reward_after_std": 0.798382893204689, "reward_before_mean": 0.5481637455523014, "reward_before_std": 0.6414137668907642, "reward_change_max": 0.003213651478290558, "reward_change_mean": -0.54975844360888, "reward_change_min": -0.9199419319629669, "reward_change_std": 0.3311528917402029, "reward_std": 0.7983829081058502, "rewards/cosine_scaled_reward": -0.16341814678162336, "rewards/format_reward": 0.8750000055879354, "step": 449 }, { "advantage_max": 1.9542711079120636, "advantage_mean": 3.414849514271623e-09, "advantage_min": -0.8025156818330288, "advantage_std": 0.9998338893055916, "completion_length": 1186.5625305175781, "epoch": 0.5142857142857142, "grad_norm": 0.6107338666915894, "kl": 0.037349700927734375, "lambda_div_used": 0.5, "learning_rate": 1.2713832064634125e-07, "loss": 0.0015, "reward": 0.14779857732355595, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.14779857732355595, "reward_after_std": 0.6910727322101593, "reward_before_mean": 0.8608616031706333, "reward_before_std": 0.4924622122198343, "reward_change_max": 0.0, "reward_change_mean": -0.7130630314350128, "reward_change_min": -1.0870602205395699, "reward_change_std": 0.4094325453042984, "reward_std": 0.6910727508366108, "rewards/cosine_scaled_reward": -0.038319210056215525, "rewards/format_reward": 0.9375000149011612, "step": 450 }, { "advantage_max": 1.9720109701156616, "advantage_mean": -2.173086144363623e-08, "advantage_min": -0.7459082752466202, "advantage_std": 0.999847374856472, "completion_length": 1231.645881652832, "epoch": 0.5154285714285715, "grad_norm": 0.5005615949630737, "kl": 0.066131591796875, "lambda_div_used": 0.5, "learning_rate": 1.260741462457165e-07, "loss": 0.0026, "reward": 0.06357445800676942, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.06357445800676942, "reward_after_std": 0.7245562225580215, "reward_before_mean": 0.6849512457847595, "reward_before_std": 0.5289575774222612, "reward_change_max": 0.004507869482040405, "reward_change_mean": -0.6213767826557159, "reward_change_min": -0.9343273863196373, "reward_change_std": 0.36326562985777855, "reward_std": 0.7245562374591827, "rewards/cosine_scaled_reward": -0.11585774272680283, "rewards/format_reward": 0.9166666865348816, "step": 451 }, { "advantage_max": 1.910423904657364, "advantage_mean": 3.7252904094842165e-09, "advantage_min": -0.7892148867249489, "advantage_std": 0.9998941868543625, "completion_length": 1935.5208740234375, "epoch": 0.5165714285714286, "grad_norm": 0.815445065498352, "kl": 0.12743377685546875, "lambda_div_used": 0.5, "learning_rate": 1.2503063339313356e-07, "loss": 0.0051, "reward": 0.1091558001935482, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1091558001935482, "reward_after_std": 1.0659672245383263, "reward_before_mean": 0.6719845505431294, "reward_before_std": 1.0267342068254948, "reward_change_max": 0.00015228241682052612, "reward_change_mean": -0.5628287335857749, "reward_change_min": -1.069099210202694, "reward_change_std": 0.43420621007680893, "reward_std": 1.0659672319889069, "rewards/cosine_scaled_reward": 0.013075600378215313, "rewards/format_reward": 0.6458333525806665, "step": 452 }, { "advantage_max": 1.9004657417535782, "advantage_mean": 7.450581041013038e-09, "advantage_min": -0.8262704908847809, "advantage_std": 0.9998724386096001, "completion_length": 1443.791706085205, "epoch": 0.5177142857142857, "grad_norm": 0.9717079997062683, "kl": 0.076995849609375, "lambda_div_used": 0.5, "learning_rate": 1.2400783294793668e-07, "loss": 0.0031, "reward": 0.15508080273866653, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.15508080273866653, "reward_after_std": 0.8549620658159256, "reward_before_mean": 0.8279006769880652, "reward_before_std": 0.7649585343897343, "reward_change_max": 0.0015206411480903625, "reward_change_mean": -0.6728198602795601, "reward_change_min": -1.1892420575022697, "reward_change_std": 0.442673247307539, "reward_std": 0.8549620807170868, "rewards/cosine_scaled_reward": -0.002716338261961937, "rewards/format_reward": 0.8333333432674408, "step": 453 }, { "advantage_max": 1.9408542066812515, "advantage_mean": 1.4280279847511679e-08, "advantage_min": -0.7722970768809319, "advantage_std": 0.9998204112052917, "completion_length": 1339.0000534057617, "epoch": 0.5188571428571429, "grad_norm": 0.5383586883544922, "kl": 0.05023193359375, "lambda_div_used": 0.5, "learning_rate": 1.2300579475997657e-07, "loss": 0.002, "reward": -0.027189110405743122, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.027189110405743122, "reward_after_std": 0.6605150178074837, "reward_before_mean": 0.5520283579826355, "reward_before_std": 0.5278488723561168, "reward_change_max": 0.0, "reward_change_mean": -0.5792174749076366, "reward_change_min": -0.9451394081115723, "reward_change_std": 0.35080388747155666, "reward_std": 0.6605150178074837, "rewards/cosine_scaled_reward": -0.18231916427612305, "rewards/format_reward": 0.916666679084301, "step": 454 }, { "advantage_max": 1.9397364258766174, "advantage_mean": -3.104408841103634e-09, "advantage_min": -0.7429406382143497, "advantage_std": 0.9998341724276543, "completion_length": 1487.0000534057617, "epoch": 0.52, "grad_norm": 0.6561453342437744, "kl": 0.07395362854003906, "lambda_div_used": 0.5, "learning_rate": 1.220245676671809e-07, "loss": 0.003, "reward": -0.03200409188866615, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.03200409188866615, "reward_after_std": 0.685878798365593, "reward_before_mean": 0.5347689781337976, "reward_before_std": 0.5669304095208645, "reward_change_max": 0.00017894059419631958, "reward_change_mean": -0.5667730458080769, "reward_change_min": -0.9794808402657509, "reward_change_std": 0.3551515229046345, "reward_std": 0.6858788095414639, "rewards/cosine_scaled_reward": -0.17011553049087524, "rewards/format_reward": 0.8750000111758709, "step": 455 }, { "advantage_max": 1.9430626034736633, "advantage_mean": 1.1486311790598336e-08, "advantage_min": -0.7365816906094551, "advantage_std": 0.9998258501291275, "completion_length": 1819.020896911621, "epoch": 0.5211428571428571, "grad_norm": 0.5760971903800964, "kl": 0.11542129516601562, "lambda_div_used": 0.5, "learning_rate": 1.2106419949317388e-07, "loss": 0.0046, "reward": -0.06889674533158541, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.06889674533158541, "reward_after_std": 0.7080087102949619, "reward_before_mean": 0.45906742848455906, "reward_before_std": 0.6147968918085098, "reward_change_max": 0.0, "reward_change_mean": -0.5279641784727573, "reward_change_min": -0.9316188842058182, "reward_change_std": 0.3530385736376047, "reward_std": 0.7080087289214134, "rewards/cosine_scaled_reward": -0.1558829639106989, "rewards/format_reward": 0.770833345130086, "step": 456 }, { "advantage_max": 1.9111386984586716, "advantage_mean": -7.761022269292539e-09, "advantage_min": -0.8441537171602249, "advantage_std": 0.9998257532715797, "completion_length": 1603.3541870117188, "epoch": 0.5222857142857142, "grad_norm": 0.42133650183677673, "kl": 0.11229705810546875, "lambda_div_used": 0.5, "learning_rate": 1.2012473704494537e-07, "loss": 0.0045, "reward": 0.11794697493314743, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.11794697493314743, "reward_after_std": 0.746201453730464, "reward_before_mean": 0.7967136232182384, "reward_before_std": 0.6390935992822051, "reward_change_max": 0.0001236647367477417, "reward_change_mean": -0.6787666529417038, "reward_change_min": -1.113053236156702, "reward_change_std": 0.4282074421644211, "reward_std": 0.7462014760822058, "rewards/cosine_scaled_reward": 0.0025234604254364967, "rewards/format_reward": 0.7916666697710752, "step": 457 }, { "advantage_max": 1.9392741024494171, "advantage_mean": 1.940255400789681e-09, "advantage_min": -0.7527594342827797, "advantage_std": 0.9998307451605797, "completion_length": 1405.750015258789, "epoch": 0.5234285714285715, "grad_norm": 0.8316559195518494, "kl": 0.068511962890625, "lambda_div_used": 0.5, "learning_rate": 1.1920622611056974e-07, "loss": 0.0027, "reward": 0.012192606925964355, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.012192606925964355, "reward_after_std": 0.7541647478938103, "reward_before_mean": 0.5929140914231539, "reward_before_std": 0.6398164816200733, "reward_change_max": 0.0, "reward_change_mean": -0.5807214863598347, "reward_change_min": -0.9291879385709763, "reward_change_std": 0.3721654526889324, "reward_std": 0.7541647478938103, "rewards/cosine_scaled_reward": -0.12020963057875633, "rewards/format_reward": 0.8333333395421505, "step": 458 }, { "advantage_max": 1.9433845430612564, "advantage_mean": -8.847564902936256e-09, "advantage_min": -0.7697709649801254, "advantage_std": 0.9998654574155807, "completion_length": 1161.9583587646484, "epoch": 0.5245714285714286, "grad_norm": 0.4935070276260376, "kl": 0.054248809814453125, "lambda_div_used": 0.5, "learning_rate": 1.1830871145697412e-07, "loss": 0.0022, "reward": 0.30431293696165085, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.30431293696165085, "reward_after_std": 0.8699779510498047, "reward_before_mean": 1.0904029458761215, "reward_before_std": 0.7013438567519188, "reward_change_max": 0.0, "reward_change_mean": -0.7860900089144707, "reward_change_min": -1.30374313890934, "reward_change_std": 0.4662424735724926, "reward_std": 0.8699779585003853, "rewards/cosine_scaled_reward": 0.045201453380286694, "rewards/format_reward": 1.0, "step": 459 }, { "advantage_max": 1.894415706396103, "advantage_mean": 3.3306690738754696e-16, "advantage_min": -0.8416037708520889, "advantage_std": 0.9998557269573212, "completion_length": 1902.7708740234375, "epoch": 0.5257142857142857, "grad_norm": 0.5548562407493591, "kl": 0.100677490234375, "lambda_div_used": 0.5, "learning_rate": 1.1743223682775649e-07, "loss": 0.004, "reward": 0.12215450627263635, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.12215450627263635, "reward_after_std": 0.7530257254838943, "reward_before_mean": 0.8011749666184187, "reward_before_std": 0.644983071833849, "reward_change_max": 0.0003541409969329834, "reward_change_mean": -0.6790204904973507, "reward_change_min": -1.1368926838040352, "reward_change_std": 0.43394239246845245, "reward_std": 0.7530257627367973, "rewards/cosine_scaled_reward": -0.016079182736575603, "rewards/format_reward": 0.8333333507180214, "step": 460 }, { "advantage_max": 1.961041271686554, "advantage_mean": 8.692344288796505e-09, "advantage_min": -0.7419874519109726, "advantage_std": 0.9998823627829552, "completion_length": 1571.8541946411133, "epoch": 0.5268571428571428, "grad_norm": 0.7262814044952393, "kl": 0.06304168701171875, "lambda_div_used": 0.5, "learning_rate": 1.1657684494105386e-07, "loss": 0.0025, "reward": 0.22672313824295998, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.22672313824295998, "reward_after_std": 0.9908672571182251, "reward_before_mean": 0.909737903624773, "reward_before_std": 0.827164052054286, "reward_change_max": 0.0015447810292243958, "reward_change_mean": -0.6830147504806519, "reward_change_min": -1.196632169187069, "reward_change_std": 0.4549138732254505, "reward_std": 0.9908672869205475, "rewards/cosine_scaled_reward": 0.038202277704840526, "rewards/format_reward": 0.8333333507180214, "step": 461 }, { "advantage_max": 1.9567998200654984, "advantage_mean": 1.3659398168108794e-08, "advantage_min": -0.7012544423341751, "advantage_std": 0.9998421967029572, "completion_length": 1640.125015258789, "epoch": 0.528, "grad_norm": 0.7010632157325745, "kl": 0.12639236450195312, "lambda_div_used": 0.5, "learning_rate": 1.1574257748745986e-07, "loss": 0.0051, "reward": -0.08827551966533065, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.08827551966533065, "reward_after_std": 0.7464075610041618, "reward_before_mean": 0.4057305008172989, "reward_before_std": 0.6126368492841721, "reward_change_max": 0.0030915439128875732, "reward_change_mean": -0.4940060283988714, "reward_change_min": -0.836365569382906, "reward_change_std": 0.3115391172468662, "reward_std": 0.7464075684547424, "rewards/cosine_scaled_reward": -0.2033847626298666, "rewards/format_reward": 0.8125000055879354, "step": 462 }, { "advantage_max": 1.9140111058950424, "advantage_mean": -1.2417634698280722e-09, "advantage_min": -0.7328698076307774, "advantage_std": 0.999872162938118, "completion_length": 1977.6042022705078, "epoch": 0.5291428571428571, "grad_norm": 0.8431422710418701, "kl": 0.11789321899414062, "lambda_div_used": 0.5, "learning_rate": 1.1492947512799328e-07, "loss": 0.0047, "reward": 0.08569935825653374, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.08569935825653374, "reward_after_std": 0.8853895887732506, "reward_before_mean": 0.6880028424784541, "reward_before_std": 0.7817164659500122, "reward_change_max": 0.002470634877681732, "reward_change_mean": -0.6023034751415253, "reward_change_min": -1.1469271406531334, "reward_change_std": 0.4306590985506773, "reward_std": 0.8853896260261536, "rewards/cosine_scaled_reward": -0.010165270417928696, "rewards/format_reward": 0.7083333469927311, "step": 463 }, { "advantage_max": 1.9133918732404709, "advantage_mean": 9.934107536579972e-09, "advantage_min": -0.8422577381134033, "advantage_std": 0.999857671558857, "completion_length": 1337.8125228881836, "epoch": 0.5302857142857142, "grad_norm": 0.8812828660011292, "kl": 0.09333038330078125, "lambda_div_used": 0.5, "learning_rate": 1.1413757749211602e-07, "loss": 0.0037, "reward": 0.2514381390064955, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2514381390064955, "reward_after_std": 0.7524741515517235, "reward_before_mean": 1.0340424440801144, "reward_before_std": 0.5952115915715694, "reward_change_max": 0.0028984099626541138, "reward_change_mean": -0.782604280859232, "reward_change_min": -1.204802818596363, "reward_change_std": 0.47602505423128605, "reward_std": 0.7524741850793362, "rewards/cosine_scaled_reward": 0.10035453177988529, "rewards/format_reward": 0.8333333395421505, "step": 464 }, { "advantage_max": 1.9208644330501556, "advantage_mean": 5.587935447692871e-09, "advantage_min": -0.8015524484217167, "advantage_std": 0.9998502060770988, "completion_length": 1781.9375457763672, "epoch": 0.5314285714285715, "grad_norm": 0.9359230399131775, "kl": 0.13791275024414062, "lambda_div_used": 0.5, "learning_rate": 1.1336692317580158e-07, "loss": 0.0055, "reward": -0.023814965039491653, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.023814965039491653, "reward_after_std": 0.7645898275077343, "reward_before_mean": 0.5261010061949492, "reward_before_std": 0.678345825523138, "reward_change_max": 0.0007695704698562622, "reward_change_mean": -0.5499159917235374, "reward_change_min": -0.9802470579743385, "reward_change_std": 0.3772691786289215, "reward_std": 0.764589861035347, "rewards/cosine_scaled_reward": -0.12236616667360067, "rewards/format_reward": 0.7708333432674408, "step": 465 }, { "advantage_max": 1.9382754564285278, "advantage_mean": -5.587935447692871e-09, "advantage_min": -0.7668173387646675, "advantage_std": 0.9998831152915955, "completion_length": 1432.2292098999023, "epoch": 0.5325714285714286, "grad_norm": 0.9250560402870178, "kl": 0.0701141357421875, "lambda_div_used": 0.5, "learning_rate": 1.1261754973965422e-07, "loss": 0.0028, "reward": 0.3115917220711708, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3115917220711708, "reward_after_std": 0.9879258796572685, "reward_before_mean": 1.0618979572318494, "reward_before_std": 0.8279051408171654, "reward_change_max": 0.0, "reward_change_mean": -0.7503062412142754, "reward_change_min": -1.3201973289251328, "reward_change_std": 0.48962003737688065, "reward_std": 0.9879258833825588, "rewards/cosine_scaled_reward": 0.09344895218964666, "rewards/format_reward": 0.8750000074505806, "step": 466 }, { "advantage_max": 1.8915677815675735, "advantage_mean": 9.313225746154785e-09, "advantage_min": -0.8872964084148407, "advantage_std": 0.9998693838715553, "completion_length": 1769.1667251586914, "epoch": 0.5337142857142857, "grad_norm": 0.6294421553611755, "kl": 0.1210784912109375, "lambda_div_used": 0.5, "learning_rate": 1.1188949370707787e-07, "loss": 0.0048, "reward": 0.12419883778784424, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.12419883778784424, "reward_after_std": 0.903032261878252, "reward_before_mean": 0.7542639700695872, "reward_before_std": 0.8407903723418713, "reward_change_max": 0.0016758441925048828, "reward_change_mean": -0.6300651095807552, "reward_change_min": -1.129418022930622, "reward_change_std": 0.4298090599477291, "reward_std": 0.9030322767794132, "rewards/cosine_scaled_reward": -0.04995136708021164, "rewards/format_reward": 0.8541666865348816, "step": 467 }, { "advantage_max": 1.9225873202085495, "advantage_mean": -1.2417633588057697e-09, "advantage_min": -0.7797147929668427, "advantage_std": 0.9998800531029701, "completion_length": 1557.770881652832, "epoch": 0.5348571428571428, "grad_norm": 0.6982704401016235, "kl": 0.06484222412109375, "lambda_div_used": 0.5, "learning_rate": 1.1118279056249653e-07, "loss": 0.0026, "reward": 0.18363811075687408, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.18363811075687408, "reward_after_std": 0.9357991591095924, "reward_before_mean": 0.8452462187269703, "reward_before_std": 0.8119201101362705, "reward_change_max": 0.0, "reward_change_mean": -0.6616080775856972, "reward_change_min": -1.198854148387909, "reward_change_std": 0.4518035836517811, "reward_std": 0.9357991963624954, "rewards/cosine_scaled_reward": -0.01487693004310131, "rewards/format_reward": 0.8750000149011612, "step": 468 }, { "advantage_max": 1.890088975429535, "advantage_mean": 2.980232283178452e-08, "advantage_min": -0.8545728474855423, "advantage_std": 0.9998464807868004, "completion_length": 1714.7709007263184, "epoch": 0.536, "grad_norm": 0.9024974703788757, "kl": 0.16271209716796875, "lambda_div_used": 0.5, "learning_rate": 1.1049747474962444e-07, "loss": 0.0065, "reward": -0.060654870234429836, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.060654870234429836, "reward_after_std": 0.6992764100432396, "reward_before_mean": 0.47816853411495686, "reward_before_std": 0.6197003275156021, "reward_change_max": 0.0035224109888076782, "reward_change_mean": -0.5388233549892902, "reward_change_min": -0.932822585105896, "reward_change_std": 0.3747980333864689, "reward_std": 0.6992764323949814, "rewards/cosine_scaled_reward": -0.1567490752786398, "rewards/format_reward": 0.791666679084301, "step": 469 }, { "advantage_max": 1.9362804740667343, "advantage_mean": 4.967053435223079e-09, "advantage_min": -0.8115110918879509, "advantage_std": 0.9998445808887482, "completion_length": 2205.7083740234375, "epoch": 0.5371428571428571, "grad_norm": 1.068304181098938, "kl": 0.21833038330078125, "lambda_div_used": 0.5, "learning_rate": 1.0983357966978745e-07, "loss": 0.0087, "reward": -0.10846987180411816, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.10846987180411816, "reward_after_std": 0.7410068362951279, "reward_before_mean": 0.37174488231539726, "reward_before_std": 0.6479252725839615, "reward_change_max": 0.0, "reward_change_mean": -0.48021476343274117, "reward_change_min": -0.8555741608142853, "reward_change_std": 0.31600991636514664, "reward_std": 0.7410068362951279, "rewards/cosine_scaled_reward": -0.16829423449235037, "rewards/format_reward": 0.7083333525806665, "step": 470 }, { "advantage_max": 1.8785135746002197, "advantage_mean": 3.1044086745701804e-09, "advantage_min": -0.8254240527749062, "advantage_std": 0.9998672753572464, "completion_length": 2018.7917098999023, "epoch": 0.5382857142857143, "grad_norm": 1.2153743505477905, "kl": 0.1783733367919922, "lambda_div_used": 0.5, "learning_rate": 1.0919113768029517e-07, "loss": 0.0071, "reward": 0.15960154053755105, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.15960154053755105, "reward_after_std": 0.8837853036820889, "reward_before_mean": 0.8326390506699681, "reward_before_std": 0.8361635878682137, "reward_change_max": 0.002089478075504303, "reward_change_mean": -0.6730375066399574, "reward_change_min": -1.2551670372486115, "reward_change_std": 0.519281305372715, "reward_std": 0.8837853148579597, "rewards/cosine_scaled_reward": 0.05173617601394653, "rewards/format_reward": 0.7291666828095913, "step": 471 }, { "advantage_max": 1.9424489885568619, "advantage_mean": 7.450580818968433e-09, "advantage_min": -0.680642619729042, "advantage_std": 0.9998679831624031, "completion_length": 1611.5209045410156, "epoch": 0.5394285714285715, "grad_norm": 0.6896313428878784, "kl": 0.11153030395507812, "lambda_div_used": 0.5, "learning_rate": 1.0857018009286381e-07, "loss": 0.0045, "reward": 0.03950534947216511, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.03950534947216511, "reward_after_std": 0.8686452694237232, "reward_before_mean": 0.6119452454149723, "reward_before_std": 0.7634111884981394, "reward_change_max": 0.0, "reward_change_mean": -0.5724398903548717, "reward_change_min": -1.1462013721466064, "reward_change_std": 0.40038398280739784, "reward_std": 0.8686453029513359, "rewards/cosine_scaled_reward": -0.11069405497983098, "rewards/format_reward": 0.8333333358168602, "step": 472 }, { "advantage_max": 1.914126843214035, "advantage_mean": 2.2351742678949904e-08, "advantage_min": -0.8491419404745102, "advantage_std": 0.9998372197151184, "completion_length": 1868.1875762939453, "epoch": 0.5405714285714286, "grad_norm": 0.8767368793487549, "kl": 0.1471405029296875, "lambda_div_used": 0.5, "learning_rate": 1.0797073717209013e-07, "loss": 0.0059, "reward": -0.07519353553652763, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.07519353553652763, "reward_after_std": 0.693619716912508, "reward_before_mean": 0.4554354092106223, "reward_before_std": 0.6181728020310402, "reward_change_max": 0.0007615610957145691, "reward_change_mean": -0.5306289177387953, "reward_change_min": -0.8539394959807396, "reward_change_std": 0.34847177751362324, "reward_std": 0.6936197318136692, "rewards/cosine_scaled_reward": -0.1472823154181242, "rewards/format_reward": 0.7500000055879354, "step": 473 }, { "advantage_max": 1.9438211470842361, "advantage_mean": -1.0554989215982857e-08, "advantage_min": -0.8066472448408604, "advantage_std": 0.9998703449964523, "completion_length": 1492.1042098999023, "epoch": 0.5417142857142857, "grad_norm": 0.5975131988525391, "kl": 0.14487838745117188, "lambda_div_used": 0.5, "learning_rate": 1.0739283813397639e-07, "loss": 0.0058, "reward": 0.5643487274646759, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5643487274646759, "reward_after_std": 0.8311857730150223, "reward_before_mean": 1.5848356559872627, "reward_before_std": 0.5555077791213989, "reward_change_max": 0.0, "reward_change_mean": -1.0204869396984577, "reward_change_min": -1.4674010053277016, "reward_change_std": 0.5802014097571373, "reward_std": 0.8311858177185059, "rewards/cosine_scaled_reward": 0.34450116008520126, "rewards/format_reward": 0.8958333395421505, "step": 474 }, { "advantage_max": 1.9134457558393478, "advantage_mean": -1.924733428193548e-08, "advantage_min": -0.7586380168795586, "advantage_std": 0.9998574778437614, "completion_length": 1629.0417022705078, "epoch": 0.5428571428571428, "grad_norm": 0.5304942727088928, "kl": 0.10820388793945312, "lambda_div_used": 0.5, "learning_rate": 1.068365111445064e-07, "loss": 0.0043, "reward": 0.18582246452569962, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.18582246452569962, "reward_after_std": 0.7934923768043518, "reward_before_mean": 0.9009908102452755, "reward_before_std": 0.6717259753495455, "reward_change_max": 0.0006563067436218262, "reward_change_mean": -0.7151683606207371, "reward_change_min": -1.2400154992938042, "reward_change_std": 0.46559458039700985, "reward_std": 0.7934924028813839, "rewards/cosine_scaled_reward": 0.06507872650399804, "rewards/format_reward": 0.770833345130086, "step": 475 }, { "advantage_max": 1.9225990921258926, "advantage_mean": -1.552204331733975e-08, "advantage_min": -0.8317600563168526, "advantage_std": 0.9998689591884613, "completion_length": 1525.9792098999023, "epoch": 0.544, "grad_norm": 0.7439927458763123, "kl": 0.07448959350585938, "lambda_div_used": 0.5, "learning_rate": 1.063017833182728e-07, "loss": 0.003, "reward": 0.31535858660936356, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.31535858660936356, "reward_after_std": 0.8980049230158329, "reward_before_mean": 1.1031142249703407, "reward_before_std": 0.7534971106797457, "reward_change_max": 0.0, "reward_change_mean": -0.7877555936574936, "reward_change_min": -1.2570370063185692, "reward_change_std": 0.4909738227725029, "reward_std": 0.8980049341917038, "rewards/cosine_scaled_reward": 0.10364041291177273, "rewards/format_reward": 0.895833358168602, "step": 476 }, { "advantage_max": 1.8741195350885391, "advantage_mean": 2.220446049250313e-16, "advantage_min": -0.8873728774487972, "advantage_std": 0.9998980090022087, "completion_length": 1399.2917175292969, "epoch": 0.5451428571428572, "grad_norm": 0.583365797996521, "kl": 0.11350250244140625, "lambda_div_used": 0.5, "learning_rate": 1.0578868071715544e-07, "loss": 0.0045, "reward": 0.4705557865090668, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4705557865090668, "reward_after_std": 1.0551037564873695, "reward_before_mean": 1.347284235060215, "reward_before_std": 0.9735592044889927, "reward_change_max": 0.001387663185596466, "reward_change_mean": -0.8767284750938416, "reward_change_min": -1.5517780631780624, "reward_change_std": 0.6072789244353771, "reward_std": 1.0551037788391113, "rewards/cosine_scaled_reward": 0.23614212637767196, "rewards/format_reward": 0.8750000223517418, "step": 477 }, { "advantage_max": 1.9357862919569016, "advantage_mean": -4.967053657267684e-09, "advantage_min": -0.8154433369636536, "advantage_std": 0.9998578280210495, "completion_length": 2062.395881652832, "epoch": 0.5462857142857143, "grad_norm": 0.5441526770591736, "kl": 0.16361618041992188, "lambda_div_used": 0.5, "learning_rate": 1.0529722834905125e-07, "loss": 0.0065, "reward": 0.009207317605614662, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.009207317605614662, "reward_after_std": 0.8506947085261345, "reward_before_mean": 0.5521553927101195, "reward_before_std": 0.7576607428491116, "reward_change_max": 0.0030072256922721863, "reward_change_mean": -0.5429480616003275, "reward_change_min": -0.9525541067123413, "reward_change_std": 0.37604556791484356, "reward_std": 0.8506947420537472, "rewards/cosine_scaled_reward": -0.0780889829620719, "rewards/format_reward": 0.7083333432674408, "step": 478 }, { "advantage_max": 1.9259414672851562, "advantage_mean": -1.0554989493538613e-08, "advantage_min": -0.8462852165102959, "advantage_std": 0.9998517110943794, "completion_length": 1818.8958587646484, "epoch": 0.5474285714285714, "grad_norm": 0.8077680468559265, "kl": 0.18693161010742188, "lambda_div_used": 0.5, "learning_rate": 1.0482745016665526e-07, "loss": 0.0075, "reward": 0.08315641060471535, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.08315641060471535, "reward_after_std": 0.7782744280993938, "reward_before_mean": 0.7172673875465989, "reward_before_std": 0.6326955072581768, "reward_change_max": 0.0008339658379554749, "reward_change_mean": -0.634110989049077, "reward_change_min": -1.057134885340929, "reward_change_std": 0.3948863986879587, "reward_std": 0.7782744280993938, "rewards/cosine_scaled_reward": -0.04761631414294243, "rewards/format_reward": 0.8125000055879354, "step": 479 }, { "advantage_max": 1.8997021317481995, "advantage_mean": -6.519258410886408e-09, "advantage_min": -0.8673086017370224, "advantage_std": 0.9998597353696823, "completion_length": 1601.9375534057617, "epoch": 0.5485714285714286, "grad_norm": 0.9863570928573608, "kl": 0.14463424682617188, "lambda_div_used": 0.5, "learning_rate": 1.0437936906629334e-07, "loss": 0.0058, "reward": -0.006774002220481634, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.006774002220481634, "reward_after_std": 0.8246656432747841, "reward_before_mean": 0.5343326807487756, "reward_before_std": 0.7610690146684647, "reward_change_max": 0.0005768761038780212, "reward_change_mean": -0.5411067046225071, "reward_change_min": -1.0265009850263596, "reward_change_std": 0.388399351388216, "reward_std": 0.8246656730771065, "rewards/cosine_scaled_reward": -0.13908366532996297, "rewards/format_reward": 0.8125000149011612, "step": 480 }, { "advantage_max": 1.9240192770957947, "advantage_mean": 2.7939678071131624e-09, "advantage_min": -0.8690285757184029, "advantage_std": 0.9998449757695198, "completion_length": 1750.354232788086, "epoch": 0.5497142857142857, "grad_norm": 1.2178927659988403, "kl": 0.13234710693359375, "lambda_div_used": 0.5, "learning_rate": 1.0395300688680625e-07, "loss": 0.0053, "reward": -0.010232685133814812, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": -0.010232685133814812, "reward_after_std": 0.792898278683424, "reward_before_mean": 0.5407127062790096, "reward_before_std": 0.7064782101660967, "reward_change_max": 0.0, "reward_change_mean": -0.5509453937411308, "reward_change_min": -0.9236162602901459, "reward_change_std": 0.3657870851457119, "reward_std": 0.7928983122110367, "rewards/cosine_scaled_reward": -0.1463103265268728, "rewards/format_reward": 0.833333358168602, "step": 481 }, { "advantage_max": 1.9587009698152542, "advantage_mean": 2.483527383745354e-09, "advantage_min": -0.6773890405893326, "advantage_std": 0.9998364299535751, "completion_length": 1611.2500381469727, "epoch": 0.5508571428571428, "grad_norm": 0.6870055794715881, "kl": 0.1389312744140625, "lambda_div_used": 0.5, "learning_rate": 1.0354838440848501e-07, "loss": 0.0056, "reward": 0.06978282704949379, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.06978282704949379, "reward_after_std": 0.7692562937736511, "reward_before_mean": 0.6915448009967804, "reward_before_std": 0.6328919120132923, "reward_change_max": 0.001624472439289093, "reward_change_mean": -0.6217619744129479, "reward_change_min": -1.0122926868498325, "reward_change_std": 0.38024843856692314, "reward_std": 0.7692563086748123, "rewards/cosine_scaled_reward": -0.03964426927268505, "rewards/format_reward": 0.770833333954215, "step": 482 }, { "advantage_max": 1.9146474301815033, "advantage_mean": -2.4835269396561444e-09, "advantage_min": -0.8112232387065887, "advantage_std": 0.9998473078012466, "completion_length": 1654.4583740234375, "epoch": 0.552, "grad_norm": 1.1116269826889038, "kl": 0.12012290954589844, "lambda_div_used": 0.5, "learning_rate": 1.0316552135205837e-07, "loss": 0.0048, "reward": 0.06403302727267146, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.06403302727267146, "reward_after_std": 0.8078042268753052, "reward_before_mean": 0.6758778803050518, "reward_before_std": 0.7425609044730663, "reward_change_max": 0.0, "reward_change_mean": -0.6118448786437511, "reward_change_min": -1.0602629855275154, "reward_change_std": 0.42626157216727734, "reward_std": 0.8078042753040791, "rewards/cosine_scaled_reward": -0.0787277314811945, "rewards/format_reward": 0.8333333395421505, "step": 483 }, { "advantage_max": 1.879467323422432, "advantage_mean": -6.208814573582799e-10, "advantage_min": -0.8875841423869133, "advantage_std": 0.9998355507850647, "completion_length": 1322.7500457763672, "epoch": 0.5531428571428572, "grad_norm": 0.5565298199653625, "kl": 0.09266090393066406, "lambda_div_used": 0.5, "learning_rate": 1.0280443637773163e-07, "loss": 0.0037, "reward": 0.12074141576886177, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.12074141576886177, "reward_after_std": 0.7823374047875404, "reward_before_mean": 0.7933894079178572, "reward_before_std": 0.7216227240860462, "reward_change_max": 0.0009933337569236755, "reward_change_mean": -0.6726479884237051, "reward_change_min": -1.105917226523161, "reward_change_std": 0.4536776263266802, "reward_std": 0.7823374196887016, "rewards/cosine_scaled_reward": -0.04080530256032944, "rewards/format_reward": 0.8750000149011612, "step": 484 }, { "advantage_max": 1.9454391449689865, "advantage_mean": -1.2417635808503746e-09, "advantage_min": -0.7871048152446747, "advantage_std": 0.9998297542333603, "completion_length": 1602.2291793823242, "epoch": 0.5542857142857143, "grad_norm": 0.7432307600975037, "kl": 0.15546417236328125, "lambda_div_used": 0.5, "learning_rate": 1.0246514708427701e-07, "loss": 0.0062, "reward": 0.10268004890531301, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.10268004890531301, "reward_after_std": 0.6411448940634727, "reward_before_mean": 0.7937132641673088, "reward_before_std": 0.47726221568882465, "reward_change_max": 0.0, "reward_change_mean": -0.6910331957042217, "reward_change_min": -1.0900376811623573, "reward_change_std": 0.40093186870217323, "reward_std": 0.6411449201405048, "rewards/cosine_scaled_reward": -0.061476717703044415, "rewards/format_reward": 0.916666679084301, "step": 485 }, { "advantage_max": 1.9508067667484283, "advantage_mean": 6.208817460162663e-09, "advantage_min": -0.7736483179032803, "advantage_std": 0.9998454228043556, "completion_length": 1003.0625267028809, "epoch": 0.5554285714285714, "grad_norm": 0.9682008028030396, "kl": 0.0872650146484375, "lambda_div_used": 0.5, "learning_rate": 1.0214767000817596e-07, "loss": 0.0035, "reward": 0.12867436837404966, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.12867436837404966, "reward_after_std": 0.7180111818015575, "reward_before_mean": 0.8117377087473869, "reward_before_std": 0.5166822988539934, "reward_change_max": 0.0017078518867492676, "reward_change_mean": -0.6830633729696274, "reward_change_min": -1.0265196487307549, "reward_change_std": 0.3901584856212139, "reward_std": 0.7180111892521381, "rewards/cosine_scaled_reward": -0.052464481675997376, "rewards/format_reward": 0.9166666865348816, "step": 486 }, { "advantage_max": 1.9592144191265106, "advantage_mean": -1.2417634698280722e-09, "advantage_min": -0.6940793693065643, "advantage_std": 0.9998834952712059, "completion_length": 1188.1041774749756, "epoch": 0.5565714285714286, "grad_norm": 0.5985164642333984, "kl": 0.08372879028320312, "lambda_div_used": 0.5, "learning_rate": 1.0185202062281336e-07, "loss": 0.0034, "reward": 0.41113134508486837, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.41113134508486837, "reward_after_std": 0.9559298977255821, "reward_before_mean": 1.257676038891077, "reward_before_std": 0.7205435046926141, "reward_change_max": 0.0, "reward_change_mean": -0.8465447202324867, "reward_change_min": -1.3234855383634567, "reward_change_std": 0.5153034143149853, "reward_std": 0.9559298977255821, "rewards/cosine_scaled_reward": 0.19133803341537714, "rewards/format_reward": 0.8750000074505806, "step": 487 }, { "advantage_max": 1.9492617100477219, "advantage_mean": 2.359350581571107e-08, "advantage_min": -0.7352440245449543, "advantage_std": 0.999819852411747, "completion_length": 1420.7708702087402, "epoch": 0.5577142857142857, "grad_norm": 0.833846390247345, "kl": 0.15200424194335938, "lambda_div_used": 0.5, "learning_rate": 1.0157821333772304e-07, "loss": 0.0061, "reward": -0.00616908073425293, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.00616908073425293, "reward_after_std": 0.6220297068357468, "reward_before_mean": 0.6039720326662064, "reward_before_std": 0.4390154229477048, "reward_change_max": 0.0, "reward_change_mean": -0.6101410929113626, "reward_change_min": -0.9060985743999481, "reward_change_std": 0.3509222362190485, "reward_std": 0.6220297180116177, "rewards/cosine_scaled_reward": -0.11468065832741559, "rewards/format_reward": 0.8333333414047956, "step": 488 }, { "advantage_max": 1.8662956207990646, "advantage_mean": 3.228584999348527e-08, "advantage_min": -0.9588077291846275, "advantage_std": 0.9998302757740021, "completion_length": 1769.5000457763672, "epoch": 0.5588571428571428, "grad_norm": 1.0365986824035645, "kl": 0.1967620849609375, "lambda_div_used": 0.5, "learning_rate": 1.013262614978859e-07, "loss": 0.0079, "reward": -0.1484443813096732, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1484443813096732, "reward_after_std": 0.6222599521279335, "reward_before_mean": 0.34515629429370165, "reward_before_std": 0.5821492820978165, "reward_change_max": 0.0, "reward_change_mean": -0.49360066652297974, "reward_change_min": -0.8139010816812515, "reward_change_std": 0.3350451663136482, "reward_std": 0.6222599819302559, "rewards/cosine_scaled_reward": -0.20242186076939106, "rewards/format_reward": 0.7500000186264515, "step": 489 }, { "advantage_max": 1.992196410894394, "advantage_mean": 1.1175871117430347e-08, "advantage_min": -0.6759535558521748, "advantage_std": 0.999831385910511, "completion_length": 1338.2083740234375, "epoch": 0.56, "grad_norm": 0.7504719495773315, "kl": 0.09897232055664062, "lambda_div_used": 0.5, "learning_rate": 1.0109617738307911e-07, "loss": 0.004, "reward": 0.0957014646846801, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.0957014646846801, "reward_after_std": 0.7212163619697094, "reward_before_mean": 0.7547863759100437, "reward_before_std": 0.49603763967752457, "reward_change_max": 0.0, "reward_change_mean": -0.6590849310159683, "reward_change_min": -0.9607073739171028, "reward_change_std": 0.3600234966725111, "reward_std": 0.72121636942029, "rewards/cosine_scaled_reward": -0.07052347250282764, "rewards/format_reward": 0.8958333432674408, "step": 490 }, { "advantage_max": 1.9146278351545334, "advantage_mean": -1.1175871117430347e-08, "advantage_min": -0.8270855993032455, "advantage_std": 0.9998930171132088, "completion_length": 1563.9166793823242, "epoch": 0.5611428571428572, "grad_norm": 0.5624725818634033, "kl": 0.09905624389648438, "lambda_div_used": 0.5, "learning_rate": 1.0088797220727779e-07, "loss": 0.004, "reward": 0.39007012732326984, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.39007012732326984, "reward_after_std": 1.004666656255722, "reward_before_mean": 1.2099284492433071, "reward_before_std": 0.86801877617836, "reward_change_max": 0.0, "reward_change_mean": -0.8198583498597145, "reward_change_min": -1.4678706899285316, "reward_change_std": 0.5273435860872269, "reward_std": 1.0046666860580444, "rewards/cosine_scaled_reward": 0.15704754507169127, "rewards/format_reward": 0.8958333432674408, "step": 491 }, { "advantage_max": 1.9598166197538376, "advantage_mean": 3.352761424046946e-08, "advantage_min": -0.6968494616448879, "advantage_std": 0.9997868090867996, "completion_length": 1500.0625190734863, "epoch": 0.5622857142857143, "grad_norm": 1.76079261302948, "kl": 0.13095855712890625, "lambda_div_used": 0.5, "learning_rate": 1.0070165611810855e-07, "loss": 0.0052, "reward": -0.011532854987308383, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.011532854987308383, "reward_after_std": 0.6456209793686867, "reward_before_mean": 0.5838748067617416, "reward_before_std": 0.48374018631875515, "reward_change_max": 0.0, "reward_change_mean": -0.595407678745687, "reward_change_min": -0.9056248366832733, "reward_change_std": 0.34459487441927195, "reward_std": 0.6456209793686867, "rewards/cosine_scaled_reward": -0.11431259848177433, "rewards/format_reward": 0.8125000149011612, "step": 492 }, { "advantage_max": 1.9004540145397186, "advantage_mean": 1.3038516377683607e-08, "advantage_min": -0.862372025847435, "advantage_std": 0.9998877570033073, "completion_length": 1285.2916793823242, "epoch": 0.5634285714285714, "grad_norm": 0.5592947602272034, "kl": 0.09164047241210938, "lambda_div_used": 0.5, "learning_rate": 1.005372381963547e-07, "loss": 0.0037, "reward": 0.3626462905667722, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3626462905667722, "reward_after_std": 1.0406964905560017, "reward_before_mean": 1.1492395093664527, "reward_before_std": 0.9662736691534519, "reward_change_max": 0.0, "reward_change_mean": -0.786593209952116, "reward_change_min": -1.3980813696980476, "reward_change_std": 0.5360561832785606, "reward_std": 1.0406964905560017, "rewards/cosine_scaled_reward": 0.11628641374409199, "rewards/format_reward": 0.916666679084301, "step": 493 }, { "advantage_max": 1.9788424372673035, "advantage_mean": -6.208817182606907e-09, "advantage_min": -0.7454531416296959, "advantage_std": 0.9998568743467331, "completion_length": 1362.6042022705078, "epoch": 0.5645714285714286, "grad_norm": 0.7242301106452942, "kl": 0.09954833984375, "lambda_div_used": 0.5, "learning_rate": 1.0039472645551372e-07, "loss": 0.004, "reward": 0.12059944961220026, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.12059944961220026, "reward_after_std": 0.7983547747135162, "reward_before_mean": 0.7689751125872135, "reward_before_std": 0.6040369383990765, "reward_change_max": 0.0, "reward_change_mean": -0.6483756825327873, "reward_change_min": -0.9728981480002403, "reward_change_std": 0.3582359105348587, "reward_std": 0.7983547821640968, "rewards/cosine_scaled_reward": -0.08426245115697384, "rewards/format_reward": 0.9375000149011612, "step": 494 }, { "advantage_max": 1.87544085085392, "advantage_mean": -1.8626451825376478e-08, "advantage_min": -0.9081234857439995, "advantage_std": 0.9998789876699448, "completion_length": 1417.7500457763672, "epoch": 0.5657142857142857, "grad_norm": 0.6767681241035461, "kl": 0.0517578125, "lambda_div_used": 0.5, "learning_rate": 1.002741278414069e-07, "loss": 0.0021, "reward": 0.3090813576709479, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3090813576709479, "reward_after_std": 0.9749084785580635, "reward_before_mean": 1.0812942683696747, "reward_before_std": 0.9502445831894875, "reward_change_max": 0.0, "reward_change_mean": -0.7722129225730896, "reward_change_min": -1.4743258655071259, "reward_change_std": 0.5595591589808464, "reward_std": 0.9749084934592247, "rewards/cosine_scaled_reward": 0.07189711090177298, "rewards/format_reward": 0.9375000149011612, "step": 495 }, { "advantage_max": 1.9674073159694672, "advantage_mean": 1.7384688022481498e-08, "advantage_min": -0.7077538371086121, "advantage_std": 0.9998735785484314, "completion_length": 1427.5416946411133, "epoch": 0.5668571428571428, "grad_norm": 0.7200669050216675, "kl": 0.10258865356445312, "lambda_div_used": 0.5, "learning_rate": 1.0017544823184055e-07, "loss": 0.0041, "reward": 0.2750055827200413, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2750055827200413, "reward_after_std": 0.907068207859993, "reward_before_mean": 1.0247598066926003, "reward_before_std": 0.6894198805093765, "reward_change_max": 0.0008356347680091858, "reward_change_mean": -0.7497542221099138, "reward_change_min": -1.2161314561963081, "reward_change_std": 0.4413383901119232, "reward_std": 0.9070682227611542, "rewards/cosine_scaled_reward": 0.13737990334630013, "rewards/format_reward": 0.7500000037252903, "step": 496 }, { "advantage_max": 1.9448639154434204, "advantage_mean": -4.1443856013678726e-08, "advantage_min": -0.7576332688331604, "advantage_std": 0.9998561069369316, "completion_length": 1287.0208740234375, "epoch": 0.568, "grad_norm": 0.7512621879577637, "kl": 0.16324996948242188, "lambda_div_used": 0.5, "learning_rate": 1.0009869243631952e-07, "loss": 0.0065, "reward": 0.35428231628611684, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.35428231628611684, "reward_after_std": 0.787726778537035, "reward_before_mean": 1.2080298122018576, "reward_before_std": 0.5447326581925154, "reward_change_max": 0.0021923109889030457, "reward_change_mean": -0.8537475019693375, "reward_change_min": -1.3086559250950813, "reward_change_std": 0.49219193682074547, "reward_std": 0.7877267859876156, "rewards/cosine_scaled_reward": 0.13526488654315472, "rewards/format_reward": 0.9375, "step": 497 }, { "advantage_max": 1.9822159558534622, "advantage_mean": -2.4835269396561444e-09, "advantage_min": -0.6983235441148281, "advantage_std": 0.9998753666877747, "completion_length": 1643.0416870117188, "epoch": 0.5691428571428572, "grad_norm": 0.4986113905906677, "kl": 0.11568832397460938, "lambda_div_used": 0.5, "learning_rate": 1.000438641958131e-07, "loss": 0.0046, "reward": 0.029189766384661198, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.029189766384661198, "reward_after_std": 1.0231481902301311, "reward_before_mean": 0.5286800367757678, "reward_before_std": 0.8786374758929014, "reward_change_max": 0.0018067136406898499, "reward_change_mean": -0.49949030485004187, "reward_change_min": -0.8799277618527412, "reward_change_std": 0.32619673386216164, "reward_std": 1.0231482051312923, "rewards/cosine_scaled_reward": -0.12107665184885263, "rewards/format_reward": 0.770833345130086, "step": 498 }, { "advantage_max": 1.921624556183815, "advantage_mean": -1.2107193636534674e-08, "advantage_min": -0.7888023443520069, "advantage_std": 0.9998480603098869, "completion_length": 1392.8333892822266, "epoch": 0.5702857142857143, "grad_norm": 0.8271004557609558, "kl": 0.08435821533203125, "lambda_div_used": 0.5, "learning_rate": 1.0001096618257236e-07, "loss": 0.0034, "reward": 0.13609783939318731, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.13609783939318731, "reward_after_std": 0.7598652169108391, "reward_before_mean": 0.817673472687602, "reward_before_std": 0.6392671652138233, "reward_change_max": 0.009077653288841248, "reward_change_mean": -0.6815756633877754, "reward_change_min": -1.0989074856042862, "reward_change_std": 0.4282492324709892, "reward_std": 0.7598652243614197, "rewards/cosine_scaled_reward": -0.07032993622124195, "rewards/format_reward": 0.9583333358168602, "step": 499 }, { "advantage_max": 1.9175880998373032, "advantage_mean": 2.483526606589237e-09, "advantage_min": -0.7598303630948067, "advantage_std": 0.9998560920357704, "completion_length": 1688.8541946411133, "epoch": 0.5714285714285714, "grad_norm": 0.9154338240623474, "kl": 0.20852279663085938, "lambda_div_used": 0.5, "learning_rate": 1e-07, "loss": 0.0083, "reward": -0.03248512791469693, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.03248512791469693, "reward_after_std": 0.8366284519433975, "reward_before_mean": 0.4850434511899948, "reward_before_std": 0.754515565931797, "reward_change_max": 0.001007281243801117, "reward_change_mean": -0.5175285935401917, "reward_change_min": -0.9981810115277767, "reward_change_std": 0.3586902804672718, "reward_std": 0.8366284593939781, "rewards/cosine_scaled_reward": -0.1428949534893036, "rewards/format_reward": 0.770833345130086, "step": 500 }, { "epoch": 0.5714285714285714, "step": 500, "total_flos": 0.0, "train_loss": 0.0009665181785810547, "train_runtime": 55686.6375, "train_samples_per_second": 0.431, "train_steps_per_second": 0.009 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }