{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5714285714285714, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "advantage_max": 1.217847466468811, "advantage_mean": -1.8936893914078823e-08, "advantage_min": -0.9552603140473366, "advantage_std": 0.7989529259502888, "completion_length": 2571.2083587646484, "epoch": 0.001142857142857143, "grad_norm": 0.1345997154712677, "kl": 0.0, "lambda_div_used": 0.7999999999999999, "learning_rate": 2e-08, "loss": 0.0721, "reward": 0.2781674414873123, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2781674414873123, "reward_after_std": 0.7989529222249985, "reward_before_mean": 0.4897647276520729, "reward_before_std": 0.8290339298546314, "reward_change_max": 0.00028071552515029907, "reward_change_mean": -0.2115972964093089, "reward_change_min": -0.4146200343966484, "reward_change_std": 0.16823832830414176, "reward_std": 0.7989529222249985, "rewards/cosine_scaled_reward": -0.015534311532974243, "rewards/format_reward": 0.5208333488553762, "step": 1 }, { "advantage_max": 0.6235709302127361, "advantage_mean": 9.313225912688239e-09, "advantage_min": -0.5223574750125408, "advantage_std": 0.4285126607865095, "completion_length": 2804.395881652832, "epoch": 0.002285714285714286, "grad_norm": 0.0617876835167408, "kl": 0.0, "lambda_div_used": 0.7999999999999999, "learning_rate": 4e-08, "loss": 0.025, "reward": 0.07961943745613098, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.07961943745613098, "reward_after_std": 0.42851265892386436, "reward_before_mean": 0.27539755403995514, "reward_before_std": 0.42092561535537243, "reward_change_max": 0.0006531104445457458, "reward_change_mean": -0.19577811146155, "reward_change_min": -0.3188221678137779, "reward_change_std": 0.13006281899288297, "reward_std": 0.42851267009973526, "rewards/cosine_scaled_reward": -0.04980122856795788, "rewards/format_reward": 0.37500000558793545, "step": 2 }, { "advantage_max": 0.7075863927602768, "advantage_mean": 3.414849569782774e-08, "advantage_min": -0.49601656943559647, "advantage_std": 0.4479621145874262, "completion_length": 3345.875015258789, "epoch": 0.0034285714285714284, "grad_norm": 0.07112511247396469, "kl": 4.579871892929077e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 6e-08, "loss": 0.011, "reward": -0.4731777422130108, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4731777422130108, "reward_after_std": 0.4479621220380068, "reward_before_mean": -0.3965716063976288, "reward_before_std": 0.46891009248793125, "reward_change_max": 0.0013469904661178589, "reward_change_mean": -0.07660610042512417, "reward_change_min": -0.17605799809098244, "reward_change_std": 0.07472573383711278, "reward_std": 0.44796212762594223, "rewards/cosine_scaled_reward": -0.2607858069241047, "rewards/format_reward": 0.12500000558793545, "step": 3 }, { "advantage_max": 1.3211607113480568, "advantage_mean": -1.0244548737103898e-08, "advantage_min": -0.7489691935479641, "advantage_std": 0.7894617952406406, "completion_length": 2410.0625762939453, "epoch": 0.004571428571428572, "grad_norm": 0.13816717267036438, "kl": 2.919137477874756e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 8e-08, "loss": 0.014, "reward": 0.2465450973249972, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2465450973249972, "reward_after_std": 0.7894618026912212, "reward_before_mean": 0.44723863434046507, "reward_before_std": 0.7846085950732231, "reward_change_max": 0.0, "reward_change_mean": -0.2006935360841453, "reward_change_min": -0.4025107752531767, "reward_change_std": 0.1515410002321005, "reward_std": 0.7894618026912212, "rewards/cosine_scaled_reward": -0.08888069525710307, "rewards/format_reward": 0.6250000055879354, "step": 4 }, { "advantage_max": 1.302382729947567, "advantage_mean": -1.5522042262627878e-09, "advantage_min": -0.74482436850667, "advantage_std": 0.7666304744780064, "completion_length": 3202.354248046875, "epoch": 0.005714285714285714, "grad_norm": 0.1259206384420395, "kl": 4.661828279495239e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 1e-07, "loss": 0.0049, "reward": -0.08441077917814255, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.08441077917814255, "reward_after_std": 0.7666304744780064, "reward_before_mean": 0.04866696195676923, "reward_before_std": 0.7808295674622059, "reward_change_max": 0.0007265731692314148, "reward_change_mean": -0.1330777509137988, "reward_change_min": -0.26945669017732143, "reward_change_std": 0.11292384238913655, "reward_std": 0.7666305154561996, "rewards/cosine_scaled_reward": -0.1423331880941987, "rewards/format_reward": 0.3333333432674408, "step": 5 }, { "advantage_max": 0.8823182210326195, "advantage_mean": -1.2417633588057697e-09, "advantage_min": -0.4785333499312401, "advantage_std": 0.5150059200823307, "completion_length": 3064.812530517578, "epoch": 0.006857142857142857, "grad_norm": 0.0908319428563118, "kl": 4.053860902786255e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.2e-07, "loss": 0.0316, "reward": -0.3478844817727804, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.3478844817727804, "reward_after_std": 0.5150059126317501, "reward_before_mean": -0.2524983729235828, "reward_before_std": 0.5230560079216957, "reward_change_max": 0.0003776848316192627, "reward_change_mean": -0.09538610768504441, "reward_change_min": -0.20627478696405888, "reward_change_std": 0.08201898471452296, "reward_std": 0.5150059200823307, "rewards/cosine_scaled_reward": -0.2304158564656973, "rewards/format_reward": 0.2083333358168602, "step": 6 }, { "advantage_max": 1.367805253714323, "advantage_mean": -1.3659397890553038e-08, "advantage_min": -0.9759664312005043, "advantage_std": 0.8795518670231104, "completion_length": 3157.0000915527344, "epoch": 0.008, "grad_norm": 0.1305931657552719, "kl": 2.5913119316101074e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.4e-07, "loss": 0.0259, "reward": 0.18099913746118546, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.18099913746118546, "reward_after_std": 0.8795518726110458, "reward_before_mean": 0.3678276315331459, "reward_before_std": 0.9176443759351969, "reward_change_max": 0.000819183886051178, "reward_change_mean": -0.18682850850746036, "reward_change_min": -0.41373845003545284, "reward_change_std": 0.17254400812089443, "reward_std": 0.8795518912374973, "rewards/cosine_scaled_reward": -0.07650285214185715, "rewards/format_reward": 0.520833345130086, "step": 7 }, { "advantage_max": 1.0459130443632603, "advantage_mean": -1.2107194025112733e-08, "advantage_min": -0.6788318604230881, "advantage_std": 0.6345811123028398, "completion_length": 2562.125045776367, "epoch": 0.009142857142857144, "grad_norm": 0.11447706073522568, "kl": 2.5659799575805664e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.6e-07, "loss": 0.0275, "reward": 0.4716091677546501, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4716091677546501, "reward_after_std": 0.634581127204001, "reward_before_mean": 0.7308037634938955, "reward_before_std": 0.596033226698637, "reward_change_max": 0.0005420669913291931, "reward_change_mean": -0.25919460941804573, "reward_change_min": -0.40141006000339985, "reward_change_std": 0.1552876008208841, "reward_std": 0.6345811542123556, "rewards/cosine_scaled_reward": 0.1154018840752542, "rewards/format_reward": 0.5000000055879354, "step": 8 }, { "advantage_max": 1.0340013727545738, "advantage_mean": -8.071462276326713e-09, "advantage_min": -0.6549094580113888, "advantage_std": 0.6379689201712608, "completion_length": 3194.3750610351562, "epoch": 0.010285714285714285, "grad_norm": 0.11988753825426102, "kl": 3.8914382457733154e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.8e-07, "loss": 0.05, "reward": -0.1560809090733528, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1560809090733528, "reward_after_std": 0.6379689201712608, "reward_before_mean": -0.026715969666838646, "reward_before_std": 0.658458199352026, "reward_change_max": 0.0005759000778198242, "reward_change_mean": -0.1293649550061673, "reward_change_min": -0.2904045693576336, "reward_change_std": 0.11258536879904568, "reward_std": 0.6379689499735832, "rewards/cosine_scaled_reward": -0.14877465018071234, "rewards/format_reward": 0.2708333432674408, "step": 9 }, { "advantage_max": 1.4368297830224037, "advantage_mean": 5.587935614226325e-09, "advantage_min": -0.8901588395237923, "advantage_std": 0.8860036619007587, "completion_length": 2562.0208587646484, "epoch": 0.011428571428571429, "grad_norm": 0.12126865983009338, "kl": 2.699345350265503e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 2e-07, "loss": 0.0312, "reward": 0.13794712629169226, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.13794712629169226, "reward_after_std": 0.8860036358237267, "reward_before_mean": 0.3111814744770527, "reward_before_std": 0.9135208688676357, "reward_change_max": 0.002153046429157257, "reward_change_mean": -0.17323432816192508, "reward_change_min": -0.39752847887575626, "reward_change_std": 0.1613102899864316, "reward_std": 0.8860036619007587, "rewards/cosine_scaled_reward": -0.07357594557106495, "rewards/format_reward": 0.45833334140479565, "step": 10 }, { "advantage_max": 0.8439371325075626, "advantage_mean": 2.173086099954702e-08, "advantage_min": -0.41614095866680145, "advantage_std": 0.49213249422609806, "completion_length": 3375.2291870117188, "epoch": 0.012571428571428572, "grad_norm": 0.0826457291841507, "kl": 4.883110523223877e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.1999999999999998e-07, "loss": 0.0161, "reward": -0.5312878023833036, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5312878023833036, "reward_after_std": 0.4921324960887432, "reward_before_mean": -0.47230170108377934, "reward_before_std": 0.5070141367614269, "reward_change_max": 0.0013536438345909119, "reward_change_mean": -0.05898609710857272, "reward_change_min": -0.1467603761702776, "reward_change_std": 0.06200661789625883, "reward_std": 0.4921324998140335, "rewards/cosine_scaled_reward": -0.30906752590090036, "rewards/format_reward": 0.14583333767950535, "step": 11 }, { "advantage_max": 1.143027637153864, "advantage_mean": -2.483526873042763e-08, "advantage_min": -0.7031282149255276, "advantage_std": 0.7011090740561485, "completion_length": 2593.1458740234375, "epoch": 0.013714285714285714, "grad_norm": 0.10075537115335464, "kl": 4.231184720993042e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.4e-07, "loss": 0.0224, "reward": 0.1525727827101946, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1525727827101946, "reward_after_std": 0.7011090852320194, "reward_before_mean": 0.3418125584721565, "reward_before_std": 0.695315845310688, "reward_change_max": 0.0, "reward_change_mean": -0.18923979345709085, "reward_change_min": -0.35227457620203495, "reward_change_std": 0.13712271628901362, "reward_std": 0.7011091038584709, "rewards/cosine_scaled_reward": -0.12076039798557758, "rewards/format_reward": 0.5833333414047956, "step": 12 }, { "advantage_max": 1.1169372722506523, "advantage_mean": 5.587935614226325e-09, "advantage_min": -0.8815496191382408, "advantage_std": 0.7253670915961266, "completion_length": 2905.854217529297, "epoch": 0.014857142857142857, "grad_norm": 0.14058144390583038, "kl": 3.0654482543468475e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.6e-07, "loss": 0.0701, "reward": 0.14813854545354843, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.14813854545354843, "reward_after_std": 0.7253671064972878, "reward_before_mean": 0.33676156401634216, "reward_before_std": 0.7501043267548084, "reward_change_max": 0.00045943260192871094, "reward_change_mean": -0.18862298224121332, "reward_change_min": -0.3747742120176554, "reward_change_std": 0.14826489053666592, "reward_std": 0.7253671269863844, "rewards/cosine_scaled_reward": -0.029535903362557292, "rewards/format_reward": 0.39583334513008595, "step": 13 }, { "advantage_max": 1.2915129400789738, "advantage_mean": 5.587935725248627e-09, "advantage_min": -0.827587179839611, "advantage_std": 0.811378862708807, "completion_length": 2895.562545776367, "epoch": 0.016, "grad_norm": 0.12285847961902618, "kl": 2.9393471777439117e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.8e-07, "loss": 0.0387, "reward": -0.00847603753209114, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.00847603753209114, "reward_after_std": 0.8113788478076458, "reward_before_mean": 0.13972076028585434, "reward_before_std": 0.840797882527113, "reward_change_max": 0.0006824731826782227, "reward_change_mean": -0.14819679409265518, "reward_change_min": -0.34741368889808655, "reward_change_std": 0.13997211772948503, "reward_std": 0.8113788738846779, "rewards/cosine_scaled_reward": -0.11763961799442768, "rewards/format_reward": 0.3750000074505806, "step": 14 }, { "advantage_max": 0.9405107796192169, "advantage_mean": -8.69234317857348e-09, "advantage_min": -0.5709736123681068, "advantage_std": 0.5694962106645107, "completion_length": 2810.625015258789, "epoch": 0.017142857142857144, "grad_norm": 0.07822926342487335, "kl": 2.2795051336288452e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 3e-07, "loss": 0.0109, "reward": 0.1779165519401431, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1779165519401431, "reward_after_std": 0.5694962106645107, "reward_before_mean": 0.3809913620352745, "reward_before_std": 0.5492642298340797, "reward_change_max": 0.00014159083366394043, "reward_change_mean": -0.20307481079362333, "reward_change_min": -0.3380450326949358, "reward_change_std": 0.13191879028454423, "reward_std": 0.5694962292909622, "rewards/cosine_scaled_reward": 0.002995698247104883, "rewards/format_reward": 0.3750000037252903, "step": 15 }, { "advantage_max": 1.1658824868500233, "advantage_mean": 4.967054045845742e-09, "advantage_min": -0.608432337641716, "advantage_std": 0.6757100559771061, "completion_length": 3554.750030517578, "epoch": 0.018285714285714287, "grad_norm": 0.12201931327581406, "kl": 4.0784478187561035e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.2e-07, "loss": 0.0101, "reward": -0.3156089726835489, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.3156089726835489, "reward_after_std": 0.6757100522518158, "reward_before_mean": -0.2246195109328255, "reward_before_std": 0.6919358260929585, "reward_change_max": 0.00035578012466430664, "reward_change_mean": -0.09098945884034038, "reward_change_min": -0.22338230721652508, "reward_change_std": 0.09024652326479554, "reward_std": 0.6757100597023964, "rewards/cosine_scaled_reward": -0.1539764255285263, "rewards/format_reward": 0.0833333358168602, "step": 16 }, { "advantage_max": 1.2798153422772884, "advantage_mean": -2.980232360894064e-08, "advantage_min": -0.8115465976297855, "advantage_std": 0.824974924325943, "completion_length": 2312.6458587646484, "epoch": 0.019428571428571427, "grad_norm": 0.13376076519489288, "kl": 3.724917769432068e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.4000000000000003e-07, "loss": 0.0356, "reward": 0.38245512172579765, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.38245512172579765, "reward_after_std": 0.8249749541282654, "reward_before_mean": 0.6136201694607735, "reward_before_std": 0.8402673825621605, "reward_change_max": 6.746500730514526e-05, "reward_change_mean": -0.2311650700867176, "reward_change_min": -0.46010350435972214, "reward_change_std": 0.1807591523975134, "reward_std": 0.8249749913811684, "rewards/cosine_scaled_reward": 0.025560079142451286, "rewards/format_reward": 0.5625000055879354, "step": 17 }, { "advantage_max": 1.2530889064073563, "advantage_mean": -2.20413013396481e-08, "advantage_min": -0.7576796039938927, "advantage_std": 0.7551870234310627, "completion_length": 2754.041717529297, "epoch": 0.02057142857142857, "grad_norm": 0.13405290246009827, "kl": 2.1032989025115967e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.6e-07, "loss": 0.0473, "reward": 0.20277105178683996, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.20277105178683996, "reward_after_std": 0.7551870420575142, "reward_before_mean": 0.39739411184564233, "reward_before_std": 0.7557785026729107, "reward_change_max": 0.00043197721242904663, "reward_change_mean": -0.19462304934859276, "reward_change_min": -0.34309362061321735, "reward_change_std": 0.13999361265450716, "reward_std": 0.7551870867609978, "rewards/cosine_scaled_reward": -0.051302953623235226, "rewards/format_reward": 0.5000000093132257, "step": 18 }, { "advantage_max": 1.6070429980754852, "advantage_mean": 7.45058070794613e-09, "advantage_min": -0.9701496809720993, "advantage_std": 0.9727808646857738, "completion_length": 2970.1250610351562, "epoch": 0.021714285714285714, "grad_norm": 0.14596253633499146, "kl": 2.6941299438476562e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.7999999999999996e-07, "loss": 0.0535, "reward": 0.32189071038737893, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.32189071038737893, "reward_after_std": 0.9727808646857738, "reward_before_mean": 0.52683225274086, "reward_before_std": 0.9912216775119305, "reward_change_max": 0.000347234308719635, "reward_change_mean": -0.20494154281914234, "reward_change_min": -0.3949390184134245, "reward_change_std": 0.16167664853855968, "reward_std": 0.9727809056639671, "rewards/cosine_scaled_reward": 0.06549946335144341, "rewards/format_reward": 0.3958333432674408, "step": 19 }, { "advantage_max": 1.3724640235304832, "advantage_mean": 2.4835267176115394e-09, "advantage_min": -1.0584416389465332, "advantage_std": 0.9209586307406425, "completion_length": 2481.562530517578, "epoch": 0.022857142857142857, "grad_norm": 0.1834186613559723, "kl": 2.04332172870636e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 4e-07, "loss": 0.0747, "reward": 0.44473595824092627, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.44473595824092627, "reward_after_std": 0.920958636328578, "reward_before_mean": 0.6847866689786315, "reward_before_std": 0.9624159913510084, "reward_change_max": 0.00030393898487091064, "reward_change_mean": -0.24005071632564068, "reward_change_min": -0.45336752384901047, "reward_change_std": 0.19396351650357246, "reward_std": 0.9209586586803198, "rewards/cosine_scaled_reward": 0.029893329367041588, "rewards/format_reward": 0.6250000111758709, "step": 20 }, { "advantage_max": 1.169568408280611, "advantage_mean": -3.1044089521259366e-09, "advantage_min": -0.6813900545239449, "advantage_std": 0.7141816075891256, "completion_length": 2654.000030517578, "epoch": 0.024, "grad_norm": 0.1101561188697815, "kl": 3.5569071769714355e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.1999999999999995e-07, "loss": 0.0294, "reward": 0.1073315367102623, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1073315367102623, "reward_after_std": 0.7141816113144159, "reward_before_mean": 0.28544674441218376, "reward_before_std": 0.7177798096090555, "reward_change_max": 0.0005570873618125916, "reward_change_mean": -0.17811520351096988, "reward_change_min": -0.3612048588693142, "reward_change_std": 0.13618801487609744, "reward_std": 0.7141816504299641, "rewards/cosine_scaled_reward": -0.11769330315291882, "rewards/format_reward": 0.5208333358168602, "step": 21 }, { "advantage_max": 1.5460242331027985, "advantage_mean": -2.235174201281609e-08, "advantage_min": -0.8349691554903984, "advantage_std": 0.9228786043822765, "completion_length": 1903.9792251586914, "epoch": 0.025142857142857144, "grad_norm": 0.11138684302568436, "kl": 2.3265602067112923e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.3999999999999997e-07, "loss": 0.0047, "reward": 0.6206777030602098, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6206777030602098, "reward_after_std": 0.9228785969316959, "reward_before_mean": 0.8899027798324823, "reward_before_std": 0.897814180701971, "reward_change_max": 0.0, "reward_change_mean": -0.26922508142888546, "reward_change_min": -0.4845781698822975, "reward_change_std": 0.1846067113801837, "reward_std": 0.9228786453604698, "rewards/cosine_scaled_reward": 0.038701380137354136, "rewards/format_reward": 0.8125000037252903, "step": 22 }, { "advantage_max": 1.016646571457386, "advantage_mean": 5.587935891782081e-09, "advantage_min": -0.5664830878376961, "advantage_std": 0.6114959469996393, "completion_length": 2711.125030517578, "epoch": 0.026285714285714287, "grad_norm": 0.09723033756017685, "kl": 2.8345733880996704e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.6e-07, "loss": 0.0039, "reward": 0.07540437951683998, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.07540437951683998, "reward_after_std": 0.6114959586411715, "reward_before_mean": 0.2520197443664074, "reward_before_std": 0.6060267491266131, "reward_change_max": 0.0005780011415481567, "reward_change_mean": -0.1766153446806129, "reward_change_min": -0.34339495189487934, "reward_change_std": 0.1285551063483581, "reward_std": 0.6114959623664618, "rewards/cosine_scaled_reward": -0.09274013433605433, "rewards/format_reward": 0.4375000074505806, "step": 23 }, { "advantage_max": 1.2170175276696682, "advantage_mean": -9.623666863411984e-09, "advantage_min": -1.0881583616137505, "advantage_std": 0.8654856085777283, "completion_length": 2938.187515258789, "epoch": 0.027428571428571427, "grad_norm": 0.14002735912799835, "kl": 2.0820647478103638e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.8e-07, "loss": 0.0312, "reward": 0.36279288213700056, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.36279288213700056, "reward_after_std": 0.8654856085777283, "reward_before_mean": 0.5922489315271378, "reward_before_std": 0.924515713006258, "reward_change_max": 0.00095406174659729, "reward_change_mean": -0.22945604752749205, "reward_change_min": -0.4770869705826044, "reward_change_std": 0.19559224974364042, "reward_std": 0.8654856272041798, "rewards/cosine_scaled_reward": 0.07737446296960115, "rewards/format_reward": 0.4375000111758709, "step": 24 }, { "advantage_max": 1.1091482378542423, "advantage_mean": 8.071462720415923e-09, "advantage_min": -0.8267083689570427, "advantage_std": 0.734721839427948, "completion_length": 2926.895866394043, "epoch": 0.02857142857142857, "grad_norm": 0.1179434061050415, "kl": 2.8535723686218262e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 5e-07, "loss": 0.0251, "reward": 0.0650341846048832, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.0650341846048832, "reward_after_std": 0.7347218468785286, "reward_before_mean": 0.2364666946232319, "reward_before_std": 0.7685037739574909, "reward_change_max": 0.00040397047996520996, "reward_change_mean": -0.17143250722438097, "reward_change_min": -0.36208393797278404, "reward_change_std": 0.14850584138184786, "reward_std": 0.7347218468785286, "rewards/cosine_scaled_reward": -0.0900999871082604, "rewards/format_reward": 0.4166666753590107, "step": 25 }, { "advantage_max": 0.8768724426627159, "advantage_mean": 1.862645371275562e-09, "advantage_min": -0.7183658257126808, "advantage_std": 0.5621801391243935, "completion_length": 2866.312530517578, "epoch": 0.029714285714285714, "grad_norm": 0.08625730872154236, "kl": 2.384372055530548e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.2e-07, "loss": 0.022, "reward": 0.16878714319318533, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.16878714319318533, "reward_after_std": 0.5621801353991032, "reward_before_mean": 0.372440692037344, "reward_before_std": 0.5589241124689579, "reward_change_max": 0.0, "reward_change_mean": -0.20365354581736028, "reward_change_min": -0.3272699285298586, "reward_change_std": 0.13190524163655937, "reward_std": 0.5621801353991032, "rewards/cosine_scaled_reward": -0.06377965398132801, "rewards/format_reward": 0.5000000055879354, "step": 26 }, { "advantage_max": 0.8886187560856342, "advantage_mean": 1.396983873025448e-08, "advantage_min": -0.8229356035590172, "advantage_std": 0.6434473972767591, "completion_length": 3056.1875915527344, "epoch": 0.030857142857142857, "grad_norm": 0.11315359175205231, "kl": 2.475641667842865e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.4e-07, "loss": 0.0579, "reward": 0.08739247173070908, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.08739247173070908, "reward_after_std": 0.6434473991394043, "reward_before_mean": 0.27392338775098324, "reward_before_std": 0.6801570225507021, "reward_change_max": 0.00012511759996414185, "reward_change_mean": -0.18653089553117752, "reward_change_min": -0.3508267253637314, "reward_change_std": 0.1480486923828721, "reward_std": 0.6434474475681782, "rewards/cosine_scaled_reward": -0.10262164659798145, "rewards/format_reward": 0.47916667722165585, "step": 27 }, { "advantage_max": 1.2374885007739067, "advantage_mean": -2.1109978542988017e-08, "advantage_min": -0.9062735512852669, "advantage_std": 0.8239960931241512, "completion_length": 2890.479232788086, "epoch": 0.032, "grad_norm": 0.13564632833003998, "kl": 3.2667070627212524e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.6e-07, "loss": 0.0542, "reward": 0.1311190624255687, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1311190624255687, "reward_after_std": 0.8239960968494415, "reward_before_mean": 0.3119109673425555, "reward_before_std": 0.8713027779012918, "reward_change_max": 4.5612454414367676e-05, "reward_change_mean": -0.1807919372804463, "reward_change_min": -0.38250917196273804, "reward_change_std": 0.15657383343204856, "reward_std": 0.823996152728796, "rewards/cosine_scaled_reward": -0.03154451958835125, "rewards/format_reward": 0.37500000558793545, "step": 28 }, { "advantage_max": 0.9986958801746368, "advantage_mean": 1.428027990302283e-08, "advantage_min": -0.6387606337666512, "advantage_std": 0.6297495402395725, "completion_length": 3118.604248046875, "epoch": 0.03314285714285714, "grad_norm": 0.17339561879634857, "kl": 3.26354056596756e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.8e-07, "loss": 0.0888, "reward": -0.21651708334684372, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.21651708334684372, "reward_after_std": 0.6297495476901531, "reward_before_mean": -0.09916665032505989, "reward_before_std": 0.6555814929306507, "reward_change_max": 0.0007281675934791565, "reward_change_mean": -0.11735043581575155, "reward_change_min": -0.27948011085391045, "reward_change_std": 0.11293295910581946, "reward_std": 0.6297495514154434, "rewards/cosine_scaled_reward": -0.17458332516252995, "rewards/format_reward": 0.25000000186264515, "step": 29 }, { "advantage_max": 1.3741733878850937, "advantage_mean": -1.6142924885720333e-08, "advantage_min": -0.8780368641018867, "advantage_std": 0.8414683490991592, "completion_length": 2939.979217529297, "epoch": 0.03428571428571429, "grad_norm": 0.20708495378494263, "kl": 3.0003488063812256e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 6e-07, "loss": 0.0433, "reward": 0.1927310898900032, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.1927310898900032, "reward_after_std": 0.841468371450901, "reward_before_mean": 0.38108139019459486, "reward_before_std": 0.8596030510962009, "reward_change_max": 0.00038279592990875244, "reward_change_mean": -0.18835033057257533, "reward_change_min": -0.41781171411275864, "reward_change_std": 0.159056534525007, "reward_std": 0.841468371450901, "rewards/cosine_scaled_reward": -0.04904262721538544, "rewards/format_reward": 0.47916667349636555, "step": 30 }, { "advantage_max": 0.9357012398540974, "advantage_mean": 5.277494247168946e-09, "advantage_min": -0.642534114420414, "advantage_std": 0.5949853956699371, "completion_length": 3138.9791870117188, "epoch": 0.03542857142857143, "grad_norm": 0.08425452560186386, "kl": 2.364814281463623e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.2e-07, "loss": 0.0358, "reward": -0.22145959362387657, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.22145959362387657, "reward_after_std": 0.5949854031205177, "reward_before_mean": -0.10198161378502846, "reward_before_std": 0.6191553995013237, "reward_change_max": 0.00035628676414489746, "reward_change_mean": -0.11947800125926733, "reward_change_min": -0.263500964269042, "reward_change_std": 0.10806434508413076, "reward_std": 0.5949854254722595, "rewards/cosine_scaled_reward": -0.15515746362507343, "rewards/format_reward": 0.20833334140479565, "step": 31 }, { "advantage_max": 1.1576483212411404, "advantage_mean": -5.587936058315535e-09, "advantage_min": -0.7510269209742546, "advantage_std": 0.7068486250936985, "completion_length": 3073.479202270508, "epoch": 0.036571428571428574, "grad_norm": 0.13992291688919067, "kl": 2.5810906663537025e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.4e-07, "loss": 0.0716, "reward": 0.07075399532914162, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.07075399532914162, "reward_after_std": 0.7068486288189888, "reward_before_mean": 0.24183613061904907, "reward_before_std": 0.7166659869253635, "reward_change_max": 0.0007139146327972412, "reward_change_mean": -0.17108213249593973, "reward_change_min": -0.33107252046465874, "reward_change_std": 0.130351847037673, "reward_std": 0.7068486325442791, "rewards/cosine_scaled_reward": -0.05616527982056141, "rewards/format_reward": 0.35416667349636555, "step": 32 }, { "advantage_max": 1.3330436423420906, "advantage_mean": 1.0554989660072067e-08, "advantage_min": -0.6214881204068661, "advantage_std": 0.7514338865876198, "completion_length": 3367.8125610351562, "epoch": 0.037714285714285714, "grad_norm": 0.1050930917263031, "kl": 2.8207898139953613e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.6e-07, "loss": 0.0292, "reward": -0.3225663788616657, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3225663788616657, "reward_after_std": 0.7514338567852974, "reward_before_mean": -0.24029204389080405, "reward_before_std": 0.7680000029504299, "reward_change_max": 0.0005582571029663086, "reward_change_mean": -0.08227434731088579, "reward_change_min": -0.2104458101093769, "reward_change_std": 0.08564048493281007, "reward_std": 0.7514339052140713, "rewards/cosine_scaled_reward": -0.2243126891553402, "rewards/format_reward": 0.2083333395421505, "step": 33 }, { "advantage_max": 1.2616764344274998, "advantage_mean": -1.8626452047421083e-08, "advantage_min": -1.1620574593544006, "advantage_std": 0.853287111967802, "completion_length": 2487.250030517578, "epoch": 0.038857142857142854, "grad_norm": 0.14598548412322998, "kl": 8.910894393920898e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.800000000000001e-07, "loss": 0.0377, "reward": 0.498631214722991, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.498631214722991, "reward_after_std": 0.8532871417701244, "reward_before_mean": 0.7552381753921509, "reward_before_std": 0.8858068697154522, "reward_change_max": 0.0, "reward_change_mean": -0.2566069485619664, "reward_change_min": -0.4700592402368784, "reward_change_std": 0.18954872153699398, "reward_std": 0.8532871417701244, "rewards/cosine_scaled_reward": 0.09636908024549484, "rewards/format_reward": 0.5625000149011612, "step": 34 }, { "advantage_max": 1.4441743902862072, "advantage_mean": 1.4280279958533981e-08, "advantage_min": -0.7700625658035278, "advantage_std": 0.8615536298602819, "completion_length": 2993.5000228881836, "epoch": 0.04, "grad_norm": 0.17307226359844208, "kl": 9.259767830371857e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 7e-07, "loss": 0.0878, "reward": -0.000494837760925293, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.000494837760925293, "reward_after_std": 0.8615536224097013, "reward_before_mean": 0.14388340152800083, "reward_before_std": 0.8852962348610163, "reward_change_max": 0.0008604899048805237, "reward_change_mean": -0.14437821554020047, "reward_change_min": -0.3321828208863735, "reward_change_std": 0.13181519228965044, "reward_std": 0.8615536373108625, "rewards/cosine_scaled_reward": -0.07389164250344038, "rewards/format_reward": 0.2916666716337204, "step": 35 }, { "advantage_max": 1.457282193005085, "advantage_mean": 1.4590720354146214e-08, "advantage_min": -0.6018553003668785, "advantage_std": 0.7696587704122066, "completion_length": 3352.562530517578, "epoch": 0.04114285714285714, "grad_norm": 0.13112565875053406, "kl": 8.600763976573944e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.2e-07, "loss": 0.0186, "reward": -0.18987324181944132, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.18987324181944132, "reward_after_std": 0.7696587704122066, "reward_before_mean": -0.08600875362753868, "reward_before_std": 0.7551218904554844, "reward_change_max": 0.00021064281463623047, "reward_change_mean": -0.10386447329074144, "reward_change_min": -0.19213777408003807, "reward_change_std": 0.0765054477378726, "reward_std": 0.7696587890386581, "rewards/cosine_scaled_reward": -0.14717104472219944, "rewards/format_reward": 0.20833334140479565, "step": 36 }, { "advantage_max": 0.9269507825374603, "advantage_mean": 8.692345010441471e-09, "advantage_min": -0.6724393144249916, "advantage_std": 0.5886142291128635, "completion_length": 3394.2291870117188, "epoch": 0.04228571428571429, "grad_norm": 0.110660620033741, "kl": 3.93986701965332e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.4e-07, "loss": 0.014, "reward": -0.2714567966759205, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.2714567966759205, "reward_after_std": 0.5886142235249281, "reward_before_mean": -0.16160957142710686, "reward_before_std": 0.6144555695354939, "reward_change_max": 0.001416221261024475, "reward_change_mean": -0.10984723456203938, "reward_change_min": -0.25573337636888027, "reward_change_std": 0.10645729210227728, "reward_std": 0.5886142309755087, "rewards/cosine_scaled_reward": -0.1849714510026388, "rewards/format_reward": 0.2083333395421505, "step": 37 }, { "advantage_max": 1.0253938995301723, "advantage_mean": -5.587935225648266e-09, "advantage_min": -0.7097817361354828, "advantage_std": 0.6563874427229166, "completion_length": 3116.916717529297, "epoch": 0.04342857142857143, "grad_norm": 0.12824492156505585, "kl": 6.318045780062675e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.599999999999999e-07, "loss": 0.0272, "reward": -0.105198223143816, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.105198223143816, "reward_after_std": 0.6563874669373035, "reward_before_mean": 0.034443892538547516, "reward_before_std": 0.6834176257252693, "reward_change_max": 0.0, "reward_change_mean": -0.13964211801066995, "reward_change_min": -0.32228403724730015, "reward_change_std": 0.12572575081139803, "reward_std": 0.6563874799758196, "rewards/cosine_scaled_reward": -0.09736138768494129, "rewards/format_reward": 0.2291666679084301, "step": 38 }, { "advantage_max": 0.9744839183986187, "advantage_mean": -5.551115123125783e-17, "advantage_min": -0.5584886260330677, "advantage_std": 0.5579538755118847, "completion_length": 2887.0000534057617, "epoch": 0.044571428571428574, "grad_norm": 0.0860845297574997, "kl": 5.842745304107666e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.799999999999999e-07, "loss": 0.0185, "reward": 0.2523145619779825, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2523145619779825, "reward_after_std": 0.5579538717865944, "reward_before_mean": 0.46960435644723475, "reward_before_std": 0.5118371807038784, "reward_change_max": 2.5331974029541016e-05, "reward_change_mean": -0.21728977281600237, "reward_change_min": -0.3160447757691145, "reward_change_std": 0.12963856477290392, "reward_std": 0.5579538978636265, "rewards/cosine_scaled_reward": 0.01605216972529888, "rewards/format_reward": 0.4375000074505806, "step": 39 }, { "advantage_max": 1.1473079845309258, "advantage_mean": -1.2417634698280722e-09, "advantage_min": -0.8341499119997025, "advantage_std": 0.69804672524333, "completion_length": 2611.416702270508, "epoch": 0.045714285714285714, "grad_norm": 0.11654925346374512, "kl": 0.000302337110042572, "lambda_div_used": 0.7999999999999999, "learning_rate": 8e-07, "loss": 0.0178, "reward": 0.2087826275965199, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2087826275965199, "reward_after_std": 0.69804672524333, "reward_before_mean": 0.4090485665947199, "reward_before_std": 0.7019842527806759, "reward_change_max": 0.0007131621241569519, "reward_change_mean": -0.20026592072099447, "reward_change_min": -0.3661715518683195, "reward_change_std": 0.14094401663169265, "reward_std": 0.6980467662215233, "rewards/cosine_scaled_reward": -0.06630906602367759, "rewards/format_reward": 0.541666679084301, "step": 40 }, { "advantage_max": 1.4810020998120308, "advantage_mean": -1.7384688355548406e-08, "advantage_min": -0.714835986495018, "advantage_std": 0.8342695869505405, "completion_length": 3090.0833740234375, "epoch": 0.046857142857142854, "grad_norm": 0.14438189566135406, "kl": 8.465877908747643e-05, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.199999999999999e-07, "loss": 0.0487, "reward": 0.01593086402863264, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.01593086402863264, "reward_after_std": 0.8342696018517017, "reward_before_mean": 0.1621880028396845, "reward_before_std": 0.8327867910265923, "reward_change_max": 0.0006985217332839966, "reward_change_mean": -0.14625715790316463, "reward_change_min": -0.26647657714784145, "reward_change_std": 0.10887052223552018, "reward_std": 0.8342696204781532, "rewards/cosine_scaled_reward": -0.12723934021778405, "rewards/format_reward": 0.4166666753590107, "step": 41 }, { "advantage_max": 0.7178252972662449, "advantage_mean": 2.1730860721991263e-09, "advantage_min": -0.47675967030227184, "advantage_std": 0.43735584430396557, "completion_length": 2747.6458644866943, "epoch": 0.048, "grad_norm": 0.07471238076686859, "kl": 0.00010556727647781372, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.399999999999999e-07, "loss": 0.0261, "reward": -0.26368879806250334, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": -0.26368879806250334, "reward_after_std": 0.4373558573424816, "reward_before_mean": -0.144127176143229, "reward_before_std": 0.4361735610291362, "reward_change_max": 0.0002202838659286499, "reward_change_mean": -0.11956162098795176, "reward_change_min": -0.22520782612264156, "reward_change_std": 0.08732934622094035, "reward_std": 0.4373558759689331, "rewards/cosine_scaled_reward": -0.22831358574330807, "rewards/format_reward": 0.31250000186264515, "step": 42 }, { "advantage_max": 1.001953613013029, "advantage_mean": -3.1044088966147854e-09, "advantage_min": -0.7429283708333969, "advantage_std": 0.6502887606620789, "completion_length": 2913.9792404174805, "epoch": 0.04914285714285714, "grad_norm": 0.09054851531982422, "kl": 0.0001889900304377079, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.599999999999999e-07, "loss": 0.0263, "reward": -0.07786577939987183, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.07786577939987183, "reward_after_std": 0.6502887643873692, "reward_before_mean": 0.06831886945292354, "reward_before_std": 0.6769301258027554, "reward_change_max": 0.0004934445023536682, "reward_change_mean": -0.14618465770035982, "reward_change_min": -0.30024728551506996, "reward_change_std": 0.12345949094742537, "reward_std": 0.6502887941896915, "rewards/cosine_scaled_reward": -0.13250724039971828, "rewards/format_reward": 0.33333333767950535, "step": 43 }, { "advantage_max": 1.0924378596246243, "advantage_mean": -6.829698584454036e-09, "advantage_min": -0.7103950157761574, "advantage_std": 0.701549582183361, "completion_length": 2740.5000228881836, "epoch": 0.05028571428571429, "grad_norm": 0.10049757361412048, "kl": 0.00028192251920700073, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.799999999999999e-07, "loss": 0.023, "reward": 0.3650970458984375, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3650970458984375, "reward_after_std": 0.7015495970845222, "reward_before_mean": 0.6003521336242557, "reward_before_std": 0.7036599554121494, "reward_change_max": 0.00017582625150680542, "reward_change_mean": -0.23525508772581816, "reward_change_min": -0.4026178065687418, "reward_change_std": 0.16161232814192772, "reward_std": 0.7015496119856834, "rewards/cosine_scaled_reward": 0.07100939005613327, "rewards/format_reward": 0.45833334140479565, "step": 44 }, { "advantage_max": 1.2479462437331676, "advantage_mean": 1.5522041152404853e-09, "advantage_min": -0.6663694195449352, "advantage_std": 0.698578417301178, "completion_length": 3311.7083435058594, "epoch": 0.05142857142857143, "grad_norm": 0.10764732211828232, "kl": 0.0002221427857875824, "lambda_div_used": 0.7999999999999999, "learning_rate": 9e-07, "loss": 0.0216, "reward": -0.09240207821130753, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.09240207821130753, "reward_after_std": 0.6985784359276295, "reward_before_mean": 0.04128584871068597, "reward_before_std": 0.6907168067991734, "reward_change_max": 0.0010211840271949768, "reward_change_mean": -0.1336879152804613, "reward_change_min": -0.23977475240826607, "reward_change_std": 0.09814670775085688, "reward_std": 0.6985784731805325, "rewards/cosine_scaled_reward": -0.08352374797686934, "rewards/format_reward": 0.20833334140479565, "step": 45 }, { "advantage_max": 0.7752528265118599, "advantage_mean": 1.5211602311104855e-08, "advantage_min": -0.46505314111709595, "advantage_std": 0.44844716414809227, "completion_length": 3227.1875, "epoch": 0.052571428571428575, "grad_norm": 0.06010043993592262, "kl": 0.000321313738822937, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.2e-07, "loss": 0.0029, "reward": -0.36389969661831856, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.36389969661831856, "reward_after_std": 0.4484471455216408, "reward_before_mean": -0.26696554012596607, "reward_before_std": 0.4430927708745003, "reward_change_max": 0.0003152713179588318, "reward_change_mean": -0.09693415294168517, "reward_change_min": -0.1783353015780449, "reward_change_std": 0.07187035365495831, "reward_std": 0.44844716414809227, "rewards/cosine_scaled_reward": -0.20639943529386073, "rewards/format_reward": 0.14583333395421505, "step": 46 }, { "advantage_max": 1.1925027072429657, "advantage_mean": 1.2417634698280722e-08, "advantage_min": -1.0193697214126587, "advantage_std": 0.8486138731241226, "completion_length": 2712.2500381469727, "epoch": 0.053714285714285714, "grad_norm": 0.1182965636253357, "kl": 0.0005766120739281178, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.399999999999999e-07, "loss": 0.0368, "reward": 0.4009051900357008, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4009051900357008, "reward_after_std": 0.8486138731241226, "reward_before_mean": 0.6394723262637854, "reward_before_std": 0.8996778801083565, "reward_change_max": 0.00031866878271102905, "reward_change_mean": -0.23856708593666553, "reward_change_min": -0.46208037063479424, "reward_change_std": 0.19554195366799831, "reward_std": 0.8486139066517353, "rewards/cosine_scaled_reward": 0.05931948032230139, "rewards/format_reward": 0.5208333395421505, "step": 47 }, { "advantage_max": 0.9778347946703434, "advantage_mean": -3.1044087300813317e-09, "advantage_min": -0.6665457040071487, "advantage_std": 0.5934563688933849, "completion_length": 2835.291717529297, "epoch": 0.054857142857142854, "grad_norm": 0.10436692088842392, "kl": 0.00103018619120121, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.6e-07, "loss": -0.0174, "reward": -0.07658016681671143, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.07658016681671143, "reward_after_std": 0.5934563763439655, "reward_before_mean": 0.07092782482504845, "reward_before_std": 0.5976646542549133, "reward_change_max": 0.0015726611018180847, "reward_change_mean": -0.1475080167874694, "reward_change_min": -0.2721630148589611, "reward_change_std": 0.11156754847615957, "reward_std": 0.5934563837945461, "rewards/cosine_scaled_reward": -0.16245274990797043, "rewards/format_reward": 0.39583333767950535, "step": 48 }, { "advantage_max": 1.3865382000803947, "advantage_mean": 1.2417633588057697e-09, "advantage_min": -0.7386491149663925, "advantage_std": 0.8016112931072712, "completion_length": 2492.8125610351562, "epoch": 0.056, "grad_norm": 0.11641683429479599, "kl": 0.0006042793393135071, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.8e-07, "loss": 0.0575, "reward": 0.07680624630302191, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.07680624630302191, "reward_after_std": 0.8016112931072712, "reward_before_mean": 0.24038056656718254, "reward_before_std": 0.7961763925850391, "reward_change_max": 0.0004246160387992859, "reward_change_mean": -0.16357432678341866, "reward_change_min": -0.3487722110003233, "reward_change_std": 0.13505406212061644, "reward_std": 0.8016113005578518, "rewards/cosine_scaled_reward": -0.15064305157284252, "rewards/format_reward": 0.5416666716337204, "step": 49 }, { "advantage_max": 0.9396346509456635, "advantage_mean": -6.208816238917336e-10, "advantage_min": -0.6733806990087032, "advantage_std": 0.6209905967116356, "completion_length": 2896.000030517578, "epoch": 0.05714285714285714, "grad_norm": 0.10556609183549881, "kl": 0.0005448460578918457, "lambda_div_used": 0.7999999999999999, "learning_rate": 1e-06, "loss": 0.0318, "reward": 0.1492229625582695, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1492229625582695, "reward_after_std": 0.6209906078875065, "reward_before_mean": 0.34668817464262247, "reward_before_std": 0.6258712746202946, "reward_change_max": 0.0001476779580116272, "reward_change_mean": -0.19746522279456258, "reward_change_min": -0.38045971281826496, "reward_change_std": 0.15171885024756193, "reward_std": 0.6209906190633774, "rewards/cosine_scaled_reward": -0.0037392508238554, "rewards/format_reward": 0.3541666679084301, "step": 50 }, { "advantage_max": 1.068737480789423, "advantage_mean": 9.313230187046884e-10, "advantage_min": -0.7626629211008549, "advantage_std": 0.6865537837147713, "completion_length": 2353.854202270508, "epoch": 0.05828571428571429, "grad_norm": 0.13578957319259644, "kl": 0.0013544261455535889, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.999890338174275e-07, "loss": 0.0685, "reward": 0.1607223842293024, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1607223842293024, "reward_after_std": 0.6865537688136101, "reward_before_mean": 0.35505437199026346, "reward_before_std": 0.7007402144372463, "reward_change_max": 0.00023029744625091553, "reward_change_mean": -0.19433199614286423, "reward_change_min": -0.3676822017878294, "reward_change_std": 0.14552877843379974, "reward_std": 0.6865537911653519, "rewards/cosine_scaled_reward": -0.09330614097416401, "rewards/format_reward": 0.5416666734963655, "step": 51 }, { "advantage_max": 1.9176241979002953, "advantage_mean": -4.967053768289986e-09, "advantage_min": -1.0883928909897804, "advantage_std": 1.1372978007420897, "completion_length": 3029.875030517578, "epoch": 0.05942857142857143, "grad_norm": 0.15842178463935852, "kl": 0.0014321208000183105, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.999561358041868e-07, "loss": 0.0321, "reward": 0.38214721204712987, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.38214721204712987, "reward_after_std": 1.1372978007420897, "reward_before_mean": 0.5866053029894829, "reward_before_std": 1.1619214955717325, "reward_change_max": 0.0004972591996192932, "reward_change_mean": -0.20445812959223986, "reward_change_min": -0.44848377257585526, "reward_change_std": 0.17908359714783728, "reward_std": 1.1372978081926703, "rewards/cosine_scaled_reward": 0.0745526600512676, "rewards/format_reward": 0.4375000037252903, "step": 52 }, { "advantage_max": 1.7308011874556541, "advantage_mean": -6.208817182606907e-09, "advantage_min": -0.8654607012867928, "advantage_std": 1.007324229925871, "completion_length": 2824.854202270508, "epoch": 0.060571428571428575, "grad_norm": 0.1322852522134781, "kl": 0.0005290806293487549, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.999013075636804e-07, "loss": 0.0116, "reward": 0.11934852181002498, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.11934852181002498, "reward_after_std": 1.007324229925871, "reward_before_mean": 0.27733908654772677, "reward_before_std": 1.0268505774438381, "reward_change_max": 0.00027889758348464966, "reward_change_mean": -0.15799055946990848, "reward_change_min": -0.35294494964182377, "reward_change_std": 0.13888469664379954, "reward_std": 1.0073242671787739, "rewards/cosine_scaled_reward": -0.06966379517689347, "rewards/format_reward": 0.4166666753590107, "step": 53 }, { "advantage_max": 1.3603114746510983, "advantage_mean": -1.3038516599728212e-08, "advantage_min": -0.9828024581074715, "advantage_std": 0.9257772825658321, "completion_length": 2847.8333892822266, "epoch": 0.061714285714285715, "grad_norm": 0.15449616312980652, "kl": 0.0002994425594806671, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.998245517681593e-07, "loss": 0.0899, "reward": 0.4370972993783653, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4370972993783653, "reward_after_std": 0.9257773272693157, "reward_before_mean": 0.6762383666355163, "reward_before_std": 0.9667139928787947, "reward_change_max": 0.00035862624645233154, "reward_change_mean": -0.23914109449833632, "reward_change_min": -0.4856687095016241, "reward_change_std": 0.19955481635406613, "reward_std": 0.9257773421704769, "rewards/cosine_scaled_reward": 0.10895252972841263, "rewards/format_reward": 0.45833334140479565, "step": 54 }, { "advantage_max": 1.2908118069171906, "advantage_mean": -9.934107814135729e-09, "advantage_min": -0.8041025288403034, "advantage_std": 0.8428567498922348, "completion_length": 3042.5209197998047, "epoch": 0.06285714285714286, "grad_norm": 0.12463443726301193, "kl": 0.0011940151453018188, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.997258721585931e-07, "loss": 0.0166, "reward": 0.18334292899817228, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.18334292899817228, "reward_after_std": 0.84285675175488, "reward_before_mean": 0.37264115549623966, "reward_before_std": 0.8806403130292892, "reward_change_max": 0.0001153796911239624, "reward_change_mean": -0.1892982078716159, "reward_change_min": -0.45491757802665234, "reward_change_std": 0.1718091967049986, "reward_std": 0.8428567964583635, "rewards/cosine_scaled_reward": -0.011596103897318244, "rewards/format_reward": 0.39583333767950535, "step": 55 }, { "advantage_max": 0.8434501104056835, "advantage_mean": 1.365939800157534e-08, "advantage_min": -0.6591260954737663, "advantage_std": 0.5589417479932308, "completion_length": 3003.479202270508, "epoch": 0.064, "grad_norm": 0.08141618967056274, "kl": 0.0008943080902099609, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.996052735444862e-07, "loss": 0.0338, "reward": 0.1315749492496252, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1315749492496252, "reward_after_std": 0.5589417517185211, "reward_before_mean": 0.3291710652410984, "reward_before_std": 0.5639232508838177, "reward_change_max": 0.0011869743466377258, "reward_change_mean": -0.19759611319750547, "reward_change_min": -0.3288336955010891, "reward_change_std": 0.13926798570901155, "reward_std": 0.5589417666196823, "rewards/cosine_scaled_reward": -0.054164471104741096, "rewards/format_reward": 0.43750001303851604, "step": 56 }, { "advantage_max": 1.089033953845501, "advantage_mean": -4.656612206943578e-09, "advantage_min": -0.49081940576434135, "advantage_std": 0.6278907656669617, "completion_length": 3390.875, "epoch": 0.06514285714285714, "grad_norm": 0.09139782935380936, "kl": 0.0003075897693634033, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.994627618036452e-07, "loss": 0.004, "reward": -0.013246476650238037, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.013246476650238037, "reward_after_std": 0.6278907507658005, "reward_before_mean": 0.14421947114169598, "reward_before_std": 0.6174965221434832, "reward_change_max": 0.0004798024892807007, "reward_change_mean": -0.15746596161625348, "reward_change_min": -0.30477464385330677, "reward_change_std": 0.11690208106301725, "reward_std": 0.6278907507658005, "rewards/cosine_scaled_reward": -0.09455694165080786, "rewards/format_reward": 0.33333334140479565, "step": 57 }, { "advantage_max": 1.414633721113205, "advantage_mean": -1.8005570256995895e-08, "advantage_min": -0.7923170626163483, "advantage_std": 0.8777213655412197, "completion_length": 2450.2709045410156, "epoch": 0.06628571428571428, "grad_norm": 0.14690345525741577, "kl": 0.005219757556915283, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.992983438818915e-07, "loss": 0.0509, "reward": 0.4775881599634886, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4775881599634886, "reward_after_std": 0.87772136926651, "reward_before_mean": 0.7227345779538155, "reward_before_std": 0.8774426877498627, "reward_change_max": 0.0, "reward_change_mean": -0.24514640774577856, "reward_change_min": -0.48581127263605595, "reward_change_std": 0.18520902004092932, "reward_std": 0.8777213841676712, "rewards/cosine_scaled_reward": 0.05928393825888634, "rewards/format_reward": 0.6041666734963655, "step": 58 }, { "advantage_max": 0.8441104851663113, "advantage_mean": 4.9670538238011375e-09, "advantage_min": -0.4297443553805351, "advantage_std": 0.4777205139398575, "completion_length": 3034.8958587646484, "epoch": 0.06742857142857143, "grad_norm": 0.07107719033956528, "kl": 0.0010358989238739014, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.991120277927223e-07, "loss": 0.0185, "reward": -0.050955090671777725, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.050955090671777725, "reward_after_std": 0.4777205176651478, "reward_before_mean": 0.10805439203977585, "reward_before_std": 0.4464508257806301, "reward_change_max": 0.0, "reward_change_mean": -0.15900947572663426, "reward_change_min": -0.2582306321710348, "reward_change_std": 0.10319915041327477, "reward_std": 0.4777205251157284, "rewards/cosine_scaled_reward": -0.0918061351403594, "rewards/format_reward": 0.2916666679084301, "step": 59 }, { "advantage_max": 0.9713038932532072, "advantage_mean": -1.2107193969601582e-08, "advantage_min": -0.7591291554272175, "advantage_std": 0.6690347250550985, "completion_length": 3079.166702270508, "epoch": 0.06857142857142857, "grad_norm": 0.09675233066082001, "kl": 0.0005219876766204834, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.989038226169207e-07, "loss": 0.0241, "reward": 0.009758900851011276, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.009758900851011276, "reward_after_std": 0.6690347408875823, "reward_before_mean": 0.17622682079672813, "reward_before_std": 0.7048610388301313, "reward_change_max": 0.00023337453603744507, "reward_change_mean": -0.16646793112158775, "reward_change_min": -0.3498587552458048, "reward_change_std": 0.1466082762926817, "reward_std": 0.6690347520634532, "rewards/cosine_scaled_reward": -0.08896992891095579, "rewards/format_reward": 0.35416667349636555, "step": 60 }, { "advantage_max": 1.0436152890324593, "advantage_mean": 1.0244548459548142e-08, "advantage_min": -0.800739947706461, "advantage_std": 0.7454509846866131, "completion_length": 3407.500030517578, "epoch": 0.06971428571428571, "grad_norm": 0.1349206119775772, "kl": 0.0010833144187927246, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.98673738502114e-07, "loss": 0.042, "reward": -0.02641453593969345, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.02641453593969345, "reward_after_std": 0.7454509921371937, "reward_before_mean": 0.12944853049702942, "reward_before_std": 0.8043582141399384, "reward_change_max": 0.0004320964217185974, "reward_change_mean": -0.1558630745857954, "reward_change_min": -0.3698664717376232, "reward_change_std": 0.1559902993030846, "reward_std": 0.7454509995877743, "rewards/cosine_scaled_reward": -0.0915257316082716, "rewards/format_reward": 0.31250000558793545, "step": 61 }, { "advantage_max": 1.419227011501789, "advantage_mean": -1.0554989049449404e-08, "advantage_min": -0.8647123798727989, "advantage_std": 0.876304779201746, "completion_length": 2578.2917404174805, "epoch": 0.07085714285714285, "grad_norm": 0.13538189232349396, "kl": 0.002939268946647644, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.98421786662277e-07, "loss": 0.0381, "reward": 0.6961326766759157, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6961326766759157, "reward_after_std": 0.876304779201746, "reward_before_mean": 0.9869858250021935, "reward_before_std": 0.8612634465098381, "reward_change_max": 0.00021763890981674194, "reward_change_mean": -0.29085311479866505, "reward_change_min": -0.5205560252070427, "reward_change_std": 0.20150170475244522, "reward_std": 0.8763048034161329, "rewards/cosine_scaled_reward": 0.21224289759993553, "rewards/format_reward": 0.5625000018626451, "step": 62 }, { "advantage_max": 1.0165626257658005, "advantage_mean": -1.614292566287645e-08, "advantage_min": -0.9420815035700798, "advantage_std": 0.6958711137995124, "completion_length": 2271.916748046875, "epoch": 0.072, "grad_norm": 0.13307465612888336, "kl": 0.0019051730632781982, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.981479793771866e-07, "loss": 0.06, "reward": 0.590453858487308, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.590453858487308, "reward_after_std": 0.6958711436018348, "reward_before_mean": 0.8764481358230114, "reward_before_std": 0.697451738640666, "reward_change_max": 0.0, "reward_change_mean": -0.2859942736104131, "reward_change_min": -0.47236127965152264, "reward_change_std": 0.1900925375521183, "reward_std": 0.695871158502996, "rewards/cosine_scaled_reward": 0.0736407395452261, "rewards/format_reward": 0.729166679084301, "step": 63 }, { "advantage_max": 1.2714868448674679, "advantage_mean": 3.104408563547878e-09, "advantage_min": -0.8228170573711395, "advantage_std": 0.8133545722812414, "completion_length": 3238.791717529297, "epoch": 0.07314285714285715, "grad_norm": 0.14776557683944702, "kl": 0.0017735958099365234, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.97852329991824e-07, "loss": 0.0858, "reward": -0.023980196565389633, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.023980196565389633, "reward_after_std": 0.8133545592427254, "reward_before_mean": 0.12304234690964222, "reward_before_std": 0.8517117649316788, "reward_change_max": 0.00044614821672439575, "reward_change_mean": -0.1470225341618061, "reward_change_min": -0.35077216662466526, "reward_change_std": 0.14567347755655646, "reward_std": 0.8133545778691769, "rewards/cosine_scaled_reward": -0.06347882654517889, "rewards/format_reward": 0.2500000074505806, "step": 64 }, { "advantage_max": 1.1495383195579052, "advantage_mean": -8.071462553882469e-09, "advantage_min": -0.699254222214222, "advantage_std": 0.7099717799574137, "completion_length": 2805.708366394043, "epoch": 0.07428571428571429, "grad_norm": 0.11072947084903717, "kl": 0.0030024051666259766, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.975348529157229e-07, "loss": 0.0085, "reward": 0.17770222015678883, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.17770222015678883, "reward_after_std": 0.7099717650562525, "reward_before_mean": 0.3720225850120187, "reward_before_std": 0.7126198261976242, "reward_change_max": 0.000616706907749176, "reward_change_mean": -0.19432037486694753, "reward_change_min": -0.40575859509408474, "reward_change_std": 0.15412551956251264, "reward_std": 0.7099717818200588, "rewards/cosine_scaled_reward": -0.04315537726506591, "rewards/format_reward": 0.4583333395421505, "step": 65 }, { "advantage_max": 0.7441215887665749, "advantage_mean": -2.1730860277902053e-08, "advantage_min": -0.38368740398436785, "advantage_std": 0.42277249693870544, "completion_length": 2110.9583473205566, "epoch": 0.07542857142857143, "grad_norm": 0.050503313541412354, "kl": 0.004761695861816406, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.971955636222684e-07, "loss": 0.0023, "reward": 0.23776982817798853, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.23776982817798853, "reward_after_std": 0.42277250438928604, "reward_before_mean": 0.4610498445108533, "reward_before_std": 0.3607384217903018, "reward_change_max": 0.0003524795174598694, "reward_change_mean": -0.22328003775328398, "reward_change_min": -0.32576517947018147, "reward_change_std": 0.12836134992539883, "reward_std": 0.42277251556515694, "rewards/cosine_scaled_reward": -0.019475062377750874, "rewards/format_reward": 0.5, "step": 66 }, { "advantage_max": 0.7842137180268764, "advantage_mean": 1.5211602949483094e-08, "advantage_min": -0.3861103430390358, "advantage_std": 0.44835129380226135, "completion_length": 3445.0, "epoch": 0.07657142857142857, "grad_norm": 0.06733589619398117, "kl": 0.0021102428436279297, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.968344786479415e-07, "loss": -0.0079, "reward": -0.5162655003368855, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5162655003368855, "reward_after_std": 0.44835130125284195, "reward_before_mean": -0.45139221101999283, "reward_before_std": 0.4538105260580778, "reward_change_max": 0.000628940761089325, "reward_change_mean": -0.06487329420633614, "reward_change_min": -0.1644220780581236, "reward_change_std": 0.06198963453061879, "reward_std": 0.44835131987929344, "rewards/cosine_scaled_reward": -0.2777794389985502, "rewards/format_reward": 0.10416666977107525, "step": 67 }, { "advantage_max": 1.448514811694622, "advantage_mean": 1.3659397612997282e-08, "advantage_min": -0.8833390064537525, "advantage_std": 0.9293830282986164, "completion_length": 2356.3958702087402, "epoch": 0.07771428571428571, "grad_norm": 0.13976342976093292, "kl": 0.008285045623779297, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.964516155915151e-07, "loss": 0.0897, "reward": 0.24067923799157143, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.24067923799157143, "reward_after_std": 0.9293830282986164, "reward_before_mean": 0.4350328594446182, "reward_before_std": 0.9595834240317345, "reward_change_max": 0.0008330345153808594, "reward_change_mean": -0.1943535995669663, "reward_change_min": -0.4173274524509907, "reward_change_std": 0.17343836603686213, "reward_std": 0.9293830730021, "rewards/cosine_scaled_reward": -0.07415024330839515, "rewards/format_reward": 0.5833333358168602, "step": 68 }, { "advantage_max": 0.5551154017448425, "advantage_mean": 8.692344399818808e-09, "advantage_min": -0.36992673948407173, "advantage_std": 0.3697303757071495, "completion_length": 2848.250030517578, "epoch": 0.07885714285714286, "grad_norm": 0.06397375464439392, "kl": 0.004971027374267578, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.960469931131936e-07, "loss": 0.0073, "reward": -0.3054785691201687, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3054785691201687, "reward_after_std": 0.3697303719818592, "reward_before_mean": -0.18777675181627274, "reward_before_std": 0.37409957498311996, "reward_change_max": 0.00036262720823287964, "reward_change_mean": -0.11770181031897664, "reward_change_min": -0.22330834157764912, "reward_change_std": 0.09037601854652166, "reward_std": 0.36973038874566555, "rewards/cosine_scaled_reward": -0.23972170613706112, "rewards/format_reward": 0.2916666679084301, "step": 69 }, { "advantage_max": 0.8655475974082947, "advantage_mean": 5.587935669737476e-09, "advantage_min": -0.4706757254898548, "advantage_std": 0.5253105983138084, "completion_length": 3049.5209045410156, "epoch": 0.08, "grad_norm": 0.07061664760112762, "kl": 0.0020711421966552734, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.956206309337066e-07, "loss": 0.0209, "reward": -0.25117893144488335, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.25117893144488335, "reward_after_std": 0.5253106132149696, "reward_before_mean": -0.1343372669070959, "reward_before_std": 0.5308738593012094, "reward_change_max": 0.00114479660987854, "reward_change_mean": -0.11684167291969061, "reward_change_min": -0.2418693620711565, "reward_change_std": 0.09811591915786266, "reward_std": 0.5253106243908405, "rewards/cosine_scaled_reward": -0.23383529856801033, "rewards/format_reward": 0.3333333358168602, "step": 70 }, { "advantage_max": 1.0452404916286469, "advantage_mean": 1.1796752852344383e-08, "advantage_min": -0.5799026042222977, "advantage_std": 0.6007254403084517, "completion_length": 2697.875030517578, "epoch": 0.08114285714285714, "grad_norm": 0.17639409005641937, "kl": 0.004536479711532593, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.951725498333448e-07, "loss": 0.0052, "reward": -0.1075789425522089, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1075789425522089, "reward_after_std": 0.6007254216820002, "reward_before_mean": 0.03112965077161789, "reward_before_std": 0.589833190664649, "reward_change_max": 0.000692419707775116, "reward_change_mean": -0.13870856910943985, "reward_change_min": -0.26774679869413376, "reward_change_std": 0.10308503685519099, "reward_std": 0.6007254421710968, "rewards/cosine_scaled_reward": -0.17193518299609423, "rewards/format_reward": 0.3750000037252903, "step": 71 }, { "advantage_max": 0.9104671142995358, "advantage_mean": 2.7939678071131624e-09, "advantage_min": -0.6509746797382832, "advantage_std": 0.607373371720314, "completion_length": 3048.750030517578, "epoch": 0.08228571428571428, "grad_norm": 0.10005640983581543, "kl": 0.006662249565124512, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.947027716509488e-07, "loss": 0.0559, "reward": -0.16898642992600799, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.16898642992600799, "reward_after_std": 0.6073733530938625, "reward_before_mean": -0.037444956600666046, "reward_before_std": 0.6409455519169569, "reward_change_max": 4.38690185546875e-05, "reward_change_mean": -0.1315414784476161, "reward_change_min": -0.31572069227695465, "reward_change_std": 0.12350317602977157, "reward_std": 0.6073733605444431, "rewards/cosine_scaled_reward": -0.18538915179669857, "rewards/format_reward": 0.3333333432674408, "step": 72 }, { "advantage_max": 1.0058673024177551, "advantage_mean": 2.1420419549222913e-08, "advantage_min": -0.5559167042374611, "advantage_std": 0.6044680792838335, "completion_length": 3541.625, "epoch": 0.08342857142857144, "grad_norm": 0.10550221800804138, "kl": 0.0011322498321533203, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.942113192828444e-07, "loss": 0.0199, "reward": -0.3414469100534916, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3414469100534916, "reward_after_std": 0.6044680699706078, "reward_before_mean": -0.24936814978718758, "reward_before_std": 0.6242033317685127, "reward_change_max": 0.0004212036728858948, "reward_change_mean": -0.09207874466665089, "reward_change_min": -0.24085674434900284, "reward_change_std": 0.09360063704662025, "reward_std": 0.6044680774211884, "rewards/cosine_scaled_reward": -0.17676742002367973, "rewards/format_reward": 0.1041666679084301, "step": 73 }, { "advantage_max": 1.1884920187294483, "advantage_mean": 6.2088177377184195e-09, "advantage_min": -0.542400136590004, "advantage_std": 0.679960060864687, "completion_length": 3246.7083740234375, "epoch": 0.08457142857142858, "grad_norm": 0.13327006995677948, "kl": 0.0024330615997314453, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.93698216681727e-07, "loss": 0.046, "reward": -0.062173645943403244, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.062173645943403244, "reward_after_std": 0.6799600645899773, "reward_before_mean": 0.08113120123744011, "reward_before_std": 0.6675333846360445, "reward_change_max": 0.0006315037608146667, "reward_change_mean": -0.14330484648235142, "reward_change_min": -0.29839462600648403, "reward_change_std": 0.1192407829221338, "reward_std": 0.6799600906670094, "rewards/cosine_scaled_reward": -0.08443439565598965, "rewards/format_reward": 0.2500000037252903, "step": 74 }, { "advantage_max": 0.8372598998248577, "advantage_mean": 8.692344177774203e-09, "advantage_min": -0.620490986853838, "advantage_std": 0.5647232867777348, "completion_length": 3057.2708892822266, "epoch": 0.08571428571428572, "grad_norm": 0.09116575866937637, "kl": 0.003495454788208008, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.931634888554935e-07, "loss": 0.0227, "reward": 0.29398803785443306, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.29398803785443306, "reward_after_std": 0.5647232793271542, "reward_before_mean": 0.525806671474129, "reward_before_std": 0.5725712291896343, "reward_change_max": 0.00021129846572875977, "reward_change_mean": -0.23181860987097025, "reward_change_min": -0.3965703621506691, "reward_change_std": 0.15494680870324373, "reward_std": 0.5647233091294765, "rewards/cosine_scaled_reward": 0.02331998385488987, "rewards/format_reward": 0.4791666716337204, "step": 75 }, { "advantage_max": 1.0369278863072395, "advantage_mean": 9.623666863411984e-09, "advantage_min": -0.6456486731767654, "advantage_std": 0.6547755803912878, "completion_length": 2789.937530517578, "epoch": 0.08685714285714285, "grad_norm": 0.1379275918006897, "kl": 0.0013328194618225098, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.926071618660237e-07, "loss": 0.0419, "reward": -0.07144338451325893, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.07144338451325893, "reward_after_std": 0.6547755543142557, "reward_before_mean": 0.07493393309414387, "reward_before_std": 0.6736296722665429, "reward_change_max": 0.0004098638892173767, "reward_change_mean": -0.14637731108814478, "reward_change_min": -0.32003118097782135, "reward_change_std": 0.1269817417487502, "reward_std": 0.6547755654901266, "rewards/cosine_scaled_reward": -0.18128304183483124, "rewards/format_reward": 0.4375000037252903, "step": 76 }, { "advantage_max": 0.9112666845321655, "advantage_mean": -1.862645149230957e-09, "advantage_min": -0.6346181519329548, "advantage_std": 0.5672171823680401, "completion_length": 3220.854248046875, "epoch": 0.088, "grad_norm": 0.16787603497505188, "kl": 0.0013133883476257324, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.9202926282791e-07, "loss": 0.0348, "reward": -0.10176217183470726, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.10176217183470726, "reward_after_std": 0.567217193543911, "reward_before_mean": 0.04407870303839445, "reward_before_std": 0.5773847512900829, "reward_change_max": 0.00022549927234649658, "reward_change_mean": -0.14584088558331132, "reward_change_min": -0.2816579360514879, "reward_change_std": 0.11306928284466267, "reward_std": 0.5672171972692013, "rewards/cosine_scaled_reward": -0.13421065732836723, "rewards/format_reward": 0.3125000111758709, "step": 77 }, { "advantage_max": 1.0467328131198883, "advantage_mean": -2.3593506148777976e-08, "advantage_min": -0.7479342110455036, "advantage_std": 0.6737900376319885, "completion_length": 3131.541717529297, "epoch": 0.08914285714285715, "grad_norm": 0.10370709747076035, "kl": 0.001988053321838379, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.91429819907136e-07, "loss": 0.0261, "reward": -0.08144053816795349, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.08144053816795349, "reward_after_std": 0.6737900450825691, "reward_before_mean": 0.06362230330705643, "reward_before_std": 0.703643836081028, "reward_change_max": 0.0002675279974937439, "reward_change_mean": -0.14506285823881626, "reward_change_min": -0.29590629041194916, "reward_change_std": 0.12511227140203118, "reward_std": 0.6737900525331497, "rewards/cosine_scaled_reward": -0.09318885393440723, "rewards/format_reward": 0.2500000037252903, "step": 78 }, { "advantage_max": 1.1895506829023361, "advantage_mean": 1.7384688355548406e-08, "advantage_min": -0.6651065498590469, "advantage_std": 0.7007867246866226, "completion_length": 2376.875030517578, "epoch": 0.09028571428571429, "grad_norm": 0.09153755754232407, "kl": 0.003210306167602539, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.908088623197048e-07, "loss": -0.0116, "reward": -0.0072834547609090805, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.0072834547609090805, "reward_after_std": 0.700786679983139, "reward_before_mean": 0.1455509290099144, "reward_before_std": 0.7070570960640907, "reward_change_max": 0.0011262595653533936, "reward_change_mean": -0.15283435536548495, "reward_change_min": -0.3172120712697506, "reward_change_std": 0.12133174622431397, "reward_std": 0.7007867284119129, "rewards/cosine_scaled_reward": -0.18764122016727924, "rewards/format_reward": 0.5208333395421505, "step": 79 }, { "advantage_max": 1.1785830929875374, "advantage_mean": -8.692344399818808e-09, "advantage_min": -0.7640922516584396, "advantage_std": 0.7338002119213343, "completion_length": 3214.1041870117188, "epoch": 0.09142857142857143, "grad_norm": 0.12460608035326004, "kl": 0.003239154815673828, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.901664203302124e-07, "loss": 0.0364, "reward": 0.04392119310796261, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.04392119310796261, "reward_after_std": 0.733800208196044, "reward_before_mean": 0.20913729257881641, "reward_before_std": 0.7534278109669685, "reward_change_max": 0.0006359443068504333, "reward_change_mean": -0.16521610878407955, "reward_change_min": -0.37309172563254833, "reward_change_std": 0.14184686355292797, "reward_std": 0.7338002119213343, "rewards/cosine_scaled_reward": -0.08293136395514011, "rewards/format_reward": 0.3750000074505806, "step": 80 }, { "advantage_max": 0.9061555825173855, "advantage_mean": 2.1730862664881556e-09, "advantage_min": -0.5620998814702034, "advantage_std": 0.5434018056839705, "completion_length": 3120.0833587646484, "epoch": 0.09257142857142857, "grad_norm": 0.1448298990726471, "kl": 0.0063190460205078125, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.895025252503755e-07, "loss": 0.0269, "reward": -0.2227240139618516, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.2227240139618516, "reward_after_std": 0.543401800096035, "reward_before_mean": -0.1021307734772563, "reward_before_std": 0.5511450357735157, "reward_change_max": 0.00051078200340271, "reward_change_mean": -0.12059322092682123, "reward_change_min": -0.24367467127740383, "reward_change_std": 0.09504873026162386, "reward_std": 0.5434018205851316, "rewards/cosine_scaled_reward": -0.19689873605966568, "rewards/format_reward": 0.29166667349636555, "step": 81 }, { "advantage_max": 0.9381827842444181, "advantage_mean": -2.173086960377546e-09, "advantage_min": -0.6656824350357056, "advantage_std": 0.6018352564424276, "completion_length": 2795.8541870117188, "epoch": 0.09371428571428571, "grad_norm": 0.12910029292106628, "kl": 0.004484236240386963, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.888172094375033e-07, "loss": 0.0464, "reward": 0.21840301156044006, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.21840301156044006, "reward_after_std": 0.6018352564424276, "reward_before_mean": 0.42964954674243927, "reward_before_std": 0.5974211236461997, "reward_change_max": 0.0009202435612678528, "reward_change_mean": -0.21124652586877346, "reward_change_min": -0.3764022495597601, "reward_change_std": 0.14769506314769387, "reward_std": 0.6018352713435888, "rewards/cosine_scaled_reward": 0.02732476219534874, "rewards/format_reward": 0.375, "step": 82 }, { "advantage_max": 1.3297571428120136, "advantage_mean": -1.055498977109437e-08, "advantage_min": -0.615177258849144, "advantage_std": 0.7736387737095356, "completion_length": 2872.4583740234375, "epoch": 0.09485714285714286, "grad_norm": 0.12672345340251923, "kl": 0.0023288726806640625, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.881105062929221e-07, "loss": 0.0173, "reward": 0.10566996037960052, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.10566996037960052, "reward_after_std": 0.7736387699842453, "reward_before_mean": 0.2762736207805574, "reward_before_std": 0.7674045320600271, "reward_change_max": 0.0004971548914909363, "reward_change_mean": -0.17060369392856956, "reward_change_min": -0.367542028427124, "reward_change_std": 0.14439437165856361, "reward_std": 0.7736387774348259, "rewards/cosine_scaled_reward": -0.038946520537137985, "rewards/format_reward": 0.35416666977107525, "step": 83 }, { "advantage_max": 1.311264593154192, "advantage_mean": 6.829698806498641e-09, "advantage_min": -0.9606313481926918, "advantage_std": 0.8567360378801823, "completion_length": 3125.854217529297, "epoch": 0.096, "grad_norm": 0.1274482011795044, "kl": 0.001814126968383789, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.873824502603459e-07, "loss": 0.0282, "reward": 0.2880218755453825, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2880218755453825, "reward_after_std": 0.8567360173910856, "reward_before_mean": 0.4982416070997715, "reward_before_std": 0.8921538908034563, "reward_change_max": 0.00035124272108078003, "reward_change_mean": -0.21021969307912514, "reward_change_min": -0.4141238369047642, "reward_change_std": 0.16916781209874898, "reward_std": 0.8567360378801823, "rewards/cosine_scaled_reward": 0.061620788648724556, "rewards/format_reward": 0.37500000558793545, "step": 84 }, { "advantage_max": 1.2005193419754505, "advantage_mean": 1.2417634698280722e-08, "advantage_min": -0.8150865212082863, "advantage_std": 0.7814747169613838, "completion_length": 3178.0625610351562, "epoch": 0.09714285714285714, "grad_norm": 0.12010905146598816, "kl": 0.003203749656677246, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.866330768241983e-07, "loss": 0.006, "reward": 0.03627629950642586, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.03627629950642586, "reward_after_std": 0.7814747169613838, "reward_before_mean": 0.19885517843067646, "reward_before_std": 0.8199738580733538, "reward_change_max": 0.0005654171109199524, "reward_change_mean": -0.16257886961102486, "reward_change_min": -0.34483438916504383, "reward_change_std": 0.14743108581751585, "reward_std": 0.7814747374504805, "rewards/cosine_scaled_reward": -0.0984890783438459, "rewards/format_reward": 0.3958333395421505, "step": 85 }, { "advantage_max": 0.9351522885262966, "advantage_mean": -6.208810132690701e-10, "advantage_min": -0.6892301812767982, "advantage_std": 0.6031467914581299, "completion_length": 3064.458366394043, "epoch": 0.09828571428571428, "grad_norm": 0.07509732246398926, "kl": 0.0036211013793945312, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.85862422507884e-07, "loss": 0.0156, "reward": 0.08452938497066498, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.08452938497066498, "reward_after_std": 0.6031467877328396, "reward_before_mean": 0.2683639934984967, "reward_before_std": 0.6118773873895407, "reward_change_max": 0.0009405314922332764, "reward_change_mean": -0.1838346249423921, "reward_change_min": -0.3337151203304529, "reward_change_std": 0.1313472967594862, "reward_std": 0.6031468138098717, "rewards/cosine_scaled_reward": -0.06373466986406129, "rewards/format_reward": 0.3958333432674408, "step": 86 }, { "advantage_max": 1.0808392018079758, "advantage_mean": -1.9557774205702927e-08, "advantage_min": -0.8707231245934963, "advantage_std": 0.7867112383246422, "completion_length": 2901.500030517578, "epoch": 0.09942857142857142, "grad_norm": 0.16579511761665344, "kl": 0.008483409881591797, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.850705248720068e-07, "loss": 0.0474, "reward": 0.08398793265223503, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.08398793265223503, "reward_after_std": 0.7867112308740616, "reward_before_mean": 0.2611438991734758, "reward_before_std": 0.8513057716190815, "reward_change_max": 0.00133620947599411, "reward_change_mean": -0.17715601343661547, "reward_change_min": -0.41611758805811405, "reward_change_std": 0.17166244937106967, "reward_std": 0.7867112457752228, "rewards/cosine_scaled_reward": -0.08817804581485689, "rewards/format_reward": 0.43750000931322575, "step": 87 }, { "advantage_max": 1.0569001287221909, "advantage_mean": -2.2972624524886243e-08, "advantage_min": -0.7176317572593689, "advantage_std": 0.6587415039539337, "completion_length": 2967.9583435058594, "epoch": 0.10057142857142858, "grad_norm": 0.1312066912651062, "kl": 0.007180690765380859, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.8425742251254e-07, "loss": 0.0486, "reward": 0.2663197033107281, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2663197033107281, "reward_after_std": 0.6587415114045143, "reward_before_mean": 0.4824961042031646, "reward_before_std": 0.6546656489372253, "reward_change_max": 0.001102931797504425, "reward_change_mean": -0.21617642883211374, "reward_change_min": -0.39005475491285324, "reward_change_std": 0.15425753220915794, "reward_std": 0.6587415598332882, "rewards/cosine_scaled_reward": -0.019168607890605927, "rewards/format_reward": 0.5208333395421505, "step": 88 }, { "advantage_max": 1.1422345601022243, "advantage_mean": 6.208819014474898e-10, "advantage_min": -0.8242807537317276, "advantage_std": 0.7421968802809715, "completion_length": 3296.562530517578, "epoch": 0.10171428571428572, "grad_norm": 0.12065722793340683, "kl": 0.0038585662841796875, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.83423155058946e-07, "loss": 0.0029, "reward": 0.009984794072806835, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.009984794072806835, "reward_after_std": 0.7421968914568424, "reward_before_mean": 0.16941174305975437, "reward_before_std": 0.7764574587345123, "reward_change_max": 0.0010293349623680115, "reward_change_mean": -0.1594269503839314, "reward_change_min": -0.3495678976178169, "reward_change_std": 0.14754267875105143, "reward_std": 0.742196898907423, "rewards/cosine_scaled_reward": -0.0819607978919521, "rewards/format_reward": 0.3333333395421505, "step": 89 }, { "advantage_max": 0.6904804185032845, "advantage_mean": -4.346172088887101e-09, "advantage_min": -0.46621549502015114, "advantage_std": 0.43648741766810417, "completion_length": 2591.8750228881836, "epoch": 0.10285714285714286, "grad_norm": 0.06959541141986847, "kl": 0.011201858520507812, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.825677631722435e-07, "loss": 0.011, "reward": -0.18421894684433937, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.18421894684433937, "reward_after_std": 0.4364874064922333, "reward_before_mean": -0.04731912910938263, "reward_before_std": 0.44100410491228104, "reward_change_max": 0.0005378872156143188, "reward_change_mean": -0.13689983123913407, "reward_change_min": -0.2534129451960325, "reward_change_std": 0.0994924996048212, "reward_std": 0.43648741766810417, "rewards/cosine_scaled_reward": -0.252826239913702, "rewards/format_reward": 0.45833333395421505, "step": 90 }, { "advantage_max": 1.3217405453324318, "advantage_mean": 1.6453366030733108e-08, "advantage_min": -0.7791223339736462, "advantage_std": 0.8055871687829494, "completion_length": 3244.1875610351562, "epoch": 0.104, "grad_norm": 0.14869704842567444, "kl": 0.004951953887939453, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.816912885430258e-07, "loss": 0.016, "reward": -0.017406470142304897, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.017406470142304897, "reward_after_std": 0.8055871650576591, "reward_before_mean": 0.12868821853771806, "reward_before_std": 0.8286459427326918, "reward_change_max": 0.000985272228717804, "reward_change_mean": -0.14609465654939413, "reward_change_min": -0.3366597071290016, "reward_change_std": 0.13514686562120914, "reward_std": 0.8055871836841106, "rewards/cosine_scaled_reward": -0.09190590120851994, "rewards/format_reward": 0.3125000111758709, "step": 91 }, { "advantage_max": 0.9895059671252966, "advantage_mean": -2.0178655718572358e-08, "advantage_min": -0.5419396162033081, "advantage_std": 0.5604732185602188, "completion_length": 2903.000045776367, "epoch": 0.10514285714285715, "grad_norm": 0.16299159824848175, "kl": 0.029235363006591797, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.807937738894303e-07, "loss": -0.0178, "reward": 0.014140639454126358, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.014140639454126358, "reward_after_std": 0.5604731980711222, "reward_before_mean": 0.18168883956968784, "reward_before_std": 0.5389804840087891, "reward_change_max": 0.0002861618995666504, "reward_change_mean": -0.1675482182763517, "reward_change_min": -0.28213073685765266, "reward_change_std": 0.11067371582612395, "reward_std": 0.5604731999337673, "rewards/cosine_scaled_reward": -0.11748891929164529, "rewards/format_reward": 0.41666667349636555, "step": 92 }, { "advantage_max": 0.5988105162978172, "advantage_mean": 1.428027990302283e-08, "advantage_min": -0.41282227262854576, "advantage_std": 0.38750826194882393, "completion_length": 3500.6666870117188, "epoch": 0.10628571428571429, "grad_norm": 0.0707600861787796, "kl": 0.0052585601806640625, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.798752629550546e-07, "loss": 0.0215, "reward": -0.5209208391606808, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.5209208391606808, "reward_after_std": 0.3875082656741142, "reward_before_mean": -0.44965457171201706, "reward_before_std": 0.4064562898129225, "reward_change_max": 0.0006816089153289795, "reward_change_mean": -0.07126627350226045, "reward_change_min": -0.16244689002633095, "reward_change_std": 0.06904846569523215, "reward_std": 0.3875082768499851, "rewards/cosine_scaled_reward": -0.24566061422228813, "rewards/format_reward": 0.0416666679084301, "step": 93 }, { "advantage_max": 0.7165923677384853, "advantage_mean": 8.071462442860167e-09, "advantage_min": -0.47233113646507263, "advantage_std": 0.44313961267471313, "completion_length": 3208.6666870117188, "epoch": 0.10742857142857143, "grad_norm": 0.07605665177106857, "kl": 0.008792877197265625, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.78935800506826e-07, "loss": 0.0127, "reward": -0.19248150289058685, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.19248150289058685, "reward_after_std": 0.44313961267471313, "reward_before_mean": -0.056962043046951294, "reward_before_std": 0.4448035843670368, "reward_change_max": 6.378442049026489e-05, "reward_change_mean": -0.13551945402286947, "reward_change_min": -0.2403192762285471, "reward_change_std": 0.0976343797519803, "reward_std": 0.44313962385058403, "rewards/cosine_scaled_reward": -0.16389768943190575, "rewards/format_reward": 0.2708333395421505, "step": 94 }, { "advantage_max": 0.8746747858822346, "advantage_mean": 4.967053546245381e-09, "advantage_min": -0.5384437739849091, "advantage_std": 0.522600032389164, "completion_length": 3483.1875610351562, "epoch": 0.10857142857142857, "grad_norm": 0.07552602887153625, "kl": 0.0022268295288085938, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.779754323328192e-07, "loss": 0.0126, "reward": -0.33064037561416626, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.33064037561416626, "reward_after_std": 0.5226000137627125, "reward_before_mean": -0.23070255480706692, "reward_before_std": 0.5317417494952679, "reward_change_max": 0.0007640570402145386, "reward_change_mean": -0.09993783105164766, "reward_change_min": -0.19900760054588318, "reward_change_std": 0.08017310034483671, "reward_std": 0.5226000174880028, "rewards/cosine_scaled_reward": -0.19868461415171623, "rewards/format_reward": 0.1666666716337204, "step": 95 }, { "advantage_max": 1.5345038175582886, "advantage_mean": 5.5879355587151736e-09, "advantage_min": -0.955367024987936, "advantage_std": 0.9416514039039612, "completion_length": 3175.000030517578, "epoch": 0.10971428571428571, "grad_norm": 0.1472436934709549, "kl": 0.007169246673583984, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.769942052400235e-07, "loss": 0.0333, "reward": 0.18423494324088097, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.18423494324088097, "reward_after_std": 0.9416513778269291, "reward_before_mean": 0.3641625363379717, "reward_before_std": 0.9746041018515825, "reward_change_max": 0.0007186755537986755, "reward_change_mean": -0.1799275849480182, "reward_change_min": -0.38189585879445076, "reward_change_std": 0.15464499220252037, "reward_std": 0.9416514132171869, "rewards/cosine_scaled_reward": 0.004997933050617576, "rewards/format_reward": 0.35416667349636555, "step": 96 }, { "advantage_max": 0.7897094935178757, "advantage_mean": 3.3527613019224134e-08, "advantage_min": -0.8142013549804688, "advantage_std": 0.6035727635025978, "completion_length": 3356.687530517578, "epoch": 0.11085714285714286, "grad_norm": 0.12530912458896637, "kl": 0.004159212112426758, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.759921670520634e-07, "loss": 0.0325, "reward": -0.10107455402612686, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.10107455402612686, "reward_after_std": 0.6035727448761463, "reward_before_mean": 0.050840720534324646, "reward_before_std": 0.6585263684391975, "reward_change_max": 0.0007024109363555908, "reward_change_mean": -0.1519152638502419, "reward_change_min": -0.30695683509111404, "reward_change_std": 0.1367809739895165, "reward_std": 0.6035727486014366, "rewards/cosine_scaled_reward": -0.06832962296903133, "rewards/format_reward": 0.1875000074505806, "step": 97 }, { "advantage_max": 1.0006159357726574, "advantage_mean": 7.1401400625337175e-09, "advantage_min": -0.7299479097127914, "advantage_std": 0.6192986331880093, "completion_length": 3245.3125610351562, "epoch": 0.112, "grad_norm": 0.10221297293901443, "kl": 0.0038628578186035156, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.749693666068663e-07, "loss": 0.0334, "reward": -0.058832570910453796, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.058832570910453796, "reward_after_std": 0.6192986331880093, "reward_before_mean": 0.09190889494493604, "reward_before_std": 0.629094023257494, "reward_change_max": 0.000854790210723877, "reward_change_mean": -0.1507414490915835, "reward_change_min": -0.26700772903859615, "reward_change_std": 0.11300534615293145, "reward_std": 0.6192986629903316, "rewards/cosine_scaled_reward": -0.09987888857722282, "rewards/format_reward": 0.291666679084301, "step": 98 }, { "advantage_max": 1.2627913691103458, "advantage_mean": 7.761021492136422e-09, "advantage_min": -0.7912185974419117, "advantage_std": 0.7411760687828064, "completion_length": 2877.354202270508, "epoch": 0.11314285714285714, "grad_norm": 0.13032735884189606, "kl": 0.00640869140625, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.739258537542835e-07, "loss": 0.0187, "reward": 0.1233619935810566, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1233619935810566, "reward_after_std": 0.741176063194871, "reward_before_mean": 0.3012175422627479, "reward_before_std": 0.7417186517268419, "reward_change_max": 3.135204315185547e-05, "reward_change_mean": -0.17785556078888476, "reward_change_min": -0.32209550961852074, "reward_change_std": 0.12937596580013633, "reward_std": 0.7411760911345482, "rewards/cosine_scaled_reward": -0.02647455967962742, "rewards/format_reward": 0.35416667722165585, "step": 99 }, { "advantage_max": 1.175136137753725, "advantage_mean": -1.2417634698280722e-09, "advantage_min": -0.99284578114748, "advantage_std": 0.8207744583487511, "completion_length": 2967.187545776367, "epoch": 0.11428571428571428, "grad_norm": 0.16071321070194244, "kl": 0.0092315673828125, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.728616793536587e-07, "loss": 0.0903, "reward": 0.28339227894321084, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.28339227894321084, "reward_after_std": 0.8207744546234608, "reward_before_mean": 0.49859905429184437, "reward_before_std": 0.8708522617816925, "reward_change_max": 6.333738565444946e-05, "reward_change_mean": -0.21520677860826254, "reward_change_min": -0.43512601405382156, "reward_change_std": 0.17898594215512276, "reward_std": 0.8207744881510735, "rewards/cosine_scaled_reward": 0.030549529939889908, "rewards/format_reward": 0.4375000111758709, "step": 100 }, { "advantage_max": 0.9483831934630871, "advantage_mean": -9.623666363811623e-09, "advantage_min": -0.5487977117300034, "advantage_std": 0.5574415624141693, "completion_length": 2825.750015258789, "epoch": 0.11542857142857142, "grad_norm": 0.09306998550891876, "kl": 0.004563093185424805, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.717768952713511e-07, "loss": 0.0092, "reward": 0.11698936205357313, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.11698936205357313, "reward_after_std": 0.5574415847659111, "reward_before_mean": 0.3067358136177063, "reward_before_std": 0.5346883460879326, "reward_change_max": 8.66129994392395e-05, "reward_change_mean": -0.1897464576177299, "reward_change_min": -0.31265771202743053, "reward_change_std": 0.12219382403418422, "reward_std": 0.5574416108429432, "rewards/cosine_scaled_reward": -0.04454876575618982, "rewards/format_reward": 0.39583333767950535, "step": 101 }, { "advantage_max": 1.3877725079655647, "advantage_mean": -1.676380706472358e-08, "advantage_min": -0.7309272214770317, "advantage_std": 0.7833095956593752, "completion_length": 2827.354217529297, "epoch": 0.11657142857142858, "grad_norm": 0.4577586054801941, "kl": 0.0829916000366211, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.706715543782064e-07, "loss": 0.0496, "reward": 0.027024696115404367, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.027024696115404367, "reward_after_std": 0.78330959379673, "reward_before_mean": 0.17954866343643516, "reward_before_std": 0.776120014488697, "reward_change_max": 0.00019662082195281982, "reward_change_mean": -0.15252398420125246, "reward_change_min": -0.2790621221065521, "reward_change_std": 0.11413990310393274, "reward_std": 0.7833096235990524, "rewards/cosine_scaled_reward": -0.14980901218950748, "rewards/format_reward": 0.47916667722165585, "step": 102 }, { "advantage_max": 1.2712066285312176, "advantage_mean": -1.2417643024953406e-09, "advantage_min": -0.841466061770916, "advantage_std": 0.8277627564966679, "completion_length": 3121.7708892822266, "epoch": 0.11771428571428572, "grad_norm": 0.17055128514766693, "kl": 0.010271072387695312, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.695457105469804e-07, "loss": 0.0962, "reward": -0.0551725160330534, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.0551725160330534, "reward_after_std": 0.8277627415955067, "reward_before_mean": 0.08446851873304695, "reward_before_std": 0.8774662502110004, "reward_change_max": 0.0010539069771766663, "reward_change_mean": -0.13964102417230606, "reward_change_min": -0.36716344580054283, "reward_change_std": 0.1503495373763144, "reward_std": 0.8277627639472485, "rewards/cosine_scaled_reward": -0.12443241663277149, "rewards/format_reward": 0.33333334140479565, "step": 103 }, { "advantage_max": 1.0146032497286797, "advantage_mean": 5.587935669737476e-09, "advantage_min": -0.6235432531684637, "advantage_std": 0.6274013724178076, "completion_length": 2856.875030517578, "epoch": 0.11885714285714286, "grad_norm": 0.215767040848732, "kl": 0.00616455078125, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.683994186497132e-07, "loss": 0.0432, "reward": -0.11946433084085584, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.11946433084085584, "reward_after_std": 0.6274013575166464, "reward_before_mean": 0.01864840214693686, "reward_before_std": 0.6406649835407734, "reward_change_max": 7.314980030059814e-05, "reward_change_mean": -0.13811274012550712, "reward_change_min": -0.26797983795404434, "reward_change_std": 0.11577287875115871, "reward_std": 0.6274013724178076, "rewards/cosine_scaled_reward": -0.17817580047994852, "rewards/format_reward": 0.3750000037252903, "step": 104 }, { "advantage_max": 1.1567305587232113, "advantage_mean": -3.725290298461914e-09, "advantage_min": -0.9195349849760532, "advantage_std": 0.7848116252571344, "completion_length": 2741.8125, "epoch": 0.12, "grad_norm": 0.12767690420150757, "kl": 0.006846427917480469, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.672327345550543e-07, "loss": 0.0243, "reward": 0.27163759991526604, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.27163759991526604, "reward_after_std": 0.7848116103559732, "reward_before_mean": 0.4845798406749964, "reward_before_std": 0.8154449053108692, "reward_change_max": 0.0009339377284049988, "reward_change_mean": -0.21294222678989172, "reward_change_min": -0.39509972743690014, "reward_change_std": 0.17022682540118694, "reward_std": 0.7848116252571344, "rewards/cosine_scaled_reward": 0.04437324404716492, "rewards/format_reward": 0.3958333358168602, "step": 105 }, { "advantage_max": 1.062649704515934, "advantage_mean": 7.450581263057643e-09, "advantage_min": -0.7678385637700558, "advantage_std": 0.7214131653308868, "completion_length": 2290.083366394043, "epoch": 0.12114285714285715, "grad_norm": 0.10329294949769974, "kl": 0.006995201110839844, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.66045715125541e-07, "loss": 0.0404, "reward": 0.7429984174668789, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7429984174668789, "reward_after_std": 0.7214131690561771, "reward_before_mean": 1.058710291981697, "reward_before_std": 0.6990874614566565, "reward_change_max": 0.0, "reward_change_mean": -0.31571184657514095, "reward_change_min": -0.5282324738800526, "reward_change_std": 0.21634249202907085, "reward_std": 0.7214132063090801, "rewards/cosine_scaled_reward": 0.1856051298091188, "rewards/format_reward": 0.6875000037252903, "step": 106 }, { "advantage_max": 0.8142878897488117, "advantage_mean": 2.4835259959665734e-09, "advantage_min": -0.6240261495113373, "advantage_std": 0.5540393777191639, "completion_length": 2874.9583892822266, "epoch": 0.12228571428571429, "grad_norm": 0.10811487585306168, "kl": 0.005537986755371094, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.648384182148252e-07, "loss": 0.0619, "reward": 0.2032775196712464, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2032775196712464, "reward_after_std": 0.5540393590927124, "reward_before_mean": 0.41669320687651634, "reward_before_std": 0.5620136149227619, "reward_change_max": 0.0003005489706993103, "reward_change_mean": -0.21341570327058434, "reward_change_min": -0.35812640748918056, "reward_change_std": 0.1473505743779242, "reward_std": 0.5540393758565187, "rewards/cosine_scaled_reward": -0.03123672492802143, "rewards/format_reward": 0.47916666977107525, "step": 107 }, { "advantage_max": 1.1528266817331314, "advantage_mean": -2.514571040279634e-08, "advantage_min": -0.9793357066810131, "advantage_std": 0.7934195697307587, "completion_length": 2877.0208892822266, "epoch": 0.12342857142857143, "grad_norm": 1.8859726190567017, "kl": 0.1377577781677246, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.636109026648554e-07, "loss": 0.097, "reward": 0.13686267286539078, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.13686267286539078, "reward_after_std": 0.793419573456049, "reward_before_mean": 0.32153723450755933, "reward_before_std": 0.8392018117010593, "reward_change_max": 0.0004731789231300354, "reward_change_mean": -0.184674559161067, "reward_change_min": -0.3883530702441931, "reward_change_std": 0.16422407794743776, "reward_std": 0.7934196069836617, "rewards/cosine_scaled_reward": -0.016314731910824776, "rewards/format_reward": 0.3541666716337204, "step": 108 }, { "advantage_max": 1.2956867851316929, "advantage_mean": -6.829699139565548e-09, "advantage_min": -0.5799591094255447, "advantage_std": 0.6968509145081043, "completion_length": 2976.812515258789, "epoch": 0.12457142857142857, "grad_norm": 0.11160595715045929, "kl": 0.0044231414794921875, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.623632283030077e-07, "loss": 0.0137, "reward": 0.05842187628149986, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.05842187628149986, "reward_after_std": 0.6968508996069431, "reward_before_mean": 0.22159072943031788, "reward_before_std": 0.6728788614273071, "reward_change_max": 0.0004415437579154968, "reward_change_mean": -0.16316887410357594, "reward_change_min": -0.2775291074067354, "reward_change_std": 0.10812568850815296, "reward_std": 0.6968509182333946, "rewards/cosine_scaled_reward": -0.06628797389566898, "rewards/format_reward": 0.35416666977107525, "step": 109 }, { "advantage_max": 1.2137416154146194, "advantage_mean": -2.2662182685984078e-08, "advantage_min": -1.076897133141756, "advantage_std": 0.834989856928587, "completion_length": 2995.979202270508, "epoch": 0.12571428571428572, "grad_norm": 0.12926240265369415, "kl": 0.006874561309814453, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.610954559391704e-07, "loss": 0.019, "reward": 0.32111069560050964, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.32111069560050964, "reward_after_std": 0.8349898457527161, "reward_before_mean": 0.5422417484223843, "reward_before_std": 0.8741515278816223, "reward_change_max": 0.0012059956789016724, "reward_change_mean": -0.221131079364568, "reward_change_min": -0.42351557686924934, "reward_change_std": 0.18153371335938573, "reward_std": 0.8349898532032967, "rewards/cosine_scaled_reward": 0.01070421189069748, "rewards/format_reward": 0.5208333432674408, "step": 110 }, { "advantage_max": 1.2839822471141815, "advantage_mean": -1.0554989438027462e-08, "advantage_min": -0.8430443927645683, "advantage_std": 0.8239332940429449, "completion_length": 3269.4166870117188, "epoch": 0.12685714285714286, "grad_norm": 0.1600443720817566, "kl": 0.00861358642578125, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.598076473627796e-07, "loss": 0.0999, "reward": -0.031158728525042534, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.031158728525042534, "reward_after_std": 0.8239332847297192, "reward_before_mean": 0.11378072574734688, "reward_before_std": 0.8664110209792852, "reward_change_max": 0.0004683658480644226, "reward_change_mean": -0.14493947010487318, "reward_change_min": -0.3175403233617544, "reward_change_std": 0.1375368507578969, "reward_std": 0.8239333350211382, "rewards/cosine_scaled_reward": -0.06810962222516537, "rewards/format_reward": 0.2500000074505806, "step": 111 }, { "advantage_max": 1.390791840851307, "advantage_mean": -1.4280279847511679e-08, "advantage_min": -0.8436776027083397, "advantage_std": 0.9095302019268274, "completion_length": 3292.416717529297, "epoch": 0.128, "grad_norm": 0.18827910721302032, "kl": 0.004608154296875, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.58499865339809e-07, "loss": 0.0404, "reward": 0.3022013884037733, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3022013884037733, "reward_after_std": 0.9095302168279886, "reward_before_mean": 0.5123886037617922, "reward_before_std": 0.9429038781672716, "reward_change_max": 0.0006921738386154175, "reward_change_mean": -0.21018722327426076, "reward_change_min": -0.4561909269541502, "reward_change_std": 0.1870468370616436, "reward_std": 0.9095302261412144, "rewards/cosine_scaled_reward": 0.047860970720648766, "rewards/format_reward": 0.41666667349636555, "step": 112 }, { "advantage_max": 1.4620042815804482, "advantage_mean": 1.7384688244526103e-08, "advantage_min": -0.9851342514157295, "advantage_std": 0.9557592868804932, "completion_length": 2622.1042404174805, "epoch": 0.12914285714285714, "grad_norm": 0.20259544253349304, "kl": 0.0076427459716796875, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.571721736097088e-07, "loss": 0.1053, "reward": 0.30275487527251244, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.30275487527251244, "reward_after_std": 0.9557593017816544, "reward_before_mean": 0.5097140893340111, "reward_before_std": 0.9958821460604668, "reward_change_max": 0.0005864575505256653, "reward_change_mean": -0.20695918891578913, "reward_change_min": -0.45230502262711525, "reward_change_std": 0.18196550523862243, "reward_std": 0.9557593166828156, "rewards/cosine_scaled_reward": -0.04722629580646753, "rewards/format_reward": 0.6041666809469461, "step": 113 }, { "advantage_max": 0.7880322486162186, "advantage_mean": -2.793967918135465e-09, "advantage_min": -0.4691660702228546, "advantage_std": 0.4585419502109289, "completion_length": 2494.250015258789, "epoch": 0.13028571428571428, "grad_norm": 0.056887462735176086, "kl": 0.005835533142089844, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.55824636882301e-07, "loss": -0.0059, "reward": 0.10360929928719997, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.10360929928719997, "reward_after_std": 0.45854195207357407, "reward_before_mean": 0.2977067567408085, "reward_before_std": 0.43085470236837864, "reward_change_max": 5.520880222320557e-06, "reward_change_mean": -0.19409746601013467, "reward_change_min": -0.30172265134751797, "reward_change_std": 0.11335213592974469, "reward_std": 0.45854196697473526, "rewards/cosine_scaled_reward": -0.1740632876753807, "rewards/format_reward": 0.645833333954215, "step": 114 }, { "advantage_max": 0.8312573879957199, "advantage_mean": 2.1730861277102775e-09, "advantage_min": -0.5887445993721485, "advantage_std": 0.5362998899072409, "completion_length": 2865.9166870117188, "epoch": 0.13142857142857142, "grad_norm": 0.07487498223781586, "kl": 0.0052642822265625, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.54457320834625e-07, "loss": 0.0144, "reward": -0.032614946365356445, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.032614946365356445, "reward_after_std": 0.5362998805940151, "reward_before_mean": 0.1314664650708437, "reward_before_std": 0.5410970067605376, "reward_change_max": 0.00015585869550704956, "reward_change_mean": -0.16408138908445835, "reward_change_min": -0.322182085365057, "reward_change_std": 0.12360355304554105, "reward_std": 0.536299891769886, "rewards/cosine_scaled_reward": -0.10093345306813717, "rewards/format_reward": 0.3333333358168602, "step": 115 }, { "advantage_max": 0.8675027787685394, "advantage_mean": -1.5522046703519976e-09, "advantage_min": -0.6971545293927193, "advantage_std": 0.58235864341259, "completion_length": 3352.625, "epoch": 0.13257142857142856, "grad_norm": 0.1078818291425705, "kl": 0.006398200988769531, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.530702921077358e-07, "loss": 0.0225, "reward": -0.13937662541866302, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.13937662541866302, "reward_after_std": 0.5823586620390415, "reward_before_mean": 0.0006389021873474121, "reward_before_std": 0.6112870834767818, "reward_change_max": 0.0004796534776687622, "reward_change_mean": -0.140015542274341, "reward_change_min": -0.2930273860692978, "reward_change_std": 0.12012169591616839, "reward_std": 0.5823586657643318, "rewards/cosine_scaled_reward": -0.11426387913525105, "rewards/format_reward": 0.2291666753590107, "step": 116 }, { "advantage_max": 0.8436762727797031, "advantage_mean": 1.614292502449821e-08, "advantage_min": -0.5272083412855864, "advantage_std": 0.49669332057237625, "completion_length": 3044.291717529297, "epoch": 0.1337142857142857, "grad_norm": 0.08327826112508774, "kl": 0.008813858032226562, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.516636183034564e-07, "loss": 0.0261, "reward": -0.23562454991042614, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.23562454991042614, "reward_after_std": 0.4966933410614729, "reward_before_mean": -0.11536954622715712, "reward_before_std": 0.4920260915532708, "reward_change_max": 0.0011560618877410889, "reward_change_mean": -0.1202550008893013, "reward_change_min": -0.20540722459554672, "reward_change_std": 0.08698108419775963, "reward_std": 0.4966933485120535, "rewards/cosine_scaled_reward": -0.22435144521296024, "rewards/format_reward": 0.3333333395421505, "step": 117 }, { "advantage_max": 1.2969117499887943, "advantage_mean": -1.4901161471403412e-08, "advantage_min": -1.0903200656175613, "advantage_std": 0.9161137863993645, "completion_length": 2885.6458587646484, "epoch": 0.13485714285714287, "grad_norm": 0.13571742177009583, "kl": 0.004894256591796875, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.502373679810839e-07, "loss": 0.0147, "reward": 0.48230795189738274, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.48230795189738274, "reward_after_std": 0.9161137938499451, "reward_before_mean": 0.7343571186065674, "reward_before_std": 0.971574455499649, "reward_change_max": 0.0007158666849136353, "reward_change_mean": -0.2520492020994425, "reward_change_min": -0.5032246150076389, "reward_change_std": 0.21108578890562057, "reward_std": 0.9161138087511063, "rewards/cosine_scaled_reward": 0.11717856023460627, "rewards/format_reward": 0.5000000055879354, "step": 118 }, { "advantage_max": 0.9215016141533852, "advantage_mean": -5.401671004934272e-08, "advantage_min": -0.7312754169106483, "advantage_std": 0.6038780957460403, "completion_length": 2581.3542098999023, "epoch": 0.136, "grad_norm": 0.3118216097354889, "kl": 0.09250736236572266, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.487916106540465e-07, "loss": 0.0175, "reward": 0.38770658522844315, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.38770658522844315, "reward_after_std": 0.6038780771195889, "reward_before_mean": 0.6343068927526474, "reward_before_std": 0.593263041228056, "reward_change_max": 0.0005638003349304199, "reward_change_mean": -0.24660037737339735, "reward_change_min": -0.4107240866869688, "reward_change_std": 0.16336056124418974, "reward_std": 0.6038780957460403, "rewards/cosine_scaled_reward": 0.03590344078838825, "rewards/format_reward": 0.5625000055879354, "step": 119 }, { "advantage_max": 1.1629462502896786, "advantage_mean": 1.3038516155639002e-08, "advantage_min": -0.6762507557868958, "advantage_std": 0.7126640006899834, "completion_length": 2743.000030517578, "epoch": 0.13714285714285715, "grad_norm": 0.17088720202445984, "kl": 0.009153366088867188, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.473264167865171e-07, "loss": 0.0105, "reward": 0.23766697943210602, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.23766697943210602, "reward_after_std": 0.712664008140564, "reward_before_mean": 0.4430948309600353, "reward_before_std": 0.7097529098391533, "reward_change_max": 0.0, "reward_change_mean": -0.20542785804718733, "reward_change_min": -0.3797203078866005, "reward_change_std": 0.1507191490381956, "reward_std": 0.7126640230417252, "rewards/cosine_scaled_reward": 0.013214093632996082, "rewards/format_reward": 0.4166666753590107, "step": 120 }, { "advantage_max": 1.0060521364212036, "advantage_mean": -6.20881956958641e-10, "advantage_min": -0.49471021071076393, "advantage_std": 0.5474960952997208, "completion_length": 2073.1667251586914, "epoch": 0.1382857142857143, "grad_norm": 0.13649925589561462, "kl": 0.010579109191894531, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.458418577899774e-07, "loss": 0.0152, "reward": 0.4631114602088928, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4631114602088928, "reward_after_std": 0.5474960878491402, "reward_before_mean": 0.7205968461930752, "reward_before_std": 0.4769848696887493, "reward_change_max": 0.0005392804741859436, "reward_change_mean": -0.2574853505939245, "reward_change_min": -0.386174064129591, "reward_change_std": 0.14462041202932596, "reward_std": 0.5474961064755917, "rewards/cosine_scaled_reward": 0.016548408661037683, "rewards/format_reward": 0.6875000018626451, "step": 121 }, { "advantage_max": 1.0412322394549847, "advantage_mean": -2.5145710097485008e-08, "advantage_min": -0.7833246439695358, "advantage_std": 0.6966988109052181, "completion_length": 2927.7917098999023, "epoch": 0.13942857142857143, "grad_norm": 0.13041254878044128, "kl": 0.0074462890625, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.443380060197385e-07, "loss": 0.034, "reward": 0.26743900775909424, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.26743900775909424, "reward_after_std": 0.6966988146305084, "reward_before_mean": 0.48478084057569504, "reward_before_std": 0.7121212631464005, "reward_change_max": 0.0002805739641189575, "reward_change_mean": -0.2173418691381812, "reward_change_min": -0.42399162985384464, "reward_change_std": 0.16811825148761272, "reward_std": 0.696698822081089, "rewards/cosine_scaled_reward": 0.03405708260834217, "rewards/format_reward": 0.41666666977107525, "step": 122 }, { "advantage_max": 1.255372829735279, "advantage_mean": 1.862645593320167e-09, "advantage_min": -0.7953049875795841, "advantage_std": 0.7709114924073219, "completion_length": 3003.104202270508, "epoch": 0.14057142857142857, "grad_norm": 0.12199567258358002, "kl": 0.006504058837890625, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.428149347714143e-07, "loss": 0.0595, "reward": -0.03525662235915661, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.03525662235915661, "reward_after_std": 0.770911518484354, "reward_before_mean": 0.10965576581656933, "reward_before_std": 0.7936905510723591, "reward_change_max": 0.0006286948919296265, "reward_change_mean": -0.14491238072514534, "reward_change_min": -0.3238669876009226, "reward_change_std": 0.13130992534570396, "reward_std": 0.7709115408360958, "rewards/cosine_scaled_reward": -0.1326721184886992, "rewards/format_reward": 0.37500000558793545, "step": 123 }, { "advantage_max": 0.9072084166109562, "advantage_mean": 7.761021658669875e-09, "advantage_min": -0.6190587878227234, "advantage_std": 0.5967953819781542, "completion_length": 2339.3333740234375, "epoch": 0.1417142857142857, "grad_norm": 0.10634805262088776, "kl": 0.0068912506103515625, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.412727182773486e-07, "loss": 0.0291, "reward": 0.35492085479199886, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.35492085479199886, "reward_after_std": 0.5967953782528639, "reward_before_mean": 0.5955427903681993, "reward_before_std": 0.5878977961838245, "reward_change_max": 0.0002144947648048401, "reward_change_mean": -0.240621916949749, "reward_change_min": -0.41652823612093925, "reward_change_std": 0.16582451574504375, "reward_std": 0.596795380115509, "rewards/cosine_scaled_reward": -0.014728618785738945, "rewards/format_reward": 0.6250000074505806, "step": 124 }, { "advantage_max": 1.146369557827711, "advantage_mean": -9.934108091691485e-09, "advantage_min": -0.7243319489061832, "advantage_std": 0.6979638151824474, "completion_length": 2861.2291717529297, "epoch": 0.14285714285714285, "grad_norm": 0.0976700559258461, "kl": 0.0049991607666015625, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.397114317029974e-07, "loss": 0.0162, "reward": 0.23180764354765415, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.23180764354765415, "reward_after_std": 0.6979638002812862, "reward_before_mean": 0.4370589777827263, "reward_before_std": 0.6945686750113964, "reward_change_max": 0.0, "reward_change_mean": -0.20525133749470115, "reward_change_min": -0.34768545627593994, "reward_change_std": 0.14167729078326374, "reward_std": 0.6979638077318668, "rewards/cosine_scaled_reward": 0.051862819120287895, "rewards/format_reward": 0.33333333395421505, "step": 125 }, { "advantage_max": 1.2939909808337688, "advantage_mean": -1.4280280624667796e-08, "advantage_min": -0.8710135444998741, "advantage_std": 0.812971830368042, "completion_length": 2863.395866394043, "epoch": 0.144, "grad_norm": 0.1510373055934906, "kl": 0.00441741943359375, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.381311511432658e-07, "loss": 0.0453, "reward": 0.351866427809, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.351866427809, "reward_after_std": 0.8129718378186226, "reward_before_mean": 0.5765400063246489, "reward_before_std": 0.8266580477356911, "reward_change_max": 0.0007203668355941772, "reward_change_mean": -0.22467358130961657, "reward_change_min": -0.4323025830090046, "reward_change_std": 0.17490686709061265, "reward_std": 0.8129718489944935, "rewards/cosine_scaled_reward": 0.03826999478042126, "rewards/format_reward": 0.5000000093132257, "step": 126 }, { "advantage_max": 0.8919055536389351, "advantage_mean": -6.829698695476338e-09, "advantage_min": -0.605189211666584, "advantage_std": 0.5651932023465633, "completion_length": 3023.2500610351562, "epoch": 0.14514285714285713, "grad_norm": 0.0999368354678154, "kl": 0.00724029541015625, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.36531953618799e-07, "loss": 0.0569, "reward": -0.23997146729379892, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.23997146729379892, "reward_after_std": 0.5651932060718536, "reward_before_mean": -0.12315090373158455, "reward_before_std": 0.5872822925448418, "reward_change_max": 0.0016008764505386353, "reward_change_mean": -0.11682056915014982, "reward_change_min": -0.24330477230250835, "reward_change_std": 0.10519890394061804, "reward_std": 0.5651932395994663, "rewards/cosine_scaled_reward": -0.24907546117901802, "rewards/format_reward": 0.37500000931322575, "step": 127 }, { "advantage_max": 1.403386164456606, "advantage_mean": -4.967053435223079e-09, "advantage_min": -1.042650755494833, "advantage_std": 0.9171682633459568, "completion_length": 2825.8750610351562, "epoch": 0.1462857142857143, "grad_norm": 0.17191162705421448, "kl": 0.0064716339111328125, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.34913917072228e-07, "loss": 0.0513, "reward": 0.5329655185341835, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5329655185341835, "reward_after_std": 0.9171682372689247, "reward_before_mean": 0.7907154634594917, "reward_before_std": 0.936184398829937, "reward_change_max": 0.0001185983419418335, "reward_change_mean": -0.2577499356120825, "reward_change_min": -0.5034973677247763, "reward_change_std": 0.2020270312204957, "reward_std": 0.9171682521700859, "rewards/cosine_scaled_reward": 0.13494104286655784, "rewards/format_reward": 0.5208333432674408, "step": 128 }, { "advantage_max": 1.209057331085205, "advantage_mean": 8.692344177774203e-09, "advantage_min": -0.6348308697342873, "advantage_std": 0.6960801109671593, "completion_length": 3328.5833435058594, "epoch": 0.14742857142857144, "grad_norm": 0.128533273935318, "kl": 0.007480621337890625, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.332771203643714e-07, "loss": 0.0069, "reward": -0.1649972628802061, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1649972628802061, "reward_after_std": 0.6960801407694817, "reward_before_mean": -0.044744182378053665, "reward_before_std": 0.7030451036989689, "reward_change_max": 0.0, "reward_change_mean": -0.12025306816212833, "reward_change_min": -0.23371572606265545, "reward_change_std": 0.09795773914083838, "reward_std": 0.6960801407694817, "rewards/cosine_scaled_reward": -0.13695543073117733, "rewards/format_reward": 0.2291666679084301, "step": 129 }, { "advantage_max": 0.7899751216173172, "advantage_mean": 1.0865430333240056e-08, "advantage_min": -0.48590800166130066, "advantage_std": 0.47238731384277344, "completion_length": 3010.666679382324, "epoch": 0.14857142857142858, "grad_norm": 0.08634471148252487, "kl": 0.006420135498046875, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.316216432703916e-07, "loss": 0.0314, "reward": -0.3267571162432432, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.3267571162432432, "reward_after_std": 0.47238731011748314, "reward_before_mean": -0.22298824414610863, "reward_before_std": 0.4778023697435856, "reward_change_max": 0.0005714669823646545, "reward_change_mean": -0.10376887186430395, "reward_change_min": -0.20740794576704502, "reward_change_std": 0.08423587586730719, "reward_std": 0.47238731384277344, "rewards/cosine_scaled_reward": -0.24691079556941986, "rewards/format_reward": 0.27083333767950535, "step": 130 }, { "advantage_max": 1.1991299539804459, "advantage_mean": -1.490116185998147e-08, "advantage_min": -0.9366889521479607, "advantage_std": 0.7851066887378693, "completion_length": 2728.8750610351562, "epoch": 0.14971428571428572, "grad_norm": 0.18943481147289276, "kl": 0.0074214935302734375, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.299475664759068e-07, "loss": 0.0797, "reward": 0.4576035141944885, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4576035141944885, "reward_after_std": 0.7851067148149014, "reward_before_mean": 0.7081699203699827, "reward_before_std": 0.7892623916268349, "reward_change_max": 0.0005019381642341614, "reward_change_mean": -0.25056641083210707, "reward_change_min": -0.43722840771079063, "reward_change_std": 0.18447065260261297, "reward_std": 0.7851067595183849, "rewards/cosine_scaled_reward": 0.12491828389465809, "rewards/format_reward": 0.4583333395421505, "step": 131 }, { "advantage_max": 0.9315230771899223, "advantage_mean": 7.45058070794613e-09, "advantage_min": -0.8552741520106792, "advantage_std": 0.637313200160861, "completion_length": 2610.2083740234375, "epoch": 0.15085714285714286, "grad_norm": 0.10160277783870697, "kl": 0.0059833526611328125, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.282549715730579e-07, "loss": 0.0306, "reward": 0.23830265924334526, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.23830265924334526, "reward_after_std": 0.6373131927102804, "reward_before_mean": 0.45372072607278824, "reward_before_std": 0.6567586157470942, "reward_change_max": 0.0003876909613609314, "reward_change_mean": -0.21541805751621723, "reward_change_min": -0.37372968532145023, "reward_change_std": 0.15087461285293102, "reward_std": 0.6373131982982159, "rewards/cosine_scaled_reward": 0.028943683952093124, "rewards/format_reward": 0.3958333395421505, "step": 132 }, { "advantage_max": 1.149038176983595, "advantage_mean": 8.07146260939362e-09, "advantage_min": -0.5314929522573948, "advantage_std": 0.6330554522573948, "completion_length": 3184.291702270508, "epoch": 0.152, "grad_norm": 0.1362348049879074, "kl": 0.009695053100585938, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.265439410565328e-07, "loss": 0.0222, "reward": -0.21828029211610556, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.21828029211610556, "reward_after_std": 0.6330554746091366, "reward_before_mean": -0.1073903851211071, "reward_before_std": 0.6251240707933903, "reward_change_max": 0.0003124848008155823, "reward_change_mean": -0.11088989552808926, "reward_change_min": -0.215480525046587, "reward_change_std": 0.08133175503462553, "reward_std": 0.6330555006861687, "rewards/cosine_scaled_reward": -0.18911186209879816, "rewards/format_reward": 0.27083333395421505, "step": 133 }, { "advantage_max": 1.367484513670206, "advantage_mean": 1.1175871006408045e-08, "advantage_min": -0.7098233513534069, "advantage_std": 0.77448296174407, "completion_length": 2480.312568664551, "epoch": 0.15314285714285714, "grad_norm": 0.1691761612892151, "kl": 0.009679794311523438, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.248145583195447e-07, "loss": 0.0257, "reward": 0.31091918097808957, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.31091918097808957, "reward_after_std": 0.7744829468429089, "reward_before_mean": 0.5238788276910782, "reward_before_std": 0.7513154298067093, "reward_change_max": 0.0006301850080490112, "reward_change_mean": -0.2129596322774887, "reward_change_min": -0.3611560985445976, "reward_change_std": 0.14230560464784503, "reward_std": 0.774482972919941, "rewards/cosine_scaled_reward": -0.029727259650826454, "rewards/format_reward": 0.5833333395421505, "step": 134 }, { "advantage_max": 1.1842001006007195, "advantage_mean": 2.4835271617007493e-09, "advantage_min": -1.0657855421304703, "advantage_std": 0.839623736217618, "completion_length": 2050.625030517578, "epoch": 0.15428571428571428, "grad_norm": 0.12861235439777374, "kl": 0.00727081298828125, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.230669076497687e-07, "loss": 0.0173, "reward": 0.8601483590900898, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.8601483590900898, "reward_after_std": 0.8396237269043922, "reward_before_mean": 1.1949367504566908, "reward_before_std": 0.8714061249047518, "reward_change_max": 0.00022689253091812134, "reward_change_mean": -0.33478840440511703, "reward_change_min": -0.5806877091526985, "reward_change_std": 0.23010531906038523, "reward_std": 0.8396237418055534, "rewards/cosine_scaled_reward": 0.2641350352205336, "rewards/format_reward": 0.6666666679084301, "step": 135 }, { "advantage_max": 1.2303773239254951, "advantage_mean": -1.1175870756607864e-08, "advantage_min": -0.9678524509072304, "advantage_std": 0.8509483598172665, "completion_length": 2895.1250610351562, "epoch": 0.15542857142857142, "grad_norm": 0.17453262209892273, "kl": 0.010608673095703125, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.213010742252327e-07, "loss": 0.0417, "reward": 0.2550869733095169, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2550869733095169, "reward_after_std": 0.8509483598172665, "reward_before_mean": 0.4606906305998564, "reward_before_std": 0.9013799019157887, "reward_change_max": 0.00351502001285553, "reward_change_mean": -0.20560368243604898, "reward_change_min": -0.4312790837138891, "reward_change_std": 0.18253358593210578, "reward_std": 0.8509483896195889, "rewards/cosine_scaled_reward": -0.009238028898835182, "rewards/format_reward": 0.4791666716337204, "step": 136 }, { "advantage_max": 1.0917166694998741, "advantage_mean": 1.1175871117430347e-08, "advantage_min": -0.7868407480418682, "advantage_std": 0.6896526142954826, "completion_length": 2984.9584045410156, "epoch": 0.15657142857142858, "grad_norm": 0.1343890279531479, "kl": 0.010105133056640625, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.195171441101668e-07, "loss": 0.049, "reward": -0.131092662923038, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.131092662923038, "reward_after_std": 0.6896526180207729, "reward_before_mean": -3.441236913204193e-05, "reward_before_std": 0.7179115898907185, "reward_change_max": 8.6270272731781e-05, "reward_change_mean": -0.13105822540819645, "reward_change_min": -0.2660218197852373, "reward_change_std": 0.11597825866192579, "reward_std": 0.6896526291966438, "rewards/cosine_scaled_reward": -0.20835054852068424, "rewards/format_reward": 0.41666668094694614, "step": 137 }, { "advantage_max": 1.0515642911195755, "advantage_mean": -6.8296989730320945e-09, "advantage_min": -0.8598476573824883, "advantage_std": 0.7168281599879265, "completion_length": 2687.6250610351562, "epoch": 0.15771428571428572, "grad_norm": 0.16192981600761414, "kl": 0.00862884521484375, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.177152042508077e-07, "loss": 0.0639, "reward": 0.20580698736011982, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.20580698736011982, "reward_after_std": 0.7168281711637974, "reward_before_mean": 0.40934595093131065, "reward_before_std": 0.7489210702478886, "reward_change_max": 0.0, "reward_change_mean": -0.2035389570519328, "reward_change_min": -0.40829353407025337, "reward_change_std": 0.1601135954260826, "reward_std": 0.7168281897902489, "rewards/cosine_scaled_reward": -0.07657703198492527, "rewards/format_reward": 0.562500013038516, "step": 138 }, { "advantage_max": 1.2990404963493347, "advantage_mean": -6.208817349140361e-09, "advantage_min": -0.9247448444366455, "advantage_std": 0.8958325535058975, "completion_length": 3186.9375610351562, "epoch": 0.15885714285714286, "grad_norm": 0.16291646659374237, "kl": 0.010364532470703125, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.158953424711624e-07, "loss": 0.0531, "reward": 0.18419764749705791, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.18419764749705791, "reward_after_std": 0.8958325386047363, "reward_before_mean": 0.37362875789403915, "reward_before_std": 0.9554058127105236, "reward_change_max": 0.0, "reward_change_mean": -0.18943111412227154, "reward_change_min": -0.4572391249239445, "reward_change_std": 0.18466449715197086, "reward_std": 0.8958325833082199, "rewards/cosine_scaled_reward": -0.06318561546504498, "rewards/format_reward": 0.5000000111758709, "step": 139 }, { "advantage_max": 1.1372979544103146, "advantage_mean": 1.1796752963366686e-08, "advantage_min": -0.5056699030101299, "advantage_std": 0.625762702897191, "completion_length": 3152.000045776367, "epoch": 0.16, "grad_norm": 0.18616674840450287, "kl": 0.012786865234375, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.140576474687263e-07, "loss": 0.0573, "reward": 0.016207601875066757, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.016207601875066757, "reward_after_std": 0.6257627103477716, "reward_before_mean": 0.17444992996752262, "reward_before_std": 0.6075356099754572, "reward_change_max": 0.0001961737871170044, "reward_change_mean": -0.1582423378713429, "reward_change_min": -0.28335670568048954, "reward_change_std": 0.10457282629795372, "reward_std": 0.6257627215236425, "rewards/cosine_scaled_reward": -0.06902503967285156, "rewards/format_reward": 0.31250000931322575, "step": 140 }, { "advantage_max": 1.1669623665511608, "advantage_mean": -2.6697914601303552e-08, "advantage_min": -1.139278769493103, "advantage_std": 0.886767391115427, "completion_length": 2624.916702270508, "epoch": 0.16114285714285714, "grad_norm": 0.18260450661182404, "kl": 0.013813018798828125, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.122022088101613e-07, "loss": 0.0687, "reward": 0.37871203385293484, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.37871203385293484, "reward_after_std": 0.8867673929780722, "reward_before_mean": 0.6132042966783047, "reward_before_std": 0.95911661721766, "reward_change_max": 0.00043144822120666504, "reward_change_mean": -0.2344922535121441, "reward_change_min": -0.48418484814465046, "reward_change_std": 0.20474183838814497, "reward_std": 0.8867674265056849, "rewards/cosine_scaled_reward": 0.004518782487139106, "rewards/format_reward": 0.6041666772216558, "step": 141 }, { "advantage_max": 1.15086317807436, "advantage_mean": -8.69234451084111e-09, "advantage_min": -0.7766324654221535, "advantage_std": 0.7354513872414827, "completion_length": 2867.5209197998047, "epoch": 0.16228571428571428, "grad_norm": 0.13374853134155273, "kl": 0.011152267456054688, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.103291169269299e-07, "loss": 0.0346, "reward": 0.18106802669353783, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.18106802669353783, "reward_after_std": 0.7354514040052891, "reward_before_mean": 0.375831738114357, "reward_before_std": 0.7478249240666628, "reward_change_max": 0.0001162216067314148, "reward_change_mean": -0.19476369954645634, "reward_change_min": -0.3674887605011463, "reward_change_std": 0.15106896962970495, "reward_std": 0.7354514189064503, "rewards/cosine_scaled_reward": -0.093334149569273, "rewards/format_reward": 0.5625000037252903, "step": 142 }, { "advantage_max": 1.1779975406825542, "advantage_mean": 1.8316011068941762e-08, "advantage_min": -0.7890665233135223, "advantage_std": 0.7509134970605373, "completion_length": 2685.1041870117188, "epoch": 0.16342857142857142, "grad_norm": 0.2118491530418396, "kl": 0.01227569580078125, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.084384631108882e-07, "loss": 0.0779, "reward": -0.044473139103502035, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.044473139103502035, "reward_after_std": 0.7509134896099567, "reward_before_mean": 0.10008539631962776, "reward_before_std": 0.7791899517178535, "reward_change_max": 0.0006205588579177856, "reward_change_mean": -0.1445585135370493, "reward_change_min": -0.31382193975150585, "reward_change_std": 0.13543755933642387, "reward_std": 0.7509135026484728, "rewards/cosine_scaled_reward": -0.18954064138233662, "rewards/format_reward": 0.47916667349636555, "step": 143 }, { "advantage_max": 1.6947645135223866, "advantage_mean": 1.2417634698280722e-09, "advantage_min": -0.7990465760231018, "advantage_std": 0.9641420841217041, "completion_length": 3006.0833740234375, "epoch": 0.16457142857142856, "grad_norm": 0.16425053775310516, "kl": 0.01209259033203125, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.065303395098358e-07, "loss": 0.0509, "reward": 0.005044015124440193, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.005044015124440193, "reward_after_std": 0.9641420878469944, "reward_before_mean": 0.14007875975221395, "reward_before_std": 0.9769864324480295, "reward_change_max": 0.0006867051124572754, "reward_change_mean": -0.1350347544066608, "reward_change_min": -0.2908038944005966, "reward_change_std": 0.12375469121616334, "reward_std": 0.9641421064734459, "rewards/cosine_scaled_reward": -0.09662727918475866, "rewards/format_reward": 0.3333333358168602, "step": 144 }, { "advantage_max": 1.1794416904449463, "advantage_mean": -3.4769377377230626e-08, "advantage_min": -0.8797503411769867, "advantage_std": 0.7782802954316139, "completion_length": 2043.2708702087402, "epoch": 0.1657142857142857, "grad_norm": 0.2015761435031891, "kl": 0.009426116943359375, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.046048391230247e-07, "loss": 0.0459, "reward": 0.5311450399458408, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5311450399458408, "reward_after_std": 0.7782802917063236, "reward_before_mean": 0.7967919856309891, "reward_before_std": 0.7834953926503658, "reward_change_max": 0.0005800426006317139, "reward_change_mean": -0.2656469848006964, "reward_change_min": -0.47148168832063675, "reward_change_std": 0.18986122868955135, "reward_std": 0.7782803103327751, "rewards/cosine_scaled_reward": 0.06506266118958592, "rewards/format_reward": 0.6666666734963655, "step": 145 }, { "advantage_max": 1.172963209450245, "advantage_mean": -9.313225912688239e-09, "advantage_min": -0.8490905277431011, "advantage_std": 0.7582556456327438, "completion_length": 2512.8334197998047, "epoch": 0.16685714285714287, "grad_norm": 0.17811964452266693, "kl": 0.00946807861328125, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.026620557966279e-07, "loss": 0.101, "reward": 0.08462646137923002, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.08462646137923002, "reward_after_std": 0.7582556679844856, "reward_before_mean": 0.25733761489391327, "reward_before_std": 0.7809236831963062, "reward_change_max": 0.0012138262391090393, "reward_change_mean": -0.1727111730724573, "reward_change_min": -0.3685157373547554, "reward_change_std": 0.1483112219721079, "reward_std": 0.7582557089626789, "rewards/cosine_scaled_reward": -0.15258119627833366, "rewards/format_reward": 0.5625000093132257, "step": 146 }, { "advantage_max": 1.589494600892067, "advantage_mean": 7.450580596923828e-09, "advantage_min": -0.804422177374363, "advantage_std": 0.8985835202038288, "completion_length": 2880.8541946411133, "epoch": 0.168, "grad_norm": 0.1284470409154892, "kl": 0.016021728515625, "lambda_div_used": 0.7999999999999999, "learning_rate": 9.007020842191634e-07, "loss": -0.0127, "reward": 0.02172628662083298, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.02172628662083298, "reward_after_std": 0.8985835202038288, "reward_before_mean": 0.16462149005383253, "reward_before_std": 0.9031895622611046, "reward_change_max": 0.0, "reward_change_mean": -0.14289520680904388, "reward_change_min": -0.299978893250227, "reward_change_std": 0.12021067598834634, "reward_std": 0.8985835537314415, "rewards/cosine_scaled_reward": -0.09477258916012943, "rewards/format_reward": 0.3541666753590107, "step": 147 }, { "advantage_max": 0.8648317083716393, "advantage_mean": -1.4901161582425715e-08, "advantage_min": -0.6253716349601746, "advantage_std": 0.5511173605918884, "completion_length": 2506.979202270508, "epoch": 0.16914285714285715, "grad_norm": 0.09233488887548447, "kl": 0.01415252685546875, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.987250199168808e-07, "loss": 0.0221, "reward": 0.08282577991485596, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.08282577991485596, "reward_after_std": 0.5511173643171787, "reward_before_mean": 0.2693531382828951, "reward_before_std": 0.5502285584807396, "reward_change_max": 0.00019822269678115845, "reward_change_mean": -0.18652736581861973, "reward_change_min": -0.3528789635747671, "reward_change_std": 0.13402172224596143, "reward_std": 0.5511173717677593, "rewards/cosine_scaled_reward": -0.12574010342359543, "rewards/format_reward": 0.5208333395421505, "step": 148 }, { "advantage_max": 1.5730044171214104, "advantage_mean": -4.9670543234014986e-09, "advantage_min": -0.9332806505262852, "advantage_std": 0.9551067128777504, "completion_length": 2842.354232788086, "epoch": 0.1702857142857143, "grad_norm": 0.18471087515354156, "kl": 0.011203765869140625, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.967309592491052e-07, "loss": 0.0665, "reward": 0.16992619819939137, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.16992619819939137, "reward_after_std": 0.9551067017018795, "reward_before_mean": 0.3444363549351692, "reward_before_std": 0.9797604568302631, "reward_change_max": 0.0006968006491661072, "reward_change_mean": -0.17451016139239073, "reward_change_min": -0.3366739135235548, "reward_change_std": 0.14999074442312121, "reward_std": 0.955106757581234, "rewards/cosine_scaled_reward": -0.08819849230349064, "rewards/format_reward": 0.5208333432674408, "step": 149 }, { "advantage_max": 1.3253266364336014, "advantage_mean": 1.862645371275562e-09, "advantage_min": -0.8598656915128231, "advantage_std": 0.8570183347910643, "completion_length": 3051.354217529297, "epoch": 0.17142857142857143, "grad_norm": 0.16119371354579926, "kl": 0.01721954345703125, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.9471999940354e-07, "loss": 0.0283, "reward": 0.06531700119376183, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.06531700119376183, "reward_after_std": 0.8570183347910643, "reward_before_mean": 0.2267507165670395, "reward_before_std": 0.8986724838614464, "reward_change_max": 0.00046025216579437256, "reward_change_mean": -0.16143368370831013, "reward_change_min": -0.39323098957538605, "reward_change_std": 0.1558735342696309, "reward_std": 0.857018357142806, "rewards/cosine_scaled_reward": -0.0324579905718565, "rewards/format_reward": 0.29166667349636555, "step": 150 }, { "advantage_max": 1.2163867093622684, "advantage_mean": 3.725290964595729e-09, "advantage_min": -0.8823812305927277, "advantage_std": 0.8030950203537941, "completion_length": 2785.750030517578, "epoch": 0.17257142857142857, "grad_norm": 0.3681621253490448, "kl": 0.01708221435546875, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.926922383915315e-07, "loss": 0.105, "reward": 0.31836192682385445, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.31836192682385445, "reward_after_std": 0.803095031529665, "reward_before_mean": 0.538421243429184, "reward_before_std": 0.8262094706296921, "reward_change_max": 0.0, "reward_change_mean": -0.22005930682644248, "reward_change_min": -0.4244182799011469, "reward_change_std": 0.1743629314005375, "reward_std": 0.8030950464308262, "rewards/cosine_scaled_reward": 0.019210622180253267, "rewards/format_reward": 0.5000000074505806, "step": 151 }, { "advantage_max": 0.8419327363371849, "advantage_mean": -1.5522040874849097e-09, "advantage_min": -0.4797561429440975, "advantage_std": 0.5182995554059744, "completion_length": 2828.6875381469727, "epoch": 0.1737142857142857, "grad_norm": 0.1370847374200821, "kl": 0.019405364990234375, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.906477750432903e-07, "loss": 0.0417, "reward": -0.2341057602316141, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2341057602316141, "reward_after_std": 0.5182995591312647, "reward_before_mean": -0.11436109989881516, "reward_before_std": 0.5233677849173546, "reward_change_max": 0.0006811320781707764, "reward_change_mean": -0.1197446659207344, "reward_change_min": -0.268768809735775, "reward_change_std": 0.10332783777266741, "reward_std": 0.5182995777577162, "rewards/cosine_scaled_reward": -0.23426389275118709, "rewards/format_reward": 0.35416666977107525, "step": 152 }, { "advantage_max": 0.7851564697921276, "advantage_mean": -1.2728075426959862e-08, "advantage_min": -0.46787630394101143, "advantage_std": 0.47235927172005177, "completion_length": 2967.645866394043, "epoch": 0.17485714285714285, "grad_norm": 0.08595111966133118, "kl": 0.02392578125, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.88586709003076e-07, "loss": 0.0332, "reward": -0.20785479061305523, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.20785479061305523, "reward_after_std": 0.4723592661321163, "reward_before_mean": -0.07975793443620205, "reward_before_std": 0.46797456219792366, "reward_change_max": 0.0, "reward_change_mean": -0.1280968664214015, "reward_change_min": -0.24486039392650127, "reward_change_std": 0.09459134982898831, "reward_std": 0.4723592773079872, "rewards/cosine_scaled_reward": -0.17529564630240202, "rewards/format_reward": 0.2708333395421505, "step": 153 }, { "advantage_max": 1.1741655804216862, "advantage_mean": -2.2351741901793787e-08, "advantage_min": -1.1094277240335941, "advantage_std": 0.826783861964941, "completion_length": 3278.750030517578, "epoch": 0.176, "grad_norm": 0.14467167854309082, "kl": 0.0132904052734375, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.865091407243394e-07, "loss": 0.0375, "reward": 0.3852272480726242, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3852272480726242, "reward_after_std": 0.8267838656902313, "reward_before_mean": 0.622010350227356, "reward_before_std": 0.8740377686917782, "reward_change_max": 0.000317499041557312, "reward_change_mean": -0.23678309470415115, "reward_change_min": -0.453342555090785, "reward_change_std": 0.1874784966930747, "reward_std": 0.8267839029431343, "rewards/cosine_scaled_reward": 0.10267183370888233, "rewards/format_reward": 0.416666679084301, "step": 154 }, { "advantage_max": 1.361569032073021, "advantage_mean": -7.761021270091817e-09, "advantage_min": -0.9261196851730347, "advantage_std": 0.8577107917517424, "completion_length": 2665.6458435058594, "epoch": 0.17714285714285713, "grad_norm": 0.1313173472881317, "kl": 0.01583099365234375, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.844151714648274e-07, "loss": 0.0112, "reward": 0.3461736086755991, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3461736086755991, "reward_after_std": 0.8577107768505812, "reward_before_mean": 0.5659867618232965, "reward_before_std": 0.8785454705357552, "reward_change_max": 0.0, "reward_change_mean": -0.21981315221637487, "reward_change_min": -0.45075549371540546, "reward_change_std": 0.17330361530184746, "reward_std": 0.857710812240839, "rewards/cosine_scaled_reward": 0.06424337532371283, "rewards/format_reward": 0.43750000558793545, "step": 155 }, { "advantage_max": 1.3500538542866707, "advantage_mean": -1.8626449271863521e-09, "advantage_min": -0.7571912333369255, "advantage_std": 0.7873305641114712, "completion_length": 3174.4583587646484, "epoch": 0.1782857142857143, "grad_norm": 0.1772507131099701, "kl": 0.015796661376953125, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.823049032816478e-07, "loss": 0.0748, "reward": -0.021374725736677647, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.021374725736677647, "reward_after_std": 0.7873305678367615, "reward_before_mean": 0.12219942547380924, "reward_before_std": 0.7977100722491741, "reward_change_max": 0.0009806975722312927, "reward_change_mean": -0.1435741283930838, "reward_change_min": -0.3106989674270153, "reward_change_std": 0.12052066763862967, "reward_std": 0.7873306162655354, "rewards/cosine_scaled_reward": -0.07431696751154959, "rewards/format_reward": 0.27083334140479565, "step": 156 }, { "advantage_max": 1.0542680732905865, "advantage_mean": -4.656612984099695e-09, "advantage_min": -0.7321070358157158, "advantage_std": 0.6456223912537098, "completion_length": 3157.729217529297, "epoch": 0.17942857142857144, "grad_norm": 0.13495060801506042, "kl": 0.0186614990234375, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.801784390262943e-07, "loss": 0.0185, "reward": 0.0226963022723794, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.0226963022723794, "reward_after_std": 0.6456223763525486, "reward_before_mean": 0.18799853324890137, "reward_before_std": 0.6520291641354561, "reward_change_max": 2.9131770133972168e-05, "reward_change_mean": -0.16530223470181227, "reward_change_min": -0.2978616785258055, "reward_change_std": 0.1206301860511303, "reward_std": 0.6456223838031292, "rewards/cosine_scaled_reward": -0.12475074303802103, "rewards/format_reward": 0.4375000111758709, "step": 157 }, { "advantage_max": 1.296987421810627, "advantage_mean": -1.800557003495129e-08, "advantage_min": -0.8344993442296982, "advantage_std": 0.8205406442284584, "completion_length": 3240.8333740234375, "epoch": 0.18057142857142858, "grad_norm": 0.17755259573459625, "kl": 0.018035888671875, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.780358823396352e-07, "loss": 0.055, "reward": 0.36690380051732063, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.36690380051732063, "reward_after_std": 0.8205406330525875, "reward_before_mean": 0.5930448030121624, "reward_before_std": 0.8394192270934582, "reward_change_max": 0.000453360378742218, "reward_change_mean": -0.22614100854843855, "reward_change_min": -0.41666679829359055, "reward_change_std": 0.16973178228363395, "reward_std": 0.8205406330525875, "rewards/cosine_scaled_reward": 0.08818905614316463, "rewards/format_reward": 0.41666667722165585, "step": 158 }, { "advantage_max": 0.9995781295001507, "advantage_mean": 2.2972624302841638e-08, "advantage_min": -0.6199997216463089, "advantage_std": 0.6150403209030628, "completion_length": 3311.500030517578, "epoch": 0.18171428571428572, "grad_norm": 0.11936888843774796, "kl": 0.02362060546875, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.758773376468604e-07, "loss": 0.0342, "reward": -0.33629678376019, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.33629678376019, "reward_after_std": 0.6150403171777725, "reward_before_mean": -0.24361892230808735, "reward_before_std": 0.6399921141564846, "reward_change_max": 0.001079276204109192, "reward_change_mean": -0.09267783933319151, "reward_change_min": -0.24926885031163692, "reward_change_std": 0.10010573221370578, "reward_std": 0.6150403209030628, "rewards/cosine_scaled_reward": -0.22597613092511892, "rewards/format_reward": 0.20833333767950535, "step": 159 }, { "advantage_max": 1.270157516002655, "advantage_mean": -1.1796753074388988e-08, "advantage_min": -0.9037523716688156, "advantage_std": 0.832497738301754, "completion_length": 2993.0625228881836, "epoch": 0.18285714285714286, "grad_norm": 0.15524564683437347, "kl": 0.02504730224609375, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.737029101523929e-07, "loss": 0.0148, "reward": 0.12219192460179329, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.12219192460179329, "reward_after_std": 0.8324977234005928, "reward_before_mean": 0.29852936416864395, "reward_before_std": 0.8750162459909916, "reward_change_max": 0.00012797117233276367, "reward_change_mean": -0.1763374567963183, "reward_change_min": -0.36676223762333393, "reward_change_std": 0.1543191159144044, "reward_std": 0.8324977271258831, "rewards/cosine_scaled_reward": -0.006985316518694162, "rewards/format_reward": 0.31250000558793545, "step": 160 }, { "advantage_max": 1.1460592560470104, "advantage_mean": 1.2417633588057697e-09, "advantage_min": -0.7549284622073174, "advantage_std": 0.7234622985124588, "completion_length": 2890.104232788086, "epoch": 0.184, "grad_norm": 0.14694982767105103, "kl": 0.0247650146484375, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.715127058347614e-07, "loss": 0.0257, "reward": 0.1656032521277666, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1656032521277666, "reward_after_std": 0.72346231341362, "reward_before_mean": 0.3562948019243777, "reward_before_std": 0.733609389513731, "reward_change_max": 0.0, "reward_change_mean": -0.19069154560565948, "reward_change_min": -0.4029949326068163, "reward_change_std": 0.15449916571378708, "reward_std": 0.7234623432159424, "rewards/cosine_scaled_reward": -0.0406026104465127, "rewards/format_reward": 0.4375000111758709, "step": 161 }, { "advantage_max": 1.2185542285442352, "advantage_mean": 1.8626452047421083e-08, "advantage_min": -0.7333599925041199, "advantage_std": 0.7422689385712147, "completion_length": 3422.4166870117188, "epoch": 0.18514285714285714, "grad_norm": 0.14781790971755981, "kl": 0.028289794921875, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.693068314414344e-07, "loss": 0.0316, "reward": -0.17666981369256973, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.17666981369256973, "reward_after_std": 0.7422689348459244, "reward_before_mean": -0.05947569012641907, "reward_before_std": 0.7710359916090965, "reward_change_max": 0.0006634965538978577, "reward_change_mean": -0.11719411658123136, "reward_change_min": -0.29207202047109604, "reward_change_std": 0.11601532436907291, "reward_std": 0.742268942296505, "rewards/cosine_scaled_reward": -0.14432118041440845, "rewards/format_reward": 0.2291666753590107, "step": 162 }, { "advantage_max": 0.8001459985971451, "advantage_mean": -2.2662182574961776e-08, "advantage_min": -0.8173674866557121, "advantage_std": 0.5539420563727617, "completion_length": 2860.375030517578, "epoch": 0.18628571428571428, "grad_norm": 0.12430832535028458, "kl": 0.02669525146484375, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.670853944836176e-07, "loss": 0.0204, "reward": 0.42752911522984505, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.42752911522984505, "reward_after_std": 0.5539420824497938, "reward_before_mean": 0.6881049210205674, "reward_before_std": 0.5508717447519302, "reward_change_max": 0.0014692693948745728, "reward_change_mean": -0.2605757915880531, "reward_change_min": -0.4086344186216593, "reward_change_std": 0.16740787355229259, "reward_std": 0.5539420917630196, "rewards/cosine_scaled_reward": 0.08363579027354717, "rewards/format_reward": 0.5208333358168602, "step": 163 }, { "advantage_max": 0.8323970772325993, "advantage_mean": 1.428027945893362e-08, "advantage_min": -0.6382769756019115, "advantage_std": 0.5268633058294654, "completion_length": 2665.3125610351562, "epoch": 0.18742857142857142, "grad_norm": 0.1152673214673996, "kl": 0.02524566650390625, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.648485032310144e-07, "loss": -0.0005, "reward": 0.1746742830146104, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1746742830146104, "reward_after_std": 0.5268633244559169, "reward_before_mean": 0.37982947938144207, "reward_before_std": 0.509920141659677, "reward_change_max": 0.0007155910134315491, "reward_change_mean": -0.20515516120940447, "reward_change_min": -0.32288478687405586, "reward_change_std": 0.13307965802960098, "reward_std": 0.5268633281812072, "rewards/cosine_scaled_reward": -0.04966861708089709, "rewards/format_reward": 0.47916666977107525, "step": 164 }, { "advantage_max": 1.1803306639194489, "advantage_mean": 3.725290742551124e-09, "advantage_min": -0.9403169602155685, "advantage_std": 0.7902816981077194, "completion_length": 3261.1458740234375, "epoch": 0.18857142857142858, "grad_norm": 0.19152827560901642, "kl": 0.03424072265625, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.625962667065487e-07, "loss": 0.0267, "reward": 0.21183019503951073, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.21183019503951073, "reward_after_std": 0.7902816887944937, "reward_before_mean": 0.41182703617960215, "reward_before_std": 0.823692075908184, "reward_change_max": 0.00022941827774047852, "reward_change_mean": -0.1999968495219946, "reward_change_min": -0.38202267698943615, "reward_change_std": 0.1634599519893527, "reward_std": 0.7902817092835903, "rewards/cosine_scaled_reward": -0.0024198126047849655, "rewards/format_reward": 0.4166666753590107, "step": 165 }, { "advantage_max": 1.3109552264213562, "advantage_mean": 1.8626452602532595e-09, "advantage_min": -1.1460014283657074, "advantage_std": 0.8773243688046932, "completion_length": 3285.479217529297, "epoch": 0.18971428571428572, "grad_norm": 0.164150208234787, "kl": 0.0212554931640625, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.603287946810513e-07, "loss": 0.0341, "reward": 0.1956256345147267, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1956256345147267, "reward_after_std": 0.8773243799805641, "reward_before_mean": 0.38636994548141956, "reward_before_std": 0.9264193251729012, "reward_change_max": 0.001349225640296936, "reward_change_mean": -0.1907443031668663, "reward_change_min": -0.39306593127548695, "reward_change_std": 0.16858274303376675, "reward_std": 0.877324391156435, "rewards/cosine_scaled_reward": -0.015148364007472992, "rewards/format_reward": 0.416666679084301, "step": 166 }, { "advantage_max": 1.3294166699051857, "advantage_mean": 9.313225746154785e-10, "advantage_min": -0.8419655784964561, "advantage_std": 0.840934332460165, "completion_length": 2786.0833435058594, "epoch": 0.19085714285714286, "grad_norm": 0.16040551662445068, "kl": 0.0241241455078125, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.580461976679099e-07, "loss": 0.0199, "reward": 0.32725309021770954, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.32725309021770954, "reward_after_std": 0.8409343659877777, "reward_before_mean": 0.5443729944527149, "reward_before_std": 0.8524107746779919, "reward_change_max": 0.0, "reward_change_mean": -0.21711986884474754, "reward_change_min": -0.4231265429407358, "reward_change_std": 0.16874686954542994, "reward_std": 0.8409343846142292, "rewards/cosine_scaled_reward": -0.04031352582387626, "rewards/format_reward": 0.6250000074505806, "step": 167 }, { "advantage_max": 1.0663059242069721, "advantage_mean": -1.4280280014045132e-08, "advantage_min": -1.0032785832881927, "advantage_std": 0.763172097504139, "completion_length": 3021.375030517578, "epoch": 0.192, "grad_norm": 0.19575634598731995, "kl": 0.0220794677734375, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.557485869176825e-07, "loss": 0.0122, "reward": 0.5800081314519048, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5800081314519048, "reward_after_std": 0.7631721086800098, "reward_before_mean": 0.8609075583517551, "reward_before_std": 0.791071143001318, "reward_change_max": 0.0, "reward_change_mean": -0.2808994185179472, "reward_change_min": -0.4914733041077852, "reward_change_std": 0.20270747411996126, "reward_std": 0.763172123581171, "rewards/cosine_scaled_reward": 0.12837043032050133, "rewards/format_reward": 0.604166679084301, "step": 168 }, { "advantage_max": 1.312363252043724, "advantage_mean": -2.6077032866389516e-08, "advantage_min": -0.669765330851078, "advantage_std": 0.7563849091529846, "completion_length": 2573.1666870117188, "epoch": 0.19314285714285714, "grad_norm": 0.13373582065105438, "kl": 0.0294952392578125, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.534360744126753e-07, "loss": 0.0235, "reward": 0.799824794754386, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.799824794754386, "reward_after_std": 0.7563849203288555, "reward_before_mean": 1.116827798075974, "reward_before_std": 0.697244044393301, "reward_change_max": 0.0002922937273979187, "reward_change_mean": -0.3170030089095235, "reward_change_min": -0.4931119568645954, "reward_change_std": 0.1916311988607049, "reward_std": 0.7563849315047264, "rewards/cosine_scaled_reward": 0.2563305450603366, "rewards/format_reward": 0.6041666679084301, "step": 169 }, { "advantage_max": 1.0451821275055408, "advantage_mean": -2.9181440652781276e-08, "advantage_min": -0.8890182189643383, "advantage_std": 0.7311634887009859, "completion_length": 2607.2083587646484, "epoch": 0.19428571428571428, "grad_norm": 0.14642181992530823, "kl": 0.02374267578125, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.511087728614862e-07, "loss": 0.0873, "reward": 0.36113011091947556, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.36113011091947556, "reward_after_std": 0.7311634700745344, "reward_before_mean": 0.5959527250379324, "reward_before_std": 0.7579219676554203, "reward_change_max": 0.00111331045627594, "reward_change_mean": -0.23482263693585992, "reward_change_min": -0.40725141018629074, "reward_change_std": 0.17611850704997778, "reward_std": 0.7311634942889214, "rewards/cosine_scaled_reward": 0.06880968064069748, "rewards/format_reward": 0.4583333358168602, "step": 170 }, { "advantage_max": 0.9982545115053654, "advantage_mean": 1.3659397890553038e-08, "advantage_min": -0.5660811513662338, "advantage_std": 0.5901012774556875, "completion_length": 2793.4375610351562, "epoch": 0.19542857142857142, "grad_norm": 0.11787493526935577, "kl": 0.022216796875, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.487667956935087e-07, "loss": 0.0222, "reward": 0.3895031474530697, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3895031474530697, "reward_after_std": 0.5901012625545263, "reward_before_mean": 0.6341063280124217, "reward_before_std": 0.5479781590402126, "reward_change_max": 0.0, "reward_change_mean": -0.24460315611213446, "reward_change_min": -0.4002964645624161, "reward_change_std": 0.15657793753780425, "reward_std": 0.5901012737303972, "rewards/cosine_scaled_reward": 0.08788650901988149, "rewards/format_reward": 0.4583333395421505, "step": 171 }, { "advantage_max": 1.3995696865022182, "advantage_mean": -8.071463275527435e-09, "advantage_min": -0.6793707236647606, "advantage_std": 0.7801363468170166, "completion_length": 3012.583396911621, "epoch": 0.19657142857142856, "grad_norm": 0.16663214564323425, "kl": 0.0301666259765625, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.464102570534061e-07, "loss": 0.0252, "reward": 0.225158647634089, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.225158647634089, "reward_after_std": 0.7801363244652748, "reward_before_mean": 0.4188475925475359, "reward_before_std": 0.751838456839323, "reward_change_max": 0.00022362172603607178, "reward_change_mean": -0.1936889262869954, "reward_change_min": -0.3318144492805004, "reward_change_std": 0.128281413577497, "reward_std": 0.7801363281905651, "rewards/cosine_scaled_reward": 0.03234043810516596, "rewards/format_reward": 0.35416666977107525, "step": 172 }, { "advantage_max": 1.511083673685789, "advantage_mean": -4.967053879312289e-09, "advantage_min": -0.8546476252377033, "advantage_std": 0.8926947899162769, "completion_length": 2415.3334045410156, "epoch": 0.1977142857142857, "grad_norm": 0.28262859582901, "kl": 0.0305938720703125, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.440392717955475e-07, "loss": 0.1065, "reward": 0.16139882639981806, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.16139882639981806, "reward_after_std": 0.8926948197185993, "reward_before_mean": 0.335021841339767, "reward_before_std": 0.9010801576077938, "reward_change_max": 0.00015825778245925903, "reward_change_mean": -0.17362301982939243, "reward_change_min": -0.353594034910202, "reward_change_std": 0.14097668789327145, "reward_std": 0.8926948495209217, "rewards/cosine_scaled_reward": -0.10332241887226701, "rewards/format_reward": 0.5416666734963655, "step": 173 }, { "advantage_max": 1.3023353144526482, "advantage_mean": -6.2088170160734535e-09, "advantage_min": -0.7510321214795113, "advantage_std": 0.7634684294462204, "completion_length": 2740.0208892822266, "epoch": 0.19885714285714284, "grad_norm": 0.17239248752593994, "kl": 0.0318450927734375, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.416539554784089e-07, "loss": 0.0457, "reward": 0.29083897173404694, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.29083897173404694, "reward_after_std": 0.763468399643898, "reward_before_mean": 0.5023742839694023, "reward_before_std": 0.7486766390502453, "reward_change_max": 0.0004704892635345459, "reward_change_mean": -0.21153535088524222, "reward_change_min": -0.40437733568251133, "reward_change_std": 0.1530080554075539, "reward_std": 0.7634684219956398, "rewards/cosine_scaled_reward": 0.0011871512979269028, "rewards/format_reward": 0.5000000093132257, "step": 174 }, { "advantage_max": 0.9298867955803871, "advantage_mean": -2.1730860444435507e-09, "advantage_min": -0.8272735141217709, "advantage_std": 0.655903734266758, "completion_length": 2807.604217529297, "epoch": 0.2, "grad_norm": 0.10371315479278564, "kl": 0.025909423828125, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.392544243589427e-07, "loss": 0.0063, "reward": 0.28335341438651085, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.28335341438651085, "reward_after_std": 0.6559037305414677, "reward_before_mean": 0.5088309608399868, "reward_before_std": 0.682648167014122, "reward_change_max": 0.0001935139298439026, "reward_change_mean": -0.22547755111008883, "reward_change_min": -0.3959792386740446, "reward_change_std": 0.16470134910196066, "reward_std": 0.6559037454426289, "rewards/cosine_scaled_reward": 0.0252488125115633, "rewards/format_reward": 0.4583333358168602, "step": 175 }, { "advantage_max": 1.262831836938858, "advantage_mean": 1.8626457043424693e-09, "advantage_min": -0.7514547593891621, "advantage_std": 0.7699050232768059, "completion_length": 2787.4792251586914, "epoch": 0.20114285714285715, "grad_norm": 0.15177121758460999, "kl": 0.0307464599609375, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.368407953869103e-07, "loss": 0.0253, "reward": 0.07663825154304504, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.07663825154304504, "reward_after_std": 0.7699050419032574, "reward_before_mean": 0.24338941648602486, "reward_before_std": 0.7808955702930689, "reward_change_max": 0.0009336844086647034, "reward_change_mean": -0.1667511600535363, "reward_change_min": -0.3371252492070198, "reward_change_std": 0.13754934258759022, "reward_std": 0.7699050642549992, "rewards/cosine_scaled_reward": -0.09705529967322946, "rewards/format_reward": 0.43750000931322575, "step": 176 }, { "advantage_max": 1.066458322107792, "advantage_mean": -9.62366644707835e-09, "advantage_min": -0.5885743796825409, "advantage_std": 0.6244613640010357, "completion_length": 2937.9583892822266, "epoch": 0.2022857142857143, "grad_norm": 0.19181039929389954, "kl": 0.03475189208984375, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.344131861991828e-07, "loss": 0.0348, "reward": 0.15816564857959747, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.15816564857959747, "reward_after_std": 0.6244613789021969, "reward_before_mean": 0.35169607895659283, "reward_before_std": 0.6057868972420692, "reward_change_max": 0.0003667473793029785, "reward_change_mean": -0.19353043381124735, "reward_change_min": -0.34786077216267586, "reward_change_std": 0.1301519968546927, "reward_std": 0.6244613826274872, "rewards/cosine_scaled_reward": -0.09498530067503452, "rewards/format_reward": 0.5416666772216558, "step": 177 }, { "advantage_max": 1.469532623887062, "advantage_mean": -7.450580818968433e-09, "advantage_min": -0.9144716300070286, "advantage_std": 0.9177613109350204, "completion_length": 2900.750045776367, "epoch": 0.20342857142857143, "grad_norm": 0.19331806898117065, "kl": 0.0428466796875, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.319717151140072e-07, "loss": 0.0269, "reward": 0.17894789204001427, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.17894789204001427, "reward_after_std": 0.9177612960338593, "reward_before_mean": 0.3592214472591877, "reward_before_std": 0.949314933270216, "reward_change_max": 0.0, "reward_change_mean": -0.18027354963123798, "reward_change_min": -0.3899546228349209, "reward_change_std": 0.15737597597762942, "reward_std": 0.9177613146603107, "rewards/cosine_scaled_reward": -0.02872262359596789, "rewards/format_reward": 0.4166666716337204, "step": 178 }, { "advantage_max": 0.8261377401649952, "advantage_mean": 4.6566128730773926e-09, "advantage_min": -0.5832706466317177, "advantage_std": 0.5149836093187332, "completion_length": 3003.1666870117188, "epoch": 0.20457142857142857, "grad_norm": 0.11742495745420456, "kl": 0.03636932373046875, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.295165011252396e-07, "loss": 0.0145, "reward": -0.13531387597322464, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.13531387597322464, "reward_after_std": 0.5149836242198944, "reward_before_mean": 0.007526550441980362, "reward_before_std": 0.5182395502924919, "reward_change_max": 0.00024544447660446167, "reward_change_mean": -0.14284042920917273, "reward_change_min": -0.27917319908738136, "reward_change_std": 0.10761795938014984, "reward_std": 0.5149836428463459, "rewards/cosine_scaled_reward": -0.17332006711512804, "rewards/format_reward": 0.35416666977107525, "step": 179 }, { "advantage_max": 1.2475721836090088, "advantage_mean": -4.34617203337595e-08, "advantage_min": -0.7720734775066376, "advantage_std": 0.7830089889466763, "completion_length": 2410.4166870117188, "epoch": 0.2057142857142857, "grad_norm": 0.13700564205646515, "kl": 0.0419769287109375, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.270476638965461e-07, "loss": 0.0174, "reward": 0.5047831274569035, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5047831274569035, "reward_after_std": 0.7830089889466763, "reward_before_mean": 0.7621986456215382, "reward_before_std": 0.776782713830471, "reward_change_max": 0.00013273954391479492, "reward_change_mean": -0.25741552095860243, "reward_change_min": -0.4544454291462898, "reward_change_std": 0.1749586989171803, "reward_std": 0.7830089963972569, "rewards/cosine_scaled_reward": 0.0894326251000166, "rewards/format_reward": 0.583333333954215, "step": 180 }, { "advantage_max": 1.4035573303699493, "advantage_mean": 2.8560559584001055e-08, "advantage_min": -0.8126705698668957, "advantage_std": 0.815248891711235, "completion_length": 3139.3750610351562, "epoch": 0.20685714285714285, "grad_norm": 0.19324177503585815, "kl": 0.04010009765625, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.245653237555705e-07, "loss": 0.0419, "reward": -0.03209049755241722, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.03209049755241722, "reward_after_std": 0.8152489028871059, "reward_before_mean": 0.10773651394993067, "reward_before_std": 0.8276308216154575, "reward_change_max": 0.001385033130645752, "reward_change_mean": -0.13982698926702142, "reward_change_min": -0.2665670830756426, "reward_change_std": 0.1115828175097704, "reward_std": 0.8152489364147186, "rewards/cosine_scaled_reward": -0.11279841396026313, "rewards/format_reward": 0.3333333395421505, "step": 181 }, { "advantage_max": 1.6242863051593304, "advantage_mean": -2.173086155465853e-09, "advantage_min": -1.124721072614193, "advantage_std": 1.0126835405826569, "completion_length": 2848.0208892822266, "epoch": 0.208, "grad_norm": 0.1714603304862976, "kl": 0.0349273681640625, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.220696016880687e-07, "loss": -0.001, "reward": 0.29942611791193485, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.29942611791193485, "reward_after_std": 1.0126835703849792, "reward_before_mean": 0.4980975305661559, "reward_before_std": 1.0505497753620148, "reward_change_max": 0.0021339207887649536, "reward_change_mean": -0.19867143034934998, "reward_change_min": -0.41653104312717915, "reward_change_std": 0.17115018144249916, "reward_std": 1.0126835741102695, "rewards/cosine_scaled_reward": 0.009465432725846767, "rewards/format_reward": 0.479166679084301, "step": 182 }, { "advantage_max": 1.3906030505895615, "advantage_mean": -1.0554989438027462e-08, "advantage_min": -0.9447256699204445, "advantage_std": 0.9372784420847893, "completion_length": 2947.3750762939453, "epoch": 0.20914285714285713, "grad_norm": 0.22748148441314697, "kl": 0.0528564453125, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.195606193320136e-07, "loss": 0.0722, "reward": 0.18109873123466969, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.18109873123466969, "reward_after_std": 0.9372784420847893, "reward_before_mean": 0.36509622633457184, "reward_before_std": 1.000947393476963, "reward_change_max": 0.0003439560532569885, "reward_change_mean": -0.18399752443656325, "reward_change_min": -0.46078629791736603, "reward_change_std": 0.1898609809577465, "reward_std": 0.9372784793376923, "rewards/cosine_scaled_reward": -0.05703521566465497, "rewards/format_reward": 0.479166679084301, "step": 183 }, { "advantage_max": 0.8160999454557896, "advantage_mean": 1.6763806731656672e-08, "advantage_min": -0.5095634013414383, "advantage_std": 0.5207002814859152, "completion_length": 3019.2916946411133, "epoch": 0.2102857142857143, "grad_norm": 0.10211692750453949, "kl": 0.046142578125, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.170384989716657e-07, "loss": 0.0104, "reward": -0.21611788868904114, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.21611788868904114, "reward_after_std": 0.5207002703100443, "reward_before_mean": -0.09095852356404066, "reward_before_std": 0.535466443747282, "reward_change_max": 0.002794913947582245, "reward_change_mean": -0.1251593711785972, "reward_change_min": -0.25952029787003994, "reward_change_std": 0.10204625176265836, "reward_std": 0.5207002777606249, "rewards/cosine_scaled_reward": -0.20172925852239132, "rewards/format_reward": 0.31250000186264515, "step": 184 }, { "advantage_max": 0.9886697083711624, "advantage_mean": 1.7384688466570708e-08, "advantage_min": -0.6492449138313532, "advantage_std": 0.6458341721445322, "completion_length": 2825.562545776367, "epoch": 0.21142857142857144, "grad_norm": 0.18683382868766785, "kl": 0.04754638671875, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.145033635316128e-07, "loss": 0.0519, "reward": -0.20608131540939212, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.20608131540939212, "reward_after_std": 0.645834144204855, "reward_before_mean": -0.08709462452679873, "reward_before_std": 0.678638924844563, "reward_change_max": 0.0005731135606765747, "reward_change_mean": -0.11898667085915804, "reward_change_min": -0.2649123091250658, "reward_change_std": 0.11509254341945052, "reward_std": 0.6458341591060162, "rewards/cosine_scaled_reward": -0.21021398529410362, "rewards/format_reward": 0.3333333395421505, "step": 185 }, { "advantage_max": 0.8099225126206875, "advantage_mean": 6.208817904251873e-10, "advantage_min": -0.7140725702047348, "advantage_std": 0.5348150469362736, "completion_length": 2959.2083587646484, "epoch": 0.21257142857142858, "grad_norm": 0.20729362964630127, "kl": 0.0452880859375, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.119553365707802e-07, "loss": -0.0377, "reward": 0.32842851243913174, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.32842851243913174, "reward_after_std": 0.5348150450736284, "reward_before_mean": 0.5684572607278824, "reward_before_std": 0.5207806155085564, "reward_change_max": 0.0, "reward_change_mean": -0.24002870451658964, "reward_change_min": -0.3893031235784292, "reward_change_std": 0.15248640812933445, "reward_std": 0.5348150543868542, "rewards/cosine_scaled_reward": 0.0654786080121994, "rewards/format_reward": 0.4375, "step": 186 }, { "advantage_max": 1.2817695625126362, "advantage_mean": 5.58793539218172e-09, "advantage_min": -0.5976484194397926, "advantage_std": 0.7407633736729622, "completion_length": 3073.5209045410156, "epoch": 0.21371428571428572, "grad_norm": 0.1879832148551941, "kl": 0.0614013671875, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.093945422764069e-07, "loss": 0.0238, "reward": 0.008961756248027086, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.008961756248027086, "reward_after_std": 0.7407633736729622, "reward_before_mean": 0.1627176369074732, "reward_before_std": 0.7348326202481985, "reward_change_max": 0.00039581209421157837, "reward_change_mean": -0.15375590091571212, "reward_change_min": -0.3458544798195362, "reward_change_std": 0.13292747549712658, "reward_std": 0.7407634034752846, "rewards/cosine_scaled_reward": -0.09572450537234545, "rewards/format_reward": 0.35416666977107525, "step": 187 }, { "advantage_max": 0.8965404964983463, "advantage_mean": 3.7252906315288215e-09, "advantage_min": -0.5984246246516705, "advantage_std": 0.5441897921264172, "completion_length": 3417.5833740234375, "epoch": 0.21485714285714286, "grad_norm": 0.14670825004577637, "kl": 0.0543212890625, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.068211054579943e-07, "loss": 0.0293, "reward": -0.2816386744379997, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2816386744379997, "reward_after_std": 0.5441897921264172, "reward_before_mean": -0.17270222771912813, "reward_before_std": 0.5559433400630951, "reward_change_max": 0.0, "reward_change_mean": -0.10893645929172635, "reward_change_min": -0.22303851880133152, "reward_change_std": 0.09435471799224615, "reward_std": 0.5441898107528687, "rewards/cosine_scaled_reward": -0.14885111525654793, "rewards/format_reward": 0.12500000186264515, "step": 188 }, { "advantage_max": 0.9566076211631298, "advantage_mean": -8.692343844707295e-09, "advantage_min": -0.8865089789032936, "advantage_std": 0.658950611948967, "completion_length": 2893.2083587646484, "epoch": 0.216, "grad_norm": 0.2866089642047882, "kl": 0.055389404296875, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.04235151541222e-07, "loss": 0.0578, "reward": 0.28129902854561806, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.28129902854561806, "reward_after_std": 0.6589506343007088, "reward_before_mean": 0.5050256848335266, "reward_before_std": 0.6821137368679047, "reward_change_max": 0.00030149519443511963, "reward_change_mean": -0.22372663486748934, "reward_change_min": -0.3970515187829733, "reward_change_std": 0.15964000718668103, "reward_std": 0.6589506380259991, "rewards/cosine_scaled_reward": 0.0025128244888037443, "rewards/format_reward": 0.5000000074505806, "step": 189 }, { "advantage_max": 0.9095200449228287, "advantage_mean": 4.967053990334591e-09, "advantage_min": -0.642152450978756, "advantage_std": 0.5963238589465618, "completion_length": 2749.854202270508, "epoch": 0.21714285714285714, "grad_norm": 0.15039433538913727, "kl": 0.05670166015625, "lambda_div_used": 0.7999999999999999, "learning_rate": 8.01636806561836e-07, "loss": 0.0434, "reward": 0.15606092661619186, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.15606092661619186, "reward_after_std": 0.5963238626718521, "reward_before_mean": 0.3552255127578974, "reward_before_std": 0.6015043295919895, "reward_change_max": 0.0006122663617134094, "reward_change_mean": -0.1991645717062056, "reward_change_min": -0.3571854718029499, "reward_change_std": 0.14616639399901032, "reward_std": 0.5963238701224327, "rewards/cosine_scaled_reward": -0.02030389942228794, "rewards/format_reward": 0.3958333358168602, "step": 190 }, { "advantage_max": 0.8247727155685425, "advantage_mean": -2.1109979653211042e-08, "advantage_min": -0.7755392119288445, "advantage_std": 0.5822130590677261, "completion_length": 2660.5416870117188, "epoch": 0.21828571428571428, "grad_norm": 0.12784256041049957, "kl": 0.063934326171875, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.990261971595048e-07, "loss": 0.0309, "reward": -0.00962606945540756, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.00962606945540756, "reward_after_std": 0.5822130553424358, "reward_before_mean": 0.15804596740053967, "reward_before_std": 0.6130220293998718, "reward_change_max": 0.00014460831880569458, "reward_change_mean": -0.16767206811346114, "reward_change_min": -0.33540536276996136, "reward_change_std": 0.13217430608347058, "reward_std": 0.582213070243597, "rewards/cosine_scaled_reward": -0.13972701877355576, "rewards/format_reward": 0.4375000149011612, "step": 191 }, { "advantage_max": 1.2215107157826424, "advantage_mean": -6.20881729362921e-09, "advantage_min": -0.6452849954366684, "advantage_std": 0.7480017244815826, "completion_length": 3340.6250610351562, "epoch": 0.21942857142857142, "grad_norm": 0.3335050046443939, "kl": 0.062347412109375, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.964034505716476e-07, "loss": 0.0813, "reward": -0.30221050325781107, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.30221050325781107, "reward_after_std": 0.7480017244815826, "reward_before_mean": -0.2114100642502308, "reward_before_std": 0.7854423932731152, "reward_change_max": 0.0012423396110534668, "reward_change_mean": -0.09080045018345118, "reward_change_min": -0.2738470807671547, "reward_change_std": 0.11589764233212918, "reward_std": 0.7480017356574535, "rewards/cosine_scaled_reward": -0.2098717025364749, "rewards/format_reward": 0.2083333358168602, "step": 192 }, { "advantage_max": 1.0752473250031471, "advantage_mean": 3.414849569782774e-09, "advantage_min": -0.6735592968761921, "advantage_std": 0.6682540886104107, "completion_length": 3199.5416870117188, "epoch": 0.22057142857142858, "grad_norm": 0.23511099815368652, "kl": 0.05914306640625, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.93768694627233e-07, "loss": 0.0457, "reward": -0.20098212780430913, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.20098212780430913, "reward_after_std": 0.6682540848851204, "reward_before_mean": -0.08410844672471285, "reward_before_std": 0.6942097879946232, "reward_change_max": 0.00023821741342544556, "reward_change_mean": -0.11687368247658014, "reward_change_min": -0.2730963882058859, "reward_change_std": 0.11029393505305052, "reward_std": 0.6682540886104107, "rewards/cosine_scaled_reward": -0.16705422988161445, "rewards/format_reward": 0.2500000074505806, "step": 193 }, { "advantage_max": 1.537854041904211, "advantage_mean": -1.676380617654516e-08, "advantage_min": -1.0600149035453796, "advantage_std": 1.016943659633398, "completion_length": 2712.604217529297, "epoch": 0.22171428571428572, "grad_norm": 0.4346405565738678, "kl": 0.051605224609375, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.911220577405484e-07, "loss": 0.0621, "reward": 0.5943349450826645, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5943349450826645, "reward_after_std": 1.0169436447322369, "reward_before_mean": 0.8588595557957888, "reward_before_std": 1.0535069219768047, "reward_change_max": 0.0, "reward_change_mean": -0.2645246204920113, "reward_change_min": -0.5160359237343073, "reward_change_std": 0.21638690726831555, "reward_std": 1.0169436782598495, "rewards/cosine_scaled_reward": 0.2106797732412815, "rewards/format_reward": 0.43750000558793545, "step": 194 }, { "advantage_max": 1.6020993739366531, "advantage_mean": 1.241762692671955e-09, "advantage_min": -0.8898907229304314, "advantage_std": 0.9601521492004395, "completion_length": 2961.791717529297, "epoch": 0.22285714285714286, "grad_norm": 0.45676088333129883, "kl": 0.06585693359375, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.884636689049422e-07, "loss": 0.0466, "reward": 0.2086570542305708, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2086570542305708, "reward_after_std": 0.9601521715521812, "reward_before_mean": 0.3893347233533859, "reward_before_std": 0.9845021851360798, "reward_change_max": 0.0007052496075630188, "reward_change_mean": -0.18067765980958939, "reward_change_min": -0.42285651341080666, "reward_change_std": 0.16245433315634727, "reward_std": 0.9601521901786327, "rewards/cosine_scaled_reward": -0.013665982987731695, "rewards/format_reward": 0.41666667722165585, "step": 195 }, { "advantage_max": 0.8580480068922043, "advantage_mean": 2.0489097085629737e-08, "advantage_min": -0.8607277423143387, "advantage_std": 0.6565283834934235, "completion_length": 3221.437530517578, "epoch": 0.224, "grad_norm": 0.23737263679504395, "kl": 0.081298828125, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.857936576865356e-07, "loss": 0.037, "reward": -0.0247703914064914, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.0247703914064914, "reward_after_std": 0.6565283834934235, "reward_before_mean": 0.139747841283679, "reward_before_std": 0.7164971679449081, "reward_change_max": 0.000530630350112915, "reward_change_mean": -0.16451821709051728, "reward_change_min": -0.33033014088869095, "reward_change_std": 0.14811254106462002, "reward_std": 0.656528402119875, "rewards/cosine_scaled_reward": -0.07595941610634327, "rewards/format_reward": 0.29166667722165585, "step": 196 }, { "advantage_max": 1.764219969511032, "advantage_mean": 1.2417633588057697e-09, "advantage_min": -1.1842006593942642, "advantage_std": 1.1066013649106026, "completion_length": 2460.9375610351562, "epoch": 0.22514285714285714, "grad_norm": 0.38473206758499146, "kl": 0.09130859375, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.831121542179086e-07, "loss": 0.0553, "reward": 0.4490340007469058, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4490340007469058, "reward_after_std": 1.1066013276576996, "reward_before_mean": 0.6725151874125004, "reward_before_std": 1.1451293528079987, "reward_change_max": 0.0007219910621643066, "reward_change_mean": -0.22348116897046566, "reward_change_min": -0.47838449850678444, "reward_change_std": 0.19713882636278868, "reward_std": 1.1066013760864735, "rewards/cosine_scaled_reward": 0.12792425928637385, "rewards/format_reward": 0.41666668094694614, "step": 197 }, { "advantage_max": 1.3422429114580154, "advantage_mean": 1.2417634698280722e-09, "advantage_min": -0.7631508782505989, "advantage_std": 0.7719403244554996, "completion_length": 2660.9583892822266, "epoch": 0.22628571428571428, "grad_norm": 0.1971741020679474, "kl": 0.093994140625, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.804192891917571e-07, "loss": 0.0259, "reward": -0.002327847760170698, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.002327847760170698, "reward_after_std": 0.7719403244554996, "reward_before_mean": 0.14502623851876706, "reward_before_std": 0.7764476202428341, "reward_change_max": 0.002315118908882141, "reward_change_mean": -0.14735408034175634, "reward_change_min": -0.2832381669431925, "reward_change_std": 0.11318755429238081, "reward_std": 0.7719403579831123, "rewards/cosine_scaled_reward": -0.06290354859083891, "rewards/format_reward": 0.2708333358168602, "step": 198 }, { "advantage_max": 0.9793026782572269, "advantage_mean": 3.725290520506519e-09, "advantage_min": -0.6699462532997131, "advantage_std": 0.6215485241264105, "completion_length": 2692.6250610351562, "epoch": 0.22742857142857142, "grad_norm": 0.30110254883766174, "kl": 0.10107421875, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.777151938545235e-07, "loss": -0.0384, "reward": -0.162913522683084, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.162913522683084, "reward_after_std": 0.6215485315769911, "reward_before_mean": -0.03506151307374239, "reward_before_std": 0.6429448090493679, "reward_change_max": 0.0008990466594696045, "reward_change_mean": -0.1278520030900836, "reward_change_min": -0.27448153495788574, "reward_change_std": 0.11330182058736682, "reward_std": 0.6215485371649265, "rewards/cosine_scaled_reward": -0.14253076119348407, "rewards/format_reward": 0.2500000074505806, "step": 199 }, { "advantage_max": 1.2570538967847824, "advantage_mean": -1.8626453046621805e-08, "advantage_min": -0.8688595369458199, "advantage_std": 0.8239178322255611, "completion_length": 1922.500015258789, "epoch": 0.22857142857142856, "grad_norm": 0.2225765585899353, "kl": 0.074127197265625, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.75e-07, "loss": -0.0031, "reward": 0.5477036349475384, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5477036349475384, "reward_after_std": 0.8239178247749805, "reward_before_mean": 0.8122347816824913, "reward_before_std": 0.8378441445529461, "reward_change_max": 0.0011024102568626404, "reward_change_mean": -0.2645311541855335, "reward_change_min": -0.5093743037432432, "reward_change_std": 0.19762188661843538, "reward_std": 0.8239178508520126, "rewards/cosine_scaled_reward": 0.1769507210701704, "rewards/format_reward": 0.45833334140479565, "step": 200 }, { "advantage_max": 1.2604795657098293, "advantage_mean": -7.450581929191458e-09, "advantage_min": -0.689516369253397, "advantage_std": 0.7160300742834806, "completion_length": 1837.520896911621, "epoch": 0.2297142857142857, "grad_norm": 0.22883553802967072, "kl": 0.083221435546875, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.72273839962904e-07, "loss": -0.0249, "reward": 0.7685513235628605, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7685513235628605, "reward_after_std": 0.7160300686955452, "reward_before_mean": 1.0796052142977715, "reward_before_std": 0.6542752701789141, "reward_change_max": 0.0012431815266609192, "reward_change_mean": -0.31105391355231404, "reward_change_min": -0.4878144096583128, "reward_change_std": 0.1846129735931754, "reward_std": 0.7160300984978676, "rewards/cosine_scaled_reward": 0.2898026071488857, "rewards/format_reward": 0.5000000018626451, "step": 201 }, { "advantage_max": 0.9671787768602371, "advantage_mean": 9.934107814135729e-09, "advantage_min": -0.8516362234950066, "advantage_std": 0.6367514543235302, "completion_length": 2345.3333587646484, "epoch": 0.23085714285714284, "grad_norm": 0.32464098930358887, "kl": 0.11572265625, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.695368466124296e-07, "loss": 0.0502, "reward": 0.650133672170341, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.650133672170341, "reward_after_std": 0.6367514468729496, "reward_before_mean": 0.9495372585952282, "reward_before_std": 0.6133211851119995, "reward_change_max": 0.0006561949849128723, "reward_change_mean": -0.2994035603478551, "reward_change_min": -0.46214500814676285, "reward_change_std": 0.18847965169698, "reward_std": 0.6367514543235302, "rewards/cosine_scaled_reward": 0.2143519576638937, "rewards/format_reward": 0.5208333488553762, "step": 202 }, { "advantage_max": 1.233177587389946, "advantage_mean": 0.0, "advantage_min": -0.7137967683374882, "advantage_std": 0.732858944684267, "completion_length": 2891.666748046875, "epoch": 0.232, "grad_norm": 0.29693952202796936, "kl": 0.133636474609375, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.667891533457718e-07, "loss": 0.0438, "reward": 0.07162884995341301, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.07162884995341301, "reward_after_std": 0.7328589595854282, "reward_before_mean": 0.2396958488970995, "reward_before_std": 0.7335174567997456, "reward_change_max": 0.0003054216504096985, "reward_change_mean": -0.1680669877678156, "reward_change_min": -0.33523451536893845, "reward_change_std": 0.13505257479846478, "reward_std": 0.7328589782118797, "rewards/cosine_scaled_reward": -0.046818746253848076, "rewards/format_reward": 0.3333333358168602, "step": 203 }, { "advantage_max": 1.1867965795099735, "advantage_mean": -2.2351742678949904e-08, "advantage_min": -0.9047318547964096, "advantage_std": 0.7859960682690144, "completion_length": 2404.7291870117188, "epoch": 0.23314285714285715, "grad_norm": 0.2781248986721039, "kl": 0.1199951171875, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.640308940816239e-07, "loss": 0.0034, "reward": 0.24108587577939034, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.24108587577939034, "reward_after_std": 0.785996101796627, "reward_before_mean": 0.44456504518166184, "reward_before_std": 0.8118856996297836, "reward_change_max": 0.001353539526462555, "reward_change_mean": -0.20347917266190052, "reward_change_min": -0.4074098691344261, "reward_change_std": 0.16620528232306242, "reward_std": 0.7859961241483688, "rewards/cosine_scaled_reward": -0.06938415579497814, "rewards/format_reward": 0.5833333395421505, "step": 204 }, { "advantage_max": 1.0401730574667454, "advantage_mean": -1.862645648831318e-09, "advantage_min": -1.1077677197754383, "advantage_std": 0.754533164203167, "completion_length": 2429.041702270508, "epoch": 0.2342857142857143, "grad_norm": 0.2373577058315277, "kl": 0.109527587890625, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.612622032536507e-07, "loss": -0.0185, "reward": 0.614743210375309, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.614743210375309, "reward_after_std": 0.7545331679284573, "reward_before_mean": 0.9034174915868789, "reward_before_std": 0.7777533903717995, "reward_change_max": 0.00044680386781692505, "reward_change_mean": -0.2886743023991585, "reward_change_min": -0.466442683711648, "reward_change_std": 0.2007216578349471, "reward_std": 0.7545331940054893, "rewards/cosine_scaled_reward": 0.2642087498679757, "rewards/format_reward": 0.37500000558793545, "step": 205 }, { "advantage_max": 1.5882083512842655, "advantage_mean": 5.27749466350258e-09, "advantage_min": -0.9246660247445107, "advantage_std": 0.9547283947467804, "completion_length": 3065.0208740234375, "epoch": 0.23542857142857143, "grad_norm": 0.4029029905796051, "kl": 0.130126953125, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.584832158039378e-07, "loss": 0.0277, "reward": 0.12429846322629601, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.12429846322629601, "reward_after_std": 0.9547284096479416, "reward_before_mean": 0.28654644452035427, "reward_before_std": 0.9803737215697765, "reward_change_max": 0.0002094656229019165, "reward_change_mean": -0.16224796697497368, "reward_change_min": -0.38078949600458145, "reward_change_std": 0.14915704168379307, "reward_std": 0.954728439450264, "rewards/cosine_scaled_reward": -0.023393452633172274, "rewards/format_reward": 0.33333334513008595, "step": 206 }, { "advantage_max": 1.2513058297336102, "advantage_mean": 6.829699084054397e-09, "advantage_min": -0.8400289416313171, "advantage_std": 0.7795267142355442, "completion_length": 3104.687545776367, "epoch": 0.23657142857142857, "grad_norm": 0.34264588356018066, "kl": 0.15802001953125, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.556940671764124e-07, "loss": 0.0258, "reward": -0.11594610224710777, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.11594610224710777, "reward_after_std": 0.779526736587286, "reward_before_mean": 0.01044432818889618, "reward_before_std": 0.81078165397048, "reward_change_max": 0.0007428675889968872, "reward_change_mean": -0.1263904278166592, "reward_change_min": -0.2810728922486305, "reward_change_std": 0.12395834270864725, "reward_std": 0.7795267626643181, "rewards/cosine_scaled_reward": -0.171861179638654, "rewards/format_reward": 0.35416667349636555, "step": 207 }, { "advantage_max": 0.9631505236029625, "advantage_mean": -2.4835268563894175e-08, "advantage_min": -0.5785660371184349, "advantage_std": 0.5737410057336092, "completion_length": 2533.8958435058594, "epoch": 0.2377142857142857, "grad_norm": 0.25822460651397705, "kl": 0.1262054443359375, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.528948933102438e-07, "loss": 0.0129, "reward": 0.28071308752987534, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.28071308752987534, "reward_after_std": 0.573741003870964, "reward_before_mean": 0.5029651252552867, "reward_before_std": 0.5481108240783215, "reward_change_max": 0.000742785632610321, "reward_change_mean": -0.22225204668939114, "reward_change_min": -0.3636954799294472, "reward_change_std": 0.14049053937196732, "reward_std": 0.5737410224974155, "rewards/cosine_scaled_reward": 0.02231588400900364, "rewards/format_reward": 0.4583333395421505, "step": 208 }, { "advantage_max": 1.3076799623668194, "advantage_mean": -8.071462553882469e-09, "advantage_min": -0.9464341886341572, "advantage_std": 0.8475919738411903, "completion_length": 2527.6250610351562, "epoch": 0.23885714285714285, "grad_norm": 0.3208506405353546, "kl": 0.147216796875, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.500858306332172e-07, "loss": 0.0315, "reward": 0.17765184119343758, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.17765184119343758, "reward_after_std": 0.8475919738411903, "reward_before_mean": 0.3619390018284321, "reward_before_std": 0.8807680755853653, "reward_change_max": 0.0023267194628715515, "reward_change_mean": -0.18428719555959105, "reward_change_min": -0.3779211286455393, "reward_change_std": 0.15464282128959894, "reward_std": 0.8475919961929321, "rewards/cosine_scaled_reward": -0.03778048907406628, "rewards/format_reward": 0.43750000931322575, "step": 209 }, { "advantage_max": 0.9974051788449287, "advantage_mean": 1.6763806787167823e-08, "advantage_min": -0.6103468053042889, "advantage_std": 0.5925644375383854, "completion_length": 2656.9583740234375, "epoch": 0.24, "grad_norm": 0.264523446559906, "kl": 0.1265869140625, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.472670160550848e-07, "loss": 0.0331, "reward": 0.015497580636292696, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.015497580636292696, "reward_after_std": 0.5925644524395466, "reward_before_mean": 0.180371415510308, "reward_before_std": 0.5836010649800301, "reward_change_max": 0.0019322484731674194, "reward_change_mean": -0.1648738132789731, "reward_change_min": -0.31565588526427746, "reward_change_std": 0.1200838522054255, "reward_std": 0.5925644710659981, "rewards/cosine_scaled_reward": -0.06606429233215749, "rewards/format_reward": 0.31250000558793545, "step": 210 }, { "advantage_max": 0.9585663378238678, "advantage_mean": 1.8626439834967812e-09, "advantage_min": -0.583470031619072, "advantage_std": 0.5566437281668186, "completion_length": 2041.8958587646484, "epoch": 0.24114285714285713, "grad_norm": 0.2638063132762909, "kl": 0.119232177734375, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.444385869608921e-07, "loss": 0.0161, "reward": 0.17763055628165603, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.17763055628165603, "reward_after_std": 0.5566437244415283, "reward_before_mean": 0.3766655183862895, "reward_before_std": 0.5256081186234951, "reward_change_max": 0.0007008612155914307, "reward_change_mean": -0.19903494883328676, "reward_change_min": -0.32790557853877544, "reward_change_std": 0.12746887607499957, "reward_std": 0.5566437393426895, "rewards/cosine_scaled_reward": 0.011249412782490253, "rewards/format_reward": 0.35416667349636555, "step": 211 }, { "advantage_max": 0.9081764183938503, "advantage_mean": -2.2041301450670403e-08, "advantage_min": -0.8862413987517357, "advantage_std": 0.6810885891318321, "completion_length": 2510.625030517578, "epoch": 0.2422857142857143, "grad_norm": 0.6100391149520874, "kl": 0.16290283203125, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.416006812042827e-07, "loss": 0.0911, "reward": 0.16298072412610054, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.16298072412610054, "reward_after_std": 0.6810886077582836, "reward_before_mean": 0.3630702644586563, "reward_before_std": 0.7299932427704334, "reward_change_max": 0.0006130561232566833, "reward_change_mean": -0.20008953846991062, "reward_change_min": -0.3803048748522997, "reward_change_std": 0.1654650866985321, "reward_std": 0.6810886263847351, "rewards/cosine_scaled_reward": -0.04763155058026314, "rewards/format_reward": 0.4583333432674408, "step": 212 }, { "advantage_max": 1.3156131207942963, "advantage_mean": 1.428027990302283e-08, "advantage_min": -0.6308629438281059, "advantage_std": 0.723580788820982, "completion_length": 2582.4167404174805, "epoch": 0.24342857142857144, "grad_norm": 0.2914482057094574, "kl": 0.18359375, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.387534371007797e-07, "loss": 0.0045, "reward": 0.299114229157567, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.299114229157567, "reward_after_std": 0.7235807813704014, "reward_before_mean": 0.5082722287625074, "reward_before_std": 0.6857242975383997, "reward_change_max": 0.0005275234580039978, "reward_change_mean": -0.20915796281769872, "reward_change_min": -0.34778287820518017, "reward_change_std": 0.1361712720245123, "reward_std": 0.7235807925462723, "rewards/cosine_scaled_reward": 0.014552779495716095, "rewards/format_reward": 0.4791666753590107, "step": 213 }, { "advantage_max": 1.4342169389128685, "advantage_mean": -3.539025950072272e-08, "advantage_min": -0.9107595607638359, "advantage_std": 0.9286076985299587, "completion_length": 2521.729232788086, "epoch": 0.24457142857142858, "grad_norm": 0.46148914098739624, "kl": 0.16448974609375, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.358969934210438e-07, "loss": 0.0652, "reward": 0.4896641212981194, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4896641212981194, "reward_after_std": 0.928607702255249, "reward_before_mean": 0.7356860414147377, "reward_before_std": 0.940785601735115, "reward_change_max": 6.170570850372314e-05, "reward_change_mean": -0.2460219245404005, "reward_change_min": -0.5083503555506468, "reward_change_std": 0.20198071282356977, "reward_std": 0.9286077208817005, "rewards/cosine_scaled_reward": 0.05534301046282053, "rewards/format_reward": 0.6250000111758709, "step": 214 }, { "advantage_max": 0.7866075076162815, "advantage_mean": 4.346172255420555e-09, "advantage_min": -0.6270756237208843, "advantage_std": 0.5026369281113148, "completion_length": 2542.8958892822266, "epoch": 0.24571428571428572, "grad_norm": 0.17121727764606476, "kl": 0.17706298828125, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.330314893841101e-07, "loss": 0.0284, "reward": 0.18317086435854435, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.18317086435854435, "reward_after_std": 0.5026369243860245, "reward_before_mean": 0.3898000605404377, "reward_before_std": 0.4934913255274296, "reward_change_max": 0.0, "reward_change_mean": -0.2066291905939579, "reward_change_min": -0.33523484133183956, "reward_change_std": 0.1286360565572977, "reward_std": 0.5026369467377663, "rewards/cosine_scaled_reward": -0.08634997345507145, "rewards/format_reward": 0.5625000055879354, "step": 215 }, { "advantage_max": 1.3609366118907928, "advantage_mean": -1.7695129722605785e-08, "advantage_min": -0.8636116236448288, "advantage_std": 0.825423177331686, "completion_length": 2282.5000534057617, "epoch": 0.24685714285714286, "grad_norm": 0.4734250605106354, "kl": 0.1982421875, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.301570646506027e-07, "loss": 0.0523, "reward": 0.5696587795391679, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5696587795391679, "reward_after_std": 0.8254231549799442, "reward_before_mean": 0.8348532042000443, "reward_before_std": 0.8018013909459114, "reward_change_max": 0.0007250010967254639, "reward_change_mean": -0.265194421634078, "reward_change_min": -0.46227551624178886, "reward_change_std": 0.18133168667554855, "reward_std": 0.8254231549799442, "rewards/cosine_scaled_reward": 0.09450992941856384, "rewards/format_reward": 0.6458333432674408, "step": 216 }, { "advantage_max": 1.6428455114364624, "advantage_mean": 3.1044087300813317e-09, "advantage_min": -1.0729522295296192, "advantage_std": 1.0483962520956993, "completion_length": 2703.0625610351562, "epoch": 0.248, "grad_norm": 0.850352942943573, "kl": 0.2318115234375, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.27273859315928e-07, "loss": 0.0886, "reward": 0.4065994236152619, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4065994236152619, "reward_after_std": 1.0483962520956993, "reward_before_mean": 0.6274208661634475, "reward_before_std": 1.0900351107120514, "reward_change_max": 0.0, "reward_change_mean": -0.22082144394516945, "reward_change_min": -0.4693372007459402, "reward_change_std": 0.1962579949758947, "reward_std": 1.0483962818980217, "rewards/cosine_scaled_reward": 0.06371042202226818, "rewards/format_reward": 0.5000000093132257, "step": 217 }, { "advantage_max": 1.488426972180605, "advantage_mean": 8.071462442860167e-09, "advantage_min": -0.6265953332185745, "advantage_std": 0.8160282298922539, "completion_length": 2634.979232788086, "epoch": 0.24914285714285714, "grad_norm": 0.5825859308242798, "kl": 0.2198486328125, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.243820139034464e-07, "loss": -0.0169, "reward": -0.12219872511923313, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.12219872511923313, "reward_after_std": 0.8160282485187054, "reward_before_mean": -0.007892501074820757, "reward_before_std": 0.8139440566301346, "reward_change_max": 0.000957004725933075, "reward_change_mean": -0.11430622171610594, "reward_change_min": -0.25531933829188347, "reward_change_std": 0.09790960466489196, "reward_std": 0.8160282634198666, "rewards/cosine_scaled_reward": -0.1810295870527625, "rewards/format_reward": 0.35416666977107525, "step": 218 }, { "advantage_max": 1.321848526597023, "advantage_mean": -1.8626451825376478e-08, "advantage_min": -0.8081910684704781, "advantage_std": 0.8162901289761066, "completion_length": 2413.791717529297, "epoch": 0.2502857142857143, "grad_norm": 1.0675256252288818, "kl": 0.218902587890625, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.214816693576234e-07, "loss": 0.0992, "reward": 0.42713918164372444, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.42713918164372444, "reward_after_std": 0.816290158778429, "reward_before_mean": 0.6623573517426848, "reward_before_std": 0.8132289610803127, "reward_change_max": 0.00037786364555358887, "reward_change_mean": -0.23521819338202477, "reward_change_min": -0.44588254764676094, "reward_change_std": 0.17330356873571873, "reward_std": 0.8162902072072029, "rewards/cosine_scaled_reward": 0.029095328878611326, "rewards/format_reward": 0.6041666753590107, "step": 219 }, { "advantage_max": 0.8039387799799442, "advantage_mean": 5.587935947293232e-09, "advantage_min": -0.5766046904027462, "advantage_std": 0.5212136022746563, "completion_length": 2948.187545776367, "epoch": 0.25142857142857145, "grad_norm": 0.30458855628967285, "kl": 0.2762451171875, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.185729670371604e-07, "loss": 0.0206, "reward": -0.2869377384777181, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.2869377384777181, "reward_after_std": 0.5212136246263981, "reward_before_mean": -0.1783251942397328, "reward_before_std": 0.5438382588326931, "reward_change_max": 0.001516386866569519, "reward_change_mean": -0.10861254576593637, "reward_change_min": -0.24544375576078892, "reward_change_std": 0.09796424768865108, "reward_std": 0.521213635802269, "rewards/cosine_scaled_reward": -0.2558292653411627, "rewards/format_reward": 0.33333334513008595, "step": 220 }, { "advantage_max": 1.2263987511396408, "advantage_mean": -7.450580818968433e-09, "advantage_min": -0.9199577495455742, "advantage_std": 0.8365131057798862, "completion_length": 2126.812530517578, "epoch": 0.25257142857142856, "grad_norm": 0.266436904668808, "kl": 0.2398681640625, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.156560487081051e-07, "loss": 0.0388, "reward": 0.3955154661089182, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3955154661089182, "reward_after_std": 0.8365130759775639, "reward_before_mean": 0.6298103723675013, "reward_before_std": 0.8753055445849895, "reward_change_max": 0.0, "reward_change_mean": -0.23429487133398652, "reward_change_min": -0.47523627430200577, "reward_change_std": 0.1862175134010613, "reward_std": 0.8365131095051765, "rewards/cosine_scaled_reward": 0.04407182789873332, "rewards/format_reward": 0.541666679084301, "step": 221 }, { "advantage_max": 1.0564912483096123, "advantage_mean": 1.645336702993383e-08, "advantage_min": -0.6158704683184624, "advantage_std": 0.624480914324522, "completion_length": 2522.625030517578, "epoch": 0.2537142857142857, "grad_norm": 0.3862358033657074, "kl": 0.3228759765625, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.127310565369415e-07, "loss": 0.0364, "reward": 0.36778752878308296, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.36778752878308296, "reward_after_std": 0.6244809105992317, "reward_before_mean": 0.6052828226238489, "reward_before_std": 0.5913751572370529, "reward_change_max": 7.501989603042603e-05, "reward_change_mean": -0.23749527800828218, "reward_change_min": -0.39534972235560417, "reward_change_std": 0.14965202566236258, "reward_std": 0.6244809329509735, "rewards/cosine_scaled_reward": 0.010974757373332977, "rewards/format_reward": 0.5833333395421505, "step": 222 }, { "advantage_max": 1.020553357899189, "advantage_mean": -2.4835269840650653e-08, "advantage_min": -0.8227595277130604, "advantage_std": 0.6716804038733244, "completion_length": 2412.1458740234375, "epoch": 0.25485714285714284, "grad_norm": 0.5055152177810669, "kl": 0.3028564453125, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.097981330836616e-07, "loss": 0.0041, "reward": 0.3942564968019724, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3942564968019724, "reward_after_std": 0.6716804187744856, "reward_before_mean": 0.6388956569135189, "reward_before_std": 0.6755735836923122, "reward_change_max": 0.0006217509508132935, "reward_change_mean": -0.24463913589715958, "reward_change_min": -0.4473006948828697, "reward_change_std": 0.17215165868401527, "reward_std": 0.6716804262250662, "rewards/cosine_scaled_reward": 0.05903113167732954, "rewards/format_reward": 0.5208333395421505, "step": 223 }, { "advantage_max": 1.4408755004405975, "advantage_mean": -2.42143873285805e-08, "advantage_min": -1.0008728131651878, "advantage_std": 0.954335268586874, "completion_length": 3054.8125762939453, "epoch": 0.256, "grad_norm": 0.8468606472015381, "kl": 0.3365478515625, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.068574212948169e-07, "loss": 0.0635, "reward": 0.23304745368659496, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.23304745368659496, "reward_after_std": 0.9543352723121643, "reward_before_mean": 0.42555781453847885, "reward_before_std": 1.0104116946458817, "reward_change_max": 0.0008555203676223755, "reward_change_mean": -0.19251040183007717, "reward_change_min": -0.44176225923001766, "reward_change_std": 0.18253928748890758, "reward_std": 0.9543353170156479, "rewards/cosine_scaled_reward": 0.04611224588006735, "rewards/format_reward": 0.33333333395421505, "step": 224 }, { "advantage_max": 1.1947003155946732, "advantage_mean": -6.829698695476338e-09, "advantage_min": -0.8866597339510918, "advantage_std": 0.7424304038286209, "completion_length": 2678.333366394043, "epoch": 0.2571428571428571, "grad_norm": 0.5643479228019714, "kl": 0.28375244140625, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.039090644965509e-07, "loss": 0.0683, "reward": -0.017941302387043834, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.017941302387043834, "reward_after_std": 0.7424304075539112, "reward_before_mean": 0.1313320784829557, "reward_before_std": 0.7683812938630581, "reward_change_max": 0.0003555566072463989, "reward_change_mean": -0.14927338855341077, "reward_change_min": -0.2961339224129915, "reward_change_std": 0.12563783582299948, "reward_std": 0.7424304150044918, "rewards/cosine_scaled_reward": -0.15308398008346558, "rewards/format_reward": 0.43750001676380634, "step": 225 }, { "advantage_max": 1.0854194797575474, "advantage_mean": -4.718701163142214e-08, "advantage_min": -0.8209239356219769, "advantage_std": 0.7022056467831135, "completion_length": 2388.500045776367, "epoch": 0.2582857142857143, "grad_norm": 0.7495775818824768, "kl": 0.28033447265625, "lambda_div_used": 0.7999999999999999, "learning_rate": 7.009532063876148e-07, "loss": -0.0004, "reward": 0.48588972771540284, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.48588972771540284, "reward_after_std": 0.7022056560963392, "reward_before_mean": 0.7460526805371046, "reward_before_std": 0.7004814520478249, "reward_change_max": 0.0, "reward_change_mean": -0.2601629653945565, "reward_change_min": -0.46394457295536995, "reward_change_std": 0.17792795319110155, "reward_std": 0.7022056691348553, "rewards/cosine_scaled_reward": 0.12302631698548794, "rewards/format_reward": 0.5000000093132257, "step": 226 }, { "advantage_max": 1.0397536233067513, "advantage_mean": -3.352761412944716e-08, "advantage_min": -0.8955881148576736, "advantage_std": 0.7049042098224163, "completion_length": 2367.229217529297, "epoch": 0.25942857142857145, "grad_norm": 0.4058659076690674, "kl": 0.32708740234375, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.979899910323624e-07, "loss": 0.0389, "reward": 0.1919878153130412, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.1919878153130412, "reward_after_std": 0.7049042358994484, "reward_before_mean": 0.3909576232545078, "reward_before_std": 0.7350170500576496, "reward_change_max": 0.0009368136525154114, "reward_change_mean": -0.19896983355283737, "reward_change_min": -0.3957909233868122, "reward_change_std": 0.15640341211110353, "reward_std": 0.7049042619764805, "rewards/cosine_scaled_reward": -0.10660453047603369, "rewards/format_reward": 0.6041666865348816, "step": 227 }, { "advantage_max": 0.9636344462633133, "advantage_mean": -1.2417632477834672e-09, "advantage_min": -0.7520710751414299, "advantage_std": 0.649684626609087, "completion_length": 2477.4375228881836, "epoch": 0.26057142857142856, "grad_norm": 0.3583449721336365, "kl": 0.308837890625, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.950195628537299e-07, "loss": 0.0322, "reward": 0.3669282849878073, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3669282849878073, "reward_after_std": 0.6496846117079258, "reward_before_mean": 0.6083303336054087, "reward_before_std": 0.6587526202201843, "reward_change_max": 0.0, "reward_change_mean": -0.24140203651040792, "reward_change_min": -0.43194261379539967, "reward_change_std": 0.16951258573681116, "reward_std": 0.6496846191585064, "rewards/cosine_scaled_reward": 0.06458183377981186, "rewards/format_reward": 0.4791666679084301, "step": 228 }, { "advantage_max": 0.9497727677226067, "advantage_mean": -1.1796752907855534e-08, "advantage_min": -0.768424890935421, "advantage_std": 0.6308351047337055, "completion_length": 2770.250045776367, "epoch": 0.26171428571428573, "grad_norm": 0.5783478021621704, "kl": 0.39697265625, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.920420666261961e-07, "loss": 0.0139, "reward": 0.3392603825777769, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3392603825777769, "reward_after_std": 0.6308351121842861, "reward_before_mean": 0.573946432210505, "reward_before_std": 0.6364546306431293, "reward_change_max": 0.0, "reward_change_mean": -0.23468605522066355, "reward_change_min": -0.41741955280303955, "reward_change_std": 0.1597256800159812, "reward_std": 0.6308351270854473, "rewards/cosine_scaled_reward": 0.02655654028058052, "rewards/format_reward": 0.5208333432674408, "step": 229 }, { "advantage_max": 1.4491103775799274, "advantage_mean": 8.692344621863413e-09, "advantage_min": -0.6946474388241768, "advantage_std": 0.8073096983134747, "completion_length": 3137.229248046875, "epoch": 0.26285714285714284, "grad_norm": 0.787334680557251, "kl": 0.40472412109375, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.890576474687263e-07, "loss": 0.0393, "reward": -0.1388988010585308, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1388988010585308, "reward_after_std": 0.807309702038765, "reward_before_mean": -0.02355245070066303, "reward_before_std": 0.8076146394014359, "reward_change_max": 0.0008121654391288757, "reward_change_mean": -0.11534635070711374, "reward_change_min": -0.2209514919668436, "reward_change_std": 0.0941726085729897, "reward_std": 0.8073097392916679, "rewards/cosine_scaled_reward": -0.1576095640193671, "rewards/format_reward": 0.2916666716337204, "step": 230 }, { "advantage_max": 1.1754492335021496, "advantage_mean": 8.381903143783731e-09, "advantage_min": -0.5930712819099426, "advantage_std": 0.6712705716490746, "completion_length": 2939.854202270508, "epoch": 0.264, "grad_norm": 0.44406428933143616, "kl": 0.36376953125, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.860664508377001e-07, "loss": 0.017, "reward": 0.05635275738313794, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.05635275738313794, "reward_after_std": 0.6712705604732037, "reward_before_mean": 0.22388296108692884, "reward_before_std": 0.6522968411445618, "reward_change_max": 0.0, "reward_change_mean": -0.16753020323812962, "reward_change_min": -0.327903950586915, "reward_change_std": 0.1249087923206389, "reward_std": 0.671270590275526, "rewards/cosine_scaled_reward": -0.03389184735715389, "rewards/format_reward": 0.29166666977107525, "step": 231 }, { "advantage_max": 0.771553497761488, "advantage_mean": 1.7074247515846963e-08, "advantage_min": -0.4739590361714363, "advantage_std": 0.48398585990071297, "completion_length": 2995.8958740234375, "epoch": 0.2651428571428571, "grad_norm": 0.7374857068061829, "kl": 0.3756103515625, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.83068622519821e-07, "loss": 0.0013, "reward": -0.3441154833417386, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3441154833417386, "reward_after_std": 0.48398585245013237, "reward_before_mean": -0.2438699882477522, "reward_before_std": 0.49863067269325256, "reward_change_max": 0.000510483980178833, "reward_change_mean": -0.10024550277739763, "reward_change_min": -0.2315848134458065, "reward_change_std": 0.0897470386698842, "reward_std": 0.48398587107658386, "rewards/cosine_scaled_reward": -0.2573516573756933, "rewards/format_reward": 0.27083333767950535, "step": 232 }, { "advantage_max": 1.0884613990783691, "advantage_mean": -1.738468857759301e-08, "advantage_min": -0.7914083432406187, "advantage_std": 0.6945225261151791, "completion_length": 2533.541732788086, "epoch": 0.2662857142857143, "grad_norm": 0.40767791867256165, "kl": 0.27996826171875, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.800643086250121e-07, "loss": 0.0328, "reward": 0.20873102080076933, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.20873102080076933, "reward_after_std": 0.6945225186645985, "reward_before_mean": 0.4113708536606282, "reward_before_std": 0.6971903201192617, "reward_change_max": 0.0007480159401893616, "reward_change_mean": -0.20263982936739922, "reward_change_min": -0.36456097662448883, "reward_change_std": 0.1446404680609703, "reward_std": 0.6945225335657597, "rewards/cosine_scaled_reward": -0.12764792516827583, "rewards/format_reward": 0.6666666753590107, "step": 233 }, { "advantage_max": 1.2315506115555763, "advantage_mean": 3.1044086745701804e-09, "advantage_min": -0.6851048804819584, "advantage_std": 0.7436628825962543, "completion_length": 2608.208351135254, "epoch": 0.2674285714285714, "grad_norm": 0.4812045693397522, "kl": 0.23345947265625, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.770536555792944e-07, "loss": 0.0449, "reward": -0.0038607940077781677, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.0038607940077781677, "reward_after_std": 0.743662878870964, "reward_before_mean": 0.14880174584686756, "reward_before_std": 0.7588416188955307, "reward_change_max": 0.0006005018949508667, "reward_change_mean": -0.1526625594124198, "reward_change_min": -0.3456360902637243, "reward_change_std": 0.1304946723394096, "reward_std": 0.743662878870964, "rewards/cosine_scaled_reward": -0.12351579079404473, "rewards/format_reward": 0.3958333358168602, "step": 234 }, { "advantage_max": 1.8652660697698593, "advantage_mean": -8.692344177774203e-09, "advantage_min": -0.9754194021224976, "advantage_std": 1.0703791454434395, "completion_length": 2342.958396911621, "epoch": 0.26857142857142857, "grad_norm": 0.655600368976593, "kl": 0.22100830078125, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.740368101176495e-07, "loss": 0.0349, "reward": 0.5246966313570738, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5246966313570738, "reward_after_std": 1.0703791752457619, "reward_before_mean": 0.7612851969897747, "reward_before_std": 1.0648993849754333, "reward_change_max": 0.0, "reward_change_mean": -0.2365885442122817, "reward_change_min": -0.4519693311303854, "reward_change_std": 0.17281616665422916, "reward_std": 1.0703792348504066, "rewards/cosine_scaled_reward": 0.07855925138574094, "rewards/format_reward": 0.6041666734963655, "step": 235 }, { "advantage_max": 1.4130152836441994, "advantage_mean": -1.0554989493538613e-08, "advantage_min": -1.119653008878231, "advantage_std": 0.948615700006485, "completion_length": 2975.541717529297, "epoch": 0.26971428571428574, "grad_norm": 0.8267731070518494, "kl": 0.21380615234375, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.710139192768694e-07, "loss": 0.049, "reward": 0.46480933332350105, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.46480933332350105, "reward_after_std": 0.9486157409846783, "reward_before_mean": 0.7081692619249225, "reward_before_std": 0.9944500625133514, "reward_change_max": 0.00048495084047317505, "reward_change_mean": -0.24335988890379667, "reward_change_min": -0.46494733169674873, "reward_change_std": 0.19382535014301538, "reward_std": 0.9486157894134521, "rewards/cosine_scaled_reward": 0.1040846067480743, "rewards/format_reward": 0.5000000111758709, "step": 236 }, { "advantage_max": 0.9904080554842949, "advantage_mean": 2.7318796114172983e-08, "advantage_min": -0.897821705788374, "advantage_std": 0.6906596831977367, "completion_length": 2672.312545776367, "epoch": 0.27085714285714285, "grad_norm": 0.3355805575847626, "kl": 0.21453857421875, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.679851303883891e-07, "loss": 0.0263, "reward": 0.26410793140530586, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.26410793140530586, "reward_after_std": 0.6906596347689629, "reward_before_mean": 0.48194941505789757, "reward_before_std": 0.7241276651620865, "reward_change_max": 0.0002012401819229126, "reward_change_mean": -0.2178414580412209, "reward_change_min": -0.4010373577475548, "reward_change_std": 0.1635338426567614, "reward_std": 0.690659660845995, "rewards/cosine_scaled_reward": 0.03264136612415314, "rewards/format_reward": 0.41666667722165585, "step": 237 }, { "advantage_max": 1.2825810834765434, "advantage_mean": 4.346172255420555e-09, "advantage_min": -1.0097664818167686, "advantage_std": 0.8716474920511246, "completion_length": 2741.2083740234375, "epoch": 0.272, "grad_norm": 1.5649930238723755, "kl": 0.2686767578125, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.649505910711058e-07, "loss": 0.1054, "reward": 0.2588945508468896, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2588945508468896, "reward_after_std": 0.8716474920511246, "reward_before_mean": 0.4643253441900015, "reward_before_std": 0.9199245795607567, "reward_change_max": 0.00028133392333984375, "reward_change_mean": -0.20543078426271677, "reward_change_min": -0.4312341585755348, "reward_change_std": 0.1798837324604392, "reward_std": 0.8716475069522858, "rewards/cosine_scaled_reward": -0.038670673966407776, "rewards/format_reward": 0.5416666846722364, "step": 238 }, { "advantage_max": 0.9123195372521877, "advantage_mean": -1.9868215517249155e-08, "advantage_min": -0.4897281564772129, "advantage_std": 0.5429513230919838, "completion_length": 2292.6458892822266, "epoch": 0.27314285714285713, "grad_norm": 0.20218530297279358, "kl": 0.212615966796875, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.619104492241847e-07, "loss": 0.0146, "reward": 0.5889002867043018, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5889002867043018, "reward_after_std": 0.5429513193666935, "reward_before_mean": 0.877927228808403, "reward_before_std": 0.4769926071166992, "reward_change_max": 0.0, "reward_change_mean": -0.28902695421129465, "reward_change_min": -0.46014387905597687, "reward_change_std": 0.1717733102850616, "reward_std": 0.542951337993145, "rewards/cosine_scaled_reward": 0.14729693718254566, "rewards/format_reward": 0.5833333395421505, "step": 239 }, { "advantage_max": 0.9248532876372337, "advantage_mean": 2.4835269396561444e-09, "advantage_min": -0.5908396989107132, "advantage_std": 0.545099601149559, "completion_length": 2806.604217529297, "epoch": 0.2742857142857143, "grad_norm": 0.4481170177459717, "kl": 0.3870849609375, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.588648530198504e-07, "loss": 0.0359, "reward": -0.07827135361731052, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.07827135361731052, "reward_after_std": 0.5450996086001396, "reward_before_mean": 0.07239316776394844, "reward_before_std": 0.5371088832616806, "reward_change_max": 0.0, "reward_change_mean": -0.15066451393067837, "reward_change_min": -0.27413977682590485, "reward_change_std": 0.1044732634909451, "reward_std": 0.5450996272265911, "rewards/cosine_scaled_reward": -0.24505342729389668, "rewards/format_reward": 0.5625000093132257, "step": 240 }, { "advantage_max": 1.1114703305065632, "advantage_mean": -5.898376620461221e-09, "advantage_min": -0.6510047800838947, "advantage_std": 0.6448456346988678, "completion_length": 3002.166717529297, "epoch": 0.2754285714285714, "grad_norm": 0.34608423709869385, "kl": 0.33349609375, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.558139508961654e-07, "loss": 0.0418, "reward": -0.07989289052784443, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.07989289052784443, "reward_after_std": 0.6448456235229969, "reward_before_mean": 0.06071397475898266, "reward_before_std": 0.6436697188764811, "reward_change_max": 0.0034544840455055237, "reward_change_mean": -0.14060684852302074, "reward_change_min": -0.2806492391973734, "reward_change_std": 0.11329286079853773, "reward_std": 0.6448456458747387, "rewards/cosine_scaled_reward": -0.20922635588794947, "rewards/format_reward": 0.47916667349636555, "step": 241 }, { "advantage_max": 1.1428449526429176, "advantage_mean": -3.104410062348961e-10, "advantage_min": -0.6170692853629589, "advantage_std": 0.6422752179205418, "completion_length": 2408.7500610351562, "epoch": 0.2765714285714286, "grad_norm": 0.32288601994514465, "kl": 0.3619384765625, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.527578915497951e-07, "loss": 0.0402, "reward": 0.2951185256242752, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2951185256242752, "reward_after_std": 0.6422752365469933, "reward_before_mean": 0.5139434542506933, "reward_before_std": 0.6053459383547306, "reward_change_max": 0.0, "reward_change_mean": -0.21882489789277315, "reward_change_min": -0.33864388801157475, "reward_change_std": 0.1315562268719077, "reward_std": 0.6422752439975739, "rewards/cosine_scaled_reward": -0.1180282924324274, "rewards/format_reward": 0.7500000149011612, "step": 242 }, { "advantage_max": 1.3674786761403084, "advantage_mean": -2.359350548264416e-08, "advantage_min": -0.7977042905986309, "advantage_std": 0.8004893995821476, "completion_length": 2858.166748046875, "epoch": 0.2777142857142857, "grad_norm": 0.4662991464138031, "kl": 0.38592529296875, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.496968239287603e-07, "loss": 0.0429, "reward": 0.4232206456363201, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4232206456363201, "reward_after_std": 0.8004894070327282, "reward_before_mean": 0.6595686723594554, "reward_before_std": 0.7805854938924313, "reward_change_max": 0.0008308589458465576, "reward_change_mean": -0.2363480133935809, "reward_change_min": -0.43160959146916866, "reward_change_std": 0.16202430799603462, "reward_std": 0.8004894331097603, "rewards/cosine_scaled_reward": 0.02770098950713873, "rewards/format_reward": 0.6041666809469461, "step": 243 }, { "advantage_max": 1.3175344914197922, "advantage_mean": -3.4148489036489593e-09, "advantage_min": -0.9045217782258987, "advantage_std": 0.7941471636295319, "completion_length": 2850.687545776367, "epoch": 0.27885714285714286, "grad_norm": 0.7452641725540161, "kl": 0.378173828125, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.466308972251785e-07, "loss": 0.0024, "reward": 0.2798166205175221, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2798166205175221, "reward_after_std": 0.7941471934318542, "reward_before_mean": 0.48845770955085754, "reward_before_std": 0.8019105568528175, "reward_change_max": 0.00035975128412246704, "reward_change_mean": -0.20864112069830298, "reward_change_min": -0.3843183424323797, "reward_change_std": 0.1547684259712696, "reward_std": 0.7941471934318542, "rewards/cosine_scaled_reward": 0.004645538050681353, "rewards/format_reward": 0.47916667722165585, "step": 244 }, { "advantage_max": 1.0654522515833378, "advantage_mean": 9.313226023710541e-10, "advantage_min": -0.6830060258507729, "advantage_std": 0.6581321209669113, "completion_length": 3216.2709045410156, "epoch": 0.28, "grad_norm": 0.4428803026676178, "kl": 0.5015869140625, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.435602608679916e-07, "loss": 0.0617, "reward": -0.09936166135594249, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.09936166135594249, "reward_after_std": 0.6581321209669113, "reward_before_mean": 0.04015257302671671, "reward_before_std": 0.6739694178104401, "reward_change_max": 0.0002557337284088135, "reward_change_mean": -0.13951422786340117, "reward_change_min": -0.30711502954363823, "reward_change_std": 0.12101027090102434, "reward_std": 0.658132154494524, "rewards/cosine_scaled_reward": -0.14659039117395878, "rewards/format_reward": 0.33333333767950535, "step": 245 }, { "advantage_max": 1.4215035066008568, "advantage_mean": -4.346172144398253e-09, "advantage_min": -0.9086656384170055, "advantage_std": 0.8594731651246548, "completion_length": 2984.854217529297, "epoch": 0.28114285714285714, "grad_norm": 1.1912707090377808, "kl": 0.4974365234375, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.404850645156841e-07, "loss": 0.1081, "reward": 0.27343960758298635, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.27343960758298635, "reward_after_std": 0.8594731576740742, "reward_before_mean": 0.47582032438367605, "reward_before_std": 0.8642517514526844, "reward_change_max": 0.00031065940856933594, "reward_change_mean": -0.2023807168006897, "reward_change_min": -0.3816883508116007, "reward_change_std": 0.15361210703849792, "reward_std": 0.8594731912016869, "rewards/cosine_scaled_reward": -0.012089846655726433, "rewards/format_reward": 0.5000000074505806, "step": 246 }, { "advantage_max": 1.1784251481294632, "advantage_mean": -7.450580596923828e-09, "advantage_min": -0.7006541565060616, "advantage_std": 0.7098598964512348, "completion_length": 3132.3334045410156, "epoch": 0.2822857142857143, "grad_norm": 0.5006141066551208, "kl": 0.468994140625, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.374054580489873e-07, "loss": 0.0462, "reward": -0.08464969601482153, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.08464969601482153, "reward_after_std": 0.7098598703742027, "reward_before_mean": 0.05323531664907932, "reward_before_std": 0.7246571853756905, "reward_change_max": 0.0006433352828025818, "reward_change_mean": -0.1378850331529975, "reward_change_min": -0.3256298862397671, "reward_change_std": 0.12156745418906212, "reward_std": 0.7098599150776863, "rewards/cosine_scaled_reward": -0.1817156784236431, "rewards/format_reward": 0.41666668094694614, "step": 247 }, { "advantage_max": 0.9952891990542412, "advantage_mean": -1.0554990104161277e-08, "advantage_min": -0.8652937412261963, "advantage_std": 0.6502857394516468, "completion_length": 2878.4583892822266, "epoch": 0.2834285714285714, "grad_norm": 0.6091745495796204, "kl": 0.4501953125, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.343215915635761e-07, "loss": 0.0183, "reward": 0.3772900849580765, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3772900849580765, "reward_after_std": 0.6502857394516468, "reward_before_mean": 0.6193997673690319, "reward_before_std": 0.6534117143601179, "reward_change_max": 0.0009594112634658813, "reward_change_mean": -0.24210964026860893, "reward_change_min": -0.3878302574157715, "reward_change_std": 0.1620875052176416, "reward_std": 0.6502857431769371, "rewards/cosine_scaled_reward": 0.05969984829425812, "rewards/format_reward": 0.5000000018626451, "step": 248 }, { "advantage_max": 1.2759136632084846, "advantage_mean": 1.3348957939030015e-08, "advantage_min": -0.8566323257982731, "advantage_std": 0.7767435237765312, "completion_length": 2242.2709197998047, "epoch": 0.2845714285714286, "grad_norm": 0.8677136301994324, "kl": 0.2835693359375, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.31233615362752e-07, "loss": 0.0774, "reward": 0.7364492556080222, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7364492556080222, "reward_after_std": 0.7767435275018215, "reward_before_mean": 1.0415291688404977, "reward_before_std": 0.7477922700345516, "reward_change_max": 0.0, "reward_change_mean": -0.30507988668978214, "reward_change_min": -0.4802358113229275, "reward_change_std": 0.18868014216423035, "reward_std": 0.776743546128273, "rewards/cosine_scaled_reward": 0.16659791581332684, "rewards/format_reward": 0.7083333395421505, "step": 249 }, { "advantage_max": 1.3246539905667305, "advantage_mean": -1.8626447051417472e-09, "advantage_min": -0.7587595582008362, "advantage_std": 0.7704361565411091, "completion_length": 2559.354232788086, "epoch": 0.2857142857142857, "grad_norm": 0.7462479472160339, "kl": 0.500732421875, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.281416799501187e-07, "loss": 0.0368, "reward": 0.35601929062977433, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.35601929062977433, "reward_after_std": 0.7704361379146576, "reward_before_mean": 0.5795669555664062, "reward_before_std": 0.7518536355346441, "reward_change_max": 8.866935968399048e-05, "reward_change_mean": -0.22354764956980944, "reward_change_min": -0.4075309745967388, "reward_change_std": 0.15171909425407648, "reward_std": 0.7704361565411091, "rewards/cosine_scaled_reward": -0.07479987479746342, "rewards/format_reward": 0.7291666753590107, "step": 250 }, { "advantage_max": 1.4129582643508911, "advantage_mean": 2.7939682512023722e-09, "advantage_min": -0.8298484236001968, "advantage_std": 0.8299252316355705, "completion_length": 1986.1875610351562, "epoch": 0.28685714285714287, "grad_norm": 0.38087886571884155, "kl": 0.32958984375, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.25045936022246e-07, "loss": 0.009, "reward": 0.5232213628478348, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5232213628478348, "reward_after_std": 0.8299252390861511, "reward_before_mean": 0.7778614014387131, "reward_before_std": 0.8030033111572266, "reward_change_max": 0.0008496716618537903, "reward_change_mean": -0.2546399999409914, "reward_change_min": -0.4694129191339016, "reward_change_std": 0.1713426373898983, "reward_std": 0.8299252465367317, "rewards/cosine_scaled_reward": -0.01731932070106268, "rewards/format_reward": 0.812500013038516, "step": 251 }, { "advantage_max": 0.9795961454510689, "advantage_mean": -1.8005570812107408e-08, "advantage_min": -0.6381446719169617, "advantage_std": 0.6092146001756191, "completion_length": 2771.8541946411133, "epoch": 0.288, "grad_norm": 0.8943673968315125, "kl": 0.4296875, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.219465344613258e-07, "loss": 0.0024, "reward": 0.22168607264757156, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.22168607264757156, "reward_after_std": 0.609214598312974, "reward_before_mean": 0.4316192679107189, "reward_before_std": 0.5957893319427967, "reward_change_max": 0.0005483701825141907, "reward_change_mean": -0.20993319479748607, "reward_change_min": -0.3657133523374796, "reward_change_std": 0.14107152435462922, "reward_std": 0.6092146243900061, "rewards/cosine_scaled_reward": -0.05502370838075876, "rewards/format_reward": 0.541666679084301, "step": 252 }, { "advantage_max": 1.1931979358196259, "advantage_mean": -1.1102230246251565e-16, "advantage_min": -0.9128836393356323, "advantage_std": 0.7648354470729828, "completion_length": 2504.479263305664, "epoch": 0.28914285714285715, "grad_norm": 0.48425501585006714, "kl": 0.3848876953125, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.188436263278172e-07, "loss": 0.035, "reward": 0.6141545444261283, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6141545444261283, "reward_after_std": 0.7648354396224022, "reward_before_mean": 0.8976643066853285, "reward_before_std": 0.7630242742598057, "reward_change_max": 0.00016620010137557983, "reward_change_mean": -0.2835097722709179, "reward_change_min": -0.4889257773756981, "reward_change_std": 0.1873344276100397, "reward_std": 0.7648354545235634, "rewards/cosine_scaled_reward": 0.06341548450291157, "rewards/format_reward": 0.7708333469927311, "step": 253 }, { "advantage_max": 1.1527259647846222, "advantage_mean": 1.5522042984272844e-08, "advantage_min": -0.7479028627276421, "advantage_std": 0.6986750327050686, "completion_length": 3060.500045776367, "epoch": 0.29028571428571426, "grad_norm": 0.5801138281822205, "kl": 0.551025390625, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.157373628530852e-07, "loss": 0.075, "reward": 0.08959226682782173, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.08959226682782173, "reward_after_std": 0.6986750550568104, "reward_before_mean": 0.26555439457297325, "reward_before_std": 0.7030624710023403, "reward_change_max": 0.0006458908319473267, "reward_change_mean": -0.17596213333308697, "reward_change_min": -0.30556071549654007, "reward_change_std": 0.12727333791553974, "reward_std": 0.698675062507391, "rewards/cosine_scaled_reward": -0.04430613946169615, "rewards/format_reward": 0.35416667349636555, "step": 254 }, { "advantage_max": 1.4135619774460793, "advantage_mean": 1.9868215628271457e-08, "advantage_min": -0.906343974173069, "advantage_std": 0.9186242371797562, "completion_length": 2971.2500610351562, "epoch": 0.2914285714285714, "grad_norm": 0.513594388961792, "kl": 0.49725341796875, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.126278954320294e-07, "loss": 0.0272, "reward": 0.09923725947737694, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.09923725947737694, "reward_after_std": 0.9186242371797562, "reward_before_mean": 0.2659471333026886, "reward_before_std": 0.9716958254575729, "reward_change_max": 0.0008112862706184387, "reward_change_mean": -0.1667098170146346, "reward_change_min": -0.4284713827073574, "reward_change_std": 0.17021398525685072, "reward_std": 0.9186242707073689, "rewards/cosine_scaled_reward": -0.09619312267750502, "rewards/format_reward": 0.45833334513008595, "step": 255 }, { "advantage_max": 1.2006757967174053, "advantage_mean": 1.5522042984272844e-08, "advantage_min": -0.7429698333144188, "advantage_std": 0.7124652713537216, "completion_length": 2864.562545776367, "epoch": 0.2925714285714286, "grad_norm": 0.705951988697052, "kl": 0.4068603515625, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.095153756157051e-07, "loss": 0.0465, "reward": -0.010957542806863785, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.010957542806863785, "reward_after_std": 0.7124652564525604, "reward_before_mean": 0.14110034401528537, "reward_before_std": 0.72434863448143, "reward_change_max": 0.0006079450249671936, "reward_change_mean": -0.1520578470081091, "reward_change_min": -0.2988077960908413, "reward_change_std": 0.12131620571017265, "reward_std": 0.7124652713537216, "rewards/cosine_scaled_reward": -0.14819983951747417, "rewards/format_reward": 0.43750001676380634, "step": 256 }, { "advantage_max": 1.307850755751133, "advantage_mean": -2.8560559584001055e-08, "advantage_min": -1.1963625326752663, "advantage_std": 0.9193693548440933, "completion_length": 3058.229248046875, "epoch": 0.2937142857142857, "grad_norm": 0.8376688957214355, "kl": 0.414306640625, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.06399955103937e-07, "loss": 0.0659, "reward": 0.4931573048233986, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4931573048233986, "reward_after_std": 0.9193693995475769, "reward_before_mean": 0.746170642785728, "reward_before_std": 0.975673820823431, "reward_change_max": 0.000555872917175293, "reward_change_mean": -0.25301333982497454, "reward_change_min": -0.4786340221762657, "reward_change_std": 0.20542677212506533, "reward_std": 0.9193694181740284, "rewards/cosine_scaled_reward": 0.10225197300314903, "rewards/format_reward": 0.5416666809469461, "step": 257 }, { "advantage_max": 1.2001386135816574, "advantage_mean": -8.6923440667519e-09, "advantage_min": -1.15541011095047, "advantage_std": 0.8693474680185318, "completion_length": 3050.666717529297, "epoch": 0.2948571428571429, "grad_norm": 1.055458426475525, "kl": 0.4342041015625, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.032817857379256e-07, "loss": 0.0921, "reward": 0.3292580866254866, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3292580866254866, "reward_after_std": 0.8693474680185318, "reward_before_mean": 0.5526071041822433, "reward_before_std": 0.9303718060255051, "reward_change_max": 0.001217663288116455, "reward_change_mean": -0.22334902989678085, "reward_change_min": -0.4339667744934559, "reward_change_std": 0.19128504721447825, "reward_std": 0.8693475164473057, "rewards/cosine_scaled_reward": -0.03619644418358803, "rewards/format_reward": 0.625000013038516, "step": 258 }, { "advantage_max": 1.120631866157055, "advantage_mean": -2.1109978376454563e-08, "advantage_min": -0.8829772584140301, "advantage_std": 0.7115774154663086, "completion_length": 2525.187530517578, "epoch": 0.296, "grad_norm": 0.6028103828430176, "kl": 0.396240234375, "lambda_div_used": 0.7999999999999999, "learning_rate": 6.001610194928464e-07, "loss": 0.0221, "reward": 0.2749463734216988, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2749463734216988, "reward_after_std": 0.7115774005651474, "reward_before_mean": 0.4895500782877207, "reward_before_std": 0.7222363092005253, "reward_change_max": 0.0002304166555404663, "reward_change_mean": -0.2146037225611508, "reward_change_min": -0.3983222171664238, "reward_change_std": 0.15764279011636972, "reward_std": 0.7115774191915989, "rewards/cosine_scaled_reward": -0.057308297604322433, "rewards/format_reward": 0.6041666753590107, "step": 259 }, { "advantage_max": 1.420989066362381, "advantage_mean": -3.228584977144067e-08, "advantage_min": -1.085547935217619, "advantage_std": 0.9159971624612808, "completion_length": 2311.1458740234375, "epoch": 0.29714285714285715, "grad_norm": 0.4120553135871887, "kl": 0.37274169921875, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.97037808470444e-07, "loss": 0.0303, "reward": 0.5744108855724335, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5744108855724335, "reward_after_std": 0.9159971848130226, "reward_before_mean": 0.8391903787851334, "reward_before_std": 0.9414675608277321, "reward_change_max": 0.0, "reward_change_mean": -0.2647794894874096, "reward_change_min": -0.4815485719591379, "reward_change_std": 0.19730475591495633, "reward_std": 0.9159972108900547, "rewards/cosine_scaled_reward": 0.11751184146851301, "rewards/format_reward": 0.6041666734963655, "step": 260 }, { "advantage_max": 1.1634193137288094, "advantage_mean": 1.8626455378090157e-09, "advantage_min": -0.6875870302319527, "advantage_std": 0.6962924711406231, "completion_length": 3096.1250610351562, "epoch": 0.29828571428571427, "grad_norm": 1.0137593746185303, "kl": 0.452880859375, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.939123048916173e-07, "loss": 0.0791, "reward": -0.030452772043645382, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.030452772043645382, "reward_after_std": 0.6962924636900425, "reward_before_mean": 0.11967078410089016, "reward_before_std": 0.7041243650019169, "reward_change_max": 0.0002564564347267151, "reward_change_mean": -0.1501235319301486, "reward_change_min": -0.33282414823770523, "reward_change_std": 0.12437624577432871, "reward_std": 0.6962924897670746, "rewards/cosine_scaled_reward": -0.19016463123261929, "rewards/format_reward": 0.5000000111758709, "step": 261 }, { "advantage_max": 0.932201087474823, "advantage_mean": -1.0865430222217753e-08, "advantage_min": -0.6653118953108788, "advantage_std": 0.5815522968769073, "completion_length": 2836.8959350585938, "epoch": 0.29942857142857143, "grad_norm": 0.4320916533470154, "kl": 0.439208984375, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.907846610890011e-07, "loss": 0.0444, "reward": 0.04765827255323529, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.04765827255323529, "reward_after_std": 0.581552293151617, "reward_before_mean": 0.22312193596735597, "reward_before_std": 0.5816116891801357, "reward_change_max": 0.00012744218111038208, "reward_change_mean": -0.17546369042247534, "reward_change_min": -0.33822631277143955, "reward_change_std": 0.1261238707229495, "reward_std": 0.581552304327488, "rewards/cosine_scaled_reward": -0.21135569922626019, "rewards/format_reward": 0.645833345130086, "step": 262 }, { "advantage_max": 0.8561205826699734, "advantage_mean": 2.79396769609086e-09, "advantage_min": -0.6569207943975925, "advantage_std": 0.5411554276943207, "completion_length": 2723.8542404174805, "epoch": 0.30057142857142854, "grad_norm": 0.3876248300075531, "kl": 0.4888916015625, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.87655029499542e-07, "loss": 0.0433, "reward": 0.10675198398530483, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.10675198398530483, "reward_after_std": 0.5411554463207722, "reward_before_mean": 0.2983070253394544, "reward_before_std": 0.538859274238348, "reward_change_max": 0.0, "reward_change_mean": -0.1915550483390689, "reward_change_min": -0.3442938830703497, "reward_change_std": 0.1286425469443202, "reward_std": 0.5411554835736752, "rewards/cosine_scaled_reward": -0.16334648989140987, "rewards/format_reward": 0.6250000093132257, "step": 263 }, { "advantage_max": 1.1704243831336498, "advantage_mean": -6.984919309616089e-09, "advantage_min": -0.860997948795557, "advantage_std": 0.7329257298260927, "completion_length": 2875.604278564453, "epoch": 0.3017142857142857, "grad_norm": 0.6105872392654419, "kl": 0.516357421875, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.845235626570683e-07, "loss": 0.0637, "reward": 0.32769849291071296, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.32769849291071296, "reward_after_std": 0.7329257223755121, "reward_before_mean": 0.551668926840648, "reward_before_std": 0.7303816229104996, "reward_change_max": 0.0002583116292953491, "reward_change_mean": -0.223970428109169, "reward_change_min": -0.39334858767688274, "reward_change_std": 0.15903489384800196, "reward_std": 0.7329257316887379, "rewards/cosine_scaled_reward": -0.036665546242147684, "rewards/format_reward": 0.6250000055879354, "step": 264 }, { "advantage_max": 1.2543099895119667, "advantage_mean": -2.8560560472179475e-08, "advantage_min": -0.8995270654559135, "advantage_std": 0.7639308758080006, "completion_length": 2455.6250610351562, "epoch": 0.3028571428571429, "grad_norm": 0.5805307626724243, "kl": 0.43011474609375, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.813904131848564e-07, "loss": 0.0123, "reward": 0.6702125794254243, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6702125794254243, "reward_after_std": 0.7639308832585812, "reward_before_mean": 0.9619456650689244, "reward_before_std": 0.7384607084095478, "reward_change_max": 2.650916576385498e-05, "reward_change_mean": -0.2917331103235483, "reward_change_min": -0.4562958050519228, "reward_change_std": 0.1784222200512886, "reward_std": 0.7639308944344521, "rewards/cosine_scaled_reward": 0.06430615484714508, "rewards/format_reward": 0.8333333507180214, "step": 265 }, { "advantage_max": 1.134152166545391, "advantage_mean": -1.0554989493538613e-08, "advantage_min": -0.6526764146983624, "advantage_std": 0.6940972171723843, "completion_length": 2838.7500610351562, "epoch": 0.304, "grad_norm": 0.8360715508460999, "kl": 0.50238037109375, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.78255733788191e-07, "loss": 0.0221, "reward": 0.07992671895772219, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.07992671895772219, "reward_after_std": 0.6940972171723843, "reward_before_mean": 0.2539265248924494, "reward_before_std": 0.6986450217664242, "reward_change_max": 0.00019249320030212402, "reward_change_mean": -0.17399981152266264, "reward_change_min": -0.3619354199618101, "reward_change_std": 0.13899551844224334, "reward_std": 0.6940972171723843, "rewards/cosine_scaled_reward": -0.18553675338625908, "rewards/format_reward": 0.6250000037252903, "step": 266 }, { "advantage_max": 1.0816873833537102, "advantage_mean": -6.208815683805824e-10, "advantage_min": -0.8192684203386307, "advantage_std": 0.7078514359891415, "completion_length": 3280.750030517578, "epoch": 0.30514285714285716, "grad_norm": 0.803869366645813, "kl": 0.64794921875, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.751196772469237e-07, "loss": 0.0459, "reward": 0.04755986901000142, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.04755986901000142, "reward_after_std": 0.7078514508903027, "reward_before_mean": 0.21775454888120294, "reward_before_std": 0.7377180494368076, "reward_change_max": 9.12100076675415e-05, "reward_change_mean": -0.1701946770772338, "reward_change_min": -0.3484746851027012, "reward_change_std": 0.1423499807715416, "reward_std": 0.707851480692625, "rewards/cosine_scaled_reward": -0.15153939928859472, "rewards/format_reward": 0.5208333488553762, "step": 267 }, { "advantage_max": 1.2278237342834473, "advantage_mean": -4.9670536017565325e-09, "advantage_min": -0.7800223752856255, "advantage_std": 0.7465534321963787, "completion_length": 2668.291717529297, "epoch": 0.3062857142857143, "grad_norm": 0.8479388356208801, "kl": 0.4451904296875, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.71982396408026e-07, "loss": 0.0637, "reward": 0.19763406133279204, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.19763406133279204, "reward_after_std": 0.7465534135699272, "reward_before_mean": 0.3912855861708522, "reward_before_std": 0.7482039630413055, "reward_change_max": 4.7944486141204834e-05, "reward_change_mean": -0.1936515048146248, "reward_change_min": -0.3798055090010166, "reward_change_std": 0.14345189882442355, "reward_std": 0.7465534433722496, "rewards/cosine_scaled_reward": -0.08560722845140845, "rewards/format_reward": 0.5625000167638063, "step": 268 }, { "advantage_max": 1.0387192778289318, "advantage_mean": -8.071462831438225e-09, "advantage_min": -0.6435651443898678, "advantage_std": 0.6317089274525642, "completion_length": 2722.104232788086, "epoch": 0.30742857142857144, "grad_norm": 0.8026515245437622, "kl": 0.4918212890625, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.688440441781398e-07, "loss": 0.0134, "reward": 0.12071692384779453, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.12071692384779453, "reward_after_std": 0.6317089200019836, "reward_before_mean": 0.3073125630617142, "reward_before_std": 0.6241258643567562, "reward_change_max": 0.0020045414566993713, "reward_change_mean": -0.18659565411508083, "reward_change_min": -0.357524149119854, "reward_change_std": 0.13216951489448547, "reward_std": 0.6317089349031448, "rewards/cosine_scaled_reward": -0.15884371474385262, "rewards/format_reward": 0.6250000149011612, "step": 269 }, { "advantage_max": 1.619797796010971, "advantage_mean": -4.346172144398253e-09, "advantage_min": -1.0483565255999565, "advantage_std": 0.9902137815952301, "completion_length": 2775.4584350585938, "epoch": 0.30857142857142855, "grad_norm": 1.6801332235336304, "kl": 0.435302734375, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.657047735161255e-07, "loss": 0.0923, "reward": 0.3881141534075141, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3881141534075141, "reward_after_std": 0.9902137517929077, "reward_before_mean": 0.6052586287260056, "reward_before_std": 1.0116727985441685, "reward_change_max": 0.0014529228210449219, "reward_change_mean": -0.21714445110410452, "reward_change_min": -0.4292067475616932, "reward_change_std": 0.17398629896342754, "reward_std": 0.9902137778699398, "rewards/cosine_scaled_reward": 0.0005459561944007874, "rewards/format_reward": 0.6041666753590107, "step": 270 }, { "advantage_max": 1.1621160581707954, "advantage_mean": -2.1109978431965715e-08, "advantage_min": -0.78480114787817, "advantage_std": 0.7370627373456955, "completion_length": 2557.9584350585938, "epoch": 0.3097142857142857, "grad_norm": 0.9561665654182434, "kl": 0.3992919921875, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.625647374256061e-07, "loss": 0.0602, "reward": 0.7530359327793121, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7530359327793121, "reward_after_std": 0.7370627522468567, "reward_before_mean": 1.0662704948335886, "reward_before_std": 0.7159419469535351, "reward_change_max": 0.0, "reward_change_mean": -0.3132345573976636, "reward_change_min": -0.5165784861892462, "reward_change_std": 0.20659221708774567, "reward_std": 0.7370627820491791, "rewards/cosine_scaled_reward": 0.18938525021076202, "rewards/format_reward": 0.687500013038516, "step": 271 }, { "advantage_max": 1.1364346630871296, "advantage_mean": -1.0554989438027462e-08, "advantage_min": -0.9597682431340218, "advantage_std": 0.757855137810111, "completion_length": 3181.166717529297, "epoch": 0.31085714285714283, "grad_norm": 0.7340137958526611, "kl": 0.583740234375, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.594240889475106e-07, "loss": 0.0624, "reward": 0.27623227424919605, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.27623227424919605, "reward_after_std": 0.7578551527112722, "reward_before_mean": 0.4916242975741625, "reward_before_std": 0.7839965410530567, "reward_change_max": 0.00044462084770202637, "reward_change_mean": -0.21539202518761158, "reward_change_min": -0.41614267230033875, "reward_change_std": 0.16441212827339768, "reward_std": 0.7578551601618528, "rewards/cosine_scaled_reward": -0.02502118982374668, "rewards/format_reward": 0.5416666846722364, "step": 272 }, { "advantage_max": 1.160659622400999, "advantage_mean": -8.692343955729598e-09, "advantage_min": -0.7852242887020111, "advantage_std": 0.7171707265079021, "completion_length": 2839.791717529297, "epoch": 0.312, "grad_norm": 0.6136277318000793, "kl": 0.52923583984375, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.562829811526154e-07, "loss": 0.0605, "reward": 0.32113747112452984, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.32113747112452984, "reward_after_std": 0.7171707190573215, "reward_before_mean": 0.544464853592217, "reward_before_std": 0.7175061888992786, "reward_change_max": 0.00020026415586471558, "reward_change_mean": -0.22332737175747752, "reward_change_min": -0.40839548222720623, "reward_change_std": 0.15226724604144692, "reward_std": 0.7171707265079021, "rewards/cosine_scaled_reward": -0.019434254616498947, "rewards/format_reward": 0.5833333414047956, "step": 273 }, { "advantage_max": 1.3966002464294434, "advantage_mean": -1.4280279958533981e-08, "advantage_min": -0.931741576641798, "advantage_std": 0.8832690231502056, "completion_length": 1894.0625228881836, "epoch": 0.31314285714285717, "grad_norm": 0.6674050688743591, "kl": 0.3251953125, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.531415671340826e-07, "loss": 0.0401, "reward": 0.6757131940685213, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6757131940685213, "reward_after_std": 0.8832690194249153, "reward_before_mean": 0.9622493050992489, "reward_before_std": 0.8776302300393581, "reward_change_max": 0.0, "reward_change_mean": -0.2865361422300339, "reward_change_min": -0.5044957622885704, "reward_change_std": 0.20020386390388012, "reward_std": 0.8832690566778183, "rewards/cosine_scaled_reward": 0.09570799674838781, "rewards/format_reward": 0.7708333469927311, "step": 274 }, { "advantage_max": 1.2382594272494316, "advantage_mean": -4.9360098897111016e-08, "advantage_min": -0.9473253339529037, "advantage_std": 0.790430873632431, "completion_length": 2379.7500610351562, "epoch": 0.3142857142857143, "grad_norm": 0.36985093355178833, "kl": 0.4437255859375, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.5e-07, "loss": 0.0343, "reward": 0.6091940072365105, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6091940072365105, "reward_after_std": 0.7904308661818504, "reward_before_mean": 0.8897625058889389, "reward_before_std": 0.7824394963681698, "reward_change_max": 0.00024212151765823364, "reward_change_mean": -0.28056854754686356, "reward_change_min": -0.4758047293871641, "reward_change_std": 0.18870245106518269, "reward_std": 0.790430873632431, "rewards/cosine_scaled_reward": 0.05946458503603935, "rewards/format_reward": 0.7708333469927311, "step": 275 }, { "advantage_max": 1.417747512459755, "advantage_mean": -1.1175871117430347e-08, "advantage_min": -0.9425608813762665, "advantage_std": 0.8622519038617611, "completion_length": 2382.729248046875, "epoch": 0.31542857142857145, "grad_norm": 0.8259385228157043, "kl": 0.54046630859375, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.468584328659172e-07, "loss": 0.046, "reward": 0.7064427239820361, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.7064427239820361, "reward_after_std": 0.8622519224882126, "reward_before_mean": 0.9980673622339964, "reward_before_std": 0.8322956692427397, "reward_change_max": 0.0002689957618713379, "reward_change_mean": -0.2916246075183153, "reward_change_min": -0.5032038614153862, "reward_change_std": 0.19721571169793606, "reward_std": 0.8622519448399544, "rewards/cosine_scaled_reward": 0.10320031549781561, "rewards/format_reward": 0.791666692122817, "step": 276 }, { "advantage_max": 1.1373561695218086, "advantage_mean": -3.1044087300813317e-09, "advantage_min": -0.6842050366103649, "advantage_std": 0.6619889885187149, "completion_length": 2434.854232788086, "epoch": 0.31657142857142856, "grad_norm": 0.961823582649231, "kl": 0.55804443359375, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.437170188473847e-07, "loss": 0.0257, "reward": 0.37808058643713593, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.37808058643713593, "reward_after_std": 0.6619890034198761, "reward_before_mean": 0.6139687532559037, "reward_before_std": 0.6310353614389896, "reward_change_max": 0.0, "reward_change_mean": -0.2358881626278162, "reward_change_min": -0.3869531359523535, "reward_change_std": 0.14327027555555105, "reward_std": 0.6619890369474888, "rewards/cosine_scaled_reward": -0.08884896896779537, "rewards/format_reward": 0.7916666716337204, "step": 277 }, { "advantage_max": 1.0359967350959778, "advantage_mean": -3.1044081749698194e-09, "advantage_min": -0.547761145979166, "advantage_std": 0.577915258705616, "completion_length": 2198.916748046875, "epoch": 0.3177142857142857, "grad_norm": 0.6416994333267212, "kl": 0.5369873046875, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.405759110524894e-07, "loss": 0.0278, "reward": 0.6939112609252334, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6939112609252334, "reward_after_std": 0.5779152661561966, "reward_before_mean": 0.9984060376882553, "reward_before_std": 0.4832126758992672, "reward_change_max": 0.0, "reward_change_mean": -0.3044947562739253, "reward_change_min": -0.447370208799839, "reward_change_std": 0.1697270618751645, "reward_std": 0.5779152922332287, "rewards/cosine_scaled_reward": 0.08253634301945567, "rewards/format_reward": 0.8333333414047956, "step": 278 }, { "advantage_max": 1.4574612230062485, "advantage_mean": -2.1730860833013566e-08, "advantage_min": -0.8479081615805626, "advantage_std": 0.880218181759119, "completion_length": 2921.791748046875, "epoch": 0.31885714285714284, "grad_norm": 1.7954747676849365, "kl": 0.614990234375, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.37435262574394e-07, "loss": -0.0028, "reward": 0.32596746971830726, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.32596746971830726, "reward_after_std": 0.8802181892096996, "reward_before_mean": 0.5379584170877934, "reward_before_std": 0.8841192573308945, "reward_change_max": 0.0, "reward_change_mean": -0.2119909394532442, "reward_change_min": -0.44285766407847404, "reward_change_std": 0.16466339118778706, "reward_std": 0.8802182115614414, "rewards/cosine_scaled_reward": -0.10602080635726452, "rewards/format_reward": 0.7500000149011612, "step": 279 }, { "advantage_max": 1.2174679934978485, "advantage_mean": -1.8626452602532595e-08, "advantage_min": -0.9811263605952263, "advantage_std": 0.8303379565477371, "completion_length": 2376.500087738037, "epoch": 0.32, "grad_norm": 0.7019585967063904, "kl": 0.4969482421875, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.342952264838747e-07, "loss": 0.0505, "reward": 0.8277103584259748, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.8277103584259748, "reward_after_std": 0.8303379639983177, "reward_before_mean": 1.1526767499744892, "reward_before_std": 0.8390437588095665, "reward_change_max": 0.0, "reward_change_mean": -0.32496638409793377, "reward_change_min": -0.5728341620415449, "reward_change_std": 0.21707758866250515, "reward_std": 0.8303379863500595, "rewards/cosine_scaled_reward": 0.17008836346212775, "rewards/format_reward": 0.8125000260770321, "step": 280 }, { "advantage_max": 1.117238812148571, "advantage_mean": -6.20881729362921e-09, "advantage_min": -0.6751887276768684, "advantage_std": 0.6338129118084908, "completion_length": 3367.354217529297, "epoch": 0.3211428571428571, "grad_norm": 0.8130143880844116, "kl": 0.66796875, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.311559558218603e-07, "loss": 0.0399, "reward": -0.0589434988796711, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.0589434988796711, "reward_after_std": 0.633812915533781, "reward_before_mean": 0.0878406809642911, "reward_before_std": 0.6230056919157505, "reward_change_max": 0.0, "reward_change_mean": -0.14678418170660734, "reward_change_min": -0.26374930888414383, "reward_change_std": 0.10998814273625612, "reward_std": 0.6338129602372646, "rewards/cosine_scaled_reward": -0.17482966743409634, "rewards/format_reward": 0.4375000037252903, "step": 281 }, { "advantage_max": 1.2622554525732994, "advantage_mean": -2.7939678237665078e-08, "advantage_min": -0.9061468839645386, "advantage_std": 0.7616865001618862, "completion_length": 2606.020881652832, "epoch": 0.3222857142857143, "grad_norm": 0.5567194819450378, "kl": 0.475067138671875, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.28017603591974e-07, "loss": 0.0595, "reward": 0.6109224297106266, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6109224297106266, "reward_after_std": 0.7616865150630474, "reward_before_mean": 0.8905543517321348, "reward_before_std": 0.7397468909621239, "reward_change_max": 0.00021518021821975708, "reward_change_mean": -0.2796319294720888, "reward_change_min": -0.481433242559433, "reward_change_std": 0.17854167707264423, "reward_std": 0.7616865336894989, "rewards/cosine_scaled_reward": 0.007777164923027158, "rewards/format_reward": 0.8750000223517418, "step": 282 }, { "advantage_max": 1.7443509474396706, "advantage_mean": 2.1730860888524717e-08, "advantage_min": -1.017472319304943, "advantage_std": 1.005886286497116, "completion_length": 2991.0000610351562, "epoch": 0.32342857142857145, "grad_norm": 1.4104963541030884, "kl": 0.6126708984375, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.248803227530763e-07, "loss": 0.1037, "reward": 0.41072939755395055, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.41072939755395055, "reward_after_std": 1.0058862939476967, "reward_before_mean": 0.6282010450959206, "reward_before_std": 1.0024794191122055, "reward_change_max": 0.0007800683379173279, "reward_change_mean": -0.21747161448001862, "reward_change_min": -0.42786360532045364, "reward_change_std": 0.1684788903221488, "reward_std": 1.005886323750019, "rewards/cosine_scaled_reward": -0.008816150482743979, "rewards/format_reward": 0.6458333414047956, "step": 283 }, { "advantage_max": 1.3748716935515404, "advantage_mean": -3.166496803652663e-08, "advantage_min": -0.9754731394350529, "advantage_std": 0.9186149425804615, "completion_length": 2677.854232788086, "epoch": 0.32457142857142857, "grad_norm": 1.2485500574111938, "kl": 0.8084716796875, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.21744266211809e-07, "loss": 0.0764, "reward": 0.4375223647803068, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4375223647803068, "reward_after_std": 0.9186149425804615, "reward_before_mean": 0.6745116095989943, "reward_before_std": 0.9550625048577785, "reward_change_max": 0.00011983513832092285, "reward_change_mean": -0.2369892280548811, "reward_change_min": -0.5025754775851965, "reward_change_std": 0.19382414128631353, "reward_std": 0.9186149574816227, "rewards/cosine_scaled_reward": -0.02732754498720169, "rewards/format_reward": 0.7291666828095913, "step": 284 }, { "advantage_max": 1.153579793870449, "advantage_mean": -4.967053879312289e-09, "advantage_min": -0.6998581737279892, "advantage_std": 0.6692559383809566, "completion_length": 2201.9583892822266, "epoch": 0.32571428571428573, "grad_norm": 0.2996189594268799, "kl": 0.35406494140625, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.186095868151436e-07, "loss": 0.0323, "reward": 0.32276270259171724, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.32276270259171724, "reward_after_std": 0.6692559532821178, "reward_before_mean": 0.5466967602260411, "reward_before_std": 0.6370350383222103, "reward_change_max": 0.00048685818910598755, "reward_change_mean": -0.22393404319882393, "reward_change_min": -0.3773474842309952, "reward_change_std": 0.14570614136755466, "reward_std": 0.669255968183279, "rewards/cosine_scaled_reward": -0.14331829687580466, "rewards/format_reward": 0.8333333395421505, "step": 285 }, { "advantage_max": 1.3171608075499535, "advantage_mean": -2.250696307104505e-08, "advantage_min": -0.9609937965869904, "advantage_std": 0.8358764089643955, "completion_length": 2561.8959350585938, "epoch": 0.32685714285714285, "grad_norm": 1.1083446741104126, "kl": 0.522705078125, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.154764373429315e-07, "loss": 0.0062, "reward": 0.526446558535099, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.526446558535099, "reward_after_std": 0.8358764089643955, "reward_before_mean": 0.7858736389316618, "reward_before_std": 0.8427512384951115, "reward_change_max": 0.0, "reward_change_mean": -0.2594271060079336, "reward_change_min": -0.4767572619020939, "reward_change_std": 0.1822401201352477, "reward_std": 0.835876427590847, "rewards/cosine_scaled_reward": -0.023729851469397545, "rewards/format_reward": 0.8333333507180214, "step": 286 }, { "advantage_max": 0.9248721599578857, "advantage_mean": 2.7318795670083773e-08, "advantage_min": -0.7077672183513641, "advantage_std": 0.5572267882525921, "completion_length": 2091.979232788086, "epoch": 0.328, "grad_norm": 0.4647475481033325, "kl": 0.3912353515625, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.123449705004581e-07, "loss": 0.0246, "reward": 0.5254754899069667, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5254754899069667, "reward_after_std": 0.5572267957031727, "reward_before_mean": 0.801264937967062, "reward_before_std": 0.5182732734829187, "reward_change_max": 0.0, "reward_change_mean": -0.27578943222761154, "reward_change_min": -0.4128921814262867, "reward_change_std": 0.1566688446328044, "reward_std": 0.5572268068790436, "rewards/cosine_scaled_reward": -0.005617540329694748, "rewards/format_reward": 0.8125000055879354, "step": 287 }, { "advantage_max": 1.504993923008442, "advantage_mean": -1.490116136038111e-08, "advantage_min": -0.8023764491081238, "advantage_std": 0.8677674159407616, "completion_length": 2702.8959197998047, "epoch": 0.3291428571428571, "grad_norm": 1.5587056875228882, "kl": 0.483612060546875, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.09215338910999e-07, "loss": 0.0009, "reward": 0.45762502774596214, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.45762502774596214, "reward_after_std": 0.867767408490181, "reward_before_mean": 0.6953183701261878, "reward_before_std": 0.842148095369339, "reward_change_max": 0.00029984116554260254, "reward_change_mean": -0.2376933479681611, "reward_change_min": -0.4342462159693241, "reward_change_std": 0.16298356838524342, "reward_std": 0.8677674308419228, "rewards/cosine_scaled_reward": -0.04817415587604046, "rewards/format_reward": 0.7916666828095913, "step": 288 }, { "advantage_max": 0.8705627843737602, "advantage_mean": -1.0554988938427101e-08, "advantage_min": -0.6423959396779537, "advantage_std": 0.5385672096163034, "completion_length": 2132.0208587646484, "epoch": 0.3302857142857143, "grad_norm": 0.8675118684768677, "kl": 0.375274658203125, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.060876951083828e-07, "loss": 0.0083, "reward": 0.41569878812879324, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.41569878812879324, "reward_after_std": 0.5385672207921743, "reward_before_mean": 0.6696806941181421, "reward_before_std": 0.5084246285259724, "reward_change_max": 0.000632651150226593, "reward_change_mean": -0.2539819087833166, "reward_change_min": -0.4022063910961151, "reward_change_std": 0.15744253154844046, "reward_std": 0.5385672282427549, "rewards/cosine_scaled_reward": -0.029742980375885963, "rewards/format_reward": 0.7291666753590107, "step": 289 }, { "advantage_max": 1.1274751722812653, "advantage_mean": -9.623666974434286e-09, "advantage_min": -0.6674411427229643, "advantage_std": 0.653532151132822, "completion_length": 2767.979248046875, "epoch": 0.3314285714285714, "grad_norm": 0.621363639831543, "kl": 0.64501953125, "lambda_div_used": 0.7999999999999999, "learning_rate": 5.02962191529556e-07, "loss": 0.0539, "reward": 0.3863111804239452, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3863111804239452, "reward_after_std": 0.6535321436822414, "reward_before_mean": 0.6250412613153458, "reward_before_std": 0.6115994676947594, "reward_change_max": 0.0, "reward_change_mean": -0.2387300655245781, "reward_change_min": -0.39923243410885334, "reward_change_std": 0.14787110220640898, "reward_std": 0.6535321548581123, "rewards/cosine_scaled_reward": -0.1458127275109291, "rewards/format_reward": 0.916666679084301, "step": 290 }, { "advantage_max": 1.1834221184253693, "advantage_mean": -1.3348957050851595e-08, "advantage_min": -0.8950243405997753, "advantage_std": 0.7383266054093838, "completion_length": 2805.1459350585938, "epoch": 0.3325714285714286, "grad_norm": 0.7029440402984619, "kl": 0.495849609375, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.998389805071536e-07, "loss": 0.0462, "reward": 0.3216223921626806, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3216223921626806, "reward_after_std": 0.7383266091346741, "reward_before_mean": 0.5436824802309275, "reward_before_std": 0.7413762956857681, "reward_change_max": 0.0010522156953811646, "reward_change_mean": -0.22206009179353714, "reward_change_min": -0.4207623451948166, "reward_change_std": 0.16112508811056614, "reward_std": 0.7383266165852547, "rewards/cosine_scaled_reward": -0.11357542686164379, "rewards/format_reward": 0.7708333507180214, "step": 291 }, { "advantage_max": 1.0772344693541527, "advantage_mean": -4.346172283176131e-09, "advantage_min": -0.7955384105443954, "advantage_std": 0.6696780174970627, "completion_length": 3134.666748046875, "epoch": 0.33371428571428574, "grad_norm": 0.5410640239715576, "kl": 0.559814453125, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.967182142620745e-07, "loss": 0.0434, "reward": 0.21674447413533926, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.21674447413533926, "reward_after_std": 0.6696780137717724, "reward_before_mean": 0.4219084605574608, "reward_before_std": 0.6723324432969093, "reward_change_max": 0.00036097317934036255, "reward_change_mean": -0.20516397897154093, "reward_change_min": -0.3651042580604553, "reward_change_std": 0.14545214269310236, "reward_std": 0.6696780510246754, "rewards/cosine_scaled_reward": -0.11196244833990932, "rewards/format_reward": 0.6458333376795053, "step": 292 }, { "advantage_max": 0.9164200350642204, "advantage_mean": 1.241763414316921e-09, "advantage_min": -0.9503684192895889, "advantage_std": 0.6595253460109234, "completion_length": 2227.041732788086, "epoch": 0.33485714285714285, "grad_norm": 0.30070415139198303, "kl": 0.222564697265625, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.93600044896063e-07, "loss": 0.0213, "reward": 0.5309373632189818, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5309373632189818, "reward_after_std": 0.6595253497362137, "reward_before_mean": 0.8075640294700861, "reward_before_std": 0.6770856119692326, "reward_change_max": 0.0, "reward_change_mean": -0.276626655831933, "reward_change_min": -0.46285124868154526, "reward_change_std": 0.18145327549427748, "reward_std": 0.6595253609120846, "rewards/cosine_scaled_reward": -0.04413466341793537, "rewards/format_reward": 0.8958333507180214, "step": 293 }, { "advantage_max": 1.524643950164318, "advantage_mean": -2.1730859334212482e-09, "advantage_min": -0.826868399977684, "advantage_std": 0.8730550743639469, "completion_length": 2852.7709045410156, "epoch": 0.336, "grad_norm": 0.5285487771034241, "kl": 0.40545654296875, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.904846243842949e-07, "loss": 0.0093, "reward": 0.07707068230956793, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.07707068230956793, "reward_after_std": 0.8730550743639469, "reward_before_mean": 0.2330453209578991, "reward_before_std": 0.8745039738714695, "reward_change_max": 0.0013482049107551575, "reward_change_mean": -0.15597465354949236, "reward_change_min": -0.3428476359695196, "reward_change_std": 0.13191541656851768, "reward_std": 0.8730550929903984, "rewards/cosine_scaled_reward": -0.11264400463551283, "rewards/format_reward": 0.4583333432674408, "step": 294 }, { "advantage_max": 1.161498136818409, "advantage_mean": -1.241763691872677e-09, "advantage_min": -0.8748191595077515, "advantage_std": 0.7533379979431629, "completion_length": 3125.562530517578, "epoch": 0.33714285714285713, "grad_norm": 0.5754555463790894, "kl": 0.4957275390625, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.873721045679706e-07, "loss": 0.0354, "reward": 0.13185544777661562, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.13185544777661562, "reward_after_std": 0.7533380128443241, "reward_before_mean": 0.3152529150247574, "reward_before_std": 0.7833084501326084, "reward_change_max": 0.0, "reward_change_mean": -0.18339747935533524, "reward_change_min": -0.3779276143759489, "reward_change_std": 0.15217753848992288, "reward_std": 0.753338024020195, "rewards/cosine_scaled_reward": -0.061123548075556755, "rewards/format_reward": 0.43750000931322575, "step": 295 }, { "advantage_max": 1.2943150848150253, "advantage_mean": -4.967053879312289e-09, "advantage_min": -0.880298025906086, "advantage_std": 0.7986906431615353, "completion_length": 3260.416748046875, "epoch": 0.3382857142857143, "grad_norm": 0.671694278717041, "kl": 0.439208984375, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.842626371469149e-07, "loss": 0.0402, "reward": 0.15541864559054375, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.15541864559054375, "reward_after_std": 0.7986906580626965, "reward_before_mean": 0.33835546113550663, "reward_before_std": 0.8092953190207481, "reward_change_max": 0.0004698038101196289, "reward_change_mean": -0.18293681275099516, "reward_change_min": -0.3585068881511688, "reward_change_std": 0.14620533026754856, "reward_std": 0.7986906915903091, "rewards/cosine_scaled_reward": -0.16415561363101006, "rewards/format_reward": 0.6666666753590107, "step": 296 }, { "advantage_max": 1.5055525228381157, "advantage_mean": 5.898376342905465e-09, "advantage_min": -0.9016059339046478, "advantage_std": 0.8884069547057152, "completion_length": 3415.7708740234375, "epoch": 0.3394285714285714, "grad_norm": 0.6050586104393005, "kl": 0.425537109375, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.811563736721829e-07, "loss": 0.0424, "reward": 0.21233353205025196, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.21233353205025196, "reward_after_std": 0.8884069658815861, "reward_before_mean": 0.3986882194876671, "reward_before_std": 0.8947096578776836, "reward_change_max": 0.0012565329670906067, "reward_change_mean": -0.18635463900864124, "reward_change_min": -0.36084929667413235, "reward_change_std": 0.14199561532586813, "reward_std": 0.8884070031344891, "rewards/cosine_scaled_reward": -0.061072577722370625, "rewards/format_reward": 0.5208333469927311, "step": 297 }, { "advantage_max": 0.9044896215200424, "advantage_mean": 4.035731665519293e-09, "advantage_min": -0.681348480284214, "advantage_std": 0.5914136283099651, "completion_length": 2602.958366394043, "epoch": 0.3405714285714286, "grad_norm": 0.7789437174797058, "kl": 0.287017822265625, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.780534655386743e-07, "loss": -0.0101, "reward": 0.35978633305057883, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.35978633305057883, "reward_after_std": 0.5914136320352554, "reward_before_mean": 0.6013914185459726, "reward_before_std": 0.5782536268234253, "reward_change_max": 0.0011856183409690857, "reward_change_mean": -0.24160510301589966, "reward_change_min": -0.416112856939435, "reward_change_std": 0.16169053129851818, "reward_std": 0.5914136357605457, "rewards/cosine_scaled_reward": -0.02222095988690853, "rewards/format_reward": 0.6458333432674408, "step": 298 }, { "advantage_max": 1.3797761723399162, "advantage_mean": -4.066775383115839e-08, "advantage_min": -1.0554508119821548, "advantage_std": 0.8842962495982647, "completion_length": 3059.2709350585938, "epoch": 0.3417142857142857, "grad_norm": 0.4524586796760559, "kl": 0.31640625, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.749540639777539e-07, "loss": -0.0135, "reward": 0.4166666514938697, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4166666514938697, "reward_after_std": 0.8842962794005871, "reward_before_mean": 0.6501839645206928, "reward_before_std": 0.9087841995060444, "reward_change_max": 0.0001943930983543396, "reward_change_mean": -0.23351731058210135, "reward_change_min": -0.4361800532788038, "reward_change_std": 0.1814221441745758, "reward_std": 0.8842963092029095, "rewards/cosine_scaled_reward": 0.012591954320669174, "rewards/format_reward": 0.6250000167638063, "step": 299 }, { "advantage_max": 1.2542919218540192, "advantage_mean": 2.4835269951672956e-09, "advantage_min": -0.6836499199271202, "advantage_std": 0.75607680529356, "completion_length": 3302.354248046875, "epoch": 0.34285714285714286, "grad_norm": 0.5583862066268921, "kl": 0.353759765625, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.7185832004988133e-07, "loss": 0.0641, "reward": -0.07509482838213444, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.07509482838213444, "reward_after_std": 0.7560768201947212, "reward_before_mean": 0.060925360303372145, "reward_before_std": 0.7722468562424183, "reward_change_max": 0.0010308846831321716, "reward_change_mean": -0.1360201993957162, "reward_change_min": -0.33642248064279556, "reward_change_std": 0.12964419461786747, "reward_std": 0.7560768499970436, "rewards/cosine_scaled_reward": -0.16745399590581656, "rewards/format_reward": 0.39583334885537624, "step": 300 }, { "advantage_max": 1.1226837486028671, "advantage_mean": -1.0554989271494009e-08, "advantage_min": -0.7226800471544266, "advantage_std": 0.6880081184208393, "completion_length": 2656.979202270508, "epoch": 0.344, "grad_norm": 1.0143342018127441, "kl": 0.244873046875, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.68766384637248e-07, "loss": 0.0539, "reward": 0.18314046040177345, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.18314046040177345, "reward_after_std": 0.6880081221461296, "reward_before_mean": 0.37871653120964766, "reward_before_std": 0.6911210939288139, "reward_change_max": 0.0, "reward_change_mean": -0.1955761080607772, "reward_change_min": -0.3798739053308964, "reward_change_std": 0.1449863687157631, "reward_std": 0.6880081407725811, "rewards/cosine_scaled_reward": -0.10230840416625142, "rewards/format_reward": 0.5833333395421505, "step": 301 }, { "advantage_max": 1.2821213752031326, "advantage_mean": -3.973642981325298e-08, "advantage_min": -0.7915182262659073, "advantage_std": 0.7827672846615314, "completion_length": 2826.0000762939453, "epoch": 0.34514285714285714, "grad_norm": 0.5162174701690674, "kl": 0.3029327392578125, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.656784084364238e-07, "loss": 0.0426, "reward": 0.43309174850583076, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.43309174850583076, "reward_after_std": 0.7827672772109509, "reward_before_mean": 0.6745897931978106, "reward_before_std": 0.7768698744475842, "reward_change_max": 0.00018189847469329834, "reward_change_mean": -0.24149805679917336, "reward_change_min": -0.4414714686572552, "reward_change_std": 0.16948643419891596, "reward_std": 0.7827673144638538, "rewards/cosine_scaled_reward": 0.04562821961008012, "rewards/format_reward": 0.583333345130086, "step": 302 }, { "advantage_max": 1.203477919101715, "advantage_mean": -1.1796752963366686e-08, "advantage_min": -1.067990280687809, "advantage_std": 0.829220212996006, "completion_length": 2699.3125915527344, "epoch": 0.3462857142857143, "grad_norm": 1.1074665784835815, "kl": 0.2890625, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.6259454195101267e-07, "loss": -0.0382, "reward": 0.7349976152181625, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7349976152181625, "reward_after_std": 0.8292202204465866, "reward_before_mean": 1.0418037474155426, "reward_before_std": 0.8478254750370979, "reward_change_max": 0.0, "reward_change_mean": -0.3068061266094446, "reward_change_min": -0.532234251499176, "reward_change_std": 0.20924520306289196, "reward_std": 0.8292202204465866, "rewards/cosine_scaled_reward": 0.11465185508131981, "rewards/format_reward": 0.8125000223517418, "step": 303 }, { "advantage_max": 1.1113415397703648, "advantage_mean": -1.738468941026028e-08, "advantage_min": -0.7756071053445339, "advantage_std": 0.7286460623145103, "completion_length": 2642.104217529297, "epoch": 0.3474285714285714, "grad_norm": 0.42963743209838867, "kl": 0.28240966796875, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.59514935484316e-07, "loss": 0.0249, "reward": 0.21103033176041208, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.21103033176041208, "reward_after_std": 0.7286460697650909, "reward_before_mean": 0.41298089548945427, "reward_before_std": 0.7496253587305546, "reward_change_max": 0.00042162835597991943, "reward_change_mean": -0.2019505836069584, "reward_change_min": -0.39072100818157196, "reward_change_std": 0.15458550211042166, "reward_std": 0.7286460697650909, "rewards/cosine_scaled_reward": -0.10600956110283732, "rewards/format_reward": 0.6250000074505806, "step": 304 }, { "advantage_max": 0.9595196545124054, "advantage_mean": -1.7384688633104162e-08, "advantage_min": -0.8634162247180939, "advantage_std": 0.6757207736372948, "completion_length": 3239.5834350585938, "epoch": 0.3485714285714286, "grad_norm": 0.33965903520584106, "kl": 0.37896728515625, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.5643973913200837e-07, "loss": 0.021, "reward": 0.051725201308727264, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.051725201308727264, "reward_after_std": 0.6757207810878754, "reward_before_mean": 0.22745683169341646, "reward_before_std": 0.719194870442152, "reward_change_max": 0.0002442896366119385, "reward_change_mean": -0.1757316291332245, "reward_change_min": -0.3522457033395767, "reward_change_std": 0.14757832884788513, "reward_std": 0.675720788538456, "rewards/cosine_scaled_reward": -0.13627159036695957, "rewards/format_reward": 0.500000013038516, "step": 305 }, { "advantage_max": 1.1726135164499283, "advantage_mean": -3.1664968647149294e-08, "advantage_min": -0.7642268985509872, "advantage_std": 0.7053759954869747, "completion_length": 2803.0833587646484, "epoch": 0.3497142857142857, "grad_norm": 0.3666328191757202, "kl": 0.2911376953125, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.5336910277482155e-07, "loss": 0.0219, "reward": 0.5246845060028136, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5246845060028136, "reward_after_std": 0.7053759880363941, "reward_before_mean": 0.7904737419448793, "reward_before_std": 0.6732211895287037, "reward_change_max": 0.00019888579845428467, "reward_change_mean": -0.26578925736248493, "reward_change_min": -0.4267906751483679, "reward_change_std": 0.16886506881564856, "reward_std": 0.705375999212265, "rewards/cosine_scaled_reward": 0.04107019305229187, "rewards/format_reward": 0.7083333414047956, "step": 306 }, { "advantage_max": 1.439152605831623, "advantage_mean": -1.1175871394986103e-08, "advantage_min": -1.1323330104351044, "advantage_std": 0.964265838265419, "completion_length": 2949.0416870117188, "epoch": 0.35085714285714287, "grad_norm": 1.266478180885315, "kl": 0.31048583984375, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.503031760712397e-07, "loss": 0.0817, "reward": 0.5640023471787572, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5640023471787572, "reward_after_std": 0.9642658233642578, "reward_before_mean": 0.8261616267263889, "reward_before_std": 0.9994736909866333, "reward_change_max": 0.00012583285570144653, "reward_change_mean": -0.2621592953801155, "reward_change_min": -0.5008783079683781, "reward_change_std": 0.20539161236956716, "reward_std": 0.9642658494412899, "rewards/cosine_scaled_reward": 0.06933080439921468, "rewards/format_reward": 0.6875000149011612, "step": 307 }, { "advantage_max": 1.1639970354735851, "advantage_mean": -9.468446804383746e-09, "advantage_min": -0.7429619617760181, "advantage_std": 0.7060525380074978, "completion_length": 3407.1250610351562, "epoch": 0.352, "grad_norm": 0.5398004651069641, "kl": 0.40625, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.4724210845020494e-07, "loss": 0.0186, "reward": 0.14492211426841095, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.14492211426841095, "reward_after_std": 0.7060525380074978, "reward_before_mean": 0.3317896923981607, "reward_before_std": 0.7155218906700611, "reward_change_max": 0.0006226077675819397, "reward_change_mean": -0.18686757795512676, "reward_change_min": -0.352012537419796, "reward_change_std": 0.13838431145995855, "reward_std": 0.7060525640845299, "rewards/cosine_scaled_reward": -0.10493848938494921, "rewards/format_reward": 0.5416666772216558, "step": 308 }, { "advantage_max": 1.3210503160953522, "advantage_mean": -3.1044103399047174e-10, "advantage_min": -0.9297028332948685, "advantage_std": 0.7949141003191471, "completion_length": 3121.416717529297, "epoch": 0.35314285714285715, "grad_norm": 1.1463578939437866, "kl": 0.287353515625, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.441860491038345e-07, "loss": 0.0524, "reward": 0.3992934077978134, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3992934077978134, "reward_after_std": 0.7949140854179859, "reward_before_mean": 0.6327683683484793, "reward_before_std": 0.7869492806494236, "reward_change_max": 0.00041719526052474976, "reward_change_mean": -0.2334749810397625, "reward_change_min": -0.3965462204068899, "reward_change_std": 0.15955772250890732, "reward_std": 0.7949140928685665, "rewards/cosine_scaled_reward": -0.03778248839080334, "rewards/format_reward": 0.7083333544433117, "step": 309 }, { "advantage_max": 1.3295684456825256, "advantage_mean": -6.208817182606907e-09, "advantage_min": -0.7546710520982742, "advantage_std": 0.7807791829109192, "completion_length": 2624.8334045410156, "epoch": 0.35428571428571426, "grad_norm": 0.3927314877510071, "kl": 0.293670654296875, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.4113514698014953e-07, "loss": 0.0145, "reward": 0.34071499202400446, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.34071499202400446, "reward_after_std": 0.780779205262661, "reward_before_mean": 0.5604322105646133, "reward_before_std": 0.7707412429153919, "reward_change_max": 6.631016731262207e-06, "reward_change_mean": -0.21971724182367325, "reward_change_min": -0.40014604292809963, "reward_change_std": 0.1491192802786827, "reward_std": 0.7807792238891125, "rewards/cosine_scaled_reward": -0.04270055377855897, "rewards/format_reward": 0.6458333395421505, "step": 310 }, { "advantage_max": 1.1729798540472984, "advantage_mean": -2.1730860583213385e-08, "advantage_min": -1.170724742114544, "advantage_std": 0.8196248821914196, "completion_length": 2766.5000534057617, "epoch": 0.3554285714285714, "grad_norm": 0.3156254291534424, "kl": 0.260223388671875, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.3808955077581546e-07, "loss": 0.0276, "reward": 0.6994708036072552, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6994708036072552, "reward_after_std": 0.8196248970925808, "reward_before_mean": 1.0011386279948056, "reward_before_std": 0.8453158028423786, "reward_change_max": 0.0006065443158149719, "reward_change_mean": -0.3016678225249052, "reward_change_min": -0.5197629183530807, "reward_change_std": 0.2100884895771742, "reward_std": 0.8196249157190323, "rewards/cosine_scaled_reward": 0.1776526216417551, "rewards/format_reward": 0.6458333414047956, "step": 311 }, { "advantage_max": 1.1915170662105083, "advantage_mean": -2.235174201281609e-08, "advantage_min": -0.8529561161994934, "advantage_std": 0.7519486397504807, "completion_length": 2315.6875381469727, "epoch": 0.3565714285714286, "grad_norm": 0.5464223623275757, "kl": 0.2484893798828125, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.350494089288943e-07, "loss": -0.0112, "reward": 0.680031955242157, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.680031955242157, "reward_after_std": 0.7519486621022224, "reward_before_mean": 0.9763741809874773, "reward_before_std": 0.73100402392447, "reward_change_max": 0.0004085749387741089, "reward_change_mean": -0.2963422201573849, "reward_change_min": -0.4903582762926817, "reward_change_std": 0.19297243934124708, "reward_std": 0.7519486956298351, "rewards/cosine_scaled_reward": 0.17568707559257746, "rewards/format_reward": 0.6250000111758709, "step": 312 }, { "advantage_max": 1.394209772348404, "advantage_mean": -2.7318796225195285e-08, "advantage_min": -0.5839328169822693, "advantage_std": 0.7712888494133949, "completion_length": 2901.0000534057617, "epoch": 0.3577142857142857, "grad_norm": 0.49047237634658813, "kl": 0.32720947265625, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.3201486961161093e-07, "loss": 0.045, "reward": 0.5697615533135831, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5697615533135831, "reward_after_std": 0.7712888531386852, "reward_before_mean": 0.8347354661673307, "reward_before_std": 0.7097177393734455, "reward_change_max": 0.0, "reward_change_mean": -0.2649738909676671, "reward_change_min": -0.4325453620404005, "reward_change_std": 0.16076719481498003, "reward_std": 0.7712888643145561, "rewards/cosine_scaled_reward": 0.0736176953651011, "rewards/format_reward": 0.6875000093132257, "step": 313 }, { "advantage_max": 1.1387183368206024, "advantage_mean": -3.0423204344653954e-08, "advantage_min": -0.8505013547837734, "advantage_std": 0.7319958060979843, "completion_length": 2589.8958892822266, "epoch": 0.3588571428571429, "grad_norm": 0.38102486729621887, "kl": 0.23065185546875, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.2898608072313045e-07, "loss": 0.0224, "reward": 0.6887513920664787, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6887513920664787, "reward_after_std": 0.7319958098232746, "reward_before_mean": 0.9901680648326874, "reward_before_std": 0.7208104282617569, "reward_change_max": 0.0, "reward_change_mean": -0.30141670163720846, "reward_change_min": -0.4891224876046181, "reward_change_std": 0.18802605429664254, "reward_std": 0.7319958359003067, "rewards/cosine_scaled_reward": 0.09925069566816092, "rewards/format_reward": 0.791666679084301, "step": 314 }, { "advantage_max": 1.2496268823742867, "advantage_mean": 1.241763691872677e-09, "advantage_min": -0.7799261212348938, "advantage_std": 0.7311495095491409, "completion_length": 3081.604217529297, "epoch": 0.36, "grad_norm": 0.6908902525901794, "kl": 0.367828369140625, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.2596318988235037e-07, "loss": 0.0545, "reward": 0.15070660039782524, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.15070660039782524, "reward_after_std": 0.7311495132744312, "reward_before_mean": 0.3349191239103675, "reward_before_std": 0.7265293411910534, "reward_change_max": 0.0005373582243919373, "reward_change_mean": -0.1842125072143972, "reward_change_min": -0.3642015140503645, "reward_change_std": 0.13788303453475237, "reward_std": 0.7311495393514633, "rewards/cosine_scaled_reward": -0.10337379078555387, "rewards/format_reward": 0.5416666734963655, "step": 315 }, { "advantage_max": 1.270074326545, "advantage_mean": 1.8626452602532595e-09, "advantage_min": -0.8054039552807808, "advantage_std": 0.7438079752027988, "completion_length": 3420.916717529297, "epoch": 0.36114285714285715, "grad_norm": 0.5454291701316833, "kl": 0.45458984375, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.2294634442070553e-07, "loss": 0.0151, "reward": -0.16140791401267052, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.16140791401267052, "reward_after_std": 0.7438079826533794, "reward_before_mean": -0.043293423019349575, "reward_before_std": 0.7589707747101784, "reward_change_max": 0.0008183494210243225, "reward_change_mean": -0.11811448354274035, "reward_change_min": -0.27028753608465195, "reward_change_std": 0.11148541839793324, "reward_std": 0.7438079975545406, "rewards/cosine_scaled_reward": -0.24039671290665865, "rewards/format_reward": 0.43750001303851604, "step": 316 }, { "advantage_max": 1.0541828200221062, "advantage_mean": -6.829699306099002e-09, "advantage_min": -0.9681460931897163, "advantage_std": 0.7232431396842003, "completion_length": 3071.1459045410156, "epoch": 0.36228571428571427, "grad_norm": 0.4729823172092438, "kl": 0.359405517578125, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.1993569137498776e-07, "loss": 0.0297, "reward": 0.40531662479043007, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.40531662479043007, "reward_after_std": 0.7232431620359421, "reward_before_mean": 0.6505545452237129, "reward_before_std": 0.7462910003960133, "reward_change_max": 0.0, "reward_change_mean": -0.2452379334717989, "reward_change_min": -0.45026662200689316, "reward_change_std": 0.17695352341979742, "reward_std": 0.7232431694865227, "rewards/cosine_scaled_reward": -0.04972273297607899, "rewards/format_reward": 0.7500000223517418, "step": 317 }, { "advantage_max": 1.2781926468014717, "advantage_mean": -7.14014017355602e-09, "advantage_min": -0.9811546318233013, "advantage_std": 0.8713476918637753, "completion_length": 2271.041717529297, "epoch": 0.36342857142857143, "grad_norm": 1.1370270252227783, "kl": 0.233673095703125, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.1693137748017915e-07, "loss": 0.0559, "reward": 0.6063612047582865, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6063612047582865, "reward_after_std": 0.8713476918637753, "reward_before_mean": 0.8839264828711748, "reward_before_std": 0.9011897668242455, "reward_change_max": 0.0, "reward_change_mean": -0.27756526693701744, "reward_change_min": -0.5454409923404455, "reward_change_std": 0.2054140493273735, "reward_std": 0.8713477253913879, "rewards/cosine_scaled_reward": 0.025296560488641262, "rewards/format_reward": 0.8333333488553762, "step": 318 }, { "advantage_max": 1.2086059525609016, "advantage_mean": 1.428027990302283e-08, "advantage_min": -0.6550127752125263, "advantage_std": 0.7075029835104942, "completion_length": 3063.2500915527344, "epoch": 0.36457142857142855, "grad_norm": 0.4142054617404938, "kl": 0.408905029296875, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.1393354916230005e-07, "loss": 0.0425, "reward": 0.10372621472924948, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.10372621472924948, "reward_after_std": 0.7075029797852039, "reward_before_mean": 0.2803697426279541, "reward_before_std": 0.699725516140461, "reward_change_max": 0.00030569732189178467, "reward_change_mean": -0.17664350988343358, "reward_change_min": -0.33379580453038216, "reward_change_std": 0.1261939788237214, "reward_std": 0.7075030021369457, "rewards/cosine_scaled_reward": -0.17231514491140842, "rewards/format_reward": 0.6250000055879354, "step": 319 }, { "advantage_max": 1.011231355369091, "advantage_mean": -2.793967784908702e-08, "advantage_min": -0.8405797928571701, "advantage_std": 0.6433273144066334, "completion_length": 2278.5209197998047, "epoch": 0.3657142857142857, "grad_norm": 1.0842550992965698, "kl": 0.266357421875, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.1094235253127374e-07, "loss": -0.0046, "reward": 0.5482218451797962, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5482218451797962, "reward_after_std": 0.6433273293077946, "reward_before_mean": 0.8245733263902366, "reward_before_std": 0.6286395341157913, "reward_change_max": 0.0, "reward_change_mean": -0.27635145746171474, "reward_change_min": -0.43134531006217003, "reward_change_std": 0.16763820592314005, "reward_std": 0.6433273367583752, "rewards/cosine_scaled_reward": -0.014796690084040165, "rewards/format_reward": 0.8541666753590107, "step": 320 }, { "advantage_max": 1.5686568692326546, "advantage_mean": -6.829699028543246e-09, "advantage_min": -0.9620345383882523, "advantage_std": 0.920565202832222, "completion_length": 2385.395881652832, "epoch": 0.3668571428571429, "grad_norm": 0.8514863848686218, "kl": 0.350616455078125, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.079579333738039e-07, "loss": 0.0402, "reward": 0.6498768096789718, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6498768096789718, "reward_after_std": 0.9205651953816414, "reward_before_mean": 0.9244288019835949, "reward_before_std": 0.8969539999961853, "reward_change_max": 0.0, "reward_change_mean": -0.2745520006865263, "reward_change_min": -0.47161246463656425, "reward_change_std": 0.18094922229647636, "reward_std": 0.9205652624368668, "rewards/cosine_scaled_reward": 0.05596439470537007, "rewards/format_reward": 0.8125000223517418, "step": 321 }, { "advantage_max": 1.32938901335001, "advantage_mean": -1.428028034711204e-08, "advantage_min": -0.935857068747282, "advantage_std": 0.8315207101404667, "completion_length": 3076.666717529297, "epoch": 0.368, "grad_norm": 0.6694331169128418, "kl": 0.4154052734375, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.0498043714627006e-07, "loss": 0.0284, "reward": 0.4391724751330912, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4391724751330912, "reward_after_std": 0.8315207213163376, "reward_before_mean": 0.6800791984423995, "reward_before_std": 0.8322959132492542, "reward_change_max": 0.00024937838315963745, "reward_change_mean": -0.24090675078332424, "reward_change_min": -0.43763658218085766, "reward_change_std": 0.16967775207012892, "reward_std": 0.8315207473933697, "rewards/cosine_scaled_reward": -0.07662705332040787, "rewards/format_reward": 0.8333333507180214, "step": 322 }, { "advantage_max": 0.8951463475823402, "advantage_mean": -1.1175870950896893e-08, "advantage_min": -0.5553086809813976, "advantage_std": 0.5410531852394342, "completion_length": 2928.5209197998047, "epoch": 0.36914285714285716, "grad_norm": 0.6581751108169556, "kl": 0.425262451171875, "lambda_div_used": 0.7999999999999999, "learning_rate": 4.020100089676376e-07, "loss": 0.0168, "reward": 0.18120288103818893, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.18120288103818893, "reward_after_std": 0.5410531796514988, "reward_before_mean": 0.38624573312699795, "reward_before_std": 0.5210823472589254, "reward_change_max": 0.0, "reward_change_mean": -0.2050428856164217, "reward_change_min": -0.3625640105456114, "reward_change_std": 0.13345395447686315, "reward_std": 0.5410531908273697, "rewards/cosine_scaled_reward": -0.1089604664593935, "rewards/format_reward": 0.6041666716337204, "step": 323 }, { "advantage_max": 1.1832123175263405, "advantage_mean": 4.035731193674508e-09, "advantage_min": -0.802801787853241, "advantage_std": 0.7257098276168108, "completion_length": 3118.979248046875, "epoch": 0.3702857142857143, "grad_norm": 0.7724043130874634, "kl": 0.521728515625, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.9904679361238526e-07, "loss": 0.0424, "reward": 0.2152748111402616, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2152748111402616, "reward_after_std": 0.7257098257541656, "reward_before_mean": 0.415889807860367, "reward_before_std": 0.7304543051868677, "reward_change_max": 0.0, "reward_change_mean": -0.20061499159783125, "reward_change_min": -0.3738654777407646, "reward_change_std": 0.14454056369140744, "reward_std": 0.7257098518311977, "rewards/cosine_scaled_reward": -0.11497177183628082, "rewards/format_reward": 0.6458333488553762, "step": 324 }, { "advantage_max": 1.1905155405402184, "advantage_mean": -2.4835267731226907e-09, "advantage_min": -0.6922871842980385, "advantage_std": 0.6870955415070057, "completion_length": 3030.0417098999023, "epoch": 0.37142857142857144, "grad_norm": 0.6713106036186218, "kl": 0.4742431640625, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.9609093550344907e-07, "loss": 0.0254, "reward": 0.04953139740973711, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.04953139740973711, "reward_after_std": 0.6870955415070057, "reward_before_mean": 0.21542606968432665, "reward_before_std": 0.6781132407486439, "reward_change_max": 0.0, "reward_change_mean": -0.16589466854929924, "reward_change_min": -0.31020473316311836, "reward_change_std": 0.11704500950872898, "reward_std": 0.6870955489575863, "rewards/cosine_scaled_reward": -0.1527036428451538, "rewards/format_reward": 0.5208333469927311, "step": 325 }, { "advantage_max": 0.9004357382655144, "advantage_mean": -2.1420419743511943e-08, "advantage_min": -0.6643481366336346, "advantage_std": 0.5915771201252937, "completion_length": 2586.916717529297, "epoch": 0.37257142857142855, "grad_norm": 0.47564664483070374, "kl": 0.35687255859375, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.931425787051832e-07, "loss": 0.0327, "reward": 0.3928201788367005, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3928201788367005, "reward_after_std": 0.591577123850584, "reward_before_mean": 0.6418143566697836, "reward_before_std": 0.5820163637399673, "reward_change_max": 0.0, "reward_change_mean": -0.24899419397115707, "reward_change_min": -0.4311428405344486, "reward_change_std": 0.15894273854792118, "reward_std": 0.5915771350264549, "rewards/cosine_scaled_reward": -0.09575949981808662, "rewards/format_reward": 0.8333333432674408, "step": 326 }, { "advantage_max": 1.1272512525320053, "advantage_mean": -1.73846881335038e-08, "advantage_min": -0.8326604291796684, "advantage_std": 0.7054290883243084, "completion_length": 2774.7709197998047, "epoch": 0.3737142857142857, "grad_norm": 0.654158353805542, "kl": 0.3751220703125, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.902018669163384e-07, "loss": 0.0321, "reward": 0.8073004670441151, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.8073004670441151, "reward_after_std": 0.7054291032254696, "reward_before_mean": 1.1331024691462517, "reward_before_std": 0.6673686243593693, "reward_change_max": 0.0, "reward_change_mean": -0.3258019909262657, "reward_change_min": -0.5110926926136017, "reward_change_std": 0.20145599078387022, "reward_std": 0.7054291255772114, "rewards/cosine_scaled_reward": 0.1498845461755991, "rewards/format_reward": 0.8333333469927311, "step": 327 }, { "advantage_max": 0.863483127206564, "advantage_mean": 5.587935947293232e-09, "advantage_min": -0.5666074939072132, "advantage_std": 0.550336167216301, "completion_length": 3289.354217529297, "epoch": 0.37485714285714283, "grad_norm": 0.40639927983283997, "kl": 0.466796875, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.872689434630585e-07, "loss": 0.0342, "reward": -0.21721973177045584, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.21721973177045584, "reward_after_std": 0.550336167216301, "reward_before_mean": -0.09404684603214264, "reward_before_std": 0.5687927044928074, "reward_change_max": 0.00017432868480682373, "reward_change_mean": -0.1231728857383132, "reward_change_min": -0.2842960599809885, "reward_change_std": 0.10683112032711506, "reward_std": 0.5503361783921719, "rewards/cosine_scaled_reward": -0.22410675883293152, "rewards/format_reward": 0.354166679084301, "step": 328 }, { "advantage_max": 1.1593385338783264, "advantage_mean": -3.6632021249705105e-08, "advantage_min": -0.9056679531931877, "advantage_std": 0.7783106043934822, "completion_length": 2319.5833587646484, "epoch": 0.376, "grad_norm": 1.2089651823043823, "kl": 0.3541259765625, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.843439512918949e-07, "loss": 0.068, "reward": 0.6148182256147265, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6148182256147265, "reward_after_std": 0.7783105969429016, "reward_before_mean": 0.898950282484293, "reward_before_std": 0.7883465178310871, "reward_change_max": 0.0, "reward_change_mean": -0.2841320354491472, "reward_change_min": -0.5121825449168682, "reward_change_std": 0.19760629907250404, "reward_std": 0.7783106341958046, "rewards/cosine_scaled_reward": 0.1161417793482542, "rewards/format_reward": 0.6666666828095913, "step": 329 }, { "advantage_max": 1.3785473480820656, "advantage_mean": -1.241763458725842e-08, "advantage_min": -0.670930914580822, "advantage_std": 0.7409147545695305, "completion_length": 2674.9167098999023, "epoch": 0.37714285714285717, "grad_norm": 0.4288341999053955, "kl": 0.318695068359375, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.8142703296283953e-07, "loss": 0.0262, "reward": 0.20354478422086686, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.20354478422086686, "reward_after_std": 0.7409147545695305, "reward_before_mean": 0.39301371574401855, "reward_before_std": 0.7014989331364632, "reward_change_max": 0.0001328885555267334, "reward_change_mean": -0.18946894630789757, "reward_change_min": -0.3107476755976677, "reward_change_std": 0.1183566078543663, "reward_std": 0.740914773195982, "rewards/cosine_scaled_reward": -0.157659818418324, "rewards/format_reward": 0.708333345130086, "step": 330 }, { "advantage_max": 0.9791850298643112, "advantage_mean": -1.5211601867015645e-08, "advantage_min": -0.6504289247095585, "advantage_std": 0.5825454797595739, "completion_length": 2798.354217529297, "epoch": 0.3782857142857143, "grad_norm": 0.4488306939601898, "kl": 0.367401123046875, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.785183306423767e-07, "loss": 0.0448, "reward": 0.09584091976284981, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.09584091976284981, "reward_after_std": 0.5825454816222191, "reward_before_mean": 0.27970938943326473, "reward_before_std": 0.5661040190607309, "reward_change_max": 0.00010569393634796143, "reward_change_mean": -0.18386848643422127, "reward_change_min": -0.31409010104835033, "reward_change_std": 0.12024927698075771, "reward_std": 0.5825455076992512, "rewards/cosine_scaled_reward": -0.1205619778484106, "rewards/format_reward": 0.520833345130086, "step": 331 }, { "advantage_max": 1.2895566672086716, "advantage_mean": -2.0799537703286575e-08, "advantage_min": -0.7044409960508347, "advantage_std": 0.7655586674809456, "completion_length": 2789.4584045410156, "epoch": 0.37942857142857145, "grad_norm": 0.8093065619468689, "kl": 0.3294219970703125, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.7561798609655373e-07, "loss": 0.0521, "reward": 0.5844221506267786, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5844221506267786, "reward_after_std": 0.7655586712062359, "reward_before_mean": 0.8579580970108509, "reward_before_std": 0.7273687534034252, "reward_change_max": 0.0, "reward_change_mean": -0.27353594824671745, "reward_change_min": -0.49232082441449165, "reward_change_std": 0.17840590421110392, "reward_std": 0.7655586935579777, "rewards/cosine_scaled_reward": 0.012312370701692998, "rewards/format_reward": 0.8333333488553762, "step": 332 }, { "advantage_max": 1.3430612981319427, "advantage_mean": -3.2906731617377005e-08, "advantage_min": -0.8295051567256451, "advantage_std": 0.8054756931960583, "completion_length": 2414.3959045410156, "epoch": 0.38057142857142856, "grad_norm": 0.8955134153366089, "kl": 0.2362060546875, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.72726140684072e-07, "loss": 0.0748, "reward": 0.5596026126295328, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5596026126295328, "reward_after_std": 0.805475689470768, "reward_before_mean": 0.8250005207955837, "reward_before_std": 0.7849955596029758, "reward_change_max": 0.0, "reward_change_mean": -0.2653979305177927, "reward_change_min": -0.4618973508477211, "reward_change_std": 0.172137257643044, "reward_std": 0.8054757341742516, "rewards/cosine_scaled_reward": -0.05624975264072418, "rewards/format_reward": 0.9375000149011612, "step": 333 }, { "advantage_max": 1.144443653523922, "advantage_mean": -9.934107647602275e-09, "advantage_min": -0.9086218178272247, "advantage_std": 0.772247925400734, "completion_length": 3215.7708740234375, "epoch": 0.38171428571428573, "grad_norm": 0.4409116208553314, "kl": 0.5167236328125, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.6984293534939737e-07, "loss": 0.0442, "reward": -0.03127163369208574, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.03127163369208574, "reward_after_std": 0.7722479328513145, "reward_before_mean": 0.1197983508463949, "reward_before_std": 0.821826346218586, "reward_change_max": 0.0003597959876060486, "reward_change_mean": -0.15106998570263386, "reward_change_min": -0.3818219192326069, "reward_change_std": 0.1507408181205392, "reward_std": 0.7722479701042175, "rewards/cosine_scaled_reward": -0.20051748771220446, "rewards/format_reward": 0.520833345130086, "step": 334 }, { "advantage_max": 1.3306704387068748, "advantage_mean": 2.483526606589237e-09, "advantage_min": -0.7545684538781643, "advantage_std": 0.772412870079279, "completion_length": 2627.3333587646484, "epoch": 0.38285714285714284, "grad_norm": 0.5144142508506775, "kl": 0.2955322265625, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.6696851061588994e-07, "loss": 0.0237, "reward": 0.5178827093914151, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5178827093914151, "reward_after_std": 0.7724128775298595, "reward_before_mean": 0.775667869951576, "reward_before_std": 0.7353975214064121, "reward_change_max": 0.00017014145851135254, "reward_change_mean": -0.25778517220169306, "reward_change_min": -0.4666583947837353, "reward_change_std": 0.1713226563297212, "reward_std": 0.7724129036068916, "rewards/cosine_scaled_reward": 0.0024172652047127485, "rewards/format_reward": 0.7708333432674408, "step": 335 }, { "advantage_max": 1.566083051264286, "advantage_mean": -6.022552695439387e-08, "advantage_min": -1.0432822108268738, "advantage_std": 0.9853878915309906, "completion_length": 2952.6459350585938, "epoch": 0.384, "grad_norm": 1.2873221635818481, "kl": 0.30908203125, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.641030065789562e-07, "loss": 0.0561, "reward": 0.6863108254037797, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6863108254037797, "reward_after_std": 0.9853878952562809, "reward_before_mean": 0.9682151582092047, "reward_before_std": 0.9924081414937973, "reward_change_max": 0.0, "reward_change_mean": -0.2819043677300215, "reward_change_min": -0.541389087215066, "reward_change_std": 0.20822805631905794, "reward_std": 0.9853879250586033, "rewards/cosine_scaled_reward": 0.08827423304319382, "rewards/format_reward": 0.791666679084301, "step": 336 }, { "advantage_max": 1.2723659500479698, "advantage_mean": -2.23517424569053e-08, "advantage_min": -0.87659877166152, "advantage_std": 0.8086981289088726, "completion_length": 3060.1250610351562, "epoch": 0.3851428571428571, "grad_norm": 0.570880651473999, "kl": 0.50115966796875, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.612465628992203e-07, "loss": 0.0392, "reward": 0.30738167371600866, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.30738167371600866, "reward_after_std": 0.8086981289088726, "reward_before_mean": 0.5230182525701821, "reward_before_std": 0.8208495676517487, "reward_change_max": 0.0006974413990974426, "reward_change_mean": -0.21563658583909273, "reward_change_min": -0.4076352324336767, "reward_change_std": 0.16335770674049854, "reward_std": 0.8086981736123562, "rewards/cosine_scaled_reward": -0.08224089071154594, "rewards/format_reward": 0.6875000074505806, "step": 337 }, { "advantage_max": 1.328472100198269, "advantage_mean": 6.208817571184966e-09, "advantage_min": -0.732303611934185, "advantage_std": 0.799540875479579, "completion_length": 2439.5209197998047, "epoch": 0.3862857142857143, "grad_norm": 0.6737061142921448, "kl": 0.34918212890625, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.5839931879571725e-07, "loss": 0.0293, "reward": 0.5488169081509113, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5488169081509113, "reward_after_std": 0.7995408792048693, "reward_before_mean": 0.8132592551410198, "reward_before_std": 0.7790190260857344, "reward_change_max": 0.0, "reward_change_mean": -0.26444234885275364, "reward_change_min": -0.4892202354967594, "reward_change_std": 0.18026553001254797, "reward_std": 0.7995409090071917, "rewards/cosine_scaled_reward": 0.05246296781115234, "rewards/format_reward": 0.7083333395421505, "step": 338 }, { "advantage_max": 0.8232481442391872, "advantage_mean": 4.6566128730773926e-09, "advantage_min": -0.6889732703566551, "advantage_std": 0.5477366112172604, "completion_length": 3039.895866394043, "epoch": 0.38742857142857146, "grad_norm": 0.6333633065223694, "kl": 0.481689453125, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.555614130391079e-07, "loss": 0.0376, "reward": -0.011264028958976269, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.011264028958976269, "reward_after_std": 0.5477366037666798, "reward_before_mean": 0.1567919987719506, "reward_before_std": 0.5631052926182747, "reward_change_max": 0.0, "reward_change_mean": -0.16805602610111237, "reward_change_min": -0.3233498651534319, "reward_change_std": 0.12449631281197071, "reward_std": 0.5477366112172604, "rewards/cosine_scaled_reward": -0.17160400934517384, "rewards/format_reward": 0.500000013038516, "step": 339 }, { "advantage_max": 1.236702598631382, "advantage_mean": -1.6142924941231485e-08, "advantage_min": -0.6286331415176392, "advantage_std": 0.7274463996291161, "completion_length": 3001.3125915527344, "epoch": 0.38857142857142857, "grad_norm": 0.49938488006591797, "kl": 0.435302734375, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.5273298394491515e-07, "loss": 0.0465, "reward": 0.4386264495551586, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4386264495551586, "reward_after_std": 0.7274463996291161, "reward_before_mean": 0.6835720241069794, "reward_before_std": 0.7010614164173603, "reward_change_max": 0.0, "reward_change_mean": -0.244945558719337, "reward_change_min": -0.43890780955553055, "reward_change_std": 0.1548476442694664, "reward_std": 0.7274464033544064, "rewards/cosine_scaled_reward": -0.0644640102982521, "rewards/format_reward": 0.812500013038516, "step": 340 }, { "advantage_max": 1.156467616558075, "advantage_mean": -2.3903947127257297e-08, "advantage_min": -1.026138313114643, "advantage_std": 0.7706930190324783, "completion_length": 2587.541717529297, "epoch": 0.38971428571428574, "grad_norm": 0.9054402112960815, "kl": 0.297088623046875, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.4991416936678276e-07, "loss": 0.037, "reward": 0.7464197538793087, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.7464197538793087, "reward_after_std": 0.7706930041313171, "reward_before_mean": 1.0588255478069186, "reward_before_std": 0.7797860540449619, "reward_change_max": 0.0002677813172340393, "reward_change_mean": -0.3124057734385133, "reward_change_min": -0.5072425659745932, "reward_change_std": 0.2082651173695922, "reward_std": 0.7706930078566074, "rewards/cosine_scaled_reward": 0.175246087834239, "rewards/format_reward": 0.7083333432674408, "step": 341 }, { "advantage_max": 1.2979220375418663, "advantage_mean": -5.898376453927767e-09, "advantage_min": -0.6838219463825226, "advantage_std": 0.7472971193492413, "completion_length": 2820.333396911621, "epoch": 0.39085714285714285, "grad_norm": 0.9386263489723206, "kl": 0.490142822265625, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.471051066897562e-07, "loss": 0.0713, "reward": 0.28842813009396195, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.28842813009396195, "reward_after_std": 0.7472970969974995, "reward_before_mean": 0.49761716183274984, "reward_before_std": 0.7262459360063076, "reward_change_max": 0.0, "reward_change_mean": -0.20918904338032007, "reward_change_min": -0.3865237608551979, "reward_change_std": 0.1397272665053606, "reward_std": 0.7472971044480801, "rewards/cosine_scaled_reward": -0.12619143165647984, "rewards/format_reward": 0.7500000186264515, "step": 342 }, { "advantage_max": 1.437583826482296, "advantage_mean": -1.024454859832602e-08, "advantage_min": -0.9720087945461273, "advantage_std": 0.8712345883250237, "completion_length": 3139.8959350585938, "epoch": 0.392, "grad_norm": 1.1243489980697632, "kl": 0.4140625, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.4430593282358777e-07, "loss": 0.0634, "reward": 0.632043020799756, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.632043020799756, "reward_after_std": 0.871234592050314, "reward_before_mean": 0.9088306780904531, "reward_before_std": 0.8549764901399612, "reward_change_max": 0.0, "reward_change_mean": -0.27678765542805195, "reward_change_min": -0.46818885393440723, "reward_change_std": 0.18712846003472805, "reward_std": 0.8712346069514751, "rewards/cosine_scaled_reward": 0.12108199740760028, "rewards/format_reward": 0.6666666772216558, "step": 343 }, { "advantage_max": 1.1282891780138016, "advantage_mean": -4.097819450432638e-08, "advantage_min": -0.6967478543519974, "advantage_std": 0.6700936332345009, "completion_length": 2371.0208892822266, "epoch": 0.3931428571428571, "grad_norm": 0.4421378970146179, "kl": 0.31829833984375, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.4151678419606233e-07, "loss": 0.0246, "reward": 0.9737739209085703, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.9737739209085703, "reward_after_std": 0.6700936630368233, "reward_before_mean": 1.3351922817528248, "reward_before_std": 0.6023357845842838, "reward_change_max": 0.0, "reward_change_mean": -0.3614183757454157, "reward_change_min": -0.5266637336462736, "reward_change_std": 0.20244834944605827, "reward_std": 0.6700937002897263, "rewards/cosine_scaled_reward": 0.24051279202103615, "rewards/format_reward": 0.854166679084301, "step": 344 }, { "advantage_max": 1.2971906512975693, "advantage_mean": -1.5211601755993343e-08, "advantage_min": -0.812989853322506, "advantage_std": 0.789570115506649, "completion_length": 3051.916763305664, "epoch": 0.3942857142857143, "grad_norm": 1.0356272459030151, "kl": 0.50445556640625, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.387377967463493e-07, "loss": 0.0108, "reward": 0.7054811893031001, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7054811893031001, "reward_after_std": 0.7895701378583908, "reward_before_mean": 1.0037767849862576, "reward_before_std": 0.7595403417944908, "reward_change_max": 0.0, "reward_change_mean": -0.29829559661448, "reward_change_min": -0.487414488568902, "reward_change_std": 0.19216558057814837, "reward_std": 0.7895701602101326, "rewards/cosine_scaled_reward": 0.12688838969916105, "rewards/format_reward": 0.7500000149011612, "step": 345 }, { "advantage_max": 1.093222588300705, "advantage_mean": -1.5832483879485437e-08, "advantage_min": -0.814299151301384, "advantage_std": 0.7042220123112202, "completion_length": 3006.9375915527344, "epoch": 0.3954285714285714, "grad_norm": 0.5911859273910522, "kl": 0.541259765625, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.359691059183761e-07, "loss": 0.045, "reward": 0.22268605418503284, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.22268605418503284, "reward_after_std": 0.7042220011353493, "reward_before_mean": 0.4276978559792042, "reward_before_std": 0.7179974764585495, "reward_change_max": 0.00036253780126571655, "reward_change_mean": -0.20501180551946163, "reward_change_min": -0.3955332338809967, "reward_change_std": 0.15684256795793772, "reward_std": 0.7042220234870911, "rewards/cosine_scaled_reward": -0.12990108225494623, "rewards/format_reward": 0.6875000111758709, "step": 346 }, { "advantage_max": 1.1199750006198883, "advantage_mean": 1.2417634698280722e-09, "advantage_min": -0.7925616055727005, "advantage_std": 0.6887125670909882, "completion_length": 3116.6458740234375, "epoch": 0.3965714285714286, "grad_norm": 0.9110195636749268, "kl": 0.51123046875, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.3321084665422803e-07, "loss": 0.0241, "reward": 0.32749871275154874, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.32749871275154874, "reward_after_std": 0.6887125447392464, "reward_before_mean": 0.5540672373026609, "reward_before_std": 0.6819815710186958, "reward_change_max": 1.1220574378967285e-05, "reward_change_mean": -0.22656850516796112, "reward_change_min": -0.39756184443831444, "reward_change_std": 0.15045985206961632, "reward_std": 0.6887125596404076, "rewards/cosine_scaled_reward": -0.11879973486065865, "rewards/format_reward": 0.7916666865348816, "step": 347 }, { "advantage_max": 1.0748011618852615, "advantage_mean": -1.5522041707516365e-09, "advantage_min": -0.783403791487217, "advantage_std": 0.6752111651003361, "completion_length": 2868.520896911621, "epoch": 0.3977142857142857, "grad_norm": 0.63515305519104, "kl": 0.509979248046875, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.3046315338757026e-07, "loss": 0.0479, "reward": 0.20603055600076914, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.20603055600076914, "reward_after_std": 0.6752111651003361, "reward_before_mean": 0.40932375006377697, "reward_before_std": 0.6790587641298771, "reward_change_max": 0.0003105774521827698, "reward_change_mean": -0.20329320384189487, "reward_change_min": -0.38921800442039967, "reward_change_std": 0.14643870294094086, "reward_std": 0.6752111837267876, "rewards/cosine_scaled_reward": -0.18075479287654161, "rewards/format_reward": 0.7708333488553762, "step": 348 }, { "advantage_max": 1.4406801536679268, "advantage_mean": -1.7384688022481498e-08, "advantage_min": -0.6087111514061689, "advantage_std": 0.7680649720132351, "completion_length": 3036.7084350585938, "epoch": 0.39885714285714285, "grad_norm": 0.566215455532074, "kl": 0.40716552734375, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.2772616003709616e-07, "loss": 0.0366, "reward": 0.3477841697167605, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3477841697167605, "reward_after_std": 0.7680649943649769, "reward_before_mean": 0.5653798468410969, "reward_before_std": 0.7107026390731335, "reward_change_max": 0.0, "reward_change_mean": -0.21759567223489285, "reward_change_min": -0.363783435896039, "reward_change_std": 0.12816297076642513, "reward_std": 0.7680650278925896, "rewards/cosine_scaled_reward": -0.11314341984689236, "rewards/format_reward": 0.7916666772216558, "step": 349 }, { "advantage_max": 1.3873028084635735, "advantage_mean": -1.800557009046244e-08, "advantage_min": -0.8304652981460094, "advantage_std": 0.8284526132047176, "completion_length": 2645.7500610351562, "epoch": 0.4, "grad_norm": 0.7734155654907227, "kl": 0.50872802734375, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.250000000000001e-07, "loss": 0.0392, "reward": 0.27224957942962646, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.27224957942962646, "reward_after_std": 0.8284526206552982, "reward_before_mean": 0.47611718252301216, "reward_before_std": 0.8236043527722359, "reward_change_max": 0.00037673860788345337, "reward_change_mean": -0.2038676030933857, "reward_change_min": -0.3996240645647049, "reward_change_std": 0.1497228485532105, "reward_std": 0.8284526281058788, "rewards/cosine_scaled_reward": -0.1369414208456874, "rewards/format_reward": 0.7500000204890966, "step": 350 }, { "advantage_max": 1.0394860319793224, "advantage_mean": 1.1796753240922442e-08, "advantage_min": -1.0615929793566465, "advantage_std": 0.7572151720523834, "completion_length": 2793.354248046875, "epoch": 0.40114285714285713, "grad_norm": 0.6227604746818542, "kl": 0.32049560546875, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.222848061454764e-07, "loss": 0.0228, "reward": 0.4829323529265821, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4829323529265821, "reward_after_std": 0.757215166464448, "reward_before_mean": 0.7447547446936369, "reward_before_std": 0.7892076782882214, "reward_change_max": 0.0006700903177261353, "reward_change_mean": -0.2618223810568452, "reward_change_min": -0.46100811660289764, "reward_change_std": 0.1924031707458198, "reward_std": 0.7572151944041252, "rewards/cosine_scaled_reward": -0.023455968126654625, "rewards/format_reward": 0.7916666865348816, "step": 351 }, { "advantage_max": 1.1272470727562904, "advantage_mean": -3.725290387279756e-08, "advantage_min": -0.6083212364464998, "advantage_std": 0.6574210226535797, "completion_length": 2952.8959045410156, "epoch": 0.4022857142857143, "grad_norm": 0.404922217130661, "kl": 0.44024658203125, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.195807108082429e-07, "loss": 0.0353, "reward": 0.4087382801808417, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4087382801808417, "reward_after_std": 0.6574210189282894, "reward_before_mean": 0.6517594940960407, "reward_before_std": 0.6211025454103947, "reward_change_max": 0.0002701207995414734, "reward_change_mean": -0.24302126793190837, "reward_change_min": -0.41978174448013306, "reward_change_std": 0.1542210541665554, "reward_std": 0.6574210450053215, "rewards/cosine_scaled_reward": -0.01787025574594736, "rewards/format_reward": 0.6875000093132257, "step": 352 }, { "advantage_max": 1.2669169083237648, "advantage_mean": -3.0423205343854676e-08, "advantage_min": -1.0604734122753143, "advantage_std": 0.8432779721915722, "completion_length": 2252.1667289733887, "epoch": 0.4034285714285714, "grad_norm": 0.5039982199668884, "kl": 0.342376708984375, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.168878457820915e-07, "loss": 0.0192, "reward": 0.4735553851351142, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4735553851351142, "reward_after_std": 0.8432779870927334, "reward_before_mean": 0.7230888027697802, "reward_before_std": 0.8688988424837589, "reward_change_max": 0.000867106020450592, "reward_change_mean": -0.24953339993953705, "reward_change_min": -0.4594745561480522, "reward_change_std": 0.188162581063807, "reward_std": 0.8432780019938946, "rewards/cosine_scaled_reward": 0.028211036697030067, "rewards/format_reward": 0.6666666809469461, "step": 353 }, { "advantage_max": 1.009014643728733, "advantage_mean": -2.5145710624840945e-08, "advantage_min": -0.6482978910207748, "advantage_std": 0.6029532328248024, "completion_length": 2492.5000610351562, "epoch": 0.4045714285714286, "grad_norm": 0.5295609831809998, "kl": 0.3319091796875, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.142063423134644e-07, "loss": 0.0091, "reward": 0.7520541350822896, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7520541350822896, "reward_after_std": 0.6029532514512539, "reward_before_mean": 1.0717348214238882, "reward_before_std": 0.5433379076421261, "reward_change_max": 0.0, "reward_change_mean": -0.3196806572377682, "reward_change_min": -0.47022315859794617, "reward_change_std": 0.18330338411033154, "reward_std": 0.6029532812535763, "rewards/cosine_scaled_reward": 0.16086738382000476, "rewards/format_reward": 0.7500000055879354, "step": 354 }, { "advantage_max": 1.4011807888746262, "advantage_mean": -1.1102230246251565e-16, "advantage_min": -0.7424768507480621, "advantage_std": 0.7783384248614311, "completion_length": 2580.7709045410156, "epoch": 0.4057142857142857, "grad_norm": 0.4905085861682892, "kl": 0.389404296875, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.115363310950578e-07, "loss": 0.0123, "reward": 0.4382380060851574, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4382380060851574, "reward_after_std": 0.7783384323120117, "reward_before_mean": 0.6740993373095989, "reward_before_std": 0.7306386679410934, "reward_change_max": 0.001161329448223114, "reward_change_mean": -0.23586132749915123, "reward_change_min": -0.39404071122407913, "reward_change_std": 0.15169959026388824, "reward_std": 0.7783384546637535, "rewards/cosine_scaled_reward": -0.01711699180305004, "rewards/format_reward": 0.7083333432674408, "step": 355 }, { "advantage_max": 1.3648847937583923, "advantage_mean": -1.1175871117430347e-08, "advantage_min": -0.8897219002246857, "advantage_std": 0.8245714977383614, "completion_length": 2772.1875915527344, "epoch": 0.40685714285714286, "grad_norm": 0.4117547571659088, "kl": 0.36395263671875, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.0887794225945143e-07, "loss": 0.0258, "reward": 0.39002780988812447, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.39002780988812447, "reward_after_std": 0.8245715126395226, "reward_before_mean": 0.6192103186622262, "reward_before_std": 0.8143918961286545, "reward_change_max": 0.00021146982908248901, "reward_change_mean": -0.22918251249939203, "reward_change_min": -0.41135019063949585, "reward_change_std": 0.16032557655125856, "reward_std": 0.8245715387165546, "rewards/cosine_scaled_reward": -0.023728182539343834, "rewards/format_reward": 0.6666666809469461, "step": 356 }, { "advantage_max": 1.139050617814064, "advantage_mean": -2.2817403466657282e-08, "advantage_min": -0.9274939596652985, "advantage_std": 0.7556623220443726, "completion_length": 3195.395965576172, "epoch": 0.408, "grad_norm": 0.5502464771270752, "kl": 0.4244384765625, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.062313053727671e-07, "loss": 0.006, "reward": 0.3339738103095442, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3339738103095442, "reward_after_std": 0.7556623220443726, "reward_before_mean": 0.5612023607827723, "reward_before_std": 0.7768936641514301, "reward_change_max": 0.0002397596836090088, "reward_change_mean": -0.22722852230072021, "reward_change_min": -0.4128516409546137, "reward_change_std": 0.16769019234925508, "reward_std": 0.7556623332202435, "rewards/cosine_scaled_reward": -0.12564883194863796, "rewards/format_reward": 0.8125000149011612, "step": 357 }, { "advantage_max": 1.1580252274870872, "advantage_mean": 1.8626451603331873e-08, "advantage_min": -0.9382319673895836, "advantage_std": 0.8013856522738934, "completion_length": 2818.5209197998047, "epoch": 0.40914285714285714, "grad_norm": 0.6733781695365906, "kl": 0.3752593994140625, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.0359654942835247e-07, "loss": 0.0264, "reward": 0.7400212744250894, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7400212744250894, "reward_after_std": 0.8013856336474419, "reward_before_mean": 1.050462001003325, "reward_before_std": 0.8101890720427036, "reward_change_max": 0.00038573890924453735, "reward_change_mean": -0.3104406874626875, "reward_change_min": -0.5300403535366058, "reward_change_std": 0.2151788305491209, "reward_std": 0.8013856410980225, "rewards/cosine_scaled_reward": 0.13981433119624853, "rewards/format_reward": 0.7708333507180214, "step": 358 }, { "advantage_max": 1.0305150002241135, "advantage_mean": -1.6453365975221956e-08, "advantage_min": -0.7308261096477509, "advantage_std": 0.6315893270075321, "completion_length": 2810.375030517578, "epoch": 0.4102857142857143, "grad_norm": 0.6957297921180725, "kl": 0.3267822265625, "lambda_div_used": 0.7999999999999999, "learning_rate": 3.0097380284049523e-07, "loss": 0.0004, "reward": 0.4991975088487379, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4991975088487379, "reward_after_std": 0.631589338183403, "reward_before_mean": 0.765597203746438, "reward_before_std": 0.5987658575177193, "reward_change_max": 0.0, "reward_change_mean": -0.26639969274401665, "reward_change_min": -0.43839479982852936, "reward_change_std": 0.16410325560718775, "reward_std": 0.6315893642604351, "rewards/cosine_scaled_reward": -0.02345141861587763, "rewards/format_reward": 0.8125000149011612, "step": 359 }, { "advantage_max": 1.572924166917801, "advantage_mean": -4.718701185346674e-08, "advantage_min": -1.0755107551813126, "advantage_std": 1.000184077769518, "completion_length": 2994.229217529297, "epoch": 0.4114285714285714, "grad_norm": 1.2385178804397583, "kl": 0.406982421875, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.9836319343816397e-07, "loss": 0.0516, "reward": 0.8811982152983546, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.8811982152983546, "reward_after_std": 1.0001840852200985, "reward_before_mean": 1.202958945184946, "reward_before_std": 1.00409185141325, "reward_change_max": 0.00045154988765716553, "reward_change_mean": -0.32176076620817184, "reward_change_min": -0.5560223925858736, "reward_change_std": 0.21943911630660295, "reward_std": 1.0001841150224209, "rewards/cosine_scaled_reward": 0.16397946886718273, "rewards/format_reward": 0.8750000149011612, "step": 360 }, { "advantage_max": 1.2372961044311523, "advantage_mean": -9.934107758624577e-09, "advantage_min": -0.7638551630079746, "advantage_std": 0.7821529917418957, "completion_length": 3017.6875610351562, "epoch": 0.4125714285714286, "grad_norm": 0.825147271156311, "kl": 0.360595703125, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.9576484845877793e-07, "loss": -0.0056, "reward": 0.659533898695372, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.659533898695372, "reward_after_std": 0.7821530178189278, "reward_before_mean": 0.9500355757772923, "reward_before_std": 0.7725023068487644, "reward_change_max": 0.0, "reward_change_mean": -0.29050169326364994, "reward_change_min": -0.5486499518156052, "reward_change_std": 0.19475482683628798, "reward_std": 0.7821530364453793, "rewards/cosine_scaled_reward": 0.07918446906842291, "rewards/format_reward": 0.7916666679084301, "step": 361 }, { "advantage_max": 0.7831064909696579, "advantage_mean": -4.346172199909404e-09, "advantage_min": -0.4822607636451721, "advantage_std": 0.4585796631872654, "completion_length": 1946.2500457763672, "epoch": 0.4137142857142857, "grad_norm": 0.5887108445167542, "kl": 0.28375244140625, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.931788945420058e-07, "loss": 0.0079, "reward": 0.5409697405993938, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5409697405993938, "reward_after_std": 0.4585796520113945, "reward_before_mean": 0.8254151255823672, "reward_before_std": 0.394386338070035, "reward_change_max": 0.0, "reward_change_mean": -0.28444540221244097, "reward_change_min": -0.429168276488781, "reward_change_std": 0.15958709362894297, "reward_std": 0.458579670637846, "rewards/cosine_scaled_reward": 0.01687422301620245, "rewards/format_reward": 0.7916666772216558, "step": 362 }, { "advantage_max": 1.3466171473264694, "advantage_mean": 3.4148495420271985e-09, "advantage_min": -0.8489307016134262, "advantage_std": 0.7919553387910128, "completion_length": 2327.125045776367, "epoch": 0.41485714285714287, "grad_norm": 1.1246758699417114, "kl": 0.2977294921875, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.9060545772359305e-07, "loss": 0.0718, "reward": 0.4264291226863861, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4264291226863861, "reward_after_std": 0.7919553834944963, "reward_before_mean": 0.6637498394120485, "reward_before_std": 0.7764361146837473, "reward_change_max": 0.00021463632583618164, "reward_change_mean": -0.23732068575918674, "reward_change_min": -0.40322399884462357, "reward_change_std": 0.1559577500447631, "reward_std": 0.7919553928077221, "rewards/cosine_scaled_reward": -0.043125099036842585, "rewards/format_reward": 0.750000013038516, "step": 363 }, { "advantage_max": 0.7658668607473373, "advantage_mean": 6.208816238917336e-10, "advantage_min": -0.5104176066815853, "advantage_std": 0.4773901328444481, "completion_length": 3035.5000610351562, "epoch": 0.416, "grad_norm": 0.8001071810722351, "kl": 0.3780517578125, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.8804466342921987e-07, "loss": 0.0156, "reward": 0.012055948376655579, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.012055948376655579, "reward_after_std": 0.4773901328444481, "reward_before_mean": 0.18769288063049316, "reward_before_std": 0.4666346423327923, "reward_change_max": 0.0, "reward_change_mean": -0.1756369285285473, "reward_change_min": -0.3073525782674551, "reward_change_std": 0.11426474433392286, "reward_std": 0.4773901589214802, "rewards/cosine_scaled_reward": -0.24990356341004372, "rewards/format_reward": 0.687500013038516, "step": 364 }, { "advantage_max": 0.9230462163686752, "advantage_mean": 1.2417635808503746e-09, "advantage_min": -0.8146604299545288, "advantage_std": 0.6262001693248749, "completion_length": 3016.0208892822266, "epoch": 0.41714285714285715, "grad_norm": 0.809684693813324, "kl": 0.3709716796875, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.854966364683872e-07, "loss": 0.0156, "reward": 0.2725785132497549, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2725785132497549, "reward_after_std": 0.6262001693248749, "reward_before_mean": 0.4956631362438202, "reward_before_std": 0.6411803290247917, "reward_change_max": 0.00024246424436569214, "reward_change_mean": -0.22308461740612984, "reward_change_min": -0.37726784870028496, "reward_change_std": 0.15289299003779888, "reward_std": 0.6262001767754555, "rewards/cosine_scaled_reward": -0.05425176955759525, "rewards/format_reward": 0.6041666846722364, "step": 365 }, { "advantage_max": 1.172032117843628, "advantage_mean": -1.8626450382086546e-09, "advantage_min": -1.1060462072491646, "advantage_std": 0.801375288516283, "completion_length": 2313.3750534057617, "epoch": 0.41828571428571426, "grad_norm": 0.970806896686554, "kl": 0.211883544921875, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.829615010283344e-07, "loss": 0.0475, "reward": 0.8352093771100044, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.8352093771100044, "reward_after_std": 0.8013752736151218, "reward_before_mean": 1.1647344306111336, "reward_before_std": 0.810756042599678, "reward_change_max": 0.0, "reward_change_mean": -0.32952504977583885, "reward_change_min": -0.5507253147661686, "reward_change_std": 0.22081815171986818, "reward_std": 0.8013753145933151, "rewards/cosine_scaled_reward": 0.2698672176338732, "rewards/format_reward": 0.6250000093132257, "step": 366 }, { "advantage_max": 1.4824132844805717, "advantage_mean": -1.738468857759301e-08, "advantage_min": -1.0441878736019135, "advantage_std": 0.9508164152503014, "completion_length": 3036.2084045410156, "epoch": 0.41942857142857143, "grad_norm": 2.3765110969543457, "kl": 0.33184814453125, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.8043938066798645e-07, "loss": 0.0876, "reward": 0.223943829536438, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.223943829536438, "reward_after_std": 0.9508164413273335, "reward_before_mean": 0.41291727498173714, "reward_before_std": 0.9962159134447575, "reward_change_max": 0.0006453990936279297, "reward_change_mean": -0.18897347524762154, "reward_change_min": -0.4371208883821964, "reward_change_std": 0.17967351153492928, "reward_std": 0.9508164711296558, "rewards/cosine_scaled_reward": -0.08520803367719054, "rewards/format_reward": 0.5833333469927311, "step": 367 }, { "advantage_max": 0.8602950386703014, "advantage_mean": -9.313227133933566e-10, "advantage_min": -0.5316635891795158, "advantage_std": 0.49776069819927216, "completion_length": 3087.75, "epoch": 0.4205714285714286, "grad_norm": 0.7006824612617493, "kl": 0.35516357421875, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.7793039831193133e-07, "loss": 0.0094, "reward": 0.10069862008094788, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.10069862008094788, "reward_after_std": 0.49776070564985275, "reward_before_mean": 0.29084074491402134, "reward_before_std": 0.466449361294508, "reward_change_max": 0.0005777925252914429, "reward_change_mean": -0.1901421444490552, "reward_change_min": -0.29943757504224777, "reward_change_std": 0.12012495519593358, "reward_std": 0.49776072427630424, "rewards/cosine_scaled_reward": -0.1045796126127243, "rewards/format_reward": 0.5000000055879354, "step": 368 }, { "advantage_max": 1.5716371908783913, "advantage_mean": -3.694246397678569e-08, "advantage_min": -1.0837796963751316, "advantage_std": 0.9727813564240932, "completion_length": 2767.2500610351562, "epoch": 0.4217142857142857, "grad_norm": 0.8232502341270447, "kl": 0.27996826171875, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.7543467624442956e-07, "loss": 0.0316, "reward": 0.6230164896696806, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6230164896696806, "reward_after_std": 0.9727814011275768, "reward_before_mean": 0.8919380642473698, "reward_before_std": 0.9813907407224178, "reward_change_max": 0.0, "reward_change_mean": -0.2689215983264148, "reward_change_min": -0.5173465516418219, "reward_change_std": 0.1963834147900343, "reward_std": 0.972781416028738, "rewards/cosine_scaled_reward": 0.09180235071107745, "rewards/format_reward": 0.708333345130086, "step": 369 }, { "advantage_max": 0.8397151418030262, "advantage_mean": -2.421438771715856e-08, "advantage_min": -0.6682015657424927, "advantage_std": 0.542332299053669, "completion_length": 3152.1875610351562, "epoch": 0.4228571428571429, "grad_norm": 0.6705177426338196, "kl": 0.392608642578125, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.729523361034538e-07, "loss": 0.0204, "reward": 0.2560145009192638, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2560145009192638, "reward_after_std": 0.5423323065042496, "reward_before_mean": 0.4798000790178776, "reward_before_std": 0.5350558869540691, "reward_change_max": 0.0003724098205566406, "reward_change_mean": -0.22378559224307537, "reward_change_min": -0.3660661317408085, "reward_change_std": 0.1437303787097335, "reward_std": 0.5423323251307011, "rewards/cosine_scaled_reward": -0.062183307483792305, "rewards/format_reward": 0.604166679084301, "step": 370 }, { "advantage_max": 1.094675436615944, "advantage_mean": -1.4280280680178947e-08, "advantage_min": -0.8158886842429638, "advantage_std": 0.6693508177995682, "completion_length": 2196.7500228881836, "epoch": 0.424, "grad_norm": 0.35201528668403625, "kl": 0.228729248046875, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.7048349887476037e-07, "loss": 0.0325, "reward": 0.6185754928737879, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6185754928737879, "reward_after_std": 0.6693508177995682, "reward_before_mean": 0.9065728336572647, "reward_before_std": 0.6426181979477406, "reward_change_max": 0.0008121505379676819, "reward_change_mean": -0.28799727838486433, "reward_change_min": -0.4334562234580517, "reward_change_std": 0.17493201605975628, "reward_std": 0.66935084015131, "rewards/cosine_scaled_reward": 0.08870305120944977, "rewards/format_reward": 0.7291666772216558, "step": 371 }, { "advantage_max": 1.6070511639118195, "advantage_mean": -6.2088167940288486e-09, "advantage_min": -0.9656042233109474, "advantage_std": 0.9646463990211487, "completion_length": 3135.9167404174805, "epoch": 0.42514285714285716, "grad_norm": 0.7402023077011108, "kl": 0.3191375732421875, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.6802828488599294e-07, "loss": 0.0365, "reward": 0.1029303731629625, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1029303731629625, "reward_after_std": 0.9646463990211487, "reward_before_mean": 0.2623461801558733, "reward_before_std": 0.9947348386049271, "reward_change_max": 0.001778140664100647, "reward_change_mean": -0.15941581409424543, "reward_change_min": -0.3602844588458538, "reward_change_std": 0.14846482127904892, "reward_std": 0.9646464139223099, "rewards/cosine_scaled_reward": -0.06674357876181602, "rewards/format_reward": 0.39583334140479565, "step": 372 }, { "advantage_max": 1.1065423339605331, "advantage_mean": -3.880510510145818e-09, "advantage_min": -0.7083667330443859, "advantage_std": 0.6691116765141487, "completion_length": 2094.5208892822266, "epoch": 0.42628571428571427, "grad_norm": 0.39025935530662537, "kl": 0.215972900390625, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.655868138008171e-07, "loss": 0.0345, "reward": 0.2956326426938176, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2956326426938176, "reward_after_std": 0.669111680239439, "reward_before_mean": 0.5150279207155108, "reward_before_std": 0.6565750613808632, "reward_change_max": 0.00019315630197525024, "reward_change_mean": -0.21939527429640293, "reward_change_min": -0.41884781047701836, "reward_change_std": 0.1480354629456997, "reward_std": 0.6691116839647293, "rewards/cosine_scaled_reward": -0.11748604848980904, "rewards/format_reward": 0.7500000093132257, "step": 373 }, { "advantage_max": 1.355882316827774, "advantage_mean": -2.545615118698663e-08, "advantage_min": -0.9096671640872955, "advantage_std": 0.8219250440597534, "completion_length": 2520.000068664551, "epoch": 0.42742857142857144, "grad_norm": 0.9724271297454834, "kl": 0.2716064453125, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.631592046130896e-07, "loss": 0.0439, "reward": 0.523963525891304, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.523963525891304, "reward_after_std": 0.8219250589609146, "reward_before_mean": 0.7809175569564104, "reward_before_std": 0.8074044659733772, "reward_change_max": 0.0, "reward_change_mean": -0.25695404689759016, "reward_change_min": -0.4361608326435089, "reward_change_std": 0.17306603863835335, "reward_std": 0.8219250701367855, "rewards/cosine_scaled_reward": 0.025875442661345005, "rewards/format_reward": 0.7291666828095913, "step": 374 }, { "advantage_max": 1.5019509121775627, "advantage_mean": -2.359350548264416e-08, "advantage_min": -0.8241128325462341, "advantage_std": 0.9086012430489063, "completion_length": 2853.666702270508, "epoch": 0.42857142857142855, "grad_norm": 1.2894920110702515, "kl": 0.330108642578125, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.6074557564105724e-07, "loss": 0.0893, "reward": 0.41232064459472895, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.41232064459472895, "reward_after_std": 0.9086012318730354, "reward_before_mean": 0.6403776034712791, "reward_before_std": 0.9098538756370544, "reward_change_max": 0.0005364418029785156, "reward_change_mean": -0.22805696167051792, "reward_change_min": -0.4626391585916281, "reward_change_std": 0.1787551799789071, "reward_std": 0.908601239323616, "rewards/cosine_scaled_reward": 0.04935545567423105, "rewards/format_reward": 0.5416666716337204, "step": 375 }, { "advantage_max": 1.329629346728325, "advantage_mean": -4.3461723664428575e-09, "advantage_min": -0.8950409665703773, "advantage_std": 0.848419301211834, "completion_length": 2420.8750534057617, "epoch": 0.4297142857142857, "grad_norm": 0.42023032903671265, "kl": 0.22918701171875, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.583460445215911e-07, "loss": 0.0116, "reward": 0.4429361894726753, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4429361894726753, "reward_after_std": 0.8484192863106728, "reward_before_mean": 0.6843834742903709, "reward_before_std": 0.8643893152475357, "reward_change_max": 0.00440620630979538, "reward_change_mean": -0.24144727364182472, "reward_change_min": -0.4647165182977915, "reward_change_std": 0.17763947695493698, "reward_std": 0.8484193198382854, "rewards/cosine_scaled_reward": -0.032808270887471735, "rewards/format_reward": 0.7500000074505806, "step": 376 }, { "advantage_max": 0.8876976631581783, "advantage_mean": 3.414849514271623e-09, "advantage_min": -0.8107789494097233, "advantage_std": 0.5993833858519793, "completion_length": 3353.416717529297, "epoch": 0.4308571428571429, "grad_norm": 0.6079760789871216, "kl": 0.392578125, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.5596072820445254e-07, "loss": 0.0241, "reward": 0.3293274771422148, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3293274771422148, "reward_after_std": 0.5993833709508181, "reward_before_mean": 0.5663622412830591, "reward_before_std": 0.6035155560821295, "reward_change_max": 0.0004423782229423523, "reward_change_mean": -0.2370347515679896, "reward_change_min": -0.40321964025497437, "reward_change_std": 0.159330602735281, "reward_std": 0.5993833728134632, "rewards/cosine_scaled_reward": -0.02931888774037361, "rewards/format_reward": 0.6250000149011612, "step": 377 }, { "advantage_max": 1.2635245844721794, "advantage_mean": -1.5522042262627878e-09, "advantage_min": -0.8523005768656731, "advantage_std": 0.7961474023759365, "completion_length": 2668.0833854675293, "epoch": 0.432, "grad_norm": 0.4990345239639282, "kl": 0.2908935546875, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.5358974294659373e-07, "loss": 0.0265, "reward": 0.7644887082278728, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7644887082278728, "reward_after_std": 0.796147421002388, "reward_before_mean": 1.0759530365467072, "reward_before_std": 0.7701874431222677, "reward_change_max": 0.00038740038871765137, "reward_change_mean": -0.31146427243947983, "reward_change_min": -0.5050882399082184, "reward_change_std": 0.20078438869677484, "reward_std": 0.796147421002388, "rewards/cosine_scaled_reward": 0.11089315311983228, "rewards/format_reward": 0.8541666697710752, "step": 378 }, { "advantage_max": 1.3585326373577118, "advantage_mean": -1.2417634698280722e-09, "advantage_min": -0.8122894652187824, "advantage_std": 0.7893616370856762, "completion_length": 3017.1875610351562, "epoch": 0.43314285714285716, "grad_norm": 0.6748170256614685, "kl": 0.405914306640625, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.512332043064913e-07, "loss": 0.0493, "reward": 0.40940558118745685, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.40940558118745685, "reward_after_std": 0.7893616445362568, "reward_before_mean": 0.6431532986462116, "reward_before_std": 0.764879610389471, "reward_change_max": 0.0004569441080093384, "reward_change_mean": -0.23374769743531942, "reward_change_min": -0.39895620197057724, "reward_change_std": 0.1561375930905342, "reward_std": 0.7893616519868374, "rewards/cosine_scaled_reward": -0.07425669557414949, "rewards/format_reward": 0.7916666753590107, "step": 379 }, { "advantage_max": 1.158207267522812, "advantage_mean": 9.934107758624577e-09, "advantage_min": -0.71046581864357, "advantage_std": 0.7042320594191551, "completion_length": 2508.979202270508, "epoch": 0.4342857142857143, "grad_norm": 0.6089492440223694, "kl": 0.3083343505859375, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.488912271385139e-07, "loss": 0.0129, "reward": 0.4616965427994728, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4616965427994728, "reward_after_std": 0.7042320668697357, "reward_before_mean": 0.7148651704192162, "reward_before_std": 0.6771434843540192, "reward_change_max": 0.0, "reward_change_mean": -0.2531685931608081, "reward_change_min": -0.419018542394042, "reward_change_std": 0.16180676454678178, "reward_std": 0.7042320743203163, "rewards/cosine_scaled_reward": 0.03451590985059738, "rewards/format_reward": 0.6458333432674408, "step": 380 }, { "advantage_max": 1.24627785384655, "advantage_mean": -9.623666530345076e-09, "advantage_min": -0.8591967336833477, "advantage_std": 0.7353556118905544, "completion_length": 2889.3125762939453, "epoch": 0.43542857142857144, "grad_norm": 0.4671679139137268, "kl": 0.4462890625, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.465639255873246e-07, "loss": 0.0428, "reward": 0.3101831339299679, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3101831339299679, "reward_after_std": 0.7353555932641029, "reward_before_mean": 0.5261548971757293, "reward_before_std": 0.723621055483818, "reward_change_max": 0.0009457617998123169, "reward_change_mean": -0.21597177907824516, "reward_change_min": -0.38014696165919304, "reward_change_std": 0.1475808946415782, "reward_std": 0.7353556044399738, "rewards/cosine_scaled_reward": -0.07025589048862457, "rewards/format_reward": 0.6666666809469461, "step": 381 }, { "advantage_max": 0.9929032921791077, "advantage_mean": -5.587935669737476e-09, "advantage_min": -0.7307357527315617, "advantage_std": 0.6182068735361099, "completion_length": 2704.3959197998047, "epoch": 0.43657142857142855, "grad_norm": 0.5253696441650391, "kl": 0.3489990234375, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.4425141308231765e-07, "loss": 0.0262, "reward": 0.20311401411890984, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.20311401411890984, "reward_after_std": 0.6182068809866905, "reward_before_mean": 0.40894183272030205, "reward_before_std": 0.6147168017923832, "reward_change_max": 0.0009988918900489807, "reward_change_mean": -0.20582780428230762, "reward_change_min": -0.3665854223072529, "reward_change_std": 0.14242740999907255, "reward_std": 0.6182069033384323, "rewards/cosine_scaled_reward": -0.2017790980899008, "rewards/format_reward": 0.8125000186264515, "step": 382 }, { "advantage_max": 1.1712853163480759, "advantage_mean": -1.055498932700516e-08, "advantage_min": -0.8046070709824562, "advantage_std": 0.7288852371275425, "completion_length": 2977.6458892822266, "epoch": 0.4377142857142857, "grad_norm": 0.5589643120765686, "kl": 0.4100341796875, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.4195380233209006e-07, "loss": 0.0259, "reward": 0.5600205603986979, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5600205603986979, "reward_after_std": 0.7288852445781231, "reward_before_mean": 0.8326513965730555, "reward_before_std": 0.7167224958539009, "reward_change_max": 0.00034793466329574585, "reward_change_mean": -0.2726308386772871, "reward_change_min": -0.45168603025376797, "reward_change_std": 0.17179731372743845, "reward_std": 0.7288852892816067, "rewards/cosine_scaled_reward": 0.07257568091154099, "rewards/format_reward": 0.6875000204890966, "step": 383 }, { "advantage_max": 1.5155923664569855, "advantage_mean": -9.93410786964688e-09, "advantage_min": -1.0785157233476639, "advantage_std": 0.9407955706119537, "completion_length": 2322.4375610351562, "epoch": 0.43885714285714283, "grad_norm": 0.41426658630371094, "kl": 0.25994110107421875, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.3967120531894857e-07, "loss": 0.0103, "reward": 0.9666200242936611, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.9666200242936611, "reward_after_std": 0.9407955855131149, "reward_before_mean": 1.3100209832191467, "reward_before_std": 0.9235934391617775, "reward_change_max": 0.00021236389875411987, "reward_change_mean": -0.3434009160846472, "reward_change_min": -0.5730981752276421, "reward_change_std": 0.21822507679462433, "reward_std": 0.9407955929636955, "rewards/cosine_scaled_reward": 0.2383438115939498, "rewards/format_reward": 0.8333333432674408, "step": 384 }, { "advantage_max": 1.7108799070119858, "advantage_mean": 1.2417630812500136e-09, "advantage_min": -0.9881407953798771, "advantage_std": 0.9953844398260117, "completion_length": 2913.729248046875, "epoch": 0.44, "grad_norm": 1.0860507488250732, "kl": 0.315673828125, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.374037332934512e-07, "loss": 0.0632, "reward": 0.5514774001203477, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5514774001203477, "reward_after_std": 0.9953844100236893, "reward_before_mean": 0.7995614763349295, "reward_before_std": 0.9814546965062618, "reward_change_max": 0.0, "reward_change_mean": -0.24808406829833984, "reward_change_min": -0.4732775613665581, "reward_change_std": 0.1805225033313036, "reward_std": 0.9953844398260117, "rewards/cosine_scaled_reward": 0.01436406234279275, "rewards/format_reward": 0.7708333507180214, "step": 385 }, { "advantage_max": 1.320080429315567, "advantage_mean": -2.3593505593666464e-08, "advantage_min": -1.19786436855793, "advantage_std": 0.9015576131641865, "completion_length": 3022.6459197998047, "epoch": 0.44114285714285717, "grad_norm": 0.7457150816917419, "kl": 0.40362548828125, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.3515149676898552e-07, "loss": 0.0534, "reward": 0.6899063736200333, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6899063736200333, "reward_after_std": 0.9015576355159283, "reward_before_mean": 0.9832256697118282, "reward_before_std": 0.9319749809801579, "reward_change_max": 0.0, "reward_change_mean": -0.29331934079527855, "reward_change_min": -0.5181048065423965, "reward_change_std": 0.21265010628849268, "reward_std": 0.9015576578676701, "rewards/cosine_scaled_reward": 0.11661284882575274, "rewards/format_reward": 0.7500000260770321, "step": 386 }, { "advantage_max": 1.1501350328326225, "advantage_mean": -2.4835267176115394e-09, "advantage_min": -0.8711995333433151, "advantage_std": 0.7353846915066242, "completion_length": 3086.2083892822266, "epoch": 0.4422857142857143, "grad_norm": 1.3488353490829468, "kl": 0.5625762939453125, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.3291460551638237e-07, "loss": 0.0237, "reward": 0.3419258650392294, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3419258650392294, "reward_after_std": 0.7353846989572048, "reward_before_mean": 0.5701103042811155, "reward_before_std": 0.7389318607747555, "reward_change_max": 0.00011692941188812256, "reward_change_mean": -0.228184443898499, "reward_change_min": -0.40408607944846153, "reward_change_std": 0.161610028706491, "reward_std": 0.7353847101330757, "rewards/cosine_scaled_reward": -0.06911152263637632, "rewards/format_reward": 0.7083333563059568, "step": 387 }, { "advantage_max": 1.2314194291830063, "advantage_mean": -8.071462553882469e-09, "advantage_min": -0.800100838765502, "advantage_std": 0.7259378135204315, "completion_length": 2672.9375762939453, "epoch": 0.44342857142857145, "grad_norm": 0.9292289614677429, "kl": 0.35479736328125, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.306931685585657e-07, "loss": 0.0024, "reward": 0.5226619439199567, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5226619439199567, "reward_after_std": 0.7259378060698509, "reward_before_mean": 0.7850506128743291, "reward_before_std": 0.689913846552372, "reward_change_max": 0.0008951425552368164, "reward_change_mean": -0.26238864846527576, "reward_change_min": -0.4309215322136879, "reward_change_std": 0.16461755614727736, "reward_std": 0.7259378209710121, "rewards/cosine_scaled_reward": -0.013724721502512693, "rewards/format_reward": 0.8125000074505806, "step": 388 }, { "advantage_max": 1.5206672251224518, "advantage_mean": -2.8250119687989184e-08, "advantage_min": -0.9303666837513447, "advantage_std": 0.9098110198974609, "completion_length": 2759.8125915527344, "epoch": 0.44457142857142856, "grad_norm": 0.7669050097465515, "kl": 0.347869873046875, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.2848729416523859e-07, "loss": 0.0573, "reward": 0.5471828170120716, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5471828170120716, "reward_after_std": 0.9098110347986221, "reward_before_mean": 0.8020366318523884, "reward_before_std": 0.900772862136364, "reward_change_max": 0.0, "reward_change_mean": -0.2548538148403168, "reward_change_min": -0.4611310213804245, "reward_change_std": 0.17724605649709702, "reward_std": 0.9098110534250736, "rewards/cosine_scaled_reward": 0.005184967070817947, "rewards/format_reward": 0.7916666772216558, "step": 389 }, { "advantage_max": 1.2562728971242905, "advantage_mean": -7.761021686425451e-09, "advantage_min": -1.0596597380936146, "advantage_std": 0.8596843369305134, "completion_length": 2771.2083892822266, "epoch": 0.44571428571428573, "grad_norm": 0.7488387227058411, "kl": 0.4257965087890625, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.2629708984760706e-07, "loss": 0.0394, "reward": 0.5074011343531311, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5074011343531311, "reward_after_std": 0.8596843518316746, "reward_before_mean": 0.7661386989057064, "reward_before_std": 0.8908602185547352, "reward_change_max": 0.0003530755639076233, "reward_change_mean": -0.25873755663633347, "reward_change_min": -0.5015239603817463, "reward_change_std": 0.19887675996869802, "reward_std": 0.859684381633997, "rewards/cosine_scaled_reward": 0.02890266850590706, "rewards/format_reward": 0.7083333432674408, "step": 390 }, { "advantage_max": 1.5985537618398666, "advantage_mean": -5.5879356475330155e-08, "advantage_min": -1.158255584537983, "advantage_std": 1.037219911813736, "completion_length": 2755.479232788086, "epoch": 0.44685714285714284, "grad_norm": 1.263946294784546, "kl": 0.4556884765625, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.2412266235313973e-07, "loss": 0.0697, "reward": 0.808660427108407, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.808660427108407, "reward_after_std": 1.0372198969125748, "reward_before_mean": 1.1152591221034527, "reward_before_std": 1.0549925453960896, "reward_change_max": 0.00044002383947372437, "reward_change_mean": -0.3065987452864647, "reward_change_min": -0.6033524796366692, "reward_change_std": 0.22967950999736786, "reward_std": 1.037219911813736, "rewards/cosine_scaled_reward": 0.1826295715291053, "rewards/format_reward": 0.7500000260770321, "step": 391 }, { "advantage_max": 1.1852309368550777, "advantage_mean": 4.6566107914092214e-10, "advantage_min": -0.9774343185126781, "advantage_std": 0.8025986086577177, "completion_length": 2328.5833740234375, "epoch": 0.448, "grad_norm": 0.7590972185134888, "kl": 0.3318328857421875, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.2196411766036487e-07, "loss": 0.0497, "reward": 0.6314968835795298, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6314968835795298, "reward_after_std": 0.8025985639542341, "reward_before_mean": 0.9186497461050749, "reward_before_std": 0.808460958302021, "reward_change_max": 0.00013077259063720703, "reward_change_mean": -0.2871528696268797, "reward_change_min": -0.4803936704993248, "reward_change_std": 0.19636545609682798, "reward_std": 0.8025985956192017, "rewards/cosine_scaled_reward": 0.011408207938075066, "rewards/format_reward": 0.8958333432674408, "step": 392 }, { "advantage_max": 1.5476094037294388, "advantage_mean": -4.7497453081746244e-08, "advantage_min": -1.207229033112526, "advantage_std": 0.9942546151578426, "completion_length": 2731.5833892822266, "epoch": 0.4491428571428571, "grad_norm": 1.4517852067947388, "kl": 0.36932373046875, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.1982156097370557e-07, "loss": 0.0669, "reward": 0.8722979612648487, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.8722979612648487, "reward_after_std": 0.9942546300590038, "reward_before_mean": 1.1941809430718422, "reward_before_std": 1.005031768232584, "reward_change_max": 0.0008895769715309143, "reward_change_mean": -0.32188298739492893, "reward_change_min": -0.5757318362593651, "reward_change_std": 0.2305017877370119, "reward_std": 0.9942546710371971, "rewards/cosine_scaled_reward": 0.18042378220707178, "rewards/format_reward": 0.8333333544433117, "step": 393 }, { "advantage_max": 1.069191426038742, "advantage_mean": -1.4280279819756103e-08, "advantage_min": -0.7790201306343079, "advantage_std": 0.6604174636304379, "completion_length": 3172.5209045410156, "epoch": 0.4502857142857143, "grad_norm": 1.4476118087768555, "kl": 0.525146484375, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.1769509671835223e-07, "loss": -0.0052, "reward": 0.11792131559923291, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.11792131559923291, "reward_after_std": 0.6604174561798573, "reward_before_mean": 0.30245483573526144, "reward_before_std": 0.660998422652483, "reward_change_max": 0.00016057491302490234, "reward_change_mean": -0.18453352292999625, "reward_change_min": -0.3241218328475952, "reward_change_std": 0.13130775513127446, "reward_std": 0.6604174748063087, "rewards/cosine_scaled_reward": -0.1716892623808235, "rewards/format_reward": 0.6458333488553762, "step": 394 }, { "advantage_max": 1.4996031299233437, "advantage_mean": 3.1044088966147854e-09, "advantage_min": -1.0069090351462364, "advantage_std": 0.9031396806240082, "completion_length": 2429.6250762939453, "epoch": 0.4514285714285714, "grad_norm": 0.42763546109199524, "kl": 0.346588134765625, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.1558482853517253e-07, "loss": 0.005, "reward": 0.5063761845231056, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5063761845231056, "reward_after_std": 0.903139665722847, "reward_before_mean": 0.7519978582859039, "reward_before_std": 0.9006646797060966, "reward_change_max": 0.00021789968013763428, "reward_change_mean": -0.24562166072428226, "reward_change_min": -0.45618370547890663, "reward_change_std": 0.17835389450192451, "reward_std": 0.9031396880745888, "rewards/cosine_scaled_reward": 0.02183226216584444, "rewards/format_reward": 0.7083333507180214, "step": 395 }, { "advantage_max": 1.0877387076616287, "advantage_mean": -2.7318795670083773e-08, "advantage_min": -0.7776696421205997, "advantage_std": 0.7121854275465012, "completion_length": 3140.041778564453, "epoch": 0.45257142857142857, "grad_norm": 0.7557488679885864, "kl": 0.46923828125, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.134908592756607e-07, "loss": 0.0279, "reward": 0.5757434768602252, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.5757434768602252, "reward_after_std": 0.7121854238212109, "reward_before_mean": 0.8553885407745838, "reward_before_std": 0.7083482407033443, "reward_change_max": 0.0, "reward_change_mean": -0.27964508533477783, "reward_change_min": -0.5032672435045242, "reward_change_std": 0.18293042574077845, "reward_std": 0.712185438722372, "rewards/cosine_scaled_reward": -0.020222392864525318, "rewards/format_reward": 0.8958333432674408, "step": 396 }, { "advantage_max": 0.980850413441658, "advantage_mean": -1.3038516322172455e-08, "advantage_min": -0.7518921606242657, "advantage_std": 0.6465669199824333, "completion_length": 2648.2708740234375, "epoch": 0.45371428571428574, "grad_norm": 0.6451117992401123, "kl": 0.3325653076171875, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.1141329099692406e-07, "loss": 0.024, "reward": 0.20205155480653048, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.20205155480653048, "reward_after_std": 0.6465669199824333, "reward_before_mean": 0.4081749673932791, "reward_before_std": 0.6582167930901051, "reward_change_max": 0.0, "reward_change_mean": -0.20612340234220028, "reward_change_min": -0.3895048126578331, "reward_change_std": 0.1514331568032503, "reward_std": 0.6465669311583042, "rewards/cosine_scaled_reward": -0.16049586178269237, "rewards/format_reward": 0.729166679084301, "step": 397 }, { "advantage_max": 1.1350066661834717, "advantage_mean": -2.2351742123838392e-08, "advantage_min": -0.6734345480799675, "advantage_std": 0.6738680861890316, "completion_length": 2693.6458892822266, "epoch": 0.45485714285714285, "grad_norm": 0.3605808913707733, "kl": 0.375091552734375, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.0935222495670968e-07, "loss": 0.0383, "reward": 0.38482781732454896, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.38482781732454896, "reward_after_std": 0.6738680824637413, "reward_before_mean": 0.6217783205211163, "reward_before_std": 0.6407834403216839, "reward_change_max": 0.0, "reward_change_mean": -0.23695052601397038, "reward_change_min": -0.3980791065841913, "reward_change_std": 0.15346836298704147, "reward_std": 0.6738680936396122, "rewards/cosine_scaled_reward": -0.0641108462586999, "rewards/format_reward": 0.7500000074505806, "step": 398 }, { "advantage_max": 1.4583600088953972, "advantage_mean": -1.241764135961887e-09, "advantage_min": -1.051467526704073, "advantage_std": 0.9354942515492439, "completion_length": 2449.666732788086, "epoch": 0.456, "grad_norm": 1.905930519104004, "kl": 0.334503173828125, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.0730776160846853e-07, "loss": 0.0844, "reward": 0.8447787600453012, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.8447787600453012, "reward_after_std": 0.9354942329227924, "reward_before_mean": 1.1628381051123142, "reward_before_std": 0.9344077445566654, "reward_change_max": 0.0008278712630271912, "reward_change_mean": -0.3180593065917492, "reward_change_min": -0.5746446810662746, "reward_change_std": 0.21871214359998703, "reward_std": 0.9354942627251148, "rewards/cosine_scaled_reward": 0.15433569997549057, "rewards/format_reward": 0.854166679084301, "step": 399 }, { "advantage_max": 1.28252375125885, "advantage_mean": -3.228585043757448e-08, "advantage_min": -0.9487503692507744, "advantage_std": 0.8270534686744213, "completion_length": 2030.4167175292969, "epoch": 0.45714285714285713, "grad_norm": 1.408567190170288, "kl": 0.1586151123046875, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.0528000059645995e-07, "loss": -0.0417, "reward": 1.0239213574677706, "reward_advantage_correlation": 1.0, "reward_after_mean": 1.0239213574677706, "reward_after_std": 0.8270534761250019, "reward_before_mean": 1.3892260467400774, "reward_before_std": 0.793400077149272, "reward_change_max": 0.0002155900001525879, "reward_change_mean": -0.3653047140687704, "reward_change_min": -0.584600031375885, "reward_change_std": 0.22804878000169992, "reward_std": 0.8270534984767437, "rewards/cosine_scaled_reward": 0.23627969017252326, "rewards/format_reward": 0.9166666716337204, "step": 400 }, { "advantage_max": 1.270394504070282, "advantage_mean": -2.2041302089048642e-08, "advantage_min": -0.953679546713829, "advantage_std": 0.7841240204870701, "completion_length": 2974.6667404174805, "epoch": 0.4582857142857143, "grad_norm": 1.3126285076141357, "kl": 0.47442626953125, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.032690407508949e-07, "loss": 0.0054, "reward": 0.5722383912652731, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5722383912652731, "reward_after_std": 0.7841240130364895, "reward_before_mean": 0.8430091231130064, "reward_before_std": 0.7782173566520214, "reward_change_max": 0.0002893209457397461, "reward_change_mean": -0.270770744420588, "reward_change_min": -0.43658161722123623, "reward_change_std": 0.18060561735183, "reward_std": 0.7841240465641022, "rewards/cosine_scaled_reward": 0.05692122224718332, "rewards/format_reward": 0.7291666846722364, "step": 401 }, { "advantage_max": 0.9616642370820045, "advantage_mean": 3.104408619059029e-09, "advantage_min": -0.6524477303028107, "advantage_std": 0.5835731439292431, "completion_length": 2443.750045776367, "epoch": 0.4594285714285714, "grad_norm": 0.5080955028533936, "kl": 0.307830810546875, "lambda_div_used": 0.7999999999999999, "learning_rate": 2.0127498008311922e-07, "loss": 0.0466, "reward": 0.4360934061296575, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4360934061296575, "reward_after_std": 0.5835731700062752, "reward_before_mean": 0.6913687556516379, "reward_before_std": 0.5534581728279591, "reward_change_max": 0.0001589655876159668, "reward_change_mean": -0.25527534261345863, "reward_change_min": -0.40523843094706535, "reward_change_std": 0.1547041442245245, "reward_std": 0.5835731737315655, "rewards/cosine_scaled_reward": -0.060565624153241515, "rewards/format_reward": 0.8125000186264515, "step": 402 }, { "advantage_max": 1.4017279893159866, "advantage_mean": -2.110997909809953e-08, "advantage_min": -0.9017267003655434, "advantage_std": 0.8699055500328541, "completion_length": 2193.9583587646484, "epoch": 0.4605714285714286, "grad_norm": 1.0111479759216309, "kl": 0.2220611572265625, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.9929791578083655e-07, "loss": 0.0451, "reward": 0.4706871140515432, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4706871140515432, "reward_after_std": 0.8699056059122086, "reward_before_mean": 0.7152492292225361, "reward_before_std": 0.8717570528388023, "reward_change_max": 0.0, "reward_change_mean": -0.24456209503114223, "reward_change_min": -0.4225851409137249, "reward_change_std": 0.1727625085040927, "reward_std": 0.8699056394398212, "rewards/cosine_scaled_reward": -0.038208745419979095, "rewards/format_reward": 0.7916666828095913, "step": 403 }, { "advantage_max": 0.9245586022734642, "advantage_mean": 6.208816238917336e-10, "advantage_min": -0.6205957382917404, "advantage_std": 0.564962100237608, "completion_length": 2551.354232788086, "epoch": 0.4617142857142857, "grad_norm": 0.7836054563522339, "kl": 0.34100341796875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.9733794420337213e-07, "loss": 0.0088, "reward": 0.4881277927197516, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4881277927197516, "reward_after_std": 0.564962100237608, "reward_before_mean": 0.7562550120055676, "reward_before_std": 0.5240175891667604, "reward_change_max": 0.0, "reward_change_mean": -0.26812720810994506, "reward_change_min": -0.4247481171041727, "reward_change_std": 0.16469123680144548, "reward_std": 0.5649621076881886, "rewards/cosine_scaled_reward": -0.028122495859861374, "rewards/format_reward": 0.8125000111758709, "step": 404 }, { "advantage_max": 1.4521689862012863, "advantage_mean": -7.450580929990736e-09, "advantage_min": -1.2455776780843735, "advantage_std": 0.9874396286904812, "completion_length": 2289.291732788086, "epoch": 0.46285714285714286, "grad_norm": 0.527711808681488, "kl": 0.3195953369140625, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.9539516087697517e-07, "loss": 0.0296, "reward": 0.9149450561963022, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.9149450561963022, "reward_after_std": 0.9874396249651909, "reward_before_mean": 1.249992674216628, "reward_before_std": 1.0209744945168495, "reward_change_max": 0.0, "reward_change_mean": -0.3350475840270519, "reward_change_min": -0.6132257878780365, "reward_change_std": 0.237197645008564, "reward_std": 0.9874396622180939, "rewards/cosine_scaled_reward": 0.1874963054433465, "rewards/format_reward": 0.8750000223517418, "step": 405 }, { "advantage_max": 1.1301475763320923, "advantage_mean": -8.07146210979326e-09, "advantage_min": -1.028851356357336, "advantage_std": 0.7529197167605162, "completion_length": 2268.4376068115234, "epoch": 0.464, "grad_norm": 0.5847371816635132, "kl": 0.24346923828125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.934696604901642e-07, "loss": 0.03, "reward": 0.8698830264620483, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.8698830264620483, "reward_after_std": 0.7529197204858065, "reward_before_mean": 1.2094858214259148, "reward_before_std": 0.7433715928345919, "reward_change_max": 0.0, "reward_change_mean": -0.3396027758717537, "reward_change_min": -0.542022779583931, "reward_change_std": 0.21592556405812502, "reward_std": 0.7529197297990322, "rewards/cosine_scaled_reward": 0.14640955906361341, "rewards/format_reward": 0.9166666865348816, "step": 406 }, { "advantage_max": 1.1296640411019325, "advantage_mean": -2.731879555906147e-08, "advantage_min": -0.8582129143178463, "advantage_std": 0.6927201226353645, "completion_length": 2421.9167556762695, "epoch": 0.46514285714285714, "grad_norm": 0.5952998995780945, "kl": 0.27155303955078125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.915615368891117e-07, "loss": 0.0247, "reward": 0.7083711186423898, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7083711186423898, "reward_after_std": 0.6927201226353645, "reward_before_mean": 1.0135265458375216, "reward_before_std": 0.6552334055304527, "reward_change_max": 0.00042350590229034424, "reward_change_mean": -0.3051554262638092, "reward_change_min": -0.4641095530241728, "reward_change_std": 0.18895516265183687, "reward_std": 0.692720141261816, "rewards/cosine_scaled_reward": 0.07967992406338453, "rewards/format_reward": 0.8541666753590107, "step": 407 }, { "advantage_max": 1.219383381307125, "advantage_mean": -2.7318796780306798e-08, "advantage_min": -0.9996734149754047, "advantage_std": 0.7710001468658447, "completion_length": 2604.6042251586914, "epoch": 0.4662857142857143, "grad_norm": 0.44969770312309265, "kl": 0.300079345703125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.8967088307307e-07, "loss": 0.0269, "reward": 0.6504247402772307, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6504247402772307, "reward_after_std": 0.7710001245141029, "reward_before_mean": 0.939513674005866, "reward_before_std": 0.7604172751307487, "reward_change_max": 1.940131187438965e-05, "reward_change_mean": -0.2890889490954578, "reward_change_min": -0.4752741064876318, "reward_change_std": 0.18871614709496498, "reward_std": 0.7710001468658447, "rewards/cosine_scaled_reward": 0.11559016536921263, "rewards/format_reward": 0.7083333563059568, "step": 408 }, { "advantage_max": 0.9517353363335133, "advantage_mean": -6.829698806498641e-09, "advantage_min": -0.7010348103940487, "advantage_std": 0.6266225092113018, "completion_length": 3273.9584350585938, "epoch": 0.4674285714285714, "grad_norm": 0.8464940190315247, "kl": 0.445068359375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.8779779118983867e-07, "loss": 0.0451, "reward": 0.22011719923466444, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.22011719923466444, "reward_after_std": 0.6266225129365921, "reward_before_mean": 0.4306641574949026, "reward_before_std": 0.6301637701690197, "reward_change_max": 0.0, "reward_change_mean": -0.2105469647794962, "reward_change_min": -0.3587564751505852, "reward_change_std": 0.14359756372869015, "reward_std": 0.6266225203871727, "rewards/cosine_scaled_reward": -0.14925126358866692, "rewards/format_reward": 0.729166679084301, "step": 409 }, { "advantage_max": 1.5754838697612286, "advantage_mean": -3.725290298461914e-09, "advantage_min": -0.9354073293507099, "advantage_std": 0.955329168587923, "completion_length": 2791.375045776367, "epoch": 0.4685714285714286, "grad_norm": 0.5504742860794067, "kl": 0.465362548828125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.8594235253127372e-07, "loss": 0.0208, "reward": 0.2885292638093233, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2885292638093233, "reward_after_std": 0.955329168587923, "reward_before_mean": 0.48750573897268623, "reward_before_std": 0.9736828692257404, "reward_change_max": 0.00035659223794937134, "reward_change_mean": -0.19897646084427834, "reward_change_min": -0.44611474499106407, "reward_change_std": 0.17359306011348963, "reward_std": 0.9553291834890842, "rewards/cosine_scaled_reward": -0.11041381629183888, "rewards/format_reward": 0.7083333469927311, "step": 410 }, { "advantage_max": 1.084962822496891, "advantage_mean": -1.986821568378261e-08, "advantage_min": -0.7869381867349148, "advantage_std": 0.7009187713265419, "completion_length": 3034.479232788086, "epoch": 0.4697142857142857, "grad_norm": 1.2345826625823975, "kl": 0.451904296875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.8410465752883758e-07, "loss": 0.0173, "reward": 0.49905491434037685, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.49905491434037685, "reward_after_std": 0.7009187415242195, "reward_before_mean": 0.7629465311765671, "reward_before_std": 0.6926179826259613, "reward_change_max": 0.0003877207636833191, "reward_change_mean": -0.2638916438445449, "reward_change_min": -0.4652510955929756, "reward_change_std": 0.1784215234220028, "reward_std": 0.7009187825024128, "rewards/cosine_scaled_reward": -0.01436007209122181, "rewards/format_reward": 0.791666679084301, "step": 411 }, { "advantage_max": 1.2609862387180328, "advantage_mean": 3.7252904094842165e-09, "advantage_min": -0.8207177668809891, "advantage_std": 0.7636414542794228, "completion_length": 2928.5833740234375, "epoch": 0.47085714285714286, "grad_norm": 0.46127453446388245, "kl": 0.316558837890625, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.822847957491922e-07, "loss": 0.0426, "reward": 0.7003511674702168, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7003511674702168, "reward_after_std": 0.7636414468288422, "reward_before_mean": 0.9990373337641358, "reward_before_std": 0.7347136251628399, "reward_change_max": 0.0, "reward_change_mean": -0.2986861392855644, "reward_change_min": -0.4825573209673166, "reward_change_std": 0.18389271199703217, "reward_std": 0.7636414766311646, "rewards/cosine_scaled_reward": 0.09326865477487445, "rewards/format_reward": 0.8125000149011612, "step": 412 }, { "advantage_max": 0.9380652755498886, "advantage_mean": -1.0554989271494009e-08, "advantage_min": -0.849666889756918, "advantage_std": 0.6361578069627285, "completion_length": 2762.0208892822266, "epoch": 0.472, "grad_norm": 0.41781947016716003, "kl": 0.29278564453125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.804828558898332e-07, "loss": 0.0193, "reward": 0.7069132681936026, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7069132681936026, "reward_after_std": 0.6361577846109867, "reward_before_mean": 1.0194363929331303, "reward_before_std": 0.6180921867489815, "reward_change_max": 0.0, "reward_change_mean": -0.3125231470912695, "reward_change_min": -0.49202222749590874, "reward_change_std": 0.19155626371502876, "reward_std": 0.6361577995121479, "rewards/cosine_scaled_reward": 0.06180151551961899, "rewards/format_reward": 0.8958333507180214, "step": 413 }, { "advantage_max": 1.431507222354412, "advantage_mean": -2.483526961860605e-08, "advantage_min": -0.8340145871043205, "advantage_std": 0.8476960770785809, "completion_length": 3287.541717529297, "epoch": 0.47314285714285714, "grad_norm": 0.8487651348114014, "kl": 0.5587158203125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.7869892577476722e-07, "loss": 0.0372, "reward": 0.4159498354420066, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4159498354420066, "reward_after_std": 0.847696091979742, "reward_before_mean": 0.6477681696414948, "reward_before_std": 0.8374223373830318, "reward_change_max": 0.0, "reward_change_mean": -0.23181835748255253, "reward_change_min": -0.427595317363739, "reward_change_std": 0.16057890839874744, "reward_std": 0.8476961143314838, "rewards/cosine_scaled_reward": -0.09278258943231776, "rewards/format_reward": 0.8333333544433117, "step": 414 }, { "advantage_max": 1.4033267349004745, "advantage_mean": -5.587935614226325e-09, "advantage_min": -1.0382907837629318, "advantage_std": 0.9273032695055008, "completion_length": 3186.166717529297, "epoch": 0.4742857142857143, "grad_norm": 0.774530291557312, "kl": 0.510498046875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.7693309235023127e-07, "loss": 0.0306, "reward": 0.4007077421993017, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4007077421993017, "reward_after_std": 0.9273032620549202, "reward_before_mean": 0.6305460706353188, "reward_before_std": 0.9649643488228321, "reward_change_max": 0.0004077330231666565, "reward_change_mean": -0.2298383079469204, "reward_change_min": -0.5088780000805855, "reward_change_std": 0.19439934007823467, "reward_std": 0.9273032918572426, "rewards/cosine_scaled_reward": -0.07014364935457706, "rewards/format_reward": 0.7708333507180214, "step": 415 }, { "advantage_max": 1.4003809839487076, "advantage_mean": -2.483527050678447e-09, "advantage_min": -0.7266187705099583, "advantage_std": 0.7599498555064201, "completion_length": 2385.770866394043, "epoch": 0.4754285714285714, "grad_norm": 0.56381756067276, "kl": 0.229095458984375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.7518544168045524e-07, "loss": 0.0382, "reward": 0.3456824137829244, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3456824137829244, "reward_after_std": 0.7599498480558395, "reward_before_mean": 0.5633416920900345, "reward_before_std": 0.7165677137672901, "reward_change_max": 0.0004272535443305969, "reward_change_mean": -0.2176592703908682, "reward_change_min": -0.36236331425607204, "reward_change_std": 0.13551894575357437, "reward_std": 0.7599498555064201, "rewards/cosine_scaled_reward": -0.08291248977184296, "rewards/format_reward": 0.729166692122817, "step": 416 }, { "advantage_max": 1.3557152077555656, "advantage_mean": -1.396983892454351e-08, "advantage_min": -0.7811924479901791, "advantage_std": 0.7849394269287586, "completion_length": 3251.354278564453, "epoch": 0.4765714285714286, "grad_norm": 0.7525548338890076, "kl": 0.4139404296875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.7345605894346726e-07, "loss": 0.0176, "reward": 0.17850057780742645, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.17850057780742645, "reward_after_std": 0.7849394232034683, "reward_before_mean": 0.36427378561347723, "reward_before_std": 0.7763594165444374, "reward_change_max": 0.0003609210252761841, "reward_change_mean": -0.1857731956988573, "reward_change_min": -0.35806503891944885, "reward_change_std": 0.13479905761778355, "reward_std": 0.7849394604563713, "rewards/cosine_scaled_reward": -0.15119645558297634, "rewards/format_reward": 0.666666679084301, "step": 417 }, { "advantage_max": 1.3753202855587006, "advantage_mean": 8.692344399818808e-09, "advantage_min": -0.8541840016841888, "advantage_std": 0.809790126979351, "completion_length": 2461.3333740234375, "epoch": 0.4777142857142857, "grad_norm": 0.44116100668907166, "kl": 0.2738494873046875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.7174502842694212e-07, "loss": 0.0078, "reward": 0.6387815941125154, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6387815941125154, "reward_after_std": 0.8097901195287704, "reward_before_mean": 0.9196815155446529, "reward_before_std": 0.7784200459718704, "reward_change_max": 6.921589374542236e-05, "reward_change_mean": -0.28089988976716995, "reward_change_min": -0.4720543175935745, "reward_change_std": 0.18527762964367867, "reward_std": 0.8097901344299316, "rewards/cosine_scaled_reward": 0.09525741077959538, "rewards/format_reward": 0.7291666865348816, "step": 418 }, { "advantage_max": 1.0813214629888535, "advantage_mean": -2.1730860944035868e-08, "advantage_min": -1.1773153692483902, "advantage_std": 0.789653729647398, "completion_length": 2982.729248046875, "epoch": 0.47885714285714287, "grad_norm": 0.7832729816436768, "kl": 0.28326416015625, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.7005243352409333e-07, "loss": 0.0463, "reward": 0.7863744031637907, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7863744031637907, "reward_after_std": 0.7896537445485592, "reward_before_mean": 1.1106031499803066, "reward_before_std": 0.8159009516239166, "reward_change_max": 0.0, "reward_change_mean": -0.32422874215990305, "reward_change_min": -0.5305093750357628, "reward_change_std": 0.21637902781367302, "reward_std": 0.7896537445485592, "rewards/cosine_scaled_reward": 0.13863489031791687, "rewards/format_reward": 0.8333333507180214, "step": 419 }, { "advantage_max": 0.9137719795107841, "advantage_mean": -1.8936892831611374e-08, "advantage_min": -0.7141037918627262, "advantage_std": 0.6005804464221001, "completion_length": 2654.8125534057617, "epoch": 0.48, "grad_norm": 0.5264664888381958, "kl": 0.3333892822265625, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.6837835672960831e-07, "loss": 0.0261, "reward": 0.4663930219539907, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4663930219539907, "reward_after_std": 0.6005804501473904, "reward_before_mean": 0.7298664068803191, "reward_before_std": 0.5901828892529011, "reward_change_max": 0.00042045116424560547, "reward_change_mean": -0.2634733971208334, "reward_change_min": -0.42677972093224525, "reward_change_std": 0.16869883053004742, "reward_std": 0.6005804762244225, "rewards/cosine_scaled_reward": -0.020483465865254402, "rewards/format_reward": 0.7708333507180214, "step": 420 }, { "advantage_max": 1.0851642340421677, "advantage_mean": 4.967053990334591e-09, "advantage_min": -0.7848291248083115, "advantage_std": 0.697137389332056, "completion_length": 3275.1875610351562, "epoch": 0.48114285714285715, "grad_norm": 0.5157479643821716, "kl": 0.427001953125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.6672287963562852e-07, "loss": 0.0368, "reward": 0.20498970928019844, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.20498970928019844, "reward_after_std": 0.697137389332056, "reward_before_mean": 0.40731083042919636, "reward_before_std": 0.7089798636734486, "reward_change_max": 0.0002793148159980774, "reward_change_mean": -0.20232110423967242, "reward_change_min": -0.3960839547216892, "reward_change_std": 0.15158123476430774, "reward_std": 0.6971374042332172, "rewards/cosine_scaled_reward": -0.10884459130465984, "rewards/format_reward": 0.6250000093132257, "step": 421 }, { "advantage_max": 1.2480263337492943, "advantage_mean": -2.110997915361068e-08, "advantage_min": -0.8096714615821838, "advantage_std": 0.7595347613096237, "completion_length": 3047.0209045410156, "epoch": 0.48228571428571426, "grad_norm": 0.8368233442306519, "kl": 0.3427734375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.6508608292777203e-07, "loss": 0.057, "reward": 0.29103474225848913, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.29103474225848913, "reward_after_std": 0.7595347538590431, "reward_before_mean": 0.5047388053499162, "reward_before_std": 0.7582575939595699, "reward_change_max": 0.00042382627725601196, "reward_change_mean": -0.21370408684015274, "reward_change_min": -0.3960469029843807, "reward_change_std": 0.15060064289718866, "reward_std": 0.7595347687602043, "rewards/cosine_scaled_reward": -0.12263060174882412, "rewards/format_reward": 0.7500000204890966, "step": 422 }, { "advantage_max": 1.0338144302368164, "advantage_mean": -2.483527017371756e-08, "advantage_min": -0.7764839977025986, "advantage_std": 0.6542897410690784, "completion_length": 2932.916702270508, "epoch": 0.48342857142857143, "grad_norm": 0.838142991065979, "kl": 0.35986328125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.6346804638120098e-07, "loss": 0.0179, "reward": 0.33770157070830464, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.33770157070830464, "reward_after_std": 0.6542897336184978, "reward_before_mean": 0.5688282120972872, "reward_before_std": 0.6509609073400497, "reward_change_max": 0.00013802200555801392, "reward_change_mean": -0.23112666234374046, "reward_change_min": -0.391840647906065, "reward_change_std": 0.15103136654943228, "reward_std": 0.6542897447943687, "rewards/cosine_scaled_reward": -0.10100256465375423, "rewards/format_reward": 0.7708333488553762, "step": 423 }, { "advantage_max": 1.273235760629177, "advantage_mean": -1.893689272058907e-08, "advantage_min": -0.7567591927945614, "advantage_std": 0.7148295640945435, "completion_length": 3324.2500610351562, "epoch": 0.4845714285714286, "grad_norm": 0.5047457218170166, "kl": 0.418701171875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.6186884885673413e-07, "loss": 0.0497, "reward": 0.07864878047257662, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.07864878047257662, "reward_after_std": 0.714829571545124, "reward_before_mean": 0.246703976765275, "reward_before_std": 0.693241573870182, "reward_change_max": 8.837878704071045e-05, "reward_change_mean": -0.16805520839989185, "reward_change_min": -0.29914162680506706, "reward_change_std": 0.119585988111794, "reward_std": 0.7148295864462852, "rewards/cosine_scaled_reward": -0.22039801999926567, "rewards/format_reward": 0.6875000242143869, "step": 424 }, { "advantage_max": 1.178904764354229, "advantage_mean": -4.5634807377403774e-08, "advantage_min": -0.7673453465104103, "advantage_std": 0.7031940557062626, "completion_length": 2276.20841217041, "epoch": 0.4857142857142857, "grad_norm": 0.6296653151512146, "kl": 0.22310638427734375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.6028856829700258e-07, "loss": 0.0073, "reward": 1.2714834255166352, "reward_advantage_correlation": 1.0, "reward_after_mean": 1.2714834255166352, "reward_after_std": 0.7031940557062626, "reward_before_mean": 1.691854658536613, "reward_before_std": 0.6099688820540905, "reward_change_max": 0.0004223063588142395, "reward_change_mean": -0.4203712586313486, "reward_change_min": -0.6066901683807373, "reward_change_std": 0.23638969287276268, "reward_std": 0.7031940631568432, "rewards/cosine_scaled_reward": 0.38759398832917213, "rewards/format_reward": 0.9166666716337204, "step": 425 }, { "advantage_max": 1.0900339856743813, "advantage_mean": -3.2285849271840306e-08, "advantage_min": -0.8043052740395069, "advantage_std": 0.6774260587990284, "completion_length": 2568.187545776367, "epoch": 0.4868571428571429, "grad_norm": 0.3406532108783722, "kl": 0.286834716796875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.5872728172265146e-07, "loss": 0.0271, "reward": 0.6657272912561893, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6657272912561893, "reward_after_std": 0.6774260550737381, "reward_before_mean": 0.9639444425702095, "reward_before_std": 0.6491693146526814, "reward_change_max": 0.0, "reward_change_mean": -0.2982171569019556, "reward_change_min": -0.47070078551769257, "reward_change_std": 0.1798529252409935, "reward_std": 0.6774260550737381, "rewards/cosine_scaled_reward": 0.04447220079600811, "rewards/format_reward": 0.8750000074505806, "step": 426 }, { "advantage_max": 1.092741310596466, "advantage_mean": 3.1044122827950105e-10, "advantage_min": -0.825873851776123, "advantage_std": 0.7247681021690369, "completion_length": 3211.604217529297, "epoch": 0.488, "grad_norm": 0.6447138786315918, "kl": 0.365966796875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.5718506522858572e-07, "loss": 0.0226, "reward": 0.31276619993150234, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.31276619993150234, "reward_after_std": 0.7247680947184563, "reward_before_mean": 0.5376413902267814, "reward_before_std": 0.7455146200954914, "reward_change_max": 0.0002492070198059082, "reward_change_mean": -0.2248751912266016, "reward_change_min": -0.4375558625906706, "reward_change_std": 0.1662255534902215, "reward_std": 0.7247681021690369, "rewards/cosine_scaled_reward": -0.043679315596818924, "rewards/format_reward": 0.625000013038516, "step": 427 }, { "advantage_max": 1.3573896884918213, "advantage_mean": 9.313226023710541e-10, "advantage_min": -0.8932981081306934, "advantage_std": 0.8282932192087173, "completion_length": 3172.9375610351562, "epoch": 0.48914285714285716, "grad_norm": 0.7879695296287537, "kl": 0.416748046875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.5566199398026147e-07, "loss": 0.0331, "reward": 0.24398767901584506, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.24398767901584506, "reward_after_std": 0.8282932415604591, "reward_before_mean": 0.4423068680334836, "reward_before_std": 0.8342677690088749, "reward_change_max": 0.0005217865109443665, "reward_change_mean": -0.19831919204443693, "reward_change_min": -0.40706876292824745, "reward_change_std": 0.155321947298944, "reward_std": 0.8282932452857494, "rewards/cosine_scaled_reward": -0.11217990750446916, "rewards/format_reward": 0.6666666772216558, "step": 428 }, { "advantage_max": 1.312331322580576, "advantage_mean": -1.9868215184182247e-08, "advantage_min": -0.701547633856535, "advantage_std": 0.7851758264005184, "completion_length": 2451.5209350585938, "epoch": 0.49028571428571427, "grad_norm": 0.30135679244995117, "kl": 0.186004638671875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.5415814221002265e-07, "loss": -0.0115, "reward": 0.5081395594170317, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5081395594170317, "reward_after_std": 0.7851758413016796, "reward_before_mean": 0.7640831712633371, "reward_before_std": 0.7578940242528915, "reward_change_max": 0.0, "reward_change_mean": -0.2559436308220029, "reward_change_min": -0.49011536315083504, "reward_change_std": 0.17267096415162086, "reward_std": 0.7851758413016796, "rewards/cosine_scaled_reward": -0.03462509764358401, "rewards/format_reward": 0.8333333395421505, "step": 429 }, { "advantage_max": 1.142540581524372, "advantage_mean": -2.4835268619405326e-08, "advantage_min": -0.6823872663080692, "advantage_std": 0.7037328667938709, "completion_length": 2430.3958892822266, "epoch": 0.49142857142857144, "grad_norm": 0.3068379759788513, "kl": 0.2426605224609375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.5267358321348285e-07, "loss": 0.0236, "reward": 0.3900938993319869, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3900938993319869, "reward_after_std": 0.703732855618, "reward_before_mean": 0.6290378728881478, "reward_before_std": 0.689750038087368, "reward_change_max": 0.0, "reward_change_mean": -0.23894398473203182, "reward_change_min": -0.4359857030212879, "reward_change_std": 0.16346812900155783, "reward_std": 0.7037328854203224, "rewards/cosine_scaled_reward": -0.03964773938059807, "rewards/format_reward": 0.7083333358168602, "step": 430 }, { "advantage_max": 1.1171996966004372, "advantage_mean": -4.346172144398253e-09, "advantage_min": -0.6750711984932423, "advantage_std": 0.6704608127474785, "completion_length": 2604.0417404174805, "epoch": 0.49257142857142855, "grad_norm": 0.6438190340995789, "kl": 0.2650909423828125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.5120838934595337e-07, "loss": 0.018, "reward": 0.24859926337376237, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.24859926337376237, "reward_after_std": 0.6704607866704464, "reward_before_mean": 0.4588760333135724, "reward_before_std": 0.6554815396666527, "reward_change_max": 0.0002969205379486084, "reward_change_mean": -0.2102767825126648, "reward_change_min": -0.3830285705626011, "reward_change_std": 0.1412417721003294, "reward_std": 0.6704607978463173, "rewards/cosine_scaled_reward": -0.1768119866028428, "rewards/format_reward": 0.8125000149011612, "step": 431 }, { "advantage_max": 1.0770330727100372, "advantage_mean": -1.241763458725842e-08, "advantage_min": -0.9631595313549042, "advantage_std": 0.7486599683761597, "completion_length": 3350.979217529297, "epoch": 0.4937142857142857, "grad_norm": 0.7599322199821472, "kl": 0.4327392578125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.4976263201891613e-07, "loss": 0.0334, "reward": 0.23587427381426096, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.23587427381426096, "reward_after_std": 0.7486599832773209, "reward_before_mean": 0.4453230546787381, "reward_before_std": 0.7853334471583366, "reward_change_max": 0.0010994002223014832, "reward_change_mean": -0.20944878738373518, "reward_change_min": -0.41561521030962467, "reward_change_std": 0.16594223212450743, "reward_std": 0.7486599907279015, "rewards/cosine_scaled_reward": -0.1002551456913352, "rewards/format_reward": 0.6458333563059568, "step": 432 }, { "advantage_max": 1.3684561997652054, "advantage_mean": -2.359350637082258e-08, "advantage_min": -1.0894028283655643, "advantage_std": 0.8980691842734814, "completion_length": 3049.5417098999023, "epoch": 0.4948571428571429, "grad_norm": 0.6187546253204346, "kl": 0.287811279296875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.483363816965435e-07, "loss": 0.0242, "reward": 0.4780670255422592, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4780670255422592, "reward_after_std": 0.8980691842734814, "reward_before_mean": 0.7258573453873396, "reward_before_std": 0.9313638806343079, "reward_change_max": 0.0, "reward_change_mean": -0.24779031332582235, "reward_change_min": -0.46179778315126896, "reward_change_std": 0.19418509118258953, "reward_std": 0.8980692103505135, "rewards/cosine_scaled_reward": 0.01917867362499237, "rewards/format_reward": 0.6875000223517418, "step": 433 }, { "advantage_max": 1.037120372056961, "advantage_mean": -4.96705393482344e-09, "advantage_min": -0.7153984606266022, "advantage_std": 0.632737260311842, "completion_length": 3014.25008392334, "epoch": 0.496, "grad_norm": 0.3393830358982086, "kl": 0.258758544921875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.469297078922642e-07, "loss": 0.016, "reward": 0.10924087464809418, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.10924087464809418, "reward_after_std": 0.6327372752130032, "reward_before_mean": 0.2937947452683147, "reward_before_std": 0.6334190480411053, "reward_change_max": 0.00024069100618362427, "reward_change_mean": -0.1845538755878806, "reward_change_min": -0.33374907448887825, "reward_change_std": 0.12923644855618477, "reward_std": 0.632737297564745, "rewards/cosine_scaled_reward": -0.2072692969813943, "rewards/format_reward": 0.7083333432674408, "step": 434 }, { "advantage_max": 1.0664081498980522, "advantage_mean": -2.514570995870713e-08, "advantage_min": -0.899286687374115, "advantage_std": 0.7049997858703136, "completion_length": 2325.0208740234375, "epoch": 0.49714285714285716, "grad_norm": 0.4277926981449127, "kl": 0.184051513671875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.4554267916537495e-07, "loss": -0.009, "reward": 0.35434122290462255, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.35434122290462255, "reward_after_std": 0.7049998044967651, "reward_before_mean": 0.5873896693810821, "reward_before_std": 0.7154140211641788, "reward_change_max": 0.0006602779030799866, "reward_change_mean": -0.2330484725534916, "reward_change_min": -0.4049280807375908, "reward_change_std": 0.16564552672207355, "reward_std": 0.7049998566508293, "rewards/cosine_scaled_reward": -0.13338850578293204, "rewards/format_reward": 0.8541666865348816, "step": 435 }, { "advantage_max": 1.2504142820835114, "advantage_mean": -9.468446471316838e-09, "advantage_min": -0.9178133606910706, "advantage_std": 0.7541405111551285, "completion_length": 2425.541732788086, "epoch": 0.4982857142857143, "grad_norm": 0.5247993469238281, "kl": 0.192962646484375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.4417536311769885e-07, "loss": 0.0075, "reward": 0.8153278874233365, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.8153278874233365, "reward_after_std": 0.7541405037045479, "reward_before_mean": 1.1376360831782222, "reward_before_std": 0.7001718543469906, "reward_change_max": 0.0, "reward_change_mean": -0.3223081771284342, "reward_change_min": -0.4753790386021137, "reward_change_std": 0.19175524450838566, "reward_std": 0.7541405223309994, "rewards/cosine_scaled_reward": 0.14173468947410583, "rewards/format_reward": 0.854166679084301, "step": 436 }, { "advantage_max": 0.961220882833004, "advantage_mean": -4.656613122877573e-09, "advantage_min": -0.9853064715862274, "advantage_std": 0.7042024843394756, "completion_length": 3297.3958740234375, "epoch": 0.49942857142857144, "grad_norm": 0.5495759844779968, "kl": 0.293212890625, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.4282782639029128e-07, "loss": 0.0261, "reward": 0.3216606783680618, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3216606783680618, "reward_after_std": 0.7042024731636047, "reward_before_mean": 0.5539966747164726, "reward_before_std": 0.7438883259892464, "reward_change_max": 0.0005879774689674377, "reward_change_mean": -0.23233598191291094, "reward_change_min": -0.42455574311316013, "reward_change_std": 0.1767239011824131, "reward_std": 0.7042024992406368, "rewards/cosine_scaled_reward": -0.08758500777184963, "rewards/format_reward": 0.7291666902601719, "step": 437 }, { "advantage_max": 1.0681714750826359, "advantage_mean": -1.1486312234687546e-08, "advantage_min": -0.7881572209298611, "advantage_std": 0.6610872447490692, "completion_length": 3082.479232788086, "epoch": 0.5005714285714286, "grad_norm": 0.5330129265785217, "kl": 0.267303466796875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.4150013466019114e-07, "loss": 0.0312, "reward": 0.42049915064126253, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.42049915064126253, "reward_after_std": 0.6610872447490692, "reward_before_mean": 0.6686933264136314, "reward_before_std": 0.6371040232479572, "reward_change_max": 0.0005615651607513428, "reward_change_mean": -0.24819417297840118, "reward_change_min": -0.4192596562206745, "reward_change_std": 0.1635490721091628, "reward_std": 0.6610872596502304, "rewards/cosine_scaled_reward": -0.05107000935822725, "rewards/format_reward": 0.7708333507180214, "step": 438 }, { "advantage_max": 1.2752152383327484, "advantage_mean": -1.1796753018877837e-08, "advantage_min": -0.7674977108836174, "advantage_std": 0.7463467661291361, "completion_length": 2945.7083892822266, "epoch": 0.5017142857142857, "grad_norm": 0.4564673900604248, "kl": 0.271728515625, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.4019235263722034e-07, "loss": 0.0193, "reward": 0.24057719483971596, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.24057719483971596, "reward_after_std": 0.7463467670604587, "reward_before_mean": 0.4422801323235035, "reward_before_std": 0.7372346660122275, "reward_change_max": 0.00031983107328414917, "reward_change_mean": -0.20170292211696506, "reward_change_min": -0.3605585377663374, "reward_change_std": 0.13972568430472165, "reward_std": 0.746346796862781, "rewards/cosine_scaled_reward": -0.09135995898395777, "rewards/format_reward": 0.6250000204890966, "step": 439 }, { "advantage_max": 0.9175564348697662, "advantage_mean": -1.9868215628271457e-08, "advantage_min": -0.6749806068837643, "advantage_std": 0.5526381395757198, "completion_length": 3036.020896911621, "epoch": 0.5028571428571429, "grad_norm": 0.38430255651474, "kl": 0.29071044921875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.3890454406082956e-07, "loss": 0.0191, "reward": 0.16476133867399767, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.16476133867399767, "reward_after_std": 0.5526381358504295, "reward_before_mean": 0.3659895323216915, "reward_before_std": 0.5327113717794418, "reward_change_max": 0.001020081341266632, "reward_change_mean": -0.20122821908444166, "reward_change_min": -0.33146263659000397, "reward_change_std": 0.13047749502584338, "reward_std": 0.5526381470263004, "rewards/cosine_scaled_reward": -0.13992191292345524, "rewards/format_reward": 0.6458333488553762, "step": 440 }, { "advantage_max": 1.3344752714037895, "advantage_mean": -1.7384688688615313e-08, "advantage_min": -1.0569850206375122, "advantage_std": 0.85391865670681, "completion_length": 2869.020927429199, "epoch": 0.504, "grad_norm": 0.4335938096046448, "kl": 0.250732421875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.3763677169699217e-07, "loss": -0.0001, "reward": 0.6424765922129154, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6424765922129154, "reward_after_std": 0.8539186492562294, "reward_before_mean": 0.9254599437117577, "reward_before_std": 0.8589312359690666, "reward_change_max": 0.00018534809350967407, "reward_change_mean": -0.28298336546868086, "reward_change_min": -0.4916497617959976, "reward_change_std": 0.19478337559849024, "reward_std": 0.85391865670681, "rewards/cosine_scaled_reward": 0.077313297893852, "rewards/format_reward": 0.7708333544433117, "step": 441 }, { "advantage_max": 1.0995511934161186, "advantage_mean": -3.197540951460631e-08, "advantage_min": -1.0392018761485815, "advantage_std": 0.7806694209575653, "completion_length": 3179.2084045410156, "epoch": 0.5051428571428571, "grad_norm": 0.8226723074913025, "kl": 0.2691650390625, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.3638909733514452e-07, "loss": 0.039, "reward": 0.6949413022957742, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6949413022957742, "reward_after_std": 0.780669404193759, "reward_before_mean": 0.9994683768600225, "reward_before_std": 0.8013839311897755, "reward_change_max": 0.0, "reward_change_mean": -0.30452708899974823, "reward_change_min": -0.5225802324712276, "reward_change_std": 0.2088159453123808, "reward_std": 0.7806694395840168, "rewards/cosine_scaled_reward": 0.0830675084143877, "rewards/format_reward": 0.8333333507180214, "step": 442 }, { "advantage_max": 1.450992576777935, "advantage_mean": -9.31322596819939e-09, "advantage_min": -0.8881213739514351, "advantage_std": 0.8333137445151806, "completion_length": 3170.4376068115234, "epoch": 0.5062857142857143, "grad_norm": 0.4043674170970917, "kl": 0.3074951171875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.351615817851748e-07, "loss": 0.0064, "reward": 0.2343538049608469, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2343538049608469, "reward_after_std": 0.8333137482404709, "reward_before_mean": 0.42692176811397076, "reward_before_std": 0.8194324485957623, "reward_change_max": 0.0, "reward_change_mean": -0.19256796687841415, "reward_change_min": -0.35877078399062157, "reward_change_std": 0.1398706203326583, "reward_std": 0.8333137854933739, "rewards/cosine_scaled_reward": -0.08862246526405215, "rewards/format_reward": 0.6041666846722364, "step": 443 }, { "advantage_max": 1.2264510244131088, "advantage_mean": -4.967053657267684e-09, "advantage_min": -0.9449261762201786, "advantage_std": 0.7524668201804161, "completion_length": 2910.7709197998047, "epoch": 0.5074285714285715, "grad_norm": 0.5402403473854065, "kl": 0.2428741455078125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.3395428487445914e-07, "loss": 0.0114, "reward": 0.32908502407372, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.32908502407372, "reward_after_std": 0.7524668239057064, "reward_before_mean": 0.5517707715625875, "reward_before_std": 0.7516389228403568, "reward_change_max": 0.00013605505228042603, "reward_change_mean": -0.22268578968942165, "reward_change_min": -0.40703381039202213, "reward_change_std": 0.15602609980851412, "reward_std": 0.7524668276309967, "rewards/cosine_scaled_reward": -0.07828126242384315, "rewards/format_reward": 0.7083333507180214, "step": 444 }, { "advantage_max": 1.1209322586655617, "advantage_mean": -1.98682153507157e-08, "advantage_min": -0.8091943934559822, "advantage_std": 0.7477481514215469, "completion_length": 3036.375030517578, "epoch": 0.5085714285714286, "grad_norm": 0.4570198059082031, "kl": 0.273529052734375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.3276726544494571e-07, "loss": 0.0264, "reward": 0.4328167047351599, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4328167047351599, "reward_after_std": 0.7477481514215469, "reward_before_mean": 0.681640456430614, "reward_before_std": 0.7630671970546246, "reward_change_max": 0.0005666017532348633, "reward_change_mean": -0.24882372468709946, "reward_change_min": -0.46233782172203064, "reward_change_std": 0.17945091798901558, "reward_std": 0.7477481849491596, "rewards/cosine_scaled_reward": -0.055013129487633705, "rewards/format_reward": 0.791666679084301, "step": 445 }, { "advantage_max": 1.6676536723971367, "advantage_mean": -1.4901161971003773e-08, "advantage_min": -0.9640394076704979, "advantage_std": 1.052213542163372, "completion_length": 3098.479217529297, "epoch": 0.5097142857142857, "grad_norm": 1.077810287475586, "kl": 0.27508544921875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.316005813502869e-07, "loss": 0.0677, "reward": 0.22505785524845123, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.22505785524845123, "reward_after_std": 1.0522135235369205, "reward_before_mean": 0.40654395520687103, "reward_before_std": 1.1059863716363907, "reward_change_max": 0.0009485706686973572, "reward_change_mean": -0.1814861036837101, "reward_change_min": -0.4725600816309452, "reward_change_std": 0.1915188366547227, "reward_std": 1.0522135570645332, "rewards/cosine_scaled_reward": -0.09881136624608189, "rewards/format_reward": 0.6041666753590107, "step": 446 }, { "advantage_max": 1.1408450677990913, "advantage_mean": -1.6763806842678974e-08, "advantage_min": -0.7244512140750885, "advantage_std": 0.6806843280792236, "completion_length": 2420.0208740234375, "epoch": 0.5108571428571429, "grad_norm": 0.40888798236846924, "kl": 0.18408203125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.3045428945301953e-07, "loss": 0.0197, "reward": 0.5533003276214004, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5533003276214004, "reward_after_std": 0.6806843131780624, "reward_before_mean": 0.8259909669868648, "reward_before_std": 0.6393276229500771, "reward_change_max": 0.0, "reward_change_mean": -0.2726906146854162, "reward_change_min": -0.4491038806736469, "reward_change_std": 0.17211227118968964, "reward_std": 0.6806843280792236, "rewards/cosine_scaled_reward": -0.0036712009459733963, "rewards/format_reward": 0.8333333358168602, "step": 447 }, { "advantage_max": 1.522441141307354, "advantage_mean": -8.071462553882469e-09, "advantage_min": -0.7598659209907055, "advantage_std": 0.8678238093852997, "completion_length": 2448.229202270508, "epoch": 0.512, "grad_norm": 0.7265898585319519, "kl": 0.20556640625, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.2932844562179352e-07, "loss": 0.0367, "reward": 0.5077628269791603, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5077628269791603, "reward_after_std": 0.8678238093852997, "reward_before_mean": 0.7545495517551899, "reward_before_std": 0.8369379080832005, "reward_change_max": 0.0, "reward_change_mean": -0.24678673688322306, "reward_change_min": -0.461018992587924, "reward_change_std": 0.16486407443881035, "reward_std": 0.8678238242864609, "rewards/cosine_scaled_reward": 0.012691439012996852, "rewards/format_reward": 0.7291666772216558, "step": 448 }, { "advantage_max": 1.2488050758838654, "advantage_mean": -1.8626449826975033e-09, "advantage_min": -0.8169982880353928, "advantage_std": 0.778340682387352, "completion_length": 2462.6875610351562, "epoch": 0.5131428571428571, "grad_norm": 0.489936888217926, "kl": 0.171630859375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.2822310472864885e-07, "loss": 0.0293, "reward": 0.40650817658752203, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.40650817658752203, "reward_after_std": 0.7783406749367714, "reward_before_mean": 0.6442282367497683, "reward_before_std": 0.7758251614868641, "reward_change_max": 0.000153273344039917, "reward_change_mean": -0.23772006668150425, "reward_change_min": -0.42172000743448734, "reward_change_std": 0.16095779091119766, "reward_std": 0.778340682387352, "rewards/cosine_scaled_reward": -0.03205255372449756, "rewards/format_reward": 0.7083333525806665, "step": 449 }, { "advantage_max": 1.0654637813568115, "advantage_mean": -2.7318795670083773e-08, "advantage_min": -0.7560668438673019, "advantage_std": 0.6400275379419327, "completion_length": 2896.791748046875, "epoch": 0.5142857142857142, "grad_norm": 0.2660713493824005, "kl": 0.193115234375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.2713832064634125e-07, "loss": 0.0189, "reward": 0.6007658436428756, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6007658436428756, "reward_after_std": 0.6400275342166424, "reward_before_mean": 0.8861885126680136, "reward_before_std": 0.5933735463768244, "reward_change_max": 0.0007397830486297607, "reward_change_mean": -0.2854226781055331, "reward_change_min": -0.4495175629854202, "reward_change_std": 0.1740018017590046, "reward_std": 0.6400275528430939, "rewards/cosine_scaled_reward": 0.0680942494655028, "rewards/format_reward": 0.7500000149011612, "step": 450 }, { "advantage_max": 1.3750276044011116, "advantage_mean": -1.7384688799637615e-08, "advantage_min": -0.7686949074268341, "advantage_std": 0.8036042414605618, "completion_length": 2704.0000534057617, "epoch": 0.5154285714285715, "grad_norm": 0.7474358081817627, "kl": 0.186279296875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.260741462457165e-07, "loss": 0.0552, "reward": 0.46400512009859085, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.46400512009859085, "reward_after_std": 0.803604245185852, "reward_before_mean": 0.7083968780934811, "reward_before_std": 0.7699825428426266, "reward_change_max": 0.0001333579421043396, "reward_change_mean": -0.24439177941530943, "reward_change_min": -0.42964933067560196, "reward_change_std": 0.1687707407400012, "reward_std": 0.8036042489111423, "rewards/cosine_scaled_reward": -0.041634893510490656, "rewards/format_reward": 0.791666679084301, "step": 451 }, { "advantage_max": 1.3469280526041985, "advantage_mean": -6.829698862009792e-09, "advantage_min": -0.878162145614624, "advantage_std": 0.8086332157254219, "completion_length": 3256.3750610351562, "epoch": 0.5165714285714286, "grad_norm": 0.8168085813522339, "kl": 0.2757568359375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.2503063339313356e-07, "loss": 0.035, "reward": 0.4759515901096165, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4759515901096165, "reward_after_std": 0.8086332082748413, "reward_before_mean": 0.7243448127992451, "reward_before_std": 0.7942167557775974, "reward_change_max": 0.0, "reward_change_mean": -0.24839316215366125, "reward_change_min": -0.41863161139190197, "reward_change_std": 0.16462755482643843, "reward_std": 0.8086332082748413, "rewards/cosine_scaled_reward": 0.07050571404397488, "rewards/format_reward": 0.5833333432674408, "step": 452 }, { "advantage_max": 1.1192011684179306, "advantage_mean": -2.1109978542988017e-08, "advantage_min": -0.9729462340474129, "advantage_std": 0.7825345098972321, "completion_length": 2952.9167709350586, "epoch": 0.5177142857142857, "grad_norm": 0.34308767318725586, "kl": 0.300079345703125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.2400783294793668e-07, "loss": 0.0178, "reward": 0.4612226258032024, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4612226258032024, "reward_after_std": 0.7825345173478127, "reward_before_mean": 0.7155648600310087, "reward_before_std": 0.8131705485284328, "reward_change_max": 0.0006426721811294556, "reward_change_mean": -0.2543422821909189, "reward_change_min": -0.47778818756341934, "reward_change_std": 0.18769241124391556, "reward_std": 0.7825345546007156, "rewards/cosine_scaled_reward": 0.03486575931310654, "rewards/format_reward": 0.6458333507180214, "step": 453 }, { "advantage_max": 1.0717548094689846, "advantage_mean": -2.1109979653211042e-08, "advantage_min": -0.7446033619344234, "advantage_std": 0.6832532957196236, "completion_length": 2770.1250610351562, "epoch": 0.5188571428571429, "grad_norm": 0.8165076971054077, "kl": 0.204010009765625, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.2300579475997657e-07, "loss": 0.0492, "reward": 0.25120340567082167, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.25120340567082167, "reward_after_std": 0.6832533031702042, "reward_before_mean": 0.46469184942543507, "reward_before_std": 0.6845399513840675, "reward_change_max": 0.0, "reward_change_mean": -0.2134884474799037, "reward_change_min": -0.3753939662128687, "reward_change_std": 0.15470506064593792, "reward_std": 0.6832533329725266, "rewards/cosine_scaled_reward": -0.10098742507398129, "rewards/format_reward": 0.6666666809469461, "step": 454 }, { "advantage_max": 1.2879580929875374, "advantage_mean": -1.6763807175745882e-08, "advantage_min": -0.9200953394174576, "advantage_std": 0.8330187946557999, "completion_length": 3133.6458892822266, "epoch": 0.52, "grad_norm": 0.47571733593940735, "kl": 0.3657989501953125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.220245676671809e-07, "loss": 0.0329, "reward": 0.30473056621849537, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.30473056621849537, "reward_after_std": 0.8330188021063805, "reward_before_mean": 0.51984196389094, "reward_before_std": 0.8574087470769882, "reward_change_max": 0.00045350193977355957, "reward_change_mean": -0.21511139906942844, "reward_change_min": -0.4619736932218075, "reward_change_std": 0.17379204276949167, "reward_std": 0.8330188542604446, "rewards/cosine_scaled_reward": -0.08382903970777988, "rewards/format_reward": 0.687500013038516, "step": 455 }, { "advantage_max": 1.1999251171946526, "advantage_mean": -1.6142924830209182e-08, "advantage_min": -0.8450670167803764, "advantage_std": 0.765825416892767, "completion_length": 3175.812530517578, "epoch": 0.5211428571428571, "grad_norm": 0.5870717167854309, "kl": 0.28533935546875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.2106419949317388e-07, "loss": 0.0282, "reward": 0.2780027943663299, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2780027943663299, "reward_after_std": 0.7658254392445087, "reward_before_mean": 0.490889391861856, "reward_before_std": 0.7788579016923904, "reward_change_max": 0.00036644935607910156, "reward_change_mean": -0.21288659516721964, "reward_change_min": -0.4008727613836527, "reward_change_std": 0.15778352785855532, "reward_std": 0.7658254392445087, "rewards/cosine_scaled_reward": -0.07747197709977627, "rewards/format_reward": 0.6458333432674408, "step": 456 }, { "advantage_max": 0.9618594944477081, "advantage_mean": -5.5879357807597785e-09, "advantage_min": -0.7956263087689877, "advantage_std": 0.6232722103595734, "completion_length": 3030.8750610351562, "epoch": 0.5222857142857142, "grad_norm": 0.4087555706501007, "kl": 0.23626708984375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.2012473704494537e-07, "loss": 0.0218, "reward": 0.30106205493211746, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.30106205493211746, "reward_after_std": 0.6232722103595734, "reward_before_mean": 0.5290708467364311, "reward_before_std": 0.6274543330073357, "reward_change_max": 0.0005281120538711548, "reward_change_mean": -0.2280087862163782, "reward_change_min": -0.3949625361710787, "reward_change_std": 0.15519539266824722, "reward_std": 0.6232722327113152, "rewards/cosine_scaled_reward": -0.07921459339559078, "rewards/format_reward": 0.6875000111758709, "step": 457 }, { "advantage_max": 1.0171835720539093, "advantage_mean": 5.58793583627093e-09, "advantage_min": -0.7879865169525146, "advantage_std": 0.6320391744375229, "completion_length": 2719.0834197998047, "epoch": 0.5234285714285715, "grad_norm": 0.41114887595176697, "kl": 0.26519775390625, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.1920622611056974e-07, "loss": 0.021, "reward": 0.2687252201139927, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2687252201139927, "reward_after_std": 0.6320391818881035, "reward_before_mean": 0.4860637728124857, "reward_before_std": 0.6235708631575108, "reward_change_max": 0.00021369755268096924, "reward_change_mean": -0.21733852848410606, "reward_change_min": -0.3654922638088465, "reward_change_std": 0.1428649527952075, "reward_std": 0.6320391818881035, "rewards/cosine_scaled_reward": -0.131968125118874, "rewards/format_reward": 0.7500000111758709, "step": 458 }, { "advantage_max": 1.333101436495781, "advantage_mean": -2.607703353252333e-08, "advantage_min": -1.0314572639763355, "advantage_std": 0.8565344773232937, "completion_length": 2316.479232788086, "epoch": 0.5245714285714286, "grad_norm": 0.6849309802055359, "kl": 0.2168731689453125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.1830871145697412e-07, "loss": -0.0094, "reward": 0.7454090863466263, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7454090863466263, "reward_after_std": 0.8565345034003258, "reward_before_mean": 1.0495480801910162, "reward_before_std": 0.8610980287194252, "reward_change_max": 0.0009169206023216248, "reward_change_mean": -0.30413899570703506, "reward_change_min": -0.5338970180600882, "reward_change_std": 0.2082844926044345, "reward_std": 0.8565345108509064, "rewards/cosine_scaled_reward": 0.13935734890401363, "rewards/format_reward": 0.770833358168602, "step": 459 }, { "advantage_max": 1.5643641501665115, "advantage_mean": 1.2417633588057697e-09, "advantage_min": -0.7691216804087162, "advantage_std": 0.8628663681447506, "completion_length": 3278.166748046875, "epoch": 0.5257142857142857, "grad_norm": 1.6409032344818115, "kl": 0.39013671875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.1743223682775649e-07, "loss": 0.1106, "reward": 0.1872671078890562, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1872671078890562, "reward_after_std": 0.8628664053976536, "reward_before_mean": 0.36641599889844656, "reward_before_std": 0.8460225574672222, "reward_change_max": 0.0004563629627227783, "reward_change_mean": -0.1791488779708743, "reward_change_min": -0.32637596502900124, "reward_change_std": 0.12987737637013197, "reward_std": 0.8628664091229439, "rewards/cosine_scaled_reward": -0.16054200963117182, "rewards/format_reward": 0.6875000055879354, "step": 460 }, { "advantage_max": 1.1700698509812355, "advantage_mean": 5.2774945524802774e-09, "advantage_min": -0.9121370539069176, "advantage_std": 0.7560761161148548, "completion_length": 3148.166748046875, "epoch": 0.5268571428571428, "grad_norm": 1.2687162160873413, "kl": 0.32611083984375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.1657684494105386e-07, "loss": 0.0576, "reward": 0.3975920076481998, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3975920076481998, "reward_after_std": 0.7560761198401451, "reward_before_mean": 0.6368492189794779, "reward_before_std": 0.769760999828577, "reward_change_max": 0.0005073174834251404, "reward_change_mean": -0.23925721272826195, "reward_change_min": -0.40126175433397293, "reward_change_std": 0.164176139049232, "reward_std": 0.7560761347413063, "rewards/cosine_scaled_reward": -0.056575387134216726, "rewards/format_reward": 0.7500000167638063, "step": 461 }, { "advantage_max": 0.853712659329176, "advantage_mean": 8.692344288796505e-09, "advantage_min": -0.6648980677127838, "advantage_std": 0.568975031375885, "completion_length": 3008.166748046875, "epoch": 0.528, "grad_norm": 0.3729988634586334, "kl": 0.2870330810546875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.1574257748745986e-07, "loss": 0.0243, "reward": -0.0351557557733031, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.0351557557733031, "reward_after_std": 0.568975031375885, "reward_before_mean": 0.12730733181524556, "reward_before_std": 0.5896075032651424, "reward_change_max": 0.0, "reward_change_mean": -0.16246309131383896, "reward_change_min": -0.315531600266695, "reward_change_std": 0.12752943392843008, "reward_std": 0.5689750388264656, "rewards/cosine_scaled_reward": -0.29051300324499607, "rewards/format_reward": 0.7083333469927311, "step": 462 }, { "advantage_max": 1.3363328725099564, "advantage_mean": 2.4835269396561444e-09, "advantage_min": -0.8758548647165298, "advantage_std": 0.8364144042134285, "completion_length": 3004.3334045410156, "epoch": 0.5291428571428571, "grad_norm": 0.5325247049331665, "kl": 0.317108154296875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.1492947512799328e-07, "loss": 0.0452, "reward": 0.3950059534981847, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3950059534981847, "reward_after_std": 0.8364144042134285, "reward_before_mean": 0.6261612032540143, "reward_before_std": 0.8405908420681953, "reward_change_max": 0.0004988312721252441, "reward_change_mean": -0.2311552306637168, "reward_change_min": -0.45054684579372406, "reward_change_std": 0.1742579499259591, "reward_std": 0.8364144563674927, "rewards/cosine_scaled_reward": -0.06191941723227501, "rewards/format_reward": 0.7500000149011612, "step": 463 }, { "advantage_max": 1.074364185333252, "advantage_mean": -7.450579930790013e-09, "advantage_min": -0.6964662335813046, "advantage_std": 0.6410483885556459, "completion_length": 2285.5625228881836, "epoch": 0.5302857142857142, "grad_norm": 0.2619056701660156, "kl": 0.281524658203125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.1413757749211602e-07, "loss": 0.0164, "reward": 0.664086430799216, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.664086430799216, "reward_after_std": 0.6410483960062265, "reward_before_mean": 0.9626196213066578, "reward_before_std": 0.5864274688065052, "reward_change_max": 0.0, "reward_change_mean": -0.2985331565141678, "reward_change_min": -0.4657944589853287, "reward_change_std": 0.1751079559326172, "reward_std": 0.6410484276711941, "rewards/cosine_scaled_reward": 0.04380978271365166, "rewards/format_reward": 0.8750000149011612, "step": 464 }, { "advantage_max": 1.1773870512843132, "advantage_mean": -1.676380662063437e-08, "advantage_min": -0.8465304002165794, "advantage_std": 0.7288689315319061, "completion_length": 2720.416732788086, "epoch": 0.5314285714285715, "grad_norm": 0.38809898495674133, "kl": 0.3505859375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.1336692317580158e-07, "loss": 0.0187, "reward": 0.39983643777668476, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.39983643777668476, "reward_after_std": 0.7288689315319061, "reward_before_mean": 0.6380915604531765, "reward_before_std": 0.7116920426487923, "reward_change_max": 0.00034280866384506226, "reward_change_mean": -0.23825511056929827, "reward_change_min": -0.3835278395563364, "reward_change_std": 0.15616153739392757, "reward_std": 0.7288689538836479, "rewards/cosine_scaled_reward": -0.09762089792639017, "rewards/format_reward": 0.8333333432674408, "step": 465 }, { "advantage_max": 1.3431070744991302, "advantage_mean": 7.14014017355602e-09, "advantage_min": -1.0130164846777916, "advantage_std": 0.8381664492189884, "completion_length": 2935.0209045410156, "epoch": 0.5325714285714286, "grad_norm": 0.48135891556739807, "kl": 0.278076171875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.1261754973965422e-07, "loss": 0.0159, "reward": 0.7656144197098911, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.7656144197098911, "reward_after_std": 0.8381664454936981, "reward_before_mean": 1.074431873857975, "reward_before_std": 0.8233371824026108, "reward_change_max": 0.00016214698553085327, "reward_change_mean": -0.308817395940423, "reward_change_min": -0.5420728102326393, "reward_change_std": 0.20922611840069294, "reward_std": 0.8381664603948593, "rewards/cosine_scaled_reward": 0.16221593227237463, "rewards/format_reward": 0.7500000149011612, "step": 466 }, { "advantage_max": 1.087053433060646, "advantage_mean": -2.4835269396561444e-09, "advantage_min": -0.6921741813421249, "advantage_std": 0.6509734019637108, "completion_length": 3099.166778564453, "epoch": 0.5337142857142857, "grad_norm": 0.40039074420928955, "kl": 0.3248291015625, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.1188949370707787e-07, "loss": 0.0126, "reward": 0.43881505231183837, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.43881505231183837, "reward_after_std": 0.650973379611969, "reward_before_mean": 0.690155977383256, "reward_before_std": 0.6194622069597244, "reward_change_max": 0.0, "reward_change_mean": -0.2513408958911896, "reward_change_min": -0.40755581110715866, "reward_change_std": 0.1519416468217969, "reward_std": 0.6509734019637108, "rewards/cosine_scaled_reward": -0.12367204017937183, "rewards/format_reward": 0.9375000149011612, "step": 467 }, { "advantage_max": 1.4045623019337654, "advantage_mean": 6.208817238118058e-09, "advantage_min": -0.9101616032421589, "advantage_std": 0.865707665681839, "completion_length": 3116.4375762939453, "epoch": 0.5348571428571428, "grad_norm": 0.5389218926429749, "kl": 0.390869140625, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.1118279056249653e-07, "loss": 0.0368, "reward": 0.29290930554270744, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.29290930554270744, "reward_after_std": 0.8657076582312584, "reward_before_mean": 0.5006698397919536, "reward_before_std": 0.8771371580660343, "reward_change_max": 0.0, "reward_change_mean": -0.20776053331792355, "reward_change_min": -0.42347368225455284, "reward_change_std": 0.16502294037491083, "reward_std": 0.8657076731324196, "rewards/cosine_scaled_reward": -0.12466508615761995, "rewards/format_reward": 0.7500000149011612, "step": 468 }, { "advantage_max": 1.245004452764988, "advantage_mean": 1.3038516266661304e-08, "advantage_min": -0.7255946360528469, "advantage_std": 0.7181912809610367, "completion_length": 2757.5625534057617, "epoch": 0.536, "grad_norm": 0.5630185008049011, "kl": 0.398773193359375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.1049747474962444e-07, "loss": 0.0376, "reward": 0.41447276156395674, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.41447276156395674, "reward_after_std": 0.7181912884116173, "reward_before_mean": 0.6538431597873569, "reward_before_std": 0.6894815154373646, "reward_change_max": 0.0008132085204124451, "reward_change_mean": -0.23937037121504545, "reward_change_min": -0.41941640712320805, "reward_change_std": 0.15768367424607277, "reward_std": 0.7181913293898106, "rewards/cosine_scaled_reward": -0.06891175545752048, "rewards/format_reward": 0.7916666716337204, "step": 469 }, { "advantage_max": 1.140313170850277, "advantage_mean": -8.071462553882469e-09, "advantage_min": -0.8693759068846703, "advantage_std": 0.7540086023509502, "completion_length": 3080.8334350585938, "epoch": 0.5371428571428571, "grad_norm": 0.39104798436164856, "kl": 0.3753662109375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0983357966978745e-07, "loss": 0.0382, "reward": 0.31692381418542936, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.31692381418542936, "reward_after_std": 0.7540086023509502, "reward_before_mean": 0.5397572233341634, "reward_before_std": 0.7693562731146812, "reward_change_max": 0.00019013136625289917, "reward_change_mean": -0.2228334043174982, "reward_change_min": -0.43365970999002457, "reward_change_std": 0.1685474431142211, "reward_std": 0.7540086284279823, "rewards/cosine_scaled_reward": -0.12595474161207676, "rewards/format_reward": 0.791666679084301, "step": 470 }, { "advantage_max": 1.1521102339029312, "advantage_mean": -1.210719347000122e-08, "advantage_min": -0.8370354846119881, "advantage_std": 0.759576290845871, "completion_length": 3062.7708892822266, "epoch": 0.5382857142857143, "grad_norm": 0.5789968371391296, "kl": 0.3306121826171875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0919113768029517e-07, "loss": 0.0351, "reward": 0.5618683909997344, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5618683909997344, "reward_after_std": 0.759576290845871, "reward_before_mean": 0.8360824510455132, "reward_before_std": 0.7654278874397278, "reward_change_max": 1.179426908493042e-05, "reward_change_mean": -0.27421404607594013, "reward_change_min": -0.4889149107038975, "reward_change_std": 0.1869899621233344, "reward_std": 0.7595762982964516, "rewards/cosine_scaled_reward": -0.009042118676006794, "rewards/format_reward": 0.8541666679084301, "step": 471 }, { "advantage_max": 1.4641920626163483, "advantage_mean": -5.5879357807597785e-09, "advantage_min": -0.7677106484770775, "advantage_std": 0.8432400114834309, "completion_length": 3012.2084045410156, "epoch": 0.5394285714285715, "grad_norm": 0.30752912163734436, "kl": 0.317901611328125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0857018009286381e-07, "loss": 0.021, "reward": 0.25966172479093075, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.25966172479093075, "reward_after_std": 0.8432399965822697, "reward_before_mean": 0.45790516724810004, "reward_before_std": 0.830863282084465, "reward_change_max": 0.0, "reward_change_mean": -0.19824345130473375, "reward_change_min": -0.3653850741684437, "reward_change_std": 0.14040481578558683, "reward_std": 0.8432400040328503, "rewards/cosine_scaled_reward": -0.11479742079973221, "rewards/format_reward": 0.6875000149011612, "step": 472 }, { "advantage_max": 1.0781077519059181, "advantage_mean": -4.967054434423801e-09, "advantage_min": -0.683658130466938, "advantage_std": 0.6386011429131031, "completion_length": 3106.125045776367, "epoch": 0.5405714285714286, "grad_norm": 0.4724801778793335, "kl": 0.30877685546875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0797073717209013e-07, "loss": 0.0152, "reward": 0.400164398830384, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.400164398830384, "reward_after_std": 0.6386011317372322, "reward_before_mean": 0.6434677103534341, "reward_before_std": 0.6013318356126547, "reward_change_max": 0.0, "reward_change_mean": -0.24330330546945333, "reward_change_min": -0.39950625970959663, "reward_change_std": 0.15486595127731562, "reward_std": 0.6386011391878128, "rewards/cosine_scaled_reward": -0.03243281506001949, "rewards/format_reward": 0.708333345130086, "step": 473 }, { "advantage_max": 1.0679819360375404, "advantage_mean": -4.656612928588544e-09, "advantage_min": -0.5518227554857731, "advantage_std": 0.6260210648179054, "completion_length": 2542.9583892822266, "epoch": 0.5417142857142857, "grad_norm": 0.6796749830245972, "kl": 0.288970947265625, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0739283813397639e-07, "loss": 0.0193, "reward": 0.6689651161432266, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6689651161432266, "reward_after_std": 0.6260210573673248, "reward_before_mean": 0.9689240120351315, "reward_before_std": 0.5679577551782131, "reward_change_max": 0.0, "reward_change_mean": -0.29995884373784065, "reward_change_min": -0.4876106455922127, "reward_change_std": 0.17878461815416813, "reward_std": 0.6260210610926151, "rewards/cosine_scaled_reward": 0.05737863853573799, "rewards/format_reward": 0.8541666828095913, "step": 474 }, { "advantage_max": 1.6982565373182297, "advantage_mean": -2.8250119438189003e-08, "advantage_min": -1.121864341199398, "advantage_std": 1.0212149359285831, "completion_length": 2369.062530517578, "epoch": 0.5428571428571428, "grad_norm": 1.186362862586975, "kl": 0.274993896484375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.068365111445064e-07, "loss": 0.0818, "reward": 0.5927941456902772, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5927941456902772, "reward_after_std": 1.021214947104454, "reward_before_mean": 0.850624798797071, "reward_before_std": 1.0256964527070522, "reward_change_max": 0.0003933459520339966, "reward_change_mean": -0.257830667309463, "reward_change_min": -0.4586414434015751, "reward_change_std": 0.1883101798593998, "reward_std": 1.0212149620056152, "rewards/cosine_scaled_reward": 0.050312393344938755, "rewards/format_reward": 0.7500000093132257, "step": 475 }, { "advantage_max": 1.6495760828256607, "advantage_mean": -9.934106648401553e-09, "advantage_min": -1.2757733091711998, "advantage_std": 1.085733950138092, "completion_length": 2699.979248046875, "epoch": 0.544, "grad_norm": 0.9671617746353149, "kl": 0.2899169921875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.063017833182728e-07, "loss": 0.06, "reward": 0.9755550567060709, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.9755550567060709, "reward_after_std": 1.085733950138092, "reward_before_mean": 1.3144520642235875, "reward_before_std": 1.1092942878603935, "reward_change_max": 0.0, "reward_change_mean": -0.3388970401138067, "reward_change_min": -0.6296705827116966, "reward_change_std": 0.24273780267685652, "reward_std": 1.0857339650392532, "rewards/cosine_scaled_reward": 0.23014270653948188, "rewards/format_reward": 0.8541666865348816, "step": 476 }, { "advantage_max": 1.1516963690519333, "advantage_mean": -2.5766591804643468e-08, "advantage_min": -0.8459501937031746, "advantage_std": 0.7099061608314514, "completion_length": 2525.791748046875, "epoch": 0.5451428571428572, "grad_norm": 1.1748000383377075, "kl": 0.261932373046875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0578868071715544e-07, "loss": -0.0042, "reward": 0.9292363012209535, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.9292363012209535, "reward_after_std": 0.7099061757326126, "reward_before_mean": 1.2795587107539177, "reward_before_std": 0.6643078401684761, "reward_change_max": 0.00022698938846588135, "reward_change_mean": -0.3503224477171898, "reward_change_min": -0.54884322732687, "reward_change_std": 0.20639916136860847, "reward_std": 0.7099061757326126, "rewards/cosine_scaled_reward": 0.21269604843109846, "rewards/format_reward": 0.8541666716337204, "step": 477 }, { "advantage_max": 1.5752771347761154, "advantage_mean": -6.829698917520943e-09, "advantage_min": -0.9742097333073616, "advantage_std": 0.9652605876326561, "completion_length": 2956.6459197998047, "epoch": 0.5462857142857143, "grad_norm": 0.639392077922821, "kl": 0.34381103515625, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0529722834905125e-07, "loss": 0.0329, "reward": 0.2602955009788275, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2602955009788275, "reward_after_std": 0.9652606099843979, "reward_before_mean": 0.45387281477451324, "reward_before_std": 0.9901572093367577, "reward_change_max": 0.0, "reward_change_mean": -0.19357730075716972, "reward_change_min": -0.4107331708073616, "reward_change_std": 0.16745386458933353, "reward_std": 0.9652606546878815, "rewards/cosine_scaled_reward": -0.08556360751390457, "rewards/format_reward": 0.6250000167638063, "step": 478 }, { "advantage_max": 1.1501880139112473, "advantage_mean": -3.104408785592483e-09, "advantage_min": -0.8670934438705444, "advantage_std": 0.7535147033631802, "completion_length": 3088.1875610351562, "epoch": 0.5474285714285714, "grad_norm": 0.5952982306480408, "kl": 0.3709716796875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0482745016665526e-07, "loss": 0.0417, "reward": 0.314653005450964, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.314653005450964, "reward_after_std": 0.7535146996378899, "reward_before_mean": 0.5371275693178177, "reward_before_std": 0.7753479313105345, "reward_change_max": 0.0002630949020385742, "reward_change_mean": -0.22247455921024084, "reward_change_min": -0.4422895945608616, "reward_change_std": 0.1705716736614704, "reward_std": 0.7535147480666637, "rewards/cosine_scaled_reward": -0.1168528909329325, "rewards/format_reward": 0.7708333432674408, "step": 479 }, { "advantage_max": 1.4555295780301094, "advantage_mean": -1.3659398501175701e-08, "advantage_min": -0.7888297103345394, "advantage_std": 0.8286092169582844, "completion_length": 2673.125045776367, "epoch": 0.5485714285714286, "grad_norm": 0.5772784352302551, "kl": 0.339385986328125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0437936906629334e-07, "loss": 0.0257, "reward": 0.26276683527976274, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.26276683527976274, "reward_after_std": 0.8286092169582844, "reward_before_mean": 0.4622917678207159, "reward_before_std": 0.8054306283593178, "reward_change_max": 3.261864185333252e-05, "reward_change_mean": -0.19952495954930782, "reward_change_min": -0.3689793273806572, "reward_change_std": 0.13875073846429586, "reward_std": 0.8286092355847359, "rewards/cosine_scaled_reward": -0.14385412260890007, "rewards/format_reward": 0.7500000037252903, "step": 480 }, { "advantage_max": 1.034639123827219, "advantage_mean": -1.862645371275562e-09, "advantage_min": -0.8700513690710068, "advantage_std": 0.6838802136480808, "completion_length": 3162.8126220703125, "epoch": 0.5497142857142857, "grad_norm": 0.6593417525291443, "kl": 0.333984375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0395300688680625e-07, "loss": 0.0159, "reward": 0.31739925500005484, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.31739925500005484, "reward_after_std": 0.6838802210986614, "reward_before_mean": 0.5451112650334835, "reward_before_std": 0.6971101015806198, "reward_change_max": 0.0006843283772468567, "reward_change_mean": -0.22771200723946095, "reward_change_min": -0.41445737704634666, "reward_change_std": 0.16134102689102292, "reward_std": 0.6838802509009838, "rewards/cosine_scaled_reward": -0.1232777088880539, "rewards/format_reward": 0.7916666846722364, "step": 481 }, { "advantage_max": 1.237500637769699, "advantage_mean": -1.4901161249358807e-08, "advantage_min": -0.820942685008049, "advantage_std": 0.7448217496275902, "completion_length": 2747.041778564453, "epoch": 0.5508571428571428, "grad_norm": 0.35874566435813904, "kl": 0.32952880859375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0354838440848501e-07, "loss": 0.0301, "reward": 0.8256460228003561, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.8256460228003561, "reward_after_std": 0.7448217272758484, "reward_before_mean": 1.1510243900120258, "reward_before_std": 0.6903415769338608, "reward_change_max": 0.0003423243761062622, "reward_change_mean": -0.32537833508104086, "reward_change_min": -0.5232283174991608, "reward_change_std": 0.20527946017682552, "reward_std": 0.744821734726429, "rewards/cosine_scaled_reward": 0.21092883963137865, "rewards/format_reward": 0.7291666753590107, "step": 482 }, { "advantage_max": 1.5438192747533321, "advantage_mean": -6.8296991950766994e-09, "advantage_min": -0.9122642446309328, "advantage_std": 0.9228374361991882, "completion_length": 2939.5000762939453, "epoch": 0.552, "grad_norm": 0.7610252499580383, "kl": 0.348785400390625, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0316552135205837e-07, "loss": 0.0146, "reward": 0.3301640059798956, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3301640059798956, "reward_after_std": 0.9228374511003494, "reward_before_mean": 0.5393744148313999, "reward_before_std": 0.9276447612792253, "reward_change_max": 0.00022416561841964722, "reward_change_mean": -0.20921041257679462, "reward_change_min": -0.4508453905582428, "reward_change_std": 0.16462393524125218, "reward_std": 0.9228374809026718, "rewards/cosine_scaled_reward": -0.10531279840506613, "rewards/format_reward": 0.750000013038516, "step": 483 }, { "advantage_max": 1.3582065775990486, "advantage_mean": -2.6077032977411818e-08, "advantage_min": -0.9722105860710144, "advantage_std": 0.8755018226802349, "completion_length": 2505.520896911621, "epoch": 0.5531428571428572, "grad_norm": 1.3068631887435913, "kl": 0.227569580078125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0280443637773163e-07, "loss": 0.0487, "reward": 0.5590387713164091, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5590387713164091, "reward_after_std": 0.8755018264055252, "reward_before_mean": 0.8242268934845924, "reward_before_std": 0.8963750414550304, "reward_change_max": 0.0, "reward_change_mean": -0.2651881640776992, "reward_change_min": -0.491425771266222, "reward_change_std": 0.19657512661069632, "reward_std": 0.8755018599331379, "rewards/cosine_scaled_reward": 0.057946796994656324, "rewards/format_reward": 0.7083333414047956, "step": 484 }, { "advantage_max": 1.3268622159957886, "advantage_mean": -3.663202274850619e-08, "advantage_min": -0.8655448257923126, "advantage_std": 0.7925088070333004, "completion_length": 2854.250030517578, "epoch": 0.5542857142857143, "grad_norm": 0.7887077927589417, "kl": 0.349334716796875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0246514708427701e-07, "loss": 0.011, "reward": 0.5681506851688027, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5681506851688027, "reward_after_std": 0.7925088219344616, "reward_before_mean": 0.8365112133324146, "reward_before_std": 0.7685690931975842, "reward_change_max": 0.00017368048429489136, "reward_change_mean": -0.26836052909493446, "reward_change_min": -0.4626109004020691, "reward_change_std": 0.16949358582496643, "reward_std": 0.7925088405609131, "rewards/cosine_scaled_reward": 0.0015889182686805725, "rewards/format_reward": 0.8333333507180214, "step": 485 }, { "advantage_max": 0.9405202865600586, "advantage_mean": -2.980232349791834e-08, "advantage_min": -0.7814814485609531, "advantage_std": 0.6054781675338745, "completion_length": 2462.6667404174805, "epoch": 0.5554285714285714, "grad_norm": 0.31038495898246765, "kl": 0.22412109375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0214767000817596e-07, "loss": 0.0166, "reward": 0.5397234465926886, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5397234465926886, "reward_after_std": 0.6054781526327133, "reward_before_mean": 0.8177524618804455, "reward_before_std": 0.5829389169812202, "reward_change_max": 0.0001628771424293518, "reward_change_mean": -0.2780290227383375, "reward_change_min": -0.4302845783531666, "reward_change_std": 0.1706089461222291, "reward_std": 0.6054781638085842, "rewards/cosine_scaled_reward": -0.039040457457304, "rewards/format_reward": 0.895833358168602, "step": 486 }, { "advantage_max": 1.2132899835705757, "advantage_mean": -6.829698862009792e-09, "advantage_min": -0.6911888271570206, "advantage_std": 0.7022703289985657, "completion_length": 2178.5000648498535, "epoch": 0.5565714285714286, "grad_norm": 1.1013511419296265, "kl": 0.2207794189453125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0185202062281336e-07, "loss": -0.0112, "reward": 0.7325315810739994, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7325315810739994, "reward_after_std": 0.7022703513503075, "reward_before_mean": 1.0389863569289446, "reward_before_std": 0.6378826051950455, "reward_change_max": 8.277595043182373e-06, "reward_change_mean": -0.30645475909113884, "reward_change_min": -0.5073573999106884, "reward_change_std": 0.1849022163078189, "reward_std": 0.7022703625261784, "rewards/cosine_scaled_reward": 0.08199317380785942, "rewards/format_reward": 0.8750000149011612, "step": 487 }, { "advantage_max": 0.9857561364769936, "advantage_mean": -2.0489096919096284e-08, "advantage_min": -0.8704906292259693, "advantage_std": 0.6565042473375797, "completion_length": 2526.8333740234375, "epoch": 0.5577142857142857, "grad_norm": 0.5738789439201355, "kl": 0.254730224609375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0157821333772304e-07, "loss": 0.0087, "reward": 0.4394610158633441, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4394610158633441, "reward_after_std": 0.6565042249858379, "reward_before_mean": 0.6953956615179777, "reward_before_std": 0.6586741730570793, "reward_change_max": 0.0, "reward_change_mean": -0.2559346444904804, "reward_change_min": -0.43403490260243416, "reward_change_std": 0.16896594502031803, "reward_std": 0.6565042324364185, "rewards/cosine_scaled_reward": -0.07938552554696798, "rewards/format_reward": 0.854166679084301, "step": 488 }, { "advantage_max": 0.8957756981253624, "advantage_mean": 5.898376537194494e-09, "advantage_min": -0.6481528356671333, "advantage_std": 0.5608323365449905, "completion_length": 3312.937530517578, "epoch": 0.5588571428571428, "grad_norm": 0.6687197089195251, "kl": 0.4052734375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.013262614978859e-07, "loss": 0.0275, "reward": -0.08741341950371861, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.08741341950371861, "reward_after_std": 0.5608323328197002, "reward_before_mean": 0.06184304691851139, "reward_before_std": 0.5696195401251316, "reward_change_max": 0.0005065873265266418, "reward_change_mean": -0.1492564668878913, "reward_change_min": -0.2845678739249706, "reward_change_std": 0.11316195782274008, "reward_std": 0.5608323402702808, "rewards/cosine_scaled_reward": -0.23991181142628193, "rewards/format_reward": 0.5416666809469461, "step": 489 }, { "advantage_max": 0.9873585477471352, "advantage_mean": -4.3461718668424965e-09, "advantage_min": -0.8867710456252098, "advantage_std": 0.648032084107399, "completion_length": 2244.000068664551, "epoch": 0.56, "grad_norm": 0.8376133441925049, "kl": 0.2108001708984375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0109617738307911e-07, "loss": -0.0159, "reward": 0.3705539219081402, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3705539219081402, "reward_after_std": 0.6480320878326893, "reward_before_mean": 0.6113275191746652, "reward_before_std": 0.6507302448153496, "reward_change_max": 0.0, "reward_change_mean": -0.24077360518276691, "reward_change_min": -0.379101675003767, "reward_change_std": 0.15669575799256563, "reward_std": 0.6480320952832699, "rewards/cosine_scaled_reward": -0.11100290203467011, "rewards/format_reward": 0.8333333507180214, "step": 490 }, { "advantage_max": 1.4200214892625809, "advantage_mean": -1.7384688466570708e-08, "advantage_min": -1.0268218517303467, "advantage_std": 0.9101089909672737, "completion_length": 2840.8959197998047, "epoch": 0.5611428571428572, "grad_norm": 0.7360396981239319, "kl": 0.316070556640625, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0088797220727779e-07, "loss": 0.0474, "reward": 0.6004394674673676, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6004394674673676, "reward_after_std": 0.9101089760661125, "reward_before_mean": 0.8715587891638279, "reward_before_std": 0.928726814687252, "reward_change_max": 0.000231131911277771, "reward_change_mean": -0.2711193021386862, "reward_change_min": -0.5320481732487679, "reward_change_std": 0.19947497360408306, "reward_std": 0.9101089909672737, "rewards/cosine_scaled_reward": 0.11286270339041948, "rewards/format_reward": 0.6458333507180214, "step": 491 }, { "advantage_max": 1.0486746355891228, "advantage_mean": -1.5522043650406658e-09, "advantage_min": -0.9603680036962032, "advantage_std": 0.7319249548017979, "completion_length": 2709.2917442321777, "epoch": 0.5622857142857143, "grad_norm": 0.36673468351364136, "kl": 0.26788330078125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0070165611810855e-07, "loss": 0.0227, "reward": 0.6172414775937796, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6172414775937796, "reward_after_std": 0.7319249622523785, "reward_before_mean": 0.9067196510732174, "reward_before_std": 0.742775421589613, "reward_change_max": 0.0006285309791564941, "reward_change_mean": -0.2894781604409218, "reward_change_min": -0.48100465163588524, "reward_change_std": 0.19262039940804243, "reward_std": 0.7319249622523785, "rewards/cosine_scaled_reward": 0.015859810635447502, "rewards/format_reward": 0.8750000223517418, "step": 492 }, { "advantage_max": 1.4047833159565926, "advantage_mean": -5.587935281159417e-09, "advantage_min": -1.0531021058559418, "advantage_std": 0.9087579324841499, "completion_length": 2453.854217529297, "epoch": 0.5634285714285714, "grad_norm": 1.173799753189087, "kl": 0.2418060302734375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.005372381963547e-07, "loss": 0.0571, "reward": 0.7099454645067453, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7099454645067453, "reward_after_std": 0.9087579324841499, "reward_before_mean": 1.004451007116586, "reward_before_std": 0.9203404039144516, "reward_change_max": 1.3522803783416748e-05, "reward_change_mean": -0.29450553096830845, "reward_change_min": -0.56828448548913, "reward_change_std": 0.21007542498409748, "reward_std": 0.9087579697370529, "rewards/cosine_scaled_reward": 0.07514216750860214, "rewards/format_reward": 0.854166679084301, "step": 493 }, { "advantage_max": 1.4265383556485176, "advantage_mean": -2.6077032755367213e-08, "advantage_min": -1.0778544396162033, "advantage_std": 0.8749287910759449, "completion_length": 2329.166763305664, "epoch": 0.5645714285714286, "grad_norm": 0.5095478296279907, "kl": 0.189117431640625, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0039472645551372e-07, "loss": 0.0123, "reward": 0.7384659582749009, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7384659582749009, "reward_after_std": 0.8749288134276867, "reward_before_mean": 1.0367552265524864, "reward_before_std": 0.8602879270911217, "reward_change_max": 0.0003532916307449341, "reward_change_mean": -0.29828929156064987, "reward_change_min": -0.5286732465028763, "reward_change_std": 0.19926906377077103, "reward_std": 0.8749288581311703, "rewards/cosine_scaled_reward": 0.06004427303560078, "rewards/format_reward": 0.9166666865348816, "step": 494 }, { "advantage_max": 1.248302560299635, "advantage_mean": -4.066775263766864e-08, "advantage_min": -0.8059535510838032, "advantage_std": 0.7659686803817749, "completion_length": 3080.604248046875, "epoch": 0.5657142857142857, "grad_norm": 0.7964968085289001, "kl": 0.31951904296875, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.002741278414069e-07, "loss": 0.0608, "reward": 0.39885510806925595, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.39885510806925595, "reward_after_std": 0.7659686580300331, "reward_before_mean": 0.6349856150336564, "reward_before_std": 0.7587495669722557, "reward_change_max": 0.00038827210664749146, "reward_change_mean": -0.23613055190071464, "reward_change_min": -0.455204539000988, "reward_change_std": 0.1684614084661007, "reward_std": 0.7659686580300331, "rewards/cosine_scaled_reward": -0.026257202960550785, "rewards/format_reward": 0.6875000093132257, "step": 495 }, { "advantage_max": 0.8897208273410797, "advantage_mean": -1.4901161193847656e-08, "advantage_min": -0.6923435255885124, "advantage_std": 0.5650329366326332, "completion_length": 2263.041679382324, "epoch": 0.5668571428571428, "grad_norm": 0.3149852752685547, "kl": 0.2501220703125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0017544823184055e-07, "loss": 0.0311, "reward": 0.4930616207420826, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4930616207420826, "reward_after_std": 0.5650329403579235, "reward_before_mean": 0.7627693666145205, "reward_before_std": 0.533838264644146, "reward_change_max": 0.00027805566787719727, "reward_change_mean": -0.26970773935317993, "reward_change_min": -0.422781178727746, "reward_change_std": 0.16430719010531902, "reward_std": 0.5650329664349556, "rewards/cosine_scaled_reward": 0.02721799910068512, "rewards/format_reward": 0.7083333395421505, "step": 496 }, { "advantage_max": 1.1297822445631027, "advantage_mean": -2.4835269396561444e-09, "advantage_min": -1.0825001783668995, "advantage_std": 0.8197258524596691, "completion_length": 2637.5625610351562, "epoch": 0.568, "grad_norm": 0.8214015960693359, "kl": 0.2474517822265625, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0009869243631952e-07, "loss": 0.0362, "reward": 0.775239892071113, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.775239892071113, "reward_after_std": 0.8197258599102497, "reward_before_mean": 1.0950648039579391, "reward_before_std": 0.8394419066607952, "reward_change_max": 0.0, "reward_change_mean": -0.31982492096722126, "reward_change_min": -0.5376950670033693, "reward_change_std": 0.22064348123967648, "reward_std": 0.8197259083390236, "rewards/cosine_scaled_reward": 0.15169907361268997, "rewards/format_reward": 0.791666679084301, "step": 497 }, { "advantage_max": 1.3400945663452148, "advantage_mean": 1.2417630257388623e-09, "advantage_min": -1.0304431170225143, "advantage_std": 0.8622480668127537, "completion_length": 3059.791778564453, "epoch": 0.5691428571428572, "grad_norm": 0.5599825382232666, "kl": 0.33740234375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.000438641958131e-07, "loss": 0.0534, "reward": 0.6572053148411214, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6572053148411214, "reward_after_std": 0.8622480593621731, "reward_before_mean": 0.9432293940335512, "reward_before_std": 0.8665383458137512, "reward_change_max": 0.0, "reward_change_mean": -0.28602408431470394, "reward_change_min": -0.4931537136435509, "reward_change_std": 0.19333378039300442, "reward_std": 0.8622480817139149, "rewards/cosine_scaled_reward": 0.03411470470018685, "rewards/format_reward": 0.8750000223517418, "step": 498 }, { "advantage_max": 1.3127831816673279, "advantage_mean": -4.6566129563441194e-09, "advantage_min": -0.7158313523977995, "advantage_std": 0.7632554471492767, "completion_length": 2867.2709197998047, "epoch": 0.5702857142857143, "grad_norm": 0.9332626461982727, "kl": 0.27239990234375, "lambda_div_used": 0.7999999999999999, "learning_rate": 1.0001096618257236e-07, "loss": 0.0505, "reward": 0.45483217388391495, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.45483217388391495, "reward_after_std": 0.7632554359734058, "reward_before_mean": 0.7000233605504036, "reward_before_std": 0.7283362746238708, "reward_change_max": 0.0, "reward_change_mean": -0.2451911773532629, "reward_change_min": -0.44301971048116684, "reward_change_std": 0.16094432026147842, "reward_std": 0.7632554583251476, "rewards/cosine_scaled_reward": -0.10832166392356157, "rewards/format_reward": 0.916666679084301, "step": 499 }, { "advantage_max": 1.5181359201669693, "advantage_mean": -1.9247333560290514e-08, "advantage_min": -0.8365307152271271, "advantage_std": 0.9098883271217346, "completion_length": 3207.5625610351562, "epoch": 0.5714285714285714, "grad_norm": 0.48555415868759155, "kl": 0.34649658203125, "lambda_div_used": 0.7999999999999999, "learning_rate": 1e-07, "loss": 0.0282, "reward": 0.5236625634133816, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5236625634133816, "reward_after_std": 0.9098883718252182, "reward_before_mean": 0.7744899541139603, "reward_before_std": 0.9004770293831825, "reward_change_max": 0.0003416910767555237, "reward_change_mean": -0.25082739163190126, "reward_change_min": -0.5166188813745975, "reward_change_std": 0.18964135646820068, "reward_std": 0.9098883867263794, "rewards/cosine_scaled_reward": 0.033078297041356564, "rewards/format_reward": 0.7083333469927311, "step": 500 }, { "epoch": 0.5714285714285714, "step": 500, "total_flos": 0.0, "train_loss": 0.023119730316335334, "train_runtime": 57324.27, "train_samples_per_second": 0.419, "train_steps_per_second": 0.009 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }