DRA-DR_GRPO-7B / trainer_state.json
kangdawei's picture
Model save
81d4b3e verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5714285714285714,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"advantage_max": 0.1587137309834361,
"advantage_mean": -8.537123882823572e-09,
"advantage_min": -0.14114269940182567,
"advantage_std": 0.12265788647346199,
"completion_length": 2253.854206085205,
"epoch": 0.001142857142857143,
"grad_norm": 0.0020193569362163544,
"kl": 0.0,
"learning_rate": 0.0,
"loss": 0.0016,
"reward": 0.16043265676125884,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.12265789229422808,
"rewards/cosine_scaled_reward": 0.16032031644135714,
"rewards/format_reward": 0.6250000037252903,
"step": 1
},
{
"advantage_max": 0.15091374469920993,
"advantage_mean": -5.6655459629295635e-09,
"advantage_min": -0.1861058697104454,
"advantage_std": 0.13657334074378014,
"completion_length": 2566.395854949951,
"epoch": 0.002285714285714286,
"grad_norm": 0.003380397567525506,
"kl": 0.0,
"learning_rate": 2e-08,
"loss": 0.0117,
"reward": 0.13084001699462533,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1365733384154737,
"rewards/cosine_scaled_reward": 0.12725313939154148,
"rewards/format_reward": 0.5208333414047956,
"step": 2
},
{
"advantage_max": 0.16393633373081684,
"advantage_mean": -4.2685618234505895e-09,
"advantage_min": -0.14902829099446535,
"advantage_std": 0.12841436569578946,
"completion_length": 2859.7708740234375,
"epoch": 0.0034285714285714284,
"grad_norm": 0.0022618037182837725,
"kl": 0.00017625093460083008,
"learning_rate": 4e-08,
"loss": 0.0052,
"reward": 0.038310326635837555,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1284143622033298,
"rewards/cosine_scaled_reward": -0.08417008072137833,
"rewards/format_reward": 0.3958333358168602,
"step": 3
},
{
"advantage_max": 0.14144689589738846,
"advantage_mean": -9.934107814135729e-09,
"advantage_min": -0.1055870414711535,
"advantage_std": 0.10548431565985084,
"completion_length": 1437.4375305175781,
"epoch": 0.004571428571428572,
"grad_norm": 0.0020079181995242834,
"kl": 0.00010022521018981934,
"learning_rate": 6e-08,
"loss": 0.0045,
"reward": 0.2122791176661849,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10548432543873787,
"rewards/cosine_scaled_reward": 0.207040709676221,
"rewards/format_reward": 0.8333333358168602,
"step": 4
},
{
"advantage_max": 0.21683576330542564,
"advantage_mean": -4.967053768289986e-09,
"advantage_min": -0.2010047109797597,
"advantage_std": 0.1657102182507515,
"completion_length": 3105.1458587646484,
"epoch": 0.005714285714285714,
"grad_norm": 0.004448694176971912,
"kl": 0.00015798211097717285,
"learning_rate": 8e-08,
"loss": 0.014,
"reward": 0.08359449577983469,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1657102219760418,
"rewards/cosine_scaled_reward": 0.01688160002231598,
"rewards/format_reward": 0.45833335258066654,
"step": 5
},
{
"advantage_max": 0.22248137276619673,
"advantage_mean": -2.79396782099095e-09,
"advantage_min": -0.15466022863984108,
"advantage_std": 0.14450703794136643,
"completion_length": 2365.9583435058594,
"epoch": 0.006857142857142857,
"grad_norm": 0.0027177755255252123,
"kl": 0.00011008977890014648,
"learning_rate": 1e-07,
"loss": 0.0044,
"reward": 0.06316112671629526,
"reward_advantage_correlation": 0.9999999999999997,
"reward_std": 0.14450704446062446,
"rewards/cosine_scaled_reward": -0.11656539421528578,
"rewards/format_reward": 0.6041666772216558,
"step": 6
},
{
"advantage_max": 0.16089605633169413,
"advantage_mean": -9.934107828013516e-09,
"advantage_min": -0.19252846017479897,
"advantage_std": 0.14089244278147817,
"completion_length": 2459.416732788086,
"epoch": 0.008,
"grad_norm": 0.0031194211915135384,
"kl": 0.00013837218284606934,
"learning_rate": 1.2e-07,
"loss": 0.0105,
"reward": 0.11613669246435165,
"reward_advantage_correlation": 1.0,
"reward_std": 0.14089244417846203,
"rewards/cosine_scaled_reward": -0.010624844580888748,
"rewards/format_reward": 0.7083333432674408,
"step": 7
},
{
"advantage_max": 0.16271019540727139,
"advantage_mean": 2.1730860721991263e-09,
"advantage_min": -0.18036252213642,
"advantage_std": 0.1353623152244836,
"completion_length": 1742.7083702087402,
"epoch": 0.009142857142857144,
"grad_norm": 0.0014795621391385794,
"kl": 6.800517439842224e-05,
"learning_rate": 1.4e-07,
"loss": -0.0062,
"reward": 0.2013200237415731,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1353623173199594,
"rewards/cosine_scaled_reward": 0.24841690342873335,
"rewards/format_reward": 0.6875000055879354,
"step": 8
},
{
"advantage_max": 0.18095109798014164,
"advantage_mean": -1.5522042678961512e-09,
"advantage_min": -0.14702739380300045,
"advantage_std": 0.1355222244746983,
"completion_length": 2552.8125228881836,
"epoch": 0.010285714285714285,
"grad_norm": 0.0029326872900128365,
"kl": 0.00014982372522354126,
"learning_rate": 1.6e-07,
"loss": 0.0096,
"reward": 0.05854184099007398,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13552221888676286,
"rewards/cosine_scaled_reward": -0.06791253259871155,
"rewards/format_reward": 0.47916667349636555,
"step": 9
},
{
"advantage_max": 0.12060405313968658,
"advantage_mean": 4.113341514622171e-09,
"advantage_min": -0.142030019313097,
"advantage_std": 0.11728623230010271,
"completion_length": 2442.5833587646484,
"epoch": 0.011428571428571429,
"grad_norm": 0.002123701386153698,
"kl": 0.00010627508163452148,
"learning_rate": 1.8e-07,
"loss": 0.0069,
"reward": 0.12899871496483684,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.11728623881936073,
"rewards/cosine_scaled_reward": 0.09703383408486843,
"rewards/format_reward": 0.5625000037252903,
"step": 10
},
{
"advantage_max": 0.21631120378151536,
"advantage_mean": 3.1044089521259366e-10,
"advantage_min": -0.177880696952343,
"advantage_std": 0.1529299677349627,
"completion_length": 3065.0833587646484,
"epoch": 0.012571428571428572,
"grad_norm": 0.0030999763403087854,
"kl": 0.0001526474952697754,
"learning_rate": 2e-07,
"loss": 0.014,
"reward": 0.025770303327590227,
"reward_advantage_correlation": 1.0,
"reward_std": 0.15292996680364013,
"rewards/cosine_scaled_reward": -0.09046578034758568,
"rewards/format_reward": 0.3333333469927311,
"step": 11
},
{
"advantage_max": 0.17459326144307852,
"advantage_mean": -6.8296991118099726e-09,
"advantage_min": -0.15691349003463984,
"advantage_std": 0.14575656410306692,
"completion_length": 1936.3542022705078,
"epoch": 0.013714285714285714,
"grad_norm": 0.00448928028345108,
"kl": 0.00013959407806396484,
"learning_rate": 2.1999999999999998e-07,
"loss": 0.0119,
"reward": 0.15124552277848125,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.14575657108798623,
"rewards/cosine_scaled_reward": 0.06061667948961258,
"rewards/format_reward": 0.7708333358168602,
"step": 12
},
{
"advantage_max": 0.1999238901771605,
"advantage_mean": -6.053596887656276e-09,
"advantage_min": -0.17521686758846045,
"advantage_std": 0.1415856694802642,
"completion_length": 2761.5625381469727,
"epoch": 0.014857142857142857,
"grad_norm": 0.0024026259779930115,
"kl": 0.00013381242752075195,
"learning_rate": 2.4e-07,
"loss": 0.0116,
"reward": 0.07723551848903298,
"reward_advantage_correlation": 1.0,
"reward_std": 0.14158567460253835,
"rewards/cosine_scaled_reward": 0.02042432501912117,
"rewards/format_reward": 0.4166666716337204,
"step": 13
},
{
"advantage_max": 0.14435320254415274,
"advantage_mean": -1.5522043650406658e-09,
"advantage_min": -0.13152476958930492,
"advantage_std": 0.10402885114308447,
"completion_length": 2247.8750228881836,
"epoch": 0.016,
"grad_norm": 0.001248424407094717,
"kl": 0.00012195110321044922,
"learning_rate": 2.6e-07,
"loss": 0.0017,
"reward": 0.0968486382625997,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1040288528893143,
"rewards/cosine_scaled_reward": -0.01446421816945076,
"rewards/format_reward": 0.6041666697710752,
"step": 14
},
{
"advantage_max": 0.07975641544908285,
"advantage_mean": -9.313225607376907e-10,
"advantage_min": -0.09877181611955166,
"advantage_std": 0.0715660082641989,
"completion_length": 2612.458366394043,
"epoch": 0.017142857142857144,
"grad_norm": 0.0010066244285553694,
"kl": 0.000102996826171875,
"learning_rate": 2.8e-07,
"loss": -0.0006,
"reward": 0.12488156370818615,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.07156601082533598,
"rewards/cosine_scaled_reward": 0.11940987780690193,
"rewards/format_reward": 0.5,
"step": 15
},
{
"advantage_max": 0.12272757943719625,
"advantage_mean": -4.656613150633149e-10,
"advantage_min": -0.13491731509566307,
"advantage_std": 0.1079402850009501,
"completion_length": 3536.8958435058594,
"epoch": 0.018285714285714287,
"grad_norm": 0.002257565502077341,
"kl": 0.0002008676528930664,
"learning_rate": 3e-07,
"loss": 0.0014,
"reward": 0.007092840503901243,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1079402850009501,
"rewards/cosine_scaled_reward": -0.06191747821867466,
"rewards/format_reward": 0.16666667349636555,
"step": 16
},
{
"advantage_max": 0.15527505613863468,
"advantage_mean": -1.2417634698280722e-09,
"advantage_min": -0.17817360255867243,
"advantage_std": 0.13301802705973387,
"completion_length": 1949.2916793823242,
"epoch": 0.019428571428571427,
"grad_norm": 0.0018681100336834788,
"kl": 0.00013206154108047485,
"learning_rate": 3.2e-07,
"loss": -0.0032,
"reward": 0.16996530396863818,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.13301803125068545,
"rewards/cosine_scaled_reward": 0.15604307316243649,
"rewards/format_reward": 0.6875000055879354,
"step": 17
},
{
"advantage_max": 0.1609904021024704,
"advantage_mean": 4.163336342344337e-17,
"advantage_min": -0.1347449072636664,
"advantage_std": 0.1273661465384066,
"completion_length": 2442.2083587646484,
"epoch": 0.02057142857142857,
"grad_norm": 0.002498304471373558,
"kl": 0.0001045428216457367,
"learning_rate": 3.4000000000000003e-07,
"loss": 0.0077,
"reward": 0.11062846053391695,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.12736614840105176,
"rewards/cosine_scaled_reward": 0.013204759918153286,
"rewards/format_reward": 0.625,
"step": 18
},
{
"advantage_max": 0.20167000405490398,
"advantage_mean": -1.4668330938771845e-08,
"advantage_min": -0.2308518048375845,
"advantage_std": 0.16828895779326558,
"completion_length": 2169.2500648498535,
"epoch": 0.021714285714285714,
"grad_norm": 0.0021624648943543434,
"kl": 0.00011816620826721191,
"learning_rate": 3.6e-07,
"loss": 0.0,
"reward": 0.23799258447252214,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.16828896710649133,
"rewards/cosine_scaled_reward": 0.3468063613399863,
"rewards/format_reward": 0.708333345130086,
"step": 19
},
{
"advantage_max": 0.10925093479454517,
"advantage_mean": 1.7074248404025383e-09,
"advantage_min": -0.21123075019568205,
"advantage_std": 0.13155136164277792,
"completion_length": 1562.8958702087402,
"epoch": 0.022857142857142857,
"grad_norm": 0.0014371563447639346,
"kl": 7.089972496032715e-05,
"learning_rate": 3.7999999999999996e-07,
"loss": 0.0053,
"reward": 0.24731143051758409,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.13155136303976178,
"rewards/cosine_scaled_reward": 0.28850897774100304,
"rewards/format_reward": 0.8750000055879354,
"step": 20
},
{
"advantage_max": 0.2152009317651391,
"advantage_mean": -6.519257994552774e-09,
"advantage_min": -0.19526144675910473,
"advantage_std": 0.1745052202604711,
"completion_length": 2338.8333587646484,
"epoch": 0.024,
"grad_norm": 0.0031636806670576334,
"kl": 0.0001360177993774414,
"learning_rate": 4e-07,
"loss": 0.0088,
"reward": 0.12647646106779575,
"reward_advantage_correlation": 1.0,
"reward_std": 0.17450521886348724,
"rewards/cosine_scaled_reward": 0.10836795996874571,
"rewards/format_reward": 0.5208333376795053,
"step": 21
},
{
"advantage_max": 0.134979996830225,
"advantage_mean": -8.498318746635869e-09,
"advantage_min": -0.09254785464145243,
"advantage_std": 0.0842497721023392,
"completion_length": 1393.520851135254,
"epoch": 0.025142857142857144,
"grad_norm": 0.0015769954770803452,
"kl": 8.021295070648193e-05,
"learning_rate": 4.1999999999999995e-07,
"loss": 0.0068,
"reward": 0.144379162156838,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.08424977766117081,
"rewards/cosine_scaled_reward": -0.010229567997157574,
"rewards/format_reward": 0.8750000149011612,
"step": 22
},
{
"advantage_max": 0.24604428745806217,
"advantage_mean": -2.173086099954702e-09,
"advantage_min": -0.1898985542356968,
"advantage_std": 0.17475335206836462,
"completion_length": 2572.395866394043,
"epoch": 0.026285714285714287,
"grad_norm": 0.0029409886337816715,
"kl": 0.00012180209159851074,
"learning_rate": 4.3999999999999997e-07,
"loss": 0.0091,
"reward": 0.12211265473160893,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.17475335579365492,
"rewards/cosine_scaled_reward": 0.02622226788662374,
"rewards/format_reward": 0.6666666753590107,
"step": 23
},
{
"advantage_max": 0.1982029126957059,
"advantage_mean": -9.778887047340312e-09,
"advantage_min": -0.17622305173426867,
"advantage_std": 0.15681974356994033,
"completion_length": 1935.3958892822266,
"epoch": 0.027428571428571427,
"grad_norm": 0.002483836840838194,
"kl": 7.27921724319458e-05,
"learning_rate": 4.6e-07,
"loss": 0.0088,
"reward": 0.17156797600910068,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.15681974682956934,
"rewards/cosine_scaled_reward": 0.11053762771189213,
"rewards/format_reward": 0.7916666716337204,
"step": 24
},
{
"advantage_max": 0.1764061450958252,
"advantage_mean": -7.372970400876255e-09,
"advantage_min": -0.1701870709657669,
"advantage_std": 0.13987470557913184,
"completion_length": 2115.000045776367,
"epoch": 0.02857142857142857,
"grad_norm": 0.0017042590770870447,
"kl": 0.00013157352805137634,
"learning_rate": 4.8e-07,
"loss": 0.0077,
"reward": 0.10251787485321984,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13987471163272858,
"rewards/cosine_scaled_reward": 0.020700588822364807,
"rewards/format_reward": 0.5625000055879354,
"step": 25
},
{
"advantage_max": 0.18950836267322302,
"advantage_mean": 3.104407703125034e-10,
"advantage_min": -0.10491138324141502,
"advantage_std": 0.11330888234078884,
"completion_length": 2396.9791946411133,
"epoch": 0.029714285714285714,
"grad_norm": 0.0017541834386065602,
"kl": 0.00014390423893928528,
"learning_rate": 5e-07,
"loss": 0.0003,
"reward": 0.03794134716736153,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1133088837377727,
"rewards/cosine_scaled_reward": -0.1579243573360145,
"rewards/format_reward": 0.5416666679084301,
"step": 26
},
{
"advantage_max": 0.13833303237333894,
"advantage_mean": 1.5522041985072121e-10,
"advantage_min": -0.15812412649393082,
"advantage_std": 0.12111463444307446,
"completion_length": 2430.4167098999023,
"epoch": 0.030857142857142857,
"grad_norm": 0.0028511241544038057,
"kl": 0.0001679062843322754,
"learning_rate": 5.2e-07,
"loss": 0.0071,
"reward": 0.14495101105421782,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12111463863402605,
"rewards/cosine_scaled_reward": 0.10645421966910362,
"rewards/format_reward": 0.6458333432674408,
"step": 27
},
{
"advantage_max": 0.2110468242317438,
"advantage_mean": 3.1044086884479682e-09,
"advantage_min": -0.11917469836771488,
"advantage_std": 0.12643845193088055,
"completion_length": 2344.7500648498535,
"epoch": 0.032,
"grad_norm": 0.002487706718966365,
"kl": 0.00012912601232528687,
"learning_rate": 5.4e-07,
"loss": 0.0023,
"reward": 0.12900030775927007,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12643845239654183,
"rewards/cosine_scaled_reward": 0.11002454720437527,
"rewards/format_reward": 0.5416666734963655,
"step": 28
},
{
"advantage_max": 0.11905187461525202,
"advantage_mean": -8.537123993845874e-10,
"advantage_min": -0.14191375905647874,
"advantage_std": 0.1037194412201643,
"completion_length": 2789.500015258789,
"epoch": 0.03314285714285714,
"grad_norm": 0.0016927722608670592,
"kl": 0.00016814470291137695,
"learning_rate": 5.6e-07,
"loss": 0.0041,
"reward": 0.032076418632641435,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.10371943796053529,
"rewards/cosine_scaled_reward": -0.12424678239040077,
"rewards/format_reward": 0.43750000558793545,
"step": 29
},
{
"advantage_max": 0.2387219499796629,
"advantage_mean": -7.140139979266991e-09,
"advantage_min": -0.1722863893955946,
"advantage_std": 0.16893034940585494,
"completion_length": 2417.9375610351562,
"epoch": 0.03428571428571429,
"grad_norm": 0.0027509131468832493,
"kl": 0.00010019540786743164,
"learning_rate": 5.8e-07,
"loss": 0.0047,
"reward": 0.17001164075918496,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.16893034894019365,
"rewards/cosine_scaled_reward": 0.1693029752932489,
"rewards/format_reward": 0.6666666772216558,
"step": 30
},
{
"advantage_max": 0.15399669483304024,
"advantage_mean": -5.432715249886755e-09,
"advantage_min": -0.16008222801610827,
"advantage_std": 0.1239645341411233,
"completion_length": 2762.229217529297,
"epoch": 0.03542857142857143,
"grad_norm": 0.0022213158663362265,
"kl": 0.00014109909534454346,
"learning_rate": 6e-07,
"loss": 0.0054,
"reward": 0.06478143483400345,
"reward_advantage_correlation": 1.0,
"reward_std": 0.123964530415833,
"rewards/cosine_scaled_reward": -0.027757282368838787,
"rewards/format_reward": 0.4375000037252903,
"step": 31
},
{
"advantage_max": 0.21103184670209885,
"advantage_mean": -2.7163574722877115e-09,
"advantage_min": -0.2043379843235016,
"advantage_std": 0.17043100483715534,
"completion_length": 2347.166732788086,
"epoch": 0.036571428571428574,
"grad_norm": 0.003295725677162409,
"kl": 0.00011375546455383301,
"learning_rate": 6.2e-07,
"loss": 0.019,
"reward": 0.17621385538950562,
"reward_advantage_correlation": 1.0,
"reward_std": 0.17043101135641336,
"rewards/cosine_scaled_reward": 0.17827051342464983,
"rewards/format_reward": 0.6875000111758709,
"step": 32
},
{
"advantage_max": 0.16886692307889462,
"advantage_mean": -9.778887394285007e-09,
"advantage_min": -0.19185489788651466,
"advantage_std": 0.15452855033800006,
"completion_length": 2871.562545776367,
"epoch": 0.037714285714285714,
"grad_norm": 0.0032696141861379147,
"kl": 0.00010472536087036133,
"learning_rate": 6.4e-07,
"loss": 0.0087,
"reward": 0.1361775571713224,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1545285526663065,
"rewards/cosine_scaled_reward": 0.12209459021687508,
"rewards/format_reward": 0.5625000074505806,
"step": 33
},
{
"advantage_max": 0.12458384316414595,
"advantage_mean": -7.140139868244688e-09,
"advantage_min": -0.15718477871268988,
"advantage_std": 0.12218879768624902,
"completion_length": 1836.4583587646484,
"epoch": 0.038857142857142854,
"grad_norm": 0.001347140409052372,
"kl": 0.00014571286737918854,
"learning_rate": 6.6e-07,
"loss": -0.0048,
"reward": 0.2071411805227399,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1221888018772006,
"rewards/cosine_scaled_reward": 0.25896920543164015,
"rewards/format_reward": 0.7083333358168602,
"step": 34
},
{
"advantage_max": 0.1953197568655014,
"advantage_mean": -5.587935614226325e-09,
"advantage_min": -0.22316306363791227,
"advantage_std": 0.1690281443297863,
"completion_length": 2355.6042289733887,
"epoch": 0.04,
"grad_norm": 0.0035231634974479675,
"kl": 0.0001258254051208496,
"learning_rate": 6.800000000000001e-07,
"loss": 0.0114,
"reward": 0.1693128461483866,
"reward_advantage_correlation": 1.0,
"reward_std": 0.16902815271168947,
"rewards/cosine_scaled_reward": 0.1864055972546339,
"rewards/format_reward": 0.625000013038516,
"step": 35
},
{
"advantage_max": 0.16095423232764006,
"advantage_mean": -9.313226440044176e-10,
"advantage_min": -0.16731015034019947,
"advantage_std": 0.1404950194992125,
"completion_length": 3038.5208892822266,
"epoch": 0.04114285714285714,
"grad_norm": 0.003486273344606161,
"kl": 0.00018405914306640625,
"learning_rate": 7e-07,
"loss": 0.0089,
"reward": 0.04707640549167991,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.14049502136185765,
"rewards/cosine_scaled_reward": -0.10159374866634607,
"rewards/format_reward": 0.4791666753590107,
"step": 36
},
{
"advantage_max": 0.14632703876122832,
"advantage_mean": -2.0954758830904474e-09,
"advantage_min": -0.1389367524534464,
"advantage_std": 0.11993890162557364,
"completion_length": 2859.4166717529297,
"epoch": 0.04228571428571429,
"grad_norm": 0.0022138648200780153,
"kl": 0.00013276375830173492,
"learning_rate": 7.2e-07,
"loss": 0.0039,
"reward": 0.04992359317839146,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11993890441954136,
"rewards/cosine_scaled_reward": -0.03023509867489338,
"rewards/format_reward": 0.35416667349636555,
"step": 37
},
{
"advantage_max": 0.12173652416095138,
"advantage_mean": 4.579002788052122e-09,
"advantage_min": -0.14573213178664446,
"advantage_std": 0.10948428139090538,
"completion_length": 3242.8958435058594,
"epoch": 0.04342857142857143,
"grad_norm": 0.0019941311329603195,
"kl": 0.00017582625150680542,
"learning_rate": 7.4e-07,
"loss": -0.0007,
"reward": -0.012153132352977991,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.10948428325355053,
"rewards/cosine_scaled_reward": -0.10800831019878387,
"rewards/format_reward": 0.1458333395421505,
"step": 38
},
{
"advantage_max": 0.11650802264921367,
"advantage_mean": -3.2790316486369653e-09,
"advantage_min": -0.1185021202545613,
"advantage_std": 0.09036520542576909,
"completion_length": 2290.125072479248,
"epoch": 0.044571428571428574,
"grad_norm": 0.00172609428409487,
"kl": 0.00010192953050136566,
"learning_rate": 7.599999999999999e-07,
"loss": 0.0069,
"reward": 0.09569389157695696,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09036520589143038,
"rewards/cosine_scaled_reward": -0.028405792079865932,
"rewards/format_reward": 0.6250000111758709,
"step": 39
},
{
"advantage_max": 0.13623419683426619,
"advantage_mean": -5.89837637066104e-09,
"advantage_min": -0.10135996155440807,
"advantage_std": 0.09501322568394244,
"completion_length": 2165.000068664551,
"epoch": 0.045714285714285714,
"grad_norm": 0.0017249195370823145,
"kl": 0.00013747811317443848,
"learning_rate": 7.799999999999999e-07,
"loss": 0.005,
"reward": 0.08313871989957988,
"reward_advantage_correlation": 1.0,
"reward_std": 0.0950132254511118,
"rewards/cosine_scaled_reward": -0.06722887419164181,
"rewards/format_reward": 0.6250000093132257,
"step": 40
},
{
"advantage_max": 0.12406869698315859,
"advantage_mean": -1.2650465452956894e-08,
"advantage_min": -0.20387937780469656,
"advantage_std": 0.12945719296112657,
"completion_length": 2695.000045776367,
"epoch": 0.046857142857142854,
"grad_norm": 0.0025204960256814957,
"kl": 0.00013300776481628418,
"learning_rate": 8e-07,
"loss": 0.0037,
"reward": 0.18478128965944052,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.12945719715207815,
"rewards/cosine_scaled_reward": 0.22261435391556006,
"rewards/format_reward": 0.6458333432674408,
"step": 41
},
{
"advantage_max": 0.15024259313941002,
"advantage_mean": 2.638747415018017e-09,
"advantage_min": -0.09568950766697526,
"advantage_std": 0.10328615305479616,
"completion_length": 2578.604169845581,
"epoch": 0.048,
"grad_norm": 0.002379926387220621,
"kl": 0.00016836822032928467,
"learning_rate": 8.199999999999999e-07,
"loss": 0.003,
"reward": -0.006026094313710928,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10328615212347358,
"rewards/cosine_scaled_reward": -0.236235816963017,
"rewards/format_reward": 0.43750000186264515,
"step": 42
},
{
"advantage_max": 0.19424660969525576,
"advantage_mean": -2.3283065059276353e-09,
"advantage_min": -0.18449017591774464,
"advantage_std": 0.15552015556022525,
"completion_length": 2532.604232788086,
"epoch": 0.04914285714285714,
"grad_norm": 0.0030434627551585436,
"kl": 0.00013034045696258545,
"learning_rate": 8.399999999999999e-07,
"loss": 0.0078,
"reward": 0.11620450200280175,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.15552015462890267,
"rewards/cosine_scaled_reward": 0.05307496804744005,
"rewards/format_reward": 0.5833333469927311,
"step": 43
},
{
"advantage_max": 0.1315040308982134,
"advantage_mean": -7.2177500920478366e-09,
"advantage_min": -0.09994148276746273,
"advantage_std": 0.08571109781041741,
"completion_length": 2133.1458740234375,
"epoch": 0.05028571428571429,
"grad_norm": 0.0013428078964352608,
"kl": 0.000141829252243042,
"learning_rate": 8.599999999999999e-07,
"loss": 0.0031,
"reward": 0.18769649555906653,
"reward_advantage_correlation": 1.0,
"reward_std": 0.08571109967306256,
"rewards/cosine_scaled_reward": 0.197374586481601,
"rewards/format_reward": 0.7083333414047956,
"step": 44
},
{
"advantage_max": 0.19893326330929995,
"advantage_mean": -6.208817363018149e-09,
"advantage_min": -0.18502129800617695,
"advantage_std": 0.16644518170505762,
"completion_length": 3050.1458587646484,
"epoch": 0.05142857142857143,
"grad_norm": 0.0026690622325986624,
"kl": 0.0001322031021118164,
"learning_rate": 8.799999999999999e-07,
"loss": 0.0064,
"reward": 0.09938832372426987,
"reward_advantage_correlation": 1.0,
"reward_std": 0.16644519148394465,
"rewards/cosine_scaled_reward": 0.08417692640796304,
"rewards/format_reward": 0.4166666716337204,
"step": 45
},
{
"advantage_max": 0.13772548642009497,
"advantage_mean": -2.173086030565763e-09,
"advantage_min": -0.13554238621145487,
"advantage_std": 0.10642092488706112,
"completion_length": 2790.6041870117188,
"epoch": 0.052571428571428575,
"grad_norm": 0.002247325610369444,
"kl": 0.0001497715711593628,
"learning_rate": 9e-07,
"loss": 0.0117,
"reward": 0.01905112573876977,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.10642092442139983,
"rewards/cosine_scaled_reward": -0.16121318750083447,
"rewards/format_reward": 0.4375000111758709,
"step": 46
},
{
"advantage_max": 0.15123218018561602,
"advantage_mean": -4.656613192266512e-09,
"advantage_min": -0.13677284540608525,
"advantage_std": 0.11217275122180581,
"completion_length": 2225.750015258789,
"epoch": 0.053714285714285714,
"grad_norm": 0.0019338211277499795,
"kl": 0.00010457634925842285,
"learning_rate": 9.2e-07,
"loss": 0.005,
"reward": 0.12341659748926759,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1121727554127574,
"rewards/cosine_scaled_reward": 0.05270408093929291,
"rewards/format_reward": 0.6250000037252903,
"step": 47
},
{
"advantage_max": 0.180091115180403,
"advantage_mean": -1.1098261233633e-08,
"advantage_min": -0.20166349643841386,
"advantage_std": 0.1485751592554152,
"completion_length": 2522.645881652832,
"epoch": 0.054857142857142854,
"grad_norm": 0.00238457671366632,
"kl": 0.00011201947927474976,
"learning_rate": 9.399999999999999e-07,
"loss": 0.0041,
"reward": 0.14124550856649876,
"reward_advantage_correlation": 1.0,
"reward_std": 0.14857516065239906,
"rewards/cosine_scaled_reward": 0.15710615552961826,
"rewards/format_reward": 0.5208333414047956,
"step": 48
},
{
"advantage_max": 0.17702306748833507,
"advantage_mean": -9.410238656012981e-10,
"advantage_min": -0.10869404303957708,
"advantage_std": 0.11685236936318688,
"completion_length": 1774.020851135254,
"epoch": 0.056,
"grad_norm": 0.0023415982723236084,
"kl": 9.121932089328766e-05,
"learning_rate": 9.6e-07,
"loss": 0.0043,
"reward": 0.11408375017344952,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11685237029450946,
"rewards/cosine_scaled_reward": -0.03847436048090458,
"rewards/format_reward": 0.7500000093132257,
"step": 49
},
{
"advantage_max": 0.09399801213294268,
"advantage_mean": -2.638747317873502e-09,
"advantage_min": -0.08518766891211271,
"advantage_std": 0.07225358131108806,
"completion_length": 2767.6041831970215,
"epoch": 0.05714285714285714,
"grad_norm": 0.0011482579866424203,
"kl": 9.690225124359131e-05,
"learning_rate": 9.8e-07,
"loss": 0.0011,
"reward": 0.13219367619603872,
"reward_advantage_correlation": 1.0,
"reward_std": 0.07225357874995098,
"rewards/cosine_scaled_reward": 0.1503643374890089,
"rewards/format_reward": 0.4791666679084301,
"step": 50
},
{
"advantage_max": 0.12806095415726304,
"advantage_mean": -5.452117693080516e-09,
"advantage_min": -0.10972547065466642,
"advantage_std": 0.0959368993062526,
"completion_length": 2330.2291717529297,
"epoch": 0.05828571428571429,
"grad_norm": 0.0021673294249922037,
"kl": 0.0001529306173324585,
"learning_rate": 1e-06,
"loss": -0.0003,
"reward": 0.0236921610776335,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09593690512701869,
"rewards/cosine_scaled_reward": -0.1689696293324232,
"rewards/format_reward": 0.4791666716337204,
"step": 51
},
{
"advantage_max": 0.18533404497429729,
"advantage_mean": -1.1020650517168384e-08,
"advantage_min": -0.20416315738111734,
"advantage_std": 0.16357391513884068,
"completion_length": 2461.3959159851074,
"epoch": 0.05942857142857143,
"grad_norm": 0.0032797979656606913,
"kl": 0.00010143965482711792,
"learning_rate": 9.999890338174275e-07,
"loss": 0.0137,
"reward": 0.18816695609712042,
"reward_advantage_correlation": 1.0,
"reward_std": 0.16357392026111484,
"rewards/cosine_scaled_reward": 0.2337212460115552,
"rewards/format_reward": 0.6458333414047956,
"step": 52
},
{
"advantage_max": 0.1701141782104969,
"advantage_mean": -1.8626452463754717e-09,
"advantage_min": -0.2136994767934084,
"advantage_std": 0.15749749122187495,
"completion_length": 2312.6250381469727,
"epoch": 0.060571428571428575,
"grad_norm": 0.002415906172245741,
"kl": 0.00011346861720085144,
"learning_rate": 9.999561358041868e-07,
"loss": 0.0041,
"reward": 0.15444708871655166,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.1574974935501814,
"rewards/cosine_scaled_reward": 0.11371952062472701,
"rewards/format_reward": 0.6875000074505806,
"step": 53
},
{
"advantage_max": 0.17725317552685738,
"advantage_mean": -7.605801322085881e-09,
"advantage_min": -0.2166620921343565,
"advantage_std": 0.1593807926401496,
"completion_length": 1764.9166946411133,
"epoch": 0.061714285714285715,
"grad_norm": 0.002244009170681238,
"kl": 7.051974534988403e-05,
"learning_rate": 9.999013075636804e-07,
"loss": -0.0003,
"reward": 0.23366648191586137,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1593807958997786,
"rewards/cosine_scaled_reward": 0.2675904119387269,
"rewards/format_reward": 0.8333333432674408,
"step": 54
},
{
"advantage_max": 0.18578462721779943,
"advantage_mean": -7.761021547647573e-10,
"advantage_min": -0.2353920480236411,
"advantage_std": 0.17948928149417043,
"completion_length": 2628.7083892822266,
"epoch": 0.06285714285714286,
"grad_norm": 0.00294486409984529,
"kl": 0.00012754648923873901,
"learning_rate": 9.998245517681593e-07,
"loss": 0.0058,
"reward": 0.1491623887559399,
"reward_advantage_correlation": 1.0,
"reward_std": 0.17948928708210588,
"rewards/cosine_scaled_reward": 0.15094960387796164,
"rewards/format_reward": 0.583333345130086,
"step": 55
},
{
"advantage_max": 0.10377925122156739,
"advantage_mean": -4.0357312075522955e-09,
"advantage_min": -0.1440494479611516,
"advantage_std": 0.1021224157884717,
"completion_length": 2625.583366394043,
"epoch": 0.064,
"grad_norm": 0.0012550298124551773,
"kl": 0.00013631582260131836,
"learning_rate": 9.997258721585931e-07,
"loss": -0.0009,
"reward": 0.07884217612445354,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.10212242277339101,
"rewards/cosine_scaled_reward": -0.04887582641094923,
"rewards/format_reward": 0.5625000074505806,
"step": 56
},
{
"advantage_max": 0.2065029153600335,
"advantage_mean": 3.10440798068079e-10,
"advantage_min": -0.21144532784819603,
"advantage_std": 0.18026947043836117,
"completion_length": 2979.4167098999023,
"epoch": 0.06514285714285714,
"grad_norm": 0.003122963709756732,
"kl": 0.00011894106864929199,
"learning_rate": 9.996052735444862e-07,
"loss": 0.0089,
"reward": 0.10382829024456441,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.1802694769576192,
"rewards/cosine_scaled_reward": 0.04622313613072038,
"rewards/format_reward": 0.5208333469927311,
"step": 57
},
{
"advantage_max": 0.10899663623422384,
"advantage_mean": 1.5522043095295146e-09,
"advantage_min": -0.15494094602763653,
"advantage_std": 0.10533166327513754,
"completion_length": 1586.5625343322754,
"epoch": 0.06628571428571428,
"grad_norm": 0.0014194791438058019,
"kl": 8.42362642288208e-05,
"learning_rate": 9.994627618036452e-07,
"loss": 0.0066,
"reward": 0.18032316933386028,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.10533167002722621,
"rewards/cosine_scaled_reward": 0.13439065031707287,
"rewards/format_reward": 0.7916666716337204,
"step": 58
},
{
"advantage_max": 0.1733364863321185,
"advantage_mean": -6.286427572943509e-09,
"advantage_min": -0.1383139775134623,
"advantage_std": 0.13875256897881627,
"completion_length": 2547.6041679382324,
"epoch": 0.06742857142857143,
"grad_norm": 0.0021414561197161674,
"kl": 9.694695472717285e-05,
"learning_rate": 9.992983438818915e-07,
"loss": 0.0023,
"reward": 0.09846353763714433,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13875256897881627,
"rewards/cosine_scaled_reward": 0.0630117068067193,
"rewards/format_reward": 0.4583333358168602,
"step": 59
},
{
"advantage_max": 0.12070130999200046,
"advantage_mean": -6.0535968737784884e-09,
"advantage_min": -0.18512004520744085,
"advantage_std": 0.12240555556491017,
"completion_length": 2303.979232788086,
"epoch": 0.06857142857142857,
"grad_norm": 0.0023025507107377052,
"kl": 0.00011995434761047363,
"learning_rate": 9.991120277927223e-07,
"loss": 0.0079,
"reward": 0.14335119677707553,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.12240555975586176,
"rewards/cosine_scaled_reward": 0.08939424622803926,
"rewards/format_reward": 0.6666666865348816,
"step": 60
},
{
"advantage_max": 0.12060253554955125,
"advantage_mean": 6.208817071584605e-10,
"advantage_min": -0.14025499392300844,
"advantage_std": 0.09711946547031403,
"completion_length": 2346.750030517578,
"epoch": 0.06971428571428571,
"grad_norm": 0.00180693285074085,
"kl": 8.498877286911011e-05,
"learning_rate": 9.989038226169207e-07,
"loss": -0.0049,
"reward": 0.09262806148035452,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09711947059258819,
"rewards/cosine_scaled_reward": -0.00842318870127201,
"rewards/format_reward": 0.5625000149011612,
"step": 61
},
{
"advantage_max": 0.16065927874296904,
"advantage_mean": -4.501392619760125e-09,
"advantage_min": -0.23090818990021944,
"advantage_std": 0.16813216032460332,
"completion_length": 2060.812515258789,
"epoch": 0.07085714285714285,
"grad_norm": 0.0032414915040135384,
"kl": 7.87973403930664e-05,
"learning_rate": 9.98673738502114e-07,
"loss": 0.0045,
"reward": 0.18154411297291517,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.16813215287402272,
"rewards/cosine_scaled_reward": 0.184435760602355,
"rewards/format_reward": 0.7083333432674408,
"step": 62
},
{
"advantage_max": 0.10771181527525187,
"advantage_mean": -2.173086099954702e-09,
"advantage_min": -0.09812528779730201,
"advantage_std": 0.08161175763234496,
"completion_length": 1515.2916717529297,
"epoch": 0.072,
"grad_norm": 0.0015955866547301412,
"kl": 8.296966552734375e-05,
"learning_rate": 9.98421786662277e-07,
"loss": -0.0048,
"reward": 0.22865570522844791,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.08161176042631269,
"rewards/cosine_scaled_reward": 0.23915747739374638,
"rewards/format_reward": 0.875,
"step": 63
},
{
"advantage_max": 0.14562141429632902,
"advantage_mean": -3.9581210253825105e-09,
"advantage_min": -0.20186926797032356,
"advantage_std": 0.1426104260608554,
"completion_length": 2601.916702270508,
"epoch": 0.07314285714285715,
"grad_norm": 0.0024525534827262163,
"kl": 0.00014842301607131958,
"learning_rate": 9.981479793771866e-07,
"loss": 0.0056,
"reward": 0.10283284028992057,
"reward_advantage_correlation": 1.0,
"reward_std": 0.14261042792350054,
"rewards/cosine_scaled_reward": 0.051970590837299824,
"rewards/format_reward": 0.5000000074505806,
"step": 64
},
{
"advantage_max": 0.12431543041020632,
"advantage_mean": -4.423782326568038e-09,
"advantage_min": -0.12246682308614254,
"advantage_std": 0.10179906385019422,
"completion_length": 2456.2292098999023,
"epoch": 0.07428571428571429,
"grad_norm": 0.0021826298907399178,
"kl": 0.00010146945714950562,
"learning_rate": 9.97852329991824e-07,
"loss": -0.0063,
"reward": 0.08970492146909237,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.10179906524717808,
"rewards/cosine_scaled_reward": -0.029155070893466473,
"rewards/format_reward": 0.5833333432674408,
"step": 65
},
{
"advantage_max": 0.11777388351038098,
"advantage_mean": -3.2596293303432944e-09,
"advantage_min": -0.08773828111588955,
"advantage_std": 0.07737651432398707,
"completion_length": 2061.0000038146973,
"epoch": 0.07542857142857143,
"grad_norm": 0.0009109475067816675,
"kl": 0.00010826066136360168,
"learning_rate": 9.975348529157229e-07,
"loss": 0.0001,
"reward": 0.07414777716621757,
"reward_advantage_correlation": 1.0,
"reward_std": 0.0773765177000314,
"rewards/cosine_scaled_reward": -0.03114328160881996,
"rewards/format_reward": 0.5,
"step": 66
},
{
"advantage_max": 0.1809951732866466,
"advantage_mean": 8.537123508123301e-10,
"advantage_min": -0.09886624943464994,
"advantage_std": 0.11706549394875765,
"completion_length": 3042.4583587646484,
"epoch": 0.07657142857142857,
"grad_norm": 0.0019939993508160114,
"kl": 0.00010170042514801025,
"learning_rate": 9.971955636222684e-07,
"loss": 0.0053,
"reward": -0.01826105872169137,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.11706549627706409,
"rewards/cosine_scaled_reward": -0.22059013764373958,
"rewards/format_reward": 0.3333333358168602,
"step": 67
},
{
"advantage_max": 0.10667644999921322,
"advantage_mean": -6.6744789001260685e-09,
"advantage_min": -0.13124415185302496,
"advantage_std": 0.10147338453680277,
"completion_length": 1382.4583473205566,
"epoch": 0.07771428571428571,
"grad_norm": 0.0009440166177228093,
"kl": 7.583759725093842e-05,
"learning_rate": 9.968344786479415e-07,
"loss": -0.0006,
"reward": 0.16119565116241574,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.10147338453680277,
"rewards/cosine_scaled_reward": 0.09969675447791815,
"rewards/format_reward": 0.75,
"step": 68
},
{
"advantage_max": 0.19610813772305846,
"advantage_mean": -3.7640954658746395e-09,
"advantage_min": -0.1299685575067997,
"advantage_std": 0.13548616133630276,
"completion_length": 1842.7292098999023,
"epoch": 0.07885714285714286,
"grad_norm": 0.002522306516766548,
"kl": 9.226799011230469e-05,
"learning_rate": 9.964516155915151e-07,
"loss": 0.0051,
"reward": 0.10152785666286945,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1354861636646092,
"rewards/cosine_scaled_reward": -0.05570081574842334,
"rewards/format_reward": 0.7083333358168602,
"step": 69
},
{
"advantage_max": 0.08541852049529552,
"advantage_mean": 5.432715666220389e-10,
"advantage_min": -0.14142528641968966,
"advantage_std": 0.08910802565515041,
"completion_length": 2474.979202270508,
"epoch": 0.08,
"grad_norm": 0.0011430344311520457,
"kl": 9.79304313659668e-05,
"learning_rate": 9.960469931131936e-07,
"loss": -0.0021,
"reward": 0.13445935118943453,
"reward_advantage_correlation": 1.0,
"reward_std": 0.08910802891477942,
"rewards/cosine_scaled_reward": 0.08494364470243454,
"rewards/format_reward": 0.625,
"step": 70
},
{
"advantage_max": 0.10955090029165149,
"advantage_mean": -3.104408646814605e-09,
"advantage_min": -0.12331806868314743,
"advantage_std": 0.09641325660049915,
"completion_length": 2457.1458587646484,
"epoch": 0.08114285714285714,
"grad_norm": 0.0009280177182517946,
"kl": 0.00010801851749420166,
"learning_rate": 9.956206309337066e-07,
"loss": -0.0,
"reward": 0.06796744232997298,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09641325753182173,
"rewards/cosine_scaled_reward": 0.0028737219981849194,
"rewards/format_reward": 0.39583333395421505,
"step": 71
},
{
"advantage_max": 0.1530079017393291,
"advantage_mean": -6.053597047250836e-09,
"advantage_min": -0.13696587504819036,
"advantage_std": 0.11541180987842381,
"completion_length": 2491.187572479248,
"epoch": 0.08228571428571428,
"grad_norm": 0.002064738655462861,
"kl": 0.0001221299171447754,
"learning_rate": 9.951725498333448e-07,
"loss": 0.0071,
"reward": 0.06420155242085457,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11541181430220604,
"rewards/cosine_scaled_reward": -0.10319141205400229,
"rewards/format_reward": 0.5833333414047956,
"step": 72
},
{
"advantage_max": 0.15204641316086054,
"advantage_mean": -5.161079424942372e-09,
"advantage_min": -0.1872421819716692,
"advantage_std": 0.1310334224253893,
"completion_length": 3098.479217529297,
"epoch": 0.08342857142857144,
"grad_norm": 0.0022661956027150154,
"kl": 0.00015395879745483398,
"learning_rate": 9.947027716509488e-07,
"loss": 0.0065,
"reward": 0.07659045979380608,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1310334224253893,
"rewards/cosine_scaled_reward": 0.00787665881216526,
"rewards/format_reward": 0.43750001303851604,
"step": 73
},
{
"advantage_max": 0.17096955608576536,
"advantage_mean": -4.190951724547531e-09,
"advantage_min": -0.17159210238605738,
"advantage_std": 0.13066924829035997,
"completion_length": 2244.520866394043,
"epoch": 0.08457142857142858,
"grad_norm": 0.0017521478002890944,
"kl": 0.00010526180267333984,
"learning_rate": 9.942113192828444e-07,
"loss": 0.0054,
"reward": 0.1286474959924817,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.13066925154998899,
"rewards/cosine_scaled_reward": 0.07717459555715322,
"rewards/format_reward": 0.6041666753590107,
"step": 74
},
{
"advantage_max": 0.14245222136378288,
"advantage_mean": -8.343098368418511e-09,
"advantage_min": -0.12452936079353094,
"advantage_std": 0.10217330139130354,
"completion_length": 2732.3958892822266,
"epoch": 0.08571428571428572,
"grad_norm": 0.0021405534353107214,
"kl": 0.00012764334678649902,
"learning_rate": 9.93698216681727e-07,
"loss": 0.0107,
"reward": 0.10671802004799247,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.10217330139130354,
"rewards/cosine_scaled_reward": 0.07396111264824867,
"rewards/format_reward": 0.4791666828095913,
"step": 75
},
{
"advantage_max": 0.1363742845132947,
"advantage_mean": -1.9402554285452567e-09,
"advantage_min": -0.1722245216369629,
"advantage_std": 0.1291468944400549,
"completion_length": 2299.4583587646484,
"epoch": 0.08685714285714285,
"grad_norm": 0.002542425412684679,
"kl": 0.00011056661605834961,
"learning_rate": 9.931634888554935e-07,
"loss": 0.0118,
"reward": 0.059120094403624535,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12914689676836133,
"rewards/cosine_scaled_reward": -0.0978600550442934,
"rewards/format_reward": 0.5416666753590107,
"step": 76
},
{
"advantage_max": 0.1297641615383327,
"advantage_mean": 8.537124410179509e-10,
"advantage_min": -0.1266080942004919,
"advantage_std": 0.10432415176182985,
"completion_length": 2606.7708892822266,
"epoch": 0.088,
"grad_norm": 0.0015011918731033802,
"kl": 0.00010880827903747559,
"learning_rate": 9.926071618660237e-07,
"loss": 0.0022,
"reward": 0.0441059676813893,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.10432415641844273,
"rewards/cosine_scaled_reward": -0.09794393740594387,
"rewards/format_reward": 0.45833333395421505,
"step": 77
},
{
"advantage_max": 0.21008419059216976,
"advantage_mean": -9.71445146547012e-17,
"advantage_min": -0.17278954200446606,
"advantage_std": 0.15343208238482475,
"completion_length": 2731.3542098999023,
"epoch": 0.08914285714285715,
"grad_norm": 0.0027688215486705303,
"kl": 0.00013116281479597092,
"learning_rate": 9.9202926282791e-07,
"loss": 0.0028,
"reward": 0.12476505199447274,
"reward_advantage_correlation": 1.0,
"reward_std": 0.15343208704143763,
"rewards/cosine_scaled_reward": 0.09734340105205774,
"rewards/format_reward": 0.5416666772216558,
"step": 78
},
{
"advantage_max": 0.20920497737824917,
"advantage_mean": -5.898376342905465e-09,
"advantage_min": -0.1681775012984872,
"advantage_std": 0.15584641904570162,
"completion_length": 1935.1041984558105,
"epoch": 0.09028571428571429,
"grad_norm": 0.0024462228175252676,
"kl": 8.627399802207947e-05,
"learning_rate": 9.91429819907136e-07,
"loss": 0.0024,
"reward": 0.12909462582319975,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.15584642230533063,
"rewards/cosine_scaled_reward": 0.004465287551283836,
"rewards/format_reward": 0.75,
"step": 79
},
{
"advantage_max": 0.16656427085399628,
"advantage_mean": -1.5522044760629683e-10,
"advantage_min": -0.1844524722546339,
"advantage_std": 0.13766634557396173,
"completion_length": 2919.6250610351562,
"epoch": 0.09142857142857143,
"grad_norm": 0.002895612735301256,
"kl": 0.00016814470291137695,
"learning_rate": 9.908088623197048e-07,
"loss": -0.0024,
"reward": 0.08655929937958717,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13766634929925203,
"rewards/cosine_scaled_reward": 0.037981233559548855,
"rewards/format_reward": 0.43750000186264515,
"step": 80
},
{
"advantage_max": 0.17092055454850197,
"advantage_mean": -5.820766389719179e-09,
"advantage_min": -0.17761663650162518,
"advantage_std": 0.1536490712314844,
"completion_length": 2850.3333854675293,
"epoch": 0.09257142857142857,
"grad_norm": 0.0033340235240757465,
"kl": 0.0001818835735321045,
"learning_rate": 9.901664203302124e-07,
"loss": 0.0037,
"reward": 0.08180352626368403,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.153649078682065,
"rewards/cosine_scaled_reward": 0.003097064793109894,
"rewards/format_reward": 0.4791666716337204,
"step": 81
},
{
"advantage_max": 0.12402567837852985,
"advantage_mean": -5.0446641516876944e-09,
"advantage_min": -0.15394007693976164,
"advantage_std": 0.11961714894277975,
"completion_length": 2321.187530517578,
"epoch": 0.09371428571428571,
"grad_norm": 0.0027620706241577864,
"kl": 9.965896606445312e-05,
"learning_rate": 9.895025252503755e-07,
"loss": 0.0116,
"reward": 0.11978777777403593,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11961715843062848,
"rewards/cosine_scaled_reward": 0.08022868749685585,
"rewards/format_reward": 0.541666679084301,
"step": 82
},
{
"advantage_max": 0.22959647234529257,
"advantage_mean": -2.405916771364147e-09,
"advantage_min": -0.17005170974880457,
"advantage_std": 0.1516456357203424,
"completion_length": 2457.958381652832,
"epoch": 0.09485714285714286,
"grad_norm": 0.002318345010280609,
"kl": 0.00013965368270874023,
"learning_rate": 9.888172094375033e-07,
"loss": 0.0055,
"reward": 0.08302936844120268,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.15164564363658428,
"rewards/cosine_scaled_reward": -0.005184752866625786,
"rewards/format_reward": 0.5000000018626451,
"step": 83
},
{
"advantage_max": 0.16241815499961376,
"advantage_mean": -7.722216546768301e-09,
"advantage_min": -0.1314363582059741,
"advantage_std": 0.12282431870698929,
"completion_length": 2358.8125228881836,
"epoch": 0.096,
"grad_norm": 0.0016149451257660985,
"kl": 0.00011220574378967285,
"learning_rate": 9.881105062929221e-07,
"loss": 0.0038,
"reward": 0.11924176779575646,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12282432615756989,
"rewards/cosine_scaled_reward": 0.04767691483721137,
"rewards/format_reward": 0.6041666679084301,
"step": 84
},
{
"advantage_max": 0.251274854876101,
"advantage_mean": -4.19095166903638e-09,
"advantage_min": -0.20053553488105536,
"advantage_std": 0.18136232160031796,
"completion_length": 2729.6250610351562,
"epoch": 0.09714285714285714,
"grad_norm": 0.0030045255552977324,
"kl": 9.201047942042351e-05,
"learning_rate": 9.873824502603459e-07,
"loss": 0.0082,
"reward": 0.10162241314537823,
"reward_advantage_correlation": 1.0,
"reward_std": 0.18136232951655984,
"rewards/cosine_scaled_reward": 0.009953074157238007,
"rewards/format_reward": 0.583333345130086,
"step": 85
},
{
"advantage_max": 0.10346131678670645,
"advantage_mean": 2.7939678903798892e-09,
"advantage_min": -0.11049740668386221,
"advantage_std": 0.08825402474030852,
"completion_length": 2581.270881652832,
"epoch": 0.09828571428571428,
"grad_norm": 0.0011986541794613004,
"kl": 0.00014419853687286377,
"learning_rate": 9.866330768241983e-07,
"loss": 0.0016,
"reward": 0.0768510882044211,
"reward_advantage_correlation": 1.0,
"reward_std": 0.08825402474030852,
"rewards/cosine_scaled_reward": -0.002405572682619095,
"rewards/format_reward": 0.4583333358168602,
"step": 86
},
{
"advantage_max": 0.12127275113016367,
"advantage_mean": -5.1998845854162035e-09,
"advantage_min": -0.16166772227734327,
"advantage_std": 0.10958564793691039,
"completion_length": 2274.125068664551,
"epoch": 0.09942857142857142,
"grad_norm": 0.0028325302992016077,
"kl": 0.00013668090105056763,
"learning_rate": 9.85862422507884e-07,
"loss": 0.0113,
"reward": 0.1180245433570235,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.10958564607426524,
"rewards/cosine_scaled_reward": 0.0059294914826750755,
"rewards/format_reward": 0.687500013038516,
"step": 87
},
{
"advantage_max": 0.23650739900767803,
"advantage_mean": -1.746229910670749e-09,
"advantage_min": -0.18534229695796967,
"advantage_std": 0.1711007342673838,
"completion_length": 1488.5833587646484,
"epoch": 0.10057142857142858,
"grad_norm": 0.001865549013018608,
"kl": 8.325278759002686e-05,
"learning_rate": 9.850705248720068e-07,
"loss": 0.009,
"reward": 0.19773138477467,
"reward_advantage_correlation": 1.0,
"reward_std": 0.17110073706135154,
"rewards/cosine_scaled_reward": 0.15710189566016197,
"rewards/format_reward": 0.8541666697710752,
"step": 88
},
{
"advantage_max": 0.17963521927595139,
"advantage_mean": -7.605801405352608e-09,
"advantage_min": -0.19737609662115574,
"advantage_std": 0.16144832829013467,
"completion_length": 2659.791702270508,
"epoch": 0.10171428571428572,
"grad_norm": 0.0037124978844076395,
"kl": 0.00011199712753295898,
"learning_rate": 9.8425742251254e-07,
"loss": 0.0162,
"reward": 0.09529236517846584,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.16144833154976368,
"rewards/cosine_scaled_reward": 0.03067113645374775,
"rewards/format_reward": 0.5000000093132257,
"step": 89
},
{
"advantage_max": 0.13574577076360583,
"advantage_mean": -3.8999133436523614e-09,
"advantage_min": -0.1119693242944777,
"advantage_std": 0.10564618976786733,
"completion_length": 2417.0833435058594,
"epoch": 0.10285714285714286,
"grad_norm": 0.001697812112979591,
"kl": 0.00015023350715637207,
"learning_rate": 9.83423155058946e-07,
"loss": 0.003,
"reward": 0.040875127888284624,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.10564619302749634,
"rewards/cosine_scaled_reward": -0.14221901632845402,
"rewards/format_reward": 0.5208333395421505,
"step": 90
},
{
"advantage_max": 0.16820846288464963,
"advantage_mean": 3.4924597380747713e-09,
"advantage_min": -0.16563586331903934,
"advantage_std": 0.13759498205035925,
"completion_length": 2622.062515258789,
"epoch": 0.104,
"grad_norm": 0.0026448904536664486,
"kl": 0.00011968612670898438,
"learning_rate": 9.825677631722435e-07,
"loss": 0.0119,
"reward": 0.11700052605010569,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13759497925639153,
"rewards/cosine_scaled_reward": 0.07498859567567706,
"rewards/format_reward": 0.5416666716337204,
"step": 91
},
{
"advantage_max": 0.12286161910742521,
"advantage_mean": -6.286427836621478e-09,
"advantage_min": -0.1205812394618988,
"advantage_std": 0.10014992253854871,
"completion_length": 2147.791717529297,
"epoch": 0.10514285714285715,
"grad_norm": 0.0013253620127215981,
"kl": 8.553266525268555e-05,
"learning_rate": 9.816912885430258e-07,
"loss": -0.0002,
"reward": 0.12493289890699089,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.10014992393553257,
"rewards/cosine_scaled_reward": 0.004685905296355486,
"rewards/format_reward": 0.7291666697710752,
"step": 92
},
{
"advantage_max": 0.18725960794836283,
"advantage_mean": 2.3283065753165744e-10,
"advantage_min": -0.09588433895260096,
"advantage_std": 0.10476150875911117,
"completion_length": 3570.2708740234375,
"epoch": 0.10628571428571429,
"grad_norm": 0.0024138952139765024,
"kl": 0.0002326369285583496,
"learning_rate": 9.807937738894303e-07,
"loss": 0.0014,
"reward": -0.049280768260359764,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.10476151248440146,
"rewards/cosine_scaled_reward": -0.18744310783222318,
"rewards/format_reward": 0.0833333358168602,
"step": 93
},
{
"advantage_max": 0.13818231970071793,
"advantage_mean": -2.0566707503721915e-09,
"advantage_min": -0.12359800864942372,
"advantage_std": 0.09686838975176215,
"completion_length": 2422.6458740234375,
"epoch": 0.10742857142857143,
"grad_norm": 0.0018462935695424676,
"kl": 0.00012992694973945618,
"learning_rate": 9.798752629550546e-07,
"loss": -0.0004,
"reward": 0.10946622129995376,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.09686839301139116,
"rewards/cosine_scaled_reward": 0.03989738831296563,
"rewards/format_reward": 0.5625000055879354,
"step": 94
},
{
"advantage_max": 0.1666426188312471,
"advantage_mean": 1.7074247293802358e-09,
"advantage_min": -0.1642469959333539,
"advantage_std": 0.13464828813448548,
"completion_length": 3139.666732788086,
"epoch": 0.10857142857142857,
"grad_norm": 0.002334951190277934,
"kl": 0.00013262033462524414,
"learning_rate": 9.78935800506826e-07,
"loss": 0.0073,
"reward": 0.05827001016587019,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13464829558506608,
"rewards/cosine_scaled_reward": -0.0356507133692503,
"rewards/format_reward": 0.41666667349636555,
"step": 95
},
{
"advantage_max": 0.2095841746777296,
"advantage_mean": 2.017865666226193e-09,
"advantage_min": -0.17328586243093014,
"advantage_std": 0.14680979307740927,
"completion_length": 2370.6458587646484,
"epoch": 0.10971428571428571,
"grad_norm": 0.0024344930425286293,
"kl": 0.00011432915925979614,
"learning_rate": 9.779754323328192e-07,
"loss": 0.0048,
"reward": 0.1633957652375102,
"reward_advantage_correlation": 1.0,
"reward_std": 0.14680979494005442,
"rewards/cosine_scaled_reward": 0.182056006626226,
"rewards/format_reward": 0.6041666734963655,
"step": 96
},
{
"advantage_max": 0.2188143515959382,
"advantage_mean": -4.423782243301311e-09,
"advantage_min": -0.17432072758674622,
"advantage_std": 0.15836003702133894,
"completion_length": 2806.6458892822266,
"epoch": 0.11085714285714286,
"grad_norm": 0.00301153352484107,
"kl": 0.00013709068298339844,
"learning_rate": 9.769942052400235e-07,
"loss": 0.0127,
"reward": 0.0940831717234687,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1583600454032421,
"rewards/cosine_scaled_reward": 0.039922329247929156,
"rewards/format_reward": 0.4791666716337204,
"step": 97
},
{
"advantage_max": 0.13472749013453722,
"advantage_mean": -4.811833625995021e-09,
"advantage_min": -0.09283868130296469,
"advantage_std": 0.08354483381845057,
"completion_length": 2286.208381652832,
"epoch": 0.112,
"grad_norm": 0.0012522657634690404,
"kl": 8.559972047805786e-05,
"learning_rate": 9.759921670520634e-07,
"loss": 0.0064,
"reward": 0.08262644917704165,
"reward_advantage_correlation": 1.0,
"reward_std": 0.08354483777657151,
"rewards/cosine_scaled_reward": -0.015163867268711329,
"rewards/format_reward": 0.5208333376795053,
"step": 98
},
{
"advantage_max": 0.1714438796043396,
"advantage_mean": -4.268561934472892e-09,
"advantage_min": -0.20534021221101284,
"advantage_std": 0.1506601725704968,
"completion_length": 2804.3333587646484,
"epoch": 0.11314285714285714,
"grad_norm": 0.0023277695290744305,
"kl": 0.0001052170991897583,
"learning_rate": 9.749693666068663e-07,
"loss": 0.0042,
"reward": 0.12414564751088619,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.15066017862409353,
"rewards/cosine_scaled_reward": 0.16006462997756898,
"rewards/format_reward": 0.4166666716337204,
"step": 99
},
{
"advantage_max": 0.13038562145084143,
"advantage_mean": -8.30429328774196e-09,
"advantage_min": -0.11401992756873369,
"advantage_std": 0.10838590876664966,
"completion_length": 2251.979179382324,
"epoch": 0.11428571428571428,
"grad_norm": 0.0022543719969689846,
"kl": 0.00011374056339263916,
"learning_rate": 9.739258537542835e-07,
"loss": 0.0052,
"reward": 0.13824327662587166,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10838590934872627,
"rewards/cosine_scaled_reward": 0.09618911519646645,
"rewards/format_reward": 0.6250000037252903,
"step": 100
},
{
"advantage_max": 0.1658927546814084,
"advantage_mean": -2.6387474566513802e-09,
"advantage_min": -0.10248309839516878,
"advantage_std": 0.10365857649594545,
"completion_length": 2299.0417098999023,
"epoch": 0.11542857142857142,
"grad_norm": 0.0026296344585716724,
"kl": 0.0001501142978668213,
"learning_rate": 9.728616793536587e-07,
"loss": 0.0098,
"reward": 0.1061628689058125,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.10365858301520348,
"rewards/cosine_scaled_reward": 0.03263038757722825,
"rewards/format_reward": 0.5625000055879354,
"step": 101
},
{
"advantage_max": 0.1605947259813547,
"advantage_mean": -3.4924597103191957e-09,
"advantage_min": -0.18639070075005293,
"advantage_std": 0.15262889862060547,
"completion_length": 2045.4375457763672,
"epoch": 0.11657142857142858,
"grad_norm": 0.003396169049665332,
"kl": 0.0001347959041595459,
"learning_rate": 9.717768952713511e-07,
"loss": 0.0105,
"reward": 0.1790522364899516,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.15262890281155705,
"rewards/cosine_scaled_reward": 0.15230208821594715,
"rewards/format_reward": 0.7500000074505806,
"step": 102
},
{
"advantage_max": 0.14158586133271456,
"advantage_mean": 4.4237822571790986e-09,
"advantage_min": -0.10933790914714336,
"advantage_std": 0.09761173883453012,
"completion_length": 2373.9792251586914,
"epoch": 0.11771428571428572,
"grad_norm": 0.0012007191544398665,
"kl": 0.00010399753227829933,
"learning_rate": 9.706715543782064e-07,
"loss": -0.0011,
"reward": 0.11079470813274384,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09761174395680428,
"rewards/cosine_scaled_reward": -0.038000524044036865,
"rewards/format_reward": 0.7291666697710752,
"step": 103
},
{
"advantage_max": 0.11513470765203238,
"advantage_mean": -8.925174974083738e-09,
"advantage_min": -0.11279049189761281,
"advantage_std": 0.09129662462510169,
"completion_length": 2454.00004196167,
"epoch": 0.11885714285714286,
"grad_norm": 0.0016338457353413105,
"kl": 0.00012861378490924835,
"learning_rate": 9.695457105469804e-07,
"loss": -0.0011,
"reward": 0.06962736044079065,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.0912966295145452,
"rewards/cosine_scaled_reward": -0.05629314109683037,
"rewards/format_reward": 0.5208333376795053,
"step": 104
},
{
"advantage_max": 0.15599115658551455,
"advantage_mean": -4.346172109703783e-09,
"advantage_min": -0.23021886963397264,
"advantage_std": 0.1603334224782884,
"completion_length": 2441.083366394043,
"epoch": 0.12,
"grad_norm": 0.002161208540201187,
"kl": 0.00013302266597747803,
"learning_rate": 9.683994186497132e-07,
"loss": 0.0091,
"reward": 0.14520665351301432,
"reward_advantage_correlation": 1.0,
"reward_std": 0.16033342946320772,
"rewards/cosine_scaled_reward": 0.178804699331522,
"rewards/format_reward": 0.5000000074505806,
"step": 105
},
{
"advantage_max": 0.10074207372963428,
"advantage_mean": -7.916242286687414e-09,
"advantage_min": -0.16330508375540376,
"advantage_std": 0.10734084341675043,
"completion_length": 1822.1041870117188,
"epoch": 0.12114285714285715,
"grad_norm": 0.001392633537761867,
"kl": 6.573088467121124e-05,
"learning_rate": 9.672327345550543e-07,
"loss": 0.0026,
"reward": 0.21404909482225776,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10734084062278271,
"rewards/cosine_scaled_reward": 0.24547897465527058,
"rewards/format_reward": 0.7708333395421505,
"step": 106
},
{
"advantage_max": 0.12848057132214308,
"advantage_mean": -3.802900563898426e-09,
"advantage_min": -0.18607094045728445,
"advantage_std": 0.12191221117973328,
"completion_length": 2692.791702270508,
"epoch": 0.12228571428571429,
"grad_norm": 0.0020098562818020582,
"kl": 0.0001888573169708252,
"learning_rate": 9.66045715125541e-07,
"loss": 0.0038,
"reward": 0.07842084765434265,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12191221164539456,
"rewards/cosine_scaled_reward": 0.001535113900899887,
"rewards/format_reward": 0.45833334140479565,
"step": 107
},
{
"advantage_max": 0.11962755676358938,
"advantage_mean": 1.552204420551817e-09,
"advantage_min": -0.14954979997128248,
"advantage_std": 0.10876429115887731,
"completion_length": 2763.416702270508,
"epoch": 0.12342857142857143,
"grad_norm": 0.0018431171774864197,
"kl": 0.00015020370483398438,
"learning_rate": 9.648384182148252e-07,
"loss": 0.0066,
"reward": 0.051005338318645954,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10876429441850632,
"rewards/cosine_scaled_reward": -0.038408463820815086,
"rewards/format_reward": 0.3750000037252903,
"step": 108
},
{
"advantage_max": 0.15706831123679876,
"advantage_mean": -2.483526884144993e-09,
"advantage_min": -0.14608103781938553,
"advantage_std": 0.12137589370831847,
"completion_length": 2733.2292098999023,
"epoch": 0.12457142857142857,
"grad_norm": 0.0020496873185038567,
"kl": 0.00012265145778656006,
"learning_rate": 9.636109026648554e-07,
"loss": 0.0055,
"reward": 0.05345132830552757,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12137589510530233,
"rewards/cosine_scaled_reward": -0.05171956028789282,
"rewards/format_reward": 0.4166666716337204,
"step": 109
},
{
"advantage_max": 0.2400534199550748,
"advantage_mean": -5.355104998328031e-09,
"advantage_min": -0.1949408515356481,
"advantage_std": 0.19347044127061963,
"completion_length": 2730.4375228881836,
"epoch": 0.12571428571428572,
"grad_norm": 0.0037184932734817266,
"kl": 0.00014457106590270996,
"learning_rate": 9.623632283030077e-07,
"loss": 0.0093,
"reward": 0.12486143945716321,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.19347044127061963,
"rewards/cosine_scaled_reward": 0.09646196017274633,
"rewards/format_reward": 0.5416666734963655,
"step": 110
},
{
"advantage_max": 0.15602963138371706,
"advantage_mean": -2.9491884628862763e-09,
"advantage_min": -0.1375290732830763,
"advantage_std": 0.11554153729230165,
"completion_length": 2890.562530517578,
"epoch": 0.12685714285714286,
"grad_norm": 0.002402157988399267,
"kl": 0.0001817643642425537,
"learning_rate": 9.610954559391704e-07,
"loss": -0.005,
"reward": 0.08678329293616116,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.11554153636097908,
"rewards/cosine_scaled_reward": 0.03825441841036081,
"rewards/format_reward": 0.43750000558793545,
"step": 111
},
{
"advantage_max": 0.18926704861223698,
"advantage_mean": -2.6387472762401387e-09,
"advantage_min": -0.25633513927459717,
"advantage_std": 0.1728609693236649,
"completion_length": 2748.7500762939453,
"epoch": 0.128,
"grad_norm": 0.002987251617014408,
"kl": 0.0001405477523803711,
"learning_rate": 9.598076473627796e-07,
"loss": 0.0127,
"reward": 0.11737876618281007,
"reward_advantage_correlation": 1.0,
"reward_std": 0.17286097328178585,
"rewards/cosine_scaled_reward": 0.054929095320403576,
"rewards/format_reward": 0.5833333563059568,
"step": 112
},
{
"advantage_max": 0.20071829669177532,
"advantage_mean": -3.6476802481311132e-09,
"advantage_min": -0.14141817204654217,
"advantage_std": 0.13287563156336546,
"completion_length": 2007.8334159851074,
"epoch": 0.12914285714285714,
"grad_norm": 0.001896014902740717,
"kl": 0.00015522539615631104,
"learning_rate": 9.58499865339809e-07,
"loss": 0.0049,
"reward": 0.14661651686765254,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13287563901394606,
"rewards/cosine_scaled_reward": 0.03844105708412826,
"rewards/format_reward": 0.7916666697710752,
"step": 113
},
{
"advantage_max": 0.10647185088600963,
"advantage_mean": -2.7551627541921864e-09,
"advantage_min": -0.11569630762096494,
"advantage_std": 0.09618571458850056,
"completion_length": 2201.1042137145996,
"epoch": 0.13028571428571428,
"grad_norm": 0.0018926298944279552,
"kl": 8.186884224414825e-05,
"learning_rate": 9.571721736097088e-07,
"loss": 0.0013,
"reward": 0.07421890611294657,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.09618572029285133,
"rewards/cosine_scaled_reward": -0.13425324205309153,
"rewards/format_reward": 0.708333333954215,
"step": 114
},
{
"advantage_max": 0.16031183023005724,
"advantage_mean": -1.552202394394797e-10,
"advantage_min": -0.16533454321324825,
"advantage_std": 0.14068402699194849,
"completion_length": 2915.5625228881836,
"epoch": 0.13142857142857142,
"grad_norm": 0.0025180347729474306,
"kl": 0.00015562772750854492,
"learning_rate": 9.55824636882301e-07,
"loss": 0.0035,
"reward": 0.06694650682038628,
"reward_advantage_correlation": 1.0,
"reward_std": 0.14068402582779527,
"rewards/cosine_scaled_reward": 0.00980508397333324,
"rewards/format_reward": 0.37500000186264515,
"step": 115
},
{
"advantage_max": 0.1184748588129878,
"advantage_mean": -6.208816516473092e-10,
"advantage_min": -0.13819944020360708,
"advantage_std": 0.10394375585019588,
"completion_length": 3137.541679382324,
"epoch": 0.13257142857142856,
"grad_norm": 0.0017691698158159852,
"kl": 0.00019854307174682617,
"learning_rate": 9.54457320834625e-07,
"loss": 0.0012,
"reward": 0.013275583041831851,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.10394375957548618,
"rewards/cosine_scaled_reward": -0.09652221202850342,
"rewards/format_reward": 0.27083333395421505,
"step": 116
},
{
"advantage_max": 0.1553569696843624,
"advantage_mean": -1.8626452047421083e-09,
"advantage_min": -0.12070130556821823,
"advantage_std": 0.1120496722869575,
"completion_length": 2608.791732788086,
"epoch": 0.1337142857142857,
"grad_norm": 0.0024786260910332203,
"kl": 0.00016558915376663208,
"learning_rate": 9.530702921077358e-07,
"loss": 0.0038,
"reward": 0.01226228941231966,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1120496797375381,
"rewards/cosine_scaled_reward": -0.21452020248398185,
"rewards/format_reward": 0.5000000074505806,
"step": 117
},
{
"advantage_max": 0.1705867312848568,
"advantage_mean": 2.7755575615628914e-17,
"advantage_min": -0.28973726741969585,
"advantage_std": 0.1876918189227581,
"completion_length": 2918.7708740234375,
"epoch": 0.13485714285714287,
"grad_norm": 0.0038028184790164232,
"kl": 0.0001245737075805664,
"learning_rate": 9.516636183034564e-07,
"loss": 0.0099,
"reward": 0.18796764593571424,
"reward_advantage_correlation": 1.0,
"reward_std": 0.18769182451069355,
"rewards/cosine_scaled_reward": 0.2854941412806511,
"rewards/format_reward": 0.5416666828095913,
"step": 118
},
{
"advantage_max": 0.12709664832800627,
"advantage_mean": -4.656613011855271e-10,
"advantage_min": -0.14629495097324252,
"advantage_std": 0.10539561160840094,
"completion_length": 1824.270866394043,
"epoch": 0.136,
"grad_norm": 0.0015314030461013317,
"kl": 0.00011080782860517502,
"learning_rate": 9.502373679810839e-07,
"loss": 0.0013,
"reward": 0.16728377249091864,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10539561323821545,
"rewards/cosine_scaled_reward": 0.12748285697307438,
"rewards/format_reward": 0.7291666716337204,
"step": 119
},
{
"advantage_max": 0.13794818706810474,
"advantage_mean": -2.2118911702229127e-09,
"advantage_min": -0.13365713064558804,
"advantage_std": 0.10238635609857738,
"completion_length": 2093.2291870117188,
"epoch": 0.13714285714285715,
"grad_norm": 0.002086564665660262,
"kl": 0.00019008666276931763,
"learning_rate": 9.487916106540465e-07,
"loss": -0.0029,
"reward": 0.11550817801617086,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10238635912537575,
"rewards/cosine_scaled_reward": 0.03842526860535145,
"rewards/format_reward": 0.6041666697710752,
"step": 120
},
{
"advantage_max": 0.06518118735402822,
"advantage_mean": -9.041590292441226e-09,
"advantage_min": -0.10872633708640933,
"advantage_std": 0.06719960737973452,
"completion_length": 1744.4791946411133,
"epoch": 0.1382857142857143,
"grad_norm": 0.0014031269820407033,
"kl": 0.00011703372001647949,
"learning_rate": 9.473264167865171e-07,
"loss": 0.0042,
"reward": 0.1042367173358798,
"reward_advantage_correlation": 1.0,
"reward_std": 0.06719961203634739,
"rewards/cosine_scaled_reward": -0.0471935048699379,
"rewards/format_reward": 0.7083333358168602,
"step": 121
},
{
"advantage_max": 0.1840760800987482,
"advantage_mean": -2.0178656245928295e-09,
"advantage_min": -0.13638843223452568,
"advantage_std": 0.12992733856663108,
"completion_length": 2759.375030517578,
"epoch": 0.13942857142857143,
"grad_norm": 0.002573323668912053,
"kl": 0.00018534809350967407,
"learning_rate": 9.458418577899774e-07,
"loss": 0.0003,
"reward": 0.09318617288954556,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12992734229192138,
"rewards/cosine_scaled_reward": -0.0037322649732232094,
"rewards/format_reward": 0.5625000055879354,
"step": 122
},
{
"advantage_max": 0.19431301951408386,
"advantage_mean": -3.026798457705926e-09,
"advantage_min": -0.1697028325870633,
"advantage_std": 0.14136297907680273,
"completion_length": 2471.1667098999023,
"epoch": 0.14057142857142857,
"grad_norm": 0.0024162298068404198,
"kl": 0.00014369189739227295,
"learning_rate": 9.443380060197385e-07,
"loss": 0.0062,
"reward": 0.09077914047520608,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1413629837334156,
"rewards/cosine_scaled_reward": -0.014139397069811821,
"rewards/format_reward": 0.5625000149011612,
"step": 123
},
{
"advantage_max": 0.1764494488015771,
"advantage_mean": -2.1265199989795036e-08,
"advantage_min": -0.19905486050993204,
"advantage_std": 0.14608955709263682,
"completion_length": 2030.208351135254,
"epoch": 0.1417142857142857,
"grad_norm": 0.0023175496608018875,
"kl": 8.096173405647278e-05,
"learning_rate": 9.428149347714143e-07,
"loss": 0.0027,
"reward": 0.20607653993647546,
"reward_advantage_correlation": 1.0,
"reward_std": 0.14608956221491098,
"rewards/cosine_scaled_reward": 0.2967276629060507,
"rewards/format_reward": 0.6250000055879354,
"step": 124
},
{
"advantage_max": 0.17857370153069496,
"advantage_mean": -2.095475917784917e-09,
"advantage_min": -0.14456479204818606,
"advantage_std": 0.12773457053117454,
"completion_length": 2383.2917137145996,
"epoch": 0.14285714285714285,
"grad_norm": 0.002172604901716113,
"kl": 0.00010094791650772095,
"learning_rate": 9.412727182773486e-07,
"loss": 0.0083,
"reward": 0.10760475019924343,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1277345723938197,
"rewards/cosine_scaled_reward": 0.07878711789089721,
"rewards/format_reward": 0.4791666753590107,
"step": 125
},
{
"advantage_max": 0.153425102122128,
"advantage_mean": -1.629814533332663e-09,
"advantage_min": -0.17162158340215683,
"advantage_std": 0.13127819541841745,
"completion_length": 2324.458396911621,
"epoch": 0.144,
"grad_norm": 0.0026990657206624746,
"kl": 0.00010439753532409668,
"learning_rate": 9.397114317029974e-07,
"loss": 0.0046,
"reward": 0.16764532215893269,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1312782042659819,
"rewards/cosine_scaled_reward": 0.1628055665642023,
"rewards/format_reward": 0.666666679084301,
"step": 126
},
{
"advantage_max": 0.11602956661954522,
"advantage_mean": -3.1044090909038147e-10,
"advantage_min": -0.13568047992885113,
"advantage_std": 0.09200059063732624,
"completion_length": 3087.354202270508,
"epoch": 0.14514285714285713,
"grad_norm": 0.0016305146273225546,
"kl": 0.00017446279525756836,
"learning_rate": 9.381311511432658e-07,
"loss": 0.0051,
"reward": 0.007634018547832966,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.09200059436261654,
"rewards/cosine_scaled_reward": -0.1423247866332531,
"rewards/format_reward": 0.3333333395421505,
"step": 127
},
{
"advantage_max": 0.13058222271502018,
"advantage_mean": 8.537123716290118e-10,
"advantage_min": -0.15740781952627003,
"advantage_std": 0.121270950185135,
"completion_length": 2198.8542251586914,
"epoch": 0.1462857142857143,
"grad_norm": 0.0022203184198588133,
"kl": 0.00014868378639221191,
"learning_rate": 9.36531953618799e-07,
"loss": 0.0063,
"reward": 0.1890734031330794,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.12127095158211887,
"rewards/cosine_scaled_reward": 0.21513771638274193,
"rewards/format_reward": 0.6875,
"step": 128
},
{
"advantage_max": 0.1487028319388628,
"advantage_mean": -1.0865429250772607e-09,
"advantage_min": -0.149984628893435,
"advantage_std": 0.12652261182665825,
"completion_length": 3222.437530517578,
"epoch": 0.14742857142857144,
"grad_norm": 0.002289236057549715,
"kl": 0.00017118453979492188,
"learning_rate": 9.34913917072228e-07,
"loss": 0.0006,
"reward": 0.11886557843536139,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12652262020856142,
"rewards/cosine_scaled_reward": 0.1228477107360959,
"rewards/format_reward": 0.45833333767950535,
"step": 129
},
{
"advantage_max": 0.18156287958845496,
"advantage_mean": -3.104409507237449e-10,
"advantage_min": -0.1829990753903985,
"advantage_std": 0.15616408130154014,
"completion_length": 2944.9375228881836,
"epoch": 0.14857142857142858,
"grad_norm": 0.003384327283129096,
"kl": 0.00020888447761535645,
"learning_rate": 9.332771203643714e-07,
"loss": 0.0086,
"reward": 0.0558637254871428,
"reward_advantage_correlation": 1.0,
"reward_std": 0.15616408409550786,
"rewards/cosine_scaled_reward": -0.001300264149904251,
"rewards/format_reward": 0.3333333395421505,
"step": 130
},
{
"advantage_max": 0.11768224369734526,
"advantage_mean": -2.638747387262441e-09,
"advantage_min": -0.16481691598892212,
"advantage_std": 0.11381826514843851,
"completion_length": 2425.687545776367,
"epoch": 0.14971428571428572,
"grad_norm": 0.002549993572756648,
"kl": 0.0001547001302242279,
"learning_rate": 9.316216432703916e-07,
"loss": 0.0086,
"reward": 0.1262263646349311,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11381827830336988,
"rewards/cosine_scaled_reward": 0.09911833424121141,
"rewards/format_reward": 0.541666679084301,
"step": 131
},
{
"advantage_max": 0.20303583005443215,
"advantage_mean": -1.7850349531833842e-09,
"advantage_min": -0.17521136440336704,
"advantage_std": 0.156132394913584,
"completion_length": 2537.3958435058594,
"epoch": 0.15085714285714286,
"grad_norm": 0.0023349204566329718,
"kl": 0.00015839934349060059,
"learning_rate": 9.299475664759068e-07,
"loss": 0.0022,
"reward": 0.14965266874060035,
"reward_advantage_correlation": 1.0,
"reward_std": 0.15613240003585815,
"rewards/cosine_scaled_reward": 0.19302122248336673,
"rewards/format_reward": 0.5000000018626451,
"step": 132
},
{
"advantage_max": 0.21418469492346048,
"advantage_mean": -1.7074247987691749e-09,
"advantage_min": -0.14465673360973597,
"advantage_std": 0.13837812095880508,
"completion_length": 3043.8750762939453,
"epoch": 0.152,
"grad_norm": 0.003348530502989888,
"kl": 0.00022482872009277344,
"learning_rate": 9.282549715730579e-07,
"loss": 0.0107,
"reward": 0.05317601654678583,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13837812119163573,
"rewards/cosine_scaled_reward": -0.041860264958813787,
"rewards/format_reward": 0.39583333767950535,
"step": 133
},
{
"advantage_max": 0.13740387186408043,
"advantage_mean": -9.313227272711444e-10,
"advantage_min": -0.1500786654651165,
"advantage_std": 0.11633877758868039,
"completion_length": 2266.4375381469727,
"epoch": 0.15314285714285714,
"grad_norm": 0.002200792543590069,
"kl": 0.00014699995517730713,
"learning_rate": 9.265439410565328e-07,
"loss": 0.0056,
"reward": 0.14638759847730398,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11633878224529326,
"rewards/cosine_scaled_reward": 0.11991992685943842,
"rewards/format_reward": 0.6250000055879354,
"step": 134
},
{
"advantage_max": 0.08174204314127564,
"advantage_mean": 3.0267983952558808e-09,
"advantage_min": -0.1401548283174634,
"advantage_std": 0.08930786373093724,
"completion_length": 1240.6041946411133,
"epoch": 0.15428571428571428,
"grad_norm": 0.000816557090729475,
"kl": 6.493180990219116e-05,
"learning_rate": 9.248145583195447e-07,
"loss": 0.0025,
"reward": 0.24731288943439722,
"reward_advantage_correlation": 1.0,
"reward_std": 0.08930786419659853,
"rewards/cosine_scaled_reward": 0.3153181979432702,
"rewards/format_reward": 0.8333333358168602,
"step": 135
},
{
"advantage_max": 0.14499722514301538,
"advantage_mean": -3.2596290250319626e-09,
"advantage_min": -0.2093830332159996,
"advantage_std": 0.1496648290194571,
"completion_length": 2443.604202270508,
"epoch": 0.15542857142857142,
"grad_norm": 0.0018037001136690378,
"kl": 0.00012743473052978516,
"learning_rate": 9.230669076497687e-07,
"loss": 0.0026,
"reward": 0.15657305950298905,
"reward_advantage_correlation": 1.0,
"reward_std": 0.14966483181342483,
"rewards/cosine_scaled_reward": 0.19108737260103226,
"rewards/format_reward": 0.5416666716337204,
"step": 136
},
{
"advantage_max": 0.2110500643029809,
"advantage_mean": 3.5700699757557075e-09,
"advantage_min": -0.16157893557101488,
"advantage_std": 0.14996290765702724,
"completion_length": 2808.6250534057617,
"epoch": 0.15657142857142858,
"grad_norm": 0.003018659772351384,
"kl": 0.00014230981469154358,
"learning_rate": 9.213010742252327e-07,
"loss": 0.0047,
"reward": 0.06567161390557885,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.14996290765702724,
"rewards/cosine_scaled_reward": -0.0245496213901788,
"rewards/format_reward": 0.4375000074505806,
"step": 137
},
{
"advantage_max": 0.13625410571694374,
"advantage_mean": -7.081932080696407e-10,
"advantage_min": -0.17112221661955118,
"advantage_std": 0.12045667658094317,
"completion_length": 2566.395881652832,
"epoch": 0.15771428571428572,
"grad_norm": 0.0023109198082238436,
"kl": 0.00012245774269104004,
"learning_rate": 9.195171441101668e-07,
"loss": 0.0024,
"reward": 0.08583869109861553,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12045668461360037,
"rewards/cosine_scaled_reward": -0.058677418157458305,
"rewards/format_reward": 0.6250000149011612,
"step": 138
},
{
"advantage_max": 0.17770376801490784,
"advantage_mean": -6.51925811251397e-09,
"advantage_min": -0.14586754702031612,
"advantage_std": 0.13520123437047005,
"completion_length": 2895.729202270508,
"epoch": 0.15885714285714286,
"grad_norm": 0.002674209652468562,
"kl": 0.00019547343254089355,
"learning_rate": 9.177152042508077e-07,
"loss": 0.0058,
"reward": 0.10707889473997056,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1352012469433248,
"rewards/cosine_scaled_reward": 0.024583726655691862,
"rewards/format_reward": 0.5833333376795053,
"step": 139
},
{
"advantage_max": 0.25041482876986265,
"advantage_mean": 2.4835269119005687e-09,
"advantage_min": -0.16461172699928284,
"advantage_std": 0.16128699900582433,
"completion_length": 2781.854217529297,
"epoch": 0.16,
"grad_norm": 0.0029202878940850496,
"kl": 0.00022208690643310547,
"learning_rate": 9.158953424711624e-07,
"loss": 0.0026,
"reward": 0.11235592421144247,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.16128700133413076,
"rewards/cosine_scaled_reward": 0.07993581797927618,
"rewards/format_reward": 0.5000000093132257,
"step": 140
},
{
"advantage_max": 0.21202841773629189,
"advantage_mean": -1.3877787807814457e-17,
"advantage_min": -0.13598172459751368,
"advantage_std": 0.13371713273227215,
"completion_length": 2812.6458740234375,
"epoch": 0.16114285714285714,
"grad_norm": 0.0021927033085376024,
"kl": 0.00016339123249053955,
"learning_rate": 9.140576474687263e-07,
"loss": 0.0039,
"reward": 0.06701798271387815,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13371713738888502,
"rewards/cosine_scaled_reward": -0.03017127327620983,
"rewards/format_reward": 0.45833334140479565,
"step": 141
},
{
"advantage_max": 0.1576713090762496,
"advantage_mean": -4.928248802105184e-09,
"advantage_min": -0.144107595551759,
"advantage_std": 0.1275632563047111,
"completion_length": 2608.3125610351562,
"epoch": 0.16228571428571428,
"grad_norm": 0.0021633445285260677,
"kl": 0.0001731477677822113,
"learning_rate": 9.122022088101613e-07,
"loss": 0.0054,
"reward": 0.06165223941206932,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.127563264220953,
"rewards/cosine_scaled_reward": -0.0996420830488205,
"rewards/format_reward": 0.5625000074505806,
"step": 142
},
{
"advantage_max": 0.11721245618537068,
"advantage_mean": -6.907309196835243e-09,
"advantage_min": -0.12485382426530123,
"advantage_std": 0.10056370904203504,
"completion_length": 2204.1042098999023,
"epoch": 0.16342857142857142,
"grad_norm": 0.0021462785080075264,
"kl": 0.00016664713621139526,
"learning_rate": 9.103291169269299e-07,
"loss": -0.0011,
"reward": 0.11139208162785508,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.10056372033432126,
"rewards/cosine_scaled_reward": 0.004287723801098764,
"rewards/format_reward": 0.6458333358168602,
"step": 143
},
{
"advantage_max": 0.1503322133794427,
"advantage_mean": -4.967054101356894e-09,
"advantage_min": -0.14401227980852127,
"advantage_std": 0.12609638017602265,
"completion_length": 2357.0416946411133,
"epoch": 0.16457142857142856,
"grad_norm": 0.0021524883341044188,
"kl": 0.0001500248908996582,
"learning_rate": 9.084384631108882e-07,
"loss": -0.0005,
"reward": 0.20968507044017315,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12609639088623226,
"rewards/cosine_scaled_reward": 0.3309060502797365,
"rewards/format_reward": 0.5833333395421505,
"step": 144
},
{
"advantage_max": 0.15520280180498958,
"advantage_mean": -1.3193737144479023e-09,
"advantage_min": -0.14075595536269248,
"advantage_std": 0.12264994671568274,
"completion_length": 1946.333339691162,
"epoch": 0.1657142857142857,
"grad_norm": 0.0021455343812704086,
"kl": 0.00014033913612365723,
"learning_rate": 9.065303395098358e-07,
"loss": 0.005,
"reward": 0.1299537445884198,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12264994950965047,
"rewards/cosine_scaled_reward": 0.06085195206105709,
"rewards/format_reward": 0.6458333358168602,
"step": 145
},
{
"advantage_max": 0.11647297162562609,
"advantage_mean": 1.0865430360995632e-09,
"advantage_min": -0.11792057100683451,
"advantage_std": 0.0969981993548572,
"completion_length": 2206.2500610351562,
"epoch": 0.16685714285714287,
"grad_norm": 0.0013740290887653828,
"kl": 0.0001351572573184967,
"learning_rate": 9.046048391230247e-07,
"loss": 0.0043,
"reward": 0.06711362052010372,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09699820261448622,
"rewards/cosine_scaled_reward": -0.10451544541865587,
"rewards/format_reward": 0.6041666697710752,
"step": 146
},
{
"advantage_max": 0.16654492495581508,
"advantage_mean": -2.0954757998237206e-09,
"advantage_min": -0.15912125445902348,
"advantage_std": 0.1402019909583032,
"completion_length": 3362.9375610351562,
"epoch": 0.168,
"grad_norm": 0.0023271015379577875,
"kl": 0.0002307295799255371,
"learning_rate": 9.026620557966279e-07,
"loss": 0.0032,
"reward": 0.016447328962385654,
"reward_advantage_correlation": 1.0,
"reward_std": 0.14020200027152896,
"rewards/cosine_scaled_reward": -0.0677786897867918,
"rewards/format_reward": 0.22916667349636555,
"step": 147
},
{
"advantage_max": 0.09006885858252645,
"advantage_mean": -2.2506962404911235e-09,
"advantage_min": -0.11421419773250818,
"advantage_std": 0.08783065434545279,
"completion_length": 2373.8750534057617,
"epoch": 0.16914285714285715,
"grad_norm": 0.0019086383981630206,
"kl": 0.00016707181930541992,
"learning_rate": 9.007020842191634e-07,
"loss": 0.0023,
"reward": 0.0981850721873343,
"reward_advantage_correlation": 1.0,
"reward_std": 0.08783066272735596,
"rewards/cosine_scaled_reward": 0.006881414446979761,
"rewards/format_reward": 0.5625000074505806,
"step": 148
},
{
"advantage_max": 0.13361886190250516,
"advantage_mean": -7.528191028893794e-09,
"advantage_min": -0.1478055864572525,
"advantage_std": 0.10741094080731273,
"completion_length": 2408.666732788086,
"epoch": 0.1702857142857143,
"grad_norm": 0.0021227849647402763,
"kl": 0.00010327436029911041,
"learning_rate": 8.987250199168808e-07,
"loss": 0.0048,
"reward": 0.15707366378046572,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10741094499826431,
"rewards/cosine_scaled_reward": 0.1423771446570754,
"rewards/format_reward": 0.6458333395421505,
"step": 149
},
{
"advantage_max": 0.20952890440821648,
"advantage_mean": -1.3969839313121568e-09,
"advantage_min": -0.183829627931118,
"advantage_std": 0.1736277553718537,
"completion_length": 2727.5208892822266,
"epoch": 0.17142857142857143,
"grad_norm": 0.002815204905346036,
"kl": 0.00017786026000976562,
"learning_rate": 8.967309592491052e-07,
"loss": 0.0096,
"reward": 0.0818605124950409,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1736277644522488,
"rewards/cosine_scaled_reward": -0.0079753203317523,
"rewards/format_reward": 0.5000000055879354,
"step": 150
},
{
"advantage_max": 0.20418076124042273,
"advantage_mean": -3.4148497640718034e-09,
"advantage_min": -0.1933022839948535,
"advantage_std": 0.16485009621828794,
"completion_length": 2286.5209045410156,
"epoch": 0.17257142857142857,
"grad_norm": 0.004568588919937611,
"kl": 0.00023168325424194336,
"learning_rate": 8.9471999940354e-07,
"loss": 0.0172,
"reward": 0.2044738749973476,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.16485009947791696,
"rewards/cosine_scaled_reward": 0.2511125993914902,
"rewards/format_reward": 0.7083333414047956,
"step": 151
},
{
"advantage_max": 0.21830918407067657,
"advantage_mean": -2.1730860721991263e-09,
"advantage_min": -0.09182694740593433,
"advantage_std": 0.11862736381590366,
"completion_length": 2849.2291984558105,
"epoch": 0.1737142857142857,
"grad_norm": 0.002062483923509717,
"kl": 0.0002673119306564331,
"learning_rate": 8.926922383915315e-07,
"loss": 0.0052,
"reward": 0.005771389231085777,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11862737126648426,
"rewards/cosine_scaled_reward": -0.13862175261601806,
"rewards/format_reward": 0.31250000186264515,
"step": 152
},
{
"advantage_max": 0.18801967659965158,
"advantage_mean": -6.984919032060333e-10,
"advantage_min": -0.1435602605342865,
"advantage_std": 0.12760146823711693,
"completion_length": 2704.145866394043,
"epoch": 0.17485714285714285,
"grad_norm": 0.0024463790468871593,
"kl": 0.00021246075630187988,
"learning_rate": 8.906477750432903e-07,
"loss": 0.0017,
"reward": 0.05029630567878485,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.12760147219523787,
"rewards/cosine_scaled_reward": -0.09206442115828395,
"rewards/format_reward": 0.47916667349636555,
"step": 153
},
{
"advantage_max": 0.14180888701230288,
"advantage_mean": -2.2506962960022747e-09,
"advantage_min": -0.23996069841086864,
"advantage_std": 0.15492864465340972,
"completion_length": 2850.729217529297,
"epoch": 0.176,
"grad_norm": 0.0028651999309659004,
"kl": 0.00016963481903076172,
"learning_rate": 8.88586709003076e-07,
"loss": 0.0099,
"reward": 0.15311535075306892,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1549286488443613,
"rewards/cosine_scaled_reward": 0.1717417575418949,
"rewards/format_reward": 0.562500013038516,
"step": 154
},
{
"advantage_max": 0.173433696385473,
"advantage_mean": -7.528191139916096e-09,
"advantage_min": -0.15252949902787805,
"advantage_std": 0.13626911328174174,
"completion_length": 2346.8333587646484,
"epoch": 0.17714285714285713,
"grad_norm": 0.0024740456137806177,
"kl": 0.00018147937953472137,
"learning_rate": 8.865091407243394e-07,
"loss": 0.005,
"reward": 0.10649993130937219,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13626911351457238,
"rewards/cosine_scaled_reward": 0.05439352709800005,
"rewards/format_reward": 0.5208333376795053,
"step": 155
},
{
"advantage_max": 0.1344355084002018,
"advantage_mean": -9.313225746154785e-10,
"advantage_min": -0.1348655167967081,
"advantage_std": 0.10152879962697625,
"completion_length": 2678.833366394043,
"epoch": 0.1782857142857143,
"grad_norm": 0.0013925960520282388,
"kl": 0.00019456446170806885,
"learning_rate": 8.844151714648274e-07,
"loss": 0.0022,
"reward": 0.07135126600041986,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10152880381792784,
"rewards/cosine_scaled_reward": 0.011746692471206188,
"rewards/format_reward": 0.3958333395421505,
"step": 156
},
{
"advantage_max": 0.11662959074601531,
"advantage_mean": 4.656613428188905e-10,
"advantage_min": -0.13163182232528925,
"advantage_std": 0.10447747865691781,
"completion_length": 2641.729202270508,
"epoch": 0.17942857142857144,
"grad_norm": 0.0014279123861342669,
"kl": 0.00022292137145996094,
"learning_rate": 8.823049032816478e-07,
"loss": 0.002,
"reward": 0.039941683411598206,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10447748005390167,
"rewards/cosine_scaled_reward": -0.09014054387807846,
"rewards/format_reward": 0.4166666679084301,
"step": 157
},
{
"advantage_max": 0.12401013169437647,
"advantage_mean": -2.17308623873258e-09,
"advantage_min": -0.10845753783360124,
"advantage_std": 0.09371542499866337,
"completion_length": 2349.4583587646484,
"epoch": 0.18057142857142858,
"grad_norm": 0.0023050915915519,
"kl": 0.0002039596438407898,
"learning_rate": 8.801784390262943e-07,
"loss": 0.0002,
"reward": 0.1618611067533493,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.09371542546432465,
"rewards/cosine_scaled_reward": 0.1614538454450667,
"rewards/format_reward": 0.6250000055879354,
"step": 158
},
{
"advantage_max": 0.22995528485625982,
"advantage_mean": 1.4357891403582457e-09,
"advantage_min": -0.13199414312839508,
"advantage_std": 0.135023855837062,
"completion_length": 3314.125030517578,
"epoch": 0.18171428571428572,
"grad_norm": 0.0022246637381613255,
"kl": 0.0002588033676147461,
"learning_rate": 8.780358823396352e-07,
"loss": 0.0053,
"reward": 0.002544154027418699,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1350238579325378,
"rewards/cosine_scaled_reward": -0.11882635089568794,
"rewards/format_reward": 0.2500000074505806,
"step": 159
},
{
"advantage_max": 0.1304126875475049,
"advantage_mean": -2.483526828633842e-09,
"advantage_min": -0.15111864916980267,
"advantage_std": 0.11367706721648574,
"completion_length": 2916.458366394043,
"epoch": 0.18285714285714286,
"grad_norm": 0.002428490901365876,
"kl": 0.0002749040722846985,
"learning_rate": 8.758773376468604e-07,
"loss": 0.0027,
"reward": 0.10357946204021573,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1136770648881793,
"rewards/cosine_scaled_reward": 0.07447406277060509,
"rewards/format_reward": 0.45833334140479565,
"step": 160
},
{
"advantage_max": 0.22650268021970987,
"advantage_mean": -4.19095166903638e-09,
"advantage_min": -0.21486747544258833,
"advantage_std": 0.1732907984405756,
"completion_length": 2120.5833892822266,
"epoch": 0.184,
"grad_norm": 0.00323474477045238,
"kl": 0.00017549656331539154,
"learning_rate": 8.737029101523929e-07,
"loss": 0.0164,
"reward": 0.16077806614339352,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.17329080402851105,
"rewards/cosine_scaled_reward": 0.11763785593211651,
"rewards/format_reward": 0.7083333395421505,
"step": 161
},
{
"advantage_max": 0.24303656117990613,
"advantage_mean": -1.823840096309981e-09,
"advantage_min": -0.20982816815376282,
"advantage_std": 0.1883863634429872,
"completion_length": 3135.729217529297,
"epoch": 0.18514285714285714,
"grad_norm": 0.003710554214194417,
"kl": 0.00029730796813964844,
"learning_rate": 8.715127058347614e-07,
"loss": 0.0127,
"reward": 0.06095714052207768,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1883863634429872,
"rewards/cosine_scaled_reward": -0.007758868858218193,
"rewards/format_reward": 0.3750000111758709,
"step": 162
},
{
"advantage_max": 0.19018926797434688,
"advantage_mean": -2.3283065614387866e-09,
"advantage_min": -0.16702749021351337,
"advantage_std": 0.1346902009099722,
"completion_length": 2321.00008392334,
"epoch": 0.18628571428571428,
"grad_norm": 0.002764482283964753,
"kl": 0.00019381940364837646,
"learning_rate": 8.693068314414344e-07,
"loss": 0.0069,
"reward": 0.15160547848790884,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13469020370393991,
"rewards/cosine_scaled_reward": 0.12644020980224013,
"rewards/format_reward": 0.645833345130086,
"step": 163
},
{
"advantage_max": 0.17234268225729465,
"advantage_mean": -9.313226051466117e-09,
"advantage_min": -0.17475124169141054,
"advantage_std": 0.13838404836133122,
"completion_length": 2284.3541946411133,
"epoch": 0.18742857142857142,
"grad_norm": 0.0023905131965875626,
"kl": 0.00018963217735290527,
"learning_rate": 8.670853944836176e-07,
"loss": 0.0053,
"reward": 0.14360764995217323,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13838405720889568,
"rewards/cosine_scaled_reward": 0.13964655436575413,
"rewards/format_reward": 0.5625000037252903,
"step": 164
},
{
"advantage_max": 0.17663770401850343,
"advantage_mean": 7.761016135310328e-11,
"advantage_min": -0.13784242887049913,
"advantage_std": 0.12715793796814978,
"completion_length": 2959.3958740234375,
"epoch": 0.18857142857142858,
"grad_norm": 0.0023930887691676617,
"kl": 0.00022131530568003654,
"learning_rate": 8.648485032310144e-07,
"loss": 0.0036,
"reward": 0.01792388770263642,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.127157939132303,
"rewards/cosine_scaled_reward": -0.13487120415084064,
"rewards/format_reward": 0.37500000931322575,
"step": 165
},
{
"advantage_max": 0.26935879176016897,
"advantage_mean": -7.411775679311283e-09,
"advantage_min": -0.21388308005407453,
"advantage_std": 0.19788388686720282,
"completion_length": 2474.6458892822266,
"epoch": 0.18971428571428572,
"grad_norm": 0.0030500064603984356,
"kl": 0.00017523393034934998,
"learning_rate": 8.625962667065487e-07,
"loss": 0.0142,
"reward": 0.13143354514613748,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1978838904760778,
"rewards/cosine_scaled_reward": 0.06401193561032414,
"rewards/format_reward": 0.6458333395421505,
"step": 166
},
{
"advantage_max": 0.09043146530166268,
"advantage_mean": -3.725290492750943e-09,
"advantage_min": -0.1378228161484003,
"advantage_std": 0.09450012096203864,
"completion_length": 2120.4166946411133,
"epoch": 0.19085714285714286,
"grad_norm": 0.001775244832970202,
"kl": 0.00016416609287261963,
"learning_rate": 8.603287946810513e-07,
"loss": 0.001,
"reward": 0.13699070224538445,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09450012096203864,
"rewards/cosine_scaled_reward": 0.049559676088392735,
"rewards/format_reward": 0.7083333432674408,
"step": 167
},
{
"advantage_max": 0.16301921661943197,
"advantage_mean": -3.3372394153685647e-09,
"advantage_min": -0.17344017466530204,
"advantage_std": 0.14057715935632586,
"completion_length": 2687.6875534057617,
"epoch": 0.192,
"grad_norm": 0.002732840832322836,
"kl": 0.00020739436149597168,
"learning_rate": 8.580461976679099e-07,
"loss": 0.0038,
"reward": 0.12937976652756333,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.140577157959342,
"rewards/cosine_scaled_reward": 0.0898700375109911,
"rewards/format_reward": 0.5833333469927311,
"step": 168
},
{
"advantage_max": 0.1822348004207015,
"advantage_mean": -1.2417635114614356e-09,
"advantage_min": -0.15554382000118494,
"advantage_std": 0.13149547297507524,
"completion_length": 2024.1875305175781,
"epoch": 0.19314285714285714,
"grad_norm": 0.0024218547623604536,
"kl": 0.00018212199211120605,
"learning_rate": 8.557485869176825e-07,
"loss": 0.0023,
"reward": 0.1839947861735709,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.1314954780973494,
"rewards/cosine_scaled_reward": 0.2302683750167489,
"rewards/format_reward": 0.6250000111758709,
"step": 169
},
{
"advantage_max": 0.10904745385050774,
"advantage_mean": 1.5813081122306727e-09,
"advantage_min": -0.06331885978579521,
"advantage_std": 0.06757040356751531,
"completion_length": 2624.2500228881836,
"epoch": 0.19428571428571428,
"grad_norm": 0.0012609402183443308,
"kl": 0.00022426247596740723,
"learning_rate": 8.534360744126753e-07,
"loss": 0.0029,
"reward": 0.044832271145423874,
"reward_advantage_correlation": 1.0,
"reward_std": 0.06757040473166853,
"rewards/cosine_scaled_reward": -0.10952581465244293,
"rewards/format_reward": 0.47916666977107525,
"step": 170
},
{
"advantage_max": 0.18128343019634485,
"advantage_mean": -7.683411469561197e-09,
"advantage_min": -0.1281019225716591,
"advantage_std": 0.12827163795009255,
"completion_length": 2249.0000381469727,
"epoch": 0.19542857142857142,
"grad_norm": 0.00255986419506371,
"kl": 0.0001919977366924286,
"learning_rate": 8.511087728614862e-07,
"loss": 0.0088,
"reward": 0.11522350832819939,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.12827164493501186,
"rewards/cosine_scaled_reward": 0.05948738753795624,
"rewards/format_reward": 0.5625000018626451,
"step": 171
},
{
"advantage_max": 0.1466981265693903,
"advantage_mean": 4.656613705744661e-10,
"advantage_min": -0.1400249758735299,
"advantage_std": 0.11561239557340741,
"completion_length": 2715.5833854675293,
"epoch": 0.19657142857142856,
"grad_norm": 0.0022185237612575293,
"kl": 0.0002675652503967285,
"learning_rate": 8.487667956935087e-07,
"loss": -0.0039,
"reward": 0.10251820925623178,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11561239883303642,
"rewards/cosine_scaled_reward": 0.08025055937469006,
"rewards/format_reward": 0.4375000074505806,
"step": 172
},
{
"advantage_max": 0.08615154423750937,
"advantage_mean": -3.531264874262474e-09,
"advantage_min": -0.08250692702131346,
"advantage_std": 0.07523247081553563,
"completion_length": 1911.5208435058594,
"epoch": 0.1977142857142857,
"grad_norm": 0.0018191589042544365,
"kl": 0.00011747702956199646,
"learning_rate": 8.464102570534061e-07,
"loss": 0.0009,
"reward": 0.0827999617322348,
"reward_advantage_correlation": 1.0,
"reward_std": 0.07523247634526342,
"rewards/cosine_scaled_reward": -0.08963774237781763,
"rewards/format_reward": 0.6666666716337204,
"step": 173
},
{
"advantage_max": 0.17214444186538458,
"advantage_mean": -2.793967904257677e-09,
"advantage_min": -0.12071564141660929,
"advantage_std": 0.119583026971668,
"completion_length": 1707.958381652832,
"epoch": 0.19885714285714284,
"grad_norm": 0.0018704604590311646,
"kl": 0.00013531744480133057,
"learning_rate": 8.440392717955475e-07,
"loss": -0.0021,
"reward": 0.11866510892286897,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11958303209394217,
"rewards/cosine_scaled_reward": -0.034700583666563034,
"rewards/format_reward": 0.7708333395421505,
"step": 174
},
{
"advantage_max": 0.0757163786329329,
"advantage_mean": 2.0954757928848267e-09,
"advantage_min": -0.11116283386945724,
"advantage_std": 0.0754783492302522,
"completion_length": 2308.666675567627,
"epoch": 0.2,
"grad_norm": 0.0010337267303839326,
"kl": 0.00017081201076507568,
"learning_rate": 8.416539554784089e-07,
"loss": -0.0001,
"reward": 0.11406097328290343,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.07547835109289736,
"rewards/cosine_scaled_reward": 0.09473255276679993,
"rewards/format_reward": 0.4791666716337204,
"step": 175
},
{
"advantage_max": 0.16840291186235845,
"advantage_mean": -6.131207153092788e-09,
"advantage_min": -0.19575551990419626,
"advantage_std": 0.15237143402919173,
"completion_length": 2512.3542251586914,
"epoch": 0.20114285714285715,
"grad_norm": 0.002759363502264023,
"kl": 0.00021241046488285065,
"learning_rate": 8.392544243589427e-07,
"loss": 0.0106,
"reward": 0.16955551970750093,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.15237143402919173,
"rewards/cosine_scaled_reward": 0.19831032841466367,
"rewards/format_reward": 0.6041666753590107,
"step": 176
},
{
"advantage_max": 0.15896850870922208,
"advantage_mean": 8.537123022400728e-10,
"advantage_min": -0.16080441791564226,
"advantage_std": 0.1259885341860354,
"completion_length": 2636.500045776367,
"epoch": 0.2022857142857143,
"grad_norm": 0.0029087516013532877,
"kl": 0.00024193525314331055,
"learning_rate": 8.368407953869103e-07,
"loss": 0.0062,
"reward": 0.06026533106341958,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.12598853604868054,
"rewards/cosine_scaled_reward": -0.08354691602289677,
"rewards/format_reward": 0.5208333507180214,
"step": 177
},
{
"advantage_max": 0.08082104474306107,
"advantage_mean": -2.6387474427735924e-09,
"advantage_min": -0.12136463588103652,
"advantage_std": 0.08366369269788265,
"completion_length": 2332.0000610351562,
"epoch": 0.20342857142857143,
"grad_norm": 0.001465087989345193,
"kl": 0.00019755959510803223,
"learning_rate": 8.344131861991828e-07,
"loss": 0.0088,
"reward": 0.10637267166748643,
"reward_advantage_correlation": 1.0,
"reward_std": 0.08366369549185038,
"rewards/cosine_scaled_reward": 0.032770249992609024,
"rewards/format_reward": 0.5625,
"step": 178
},
{
"advantage_max": 0.16390905156731606,
"advantage_mean": 4.656613011855271e-10,
"advantage_min": -0.1330079366452992,
"advantage_std": 0.1229178715730086,
"completion_length": 2670.416679382324,
"epoch": 0.20457142857142857,
"grad_norm": 0.005331814754754305,
"kl": 0.00022807717323303223,
"learning_rate": 8.319717151140072e-07,
"loss": 0.0051,
"reward": 0.049615125404670835,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12291787634603679,
"rewards/cosine_scaled_reward": -0.0732691722587333,
"rewards/format_reward": 0.43750000558793545,
"step": 179
},
{
"advantage_max": 0.06850773748010397,
"advantage_mean": -9.778887456735053e-09,
"advantage_min": -0.0560016599483788,
"advantage_std": 0.05324950837530196,
"completion_length": 2127.2500343322754,
"epoch": 0.2057142857142857,
"grad_norm": 0.0010600673267617822,
"kl": 0.00023946166038513184,
"learning_rate": 8.295165011252396e-07,
"loss": -0.0008,
"reward": 0.1334767653606832,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.05324950924841687,
"rewards/cosine_scaled_reward": 0.09442592412233353,
"rewards/format_reward": 0.6041666716337204,
"step": 180
},
{
"advantage_max": 0.177369711920619,
"advantage_mean": -2.949188115941581e-09,
"advantage_min": -0.14135410264134407,
"advantage_std": 0.14470088807865977,
"completion_length": 2986.770866394043,
"epoch": 0.20685714285714285,
"grad_norm": 0.002808406949043274,
"kl": 0.0002924799919128418,
"learning_rate": 8.270476638965461e-07,
"loss": 0.0055,
"reward": 0.08883980172686279,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.14470090251415968,
"rewards/cosine_scaled_reward": 0.044522762298583984,
"rewards/format_reward": 0.4375000037252903,
"step": 181
},
{
"advantage_max": 0.16322546359151602,
"advantage_mean": -4.190951738425319e-09,
"advantage_min": -0.10783215472474694,
"advantage_std": 0.10909037687815726,
"completion_length": 1764.2916946411133,
"epoch": 0.208,
"grad_norm": 0.0014723712811246514,
"kl": 0.00010727345943450928,
"learning_rate": 8.245653237555705e-07,
"loss": -0.0045,
"reward": 0.09074319200590253,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.10909038223326206,
"rewards/cosine_scaled_reward": -0.10806664638221264,
"rewards/format_reward": 0.75,
"step": 182
},
{
"advantage_max": 0.1708065690472722,
"advantage_mean": -7.605801183308003e-09,
"advantage_min": -0.15987203177064657,
"advantage_std": 0.13489929027855396,
"completion_length": 1637.4792022705078,
"epoch": 0.20914285714285713,
"grad_norm": 0.0019362150924280286,
"kl": 0.0001348257064819336,
"learning_rate": 8.220696016880687e-07,
"loss": -0.0,
"reward": 0.17411585431545973,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.13489929679781199,
"rewards/cosine_scaled_reward": 0.08437284221872687,
"rewards/format_reward": 0.8541666716337204,
"step": 183
},
{
"advantage_max": 0.11368166282773018,
"advantage_mean": 2.910383149756779e-09,
"advantage_min": -0.06791064376011491,
"advantage_std": 0.07019675150513649,
"completion_length": 2715.479200363159,
"epoch": 0.2102857142857143,
"grad_norm": 0.0016745430184528232,
"kl": 0.00021871179342269897,
"learning_rate": 8.195606193320136e-07,
"loss": 0.0007,
"reward": 0.022002333775162697,
"reward_advantage_correlation": 1.0,
"reward_std": 0.070196753134951,
"rewards/cosine_scaled_reward": -0.13184033427387476,
"rewards/format_reward": 0.39583333395421505,
"step": 184
},
{
"advantage_max": 0.12346031097695231,
"advantage_mean": -3.2790315879216436e-09,
"advantage_min": -0.11551401333417743,
"advantage_std": 0.1026953593827784,
"completion_length": 2431.7083625793457,
"epoch": 0.21142857142857144,
"grad_norm": 0.00237295706756413,
"kl": 0.00017070770263671875,
"learning_rate": 8.170384989716657e-07,
"loss": 0.0061,
"reward": 0.07951303326990455,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.10269536171108484,
"rewards/cosine_scaled_reward": -0.01576326903887093,
"rewards/format_reward": 0.5000000037252903,
"step": 185
},
{
"advantage_max": 0.11272265436127782,
"advantage_mean": 4.6566127342995145e-10,
"advantage_min": -0.07673908583819866,
"advantage_std": 0.07678107637912035,
"completion_length": 2727.812515258789,
"epoch": 0.21257142857142858,
"grad_norm": 0.0013483620714396238,
"kl": 0.00023304671049118042,
"learning_rate": 8.145033635316128e-07,
"loss": -0.0035,
"reward": 0.05358618497848511,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.07678107777610421,
"rewards/cosine_scaled_reward": -0.02927381545305252,
"rewards/format_reward": 0.375,
"step": 186
},
{
"advantage_max": 0.1563552524894476,
"advantage_mean": -2.716357583310014e-09,
"advantage_min": -0.13328236620873213,
"advantage_std": 0.12376850796863437,
"completion_length": 2533.9166870117188,
"epoch": 0.21371428571428572,
"grad_norm": 0.0021735529880970716,
"kl": 0.0002467595040798187,
"learning_rate": 8.119553365707802e-07,
"loss": 0.0065,
"reward": 0.04993397952057421,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12376851122826338,
"rewards/cosine_scaled_reward": -0.12346055079251528,
"rewards/format_reward": 0.5416666734963655,
"step": 187
},
{
"advantage_max": 0.08845503395423293,
"advantage_mean": 3.8805124391583234e-10,
"advantage_min": -0.1204057689756155,
"advantage_std": 0.08113091951236129,
"completion_length": 3432.812530517578,
"epoch": 0.21485714285714286,
"grad_norm": 0.0014322166098281741,
"kl": 0.00029206275939941406,
"learning_rate": 8.093945422764069e-07,
"loss": 0.0022,
"reward": 0.045054638059809804,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.08113092044368386,
"rewards/cosine_scaled_reward": -0.022663846611976624,
"rewards/format_reward": 0.3125000074505806,
"step": 188
},
{
"advantage_max": 0.17914783209562302,
"advantage_mean": -3.5700699896334953e-09,
"advantage_min": -0.11173915676772594,
"advantage_std": 0.11490088887512684,
"completion_length": 1846.0000114440918,
"epoch": 0.216,
"grad_norm": 0.002113899914547801,
"kl": 0.0001726001501083374,
"learning_rate": 8.068211054579943e-07,
"loss": 0.0092,
"reward": 0.11756939408951439,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.11490088701248169,
"rewards/cosine_scaled_reward": -0.01818160153925419,
"rewards/format_reward": 0.7291666734963655,
"step": 189
},
{
"advantage_max": 0.1964530674740672,
"advantage_mean": -1.5522044760629683e-10,
"advantage_min": -0.12849188223481178,
"advantage_std": 0.1331510180607438,
"completion_length": 2651.8750381469727,
"epoch": 0.21714285714285714,
"grad_norm": 0.002494914224371314,
"kl": 0.00020803511142730713,
"learning_rate": 8.04235151541222e-07,
"loss": 0.0084,
"reward": 0.06791404378600419,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.13315102504566312,
"rewards/cosine_scaled_reward": 0.0009567160159349442,
"rewards/format_reward": 0.39583333767950535,
"step": 190
},
{
"advantage_max": 0.0955268326215446,
"advantage_mean": -6.208817349140361e-10,
"advantage_min": -0.14831526763737202,
"advantage_std": 0.10267449170351028,
"completion_length": 2117.208351135254,
"epoch": 0.21828571428571428,
"grad_norm": 0.00150469527579844,
"kl": 0.00021576881408691406,
"learning_rate": 8.01636806561836e-07,
"loss": -0.0023,
"reward": 0.15802897419780493,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10267449403181672,
"rewards/cosine_scaled_reward": 0.15254965890198946,
"rewards/format_reward": 0.625,
"step": 191
},
{
"advantage_max": 0.12947352742776275,
"advantage_mean": -1.785035036450111e-09,
"advantage_min": -0.1430590646341443,
"advantage_std": 0.11898831464350224,
"completion_length": 3050.4584045410156,
"epoch": 0.21942857142857142,
"grad_norm": 0.0025042772758752108,
"kl": 0.0002841353416442871,
"learning_rate": 7.990261971595048e-07,
"loss": 0.01,
"reward": 0.03274457482621074,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11898831464350224,
"rewards/cosine_scaled_reward": -0.11102783679962158,
"rewards/format_reward": 0.4166666828095913,
"step": 192
},
{
"advantage_max": 0.18321886658668518,
"advantage_mean": -5.587935447692871e-09,
"advantage_min": -0.21362949814647436,
"advantage_std": 0.16429298697039485,
"completion_length": 2551.6042098999023,
"epoch": 0.22057142857142858,
"grad_norm": 0.0033616998698562384,
"kl": 0.00018714368343353271,
"learning_rate": 7.964034505716476e-07,
"loss": 0.0094,
"reward": 0.10076185502111912,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.16429299302399158,
"rewards/cosine_scaled_reward": 0.03686658607330173,
"rewards/format_reward": 0.5208333432674408,
"step": 193
},
{
"advantage_max": 0.1511156321503222,
"advantage_mean": -1.3814618921026423e-08,
"advantage_min": -0.16178389079868793,
"advantage_std": 0.1266572391614318,
"completion_length": 2812.541717529297,
"epoch": 0.22171428571428572,
"grad_norm": 0.002837719861418009,
"kl": 0.00023984909057617188,
"learning_rate": 7.93768694627233e-07,
"loss": 0.0145,
"reward": 0.2117105281795375,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12665724474936724,
"rewards/cosine_scaled_reward": 0.3458312964066863,
"rewards/format_reward": 0.5625000055879354,
"step": 194
},
{
"advantage_max": 0.2311068344861269,
"advantage_mean": -1.2417635669725868e-09,
"advantage_min": -0.18620420899242163,
"advantage_std": 0.17218404030427337,
"completion_length": 2531.1666870117188,
"epoch": 0.22285714285714286,
"grad_norm": 0.002786431461572647,
"kl": 0.0001908913254737854,
"learning_rate": 7.911220577405484e-07,
"loss": 0.0001,
"reward": 0.042947592213749886,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.17218404030427337,
"rewards/cosine_scaled_reward": -0.10107360588153824,
"rewards/format_reward": 0.4583333469927311,
"step": 195
},
{
"advantage_max": 0.14308508206158876,
"advantage_mean": 1.552204059729334e-10,
"advantage_min": -0.12311006104573607,
"advantage_std": 0.10863854410126805,
"completion_length": 3364.9375610351562,
"epoch": 0.224,
"grad_norm": 0.002163918921723962,
"kl": 0.00028574466705322266,
"learning_rate": 7.884636689049422e-07,
"loss": 0.0002,
"reward": 0.03790745767764747,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10863854829221964,
"rewards/cosine_scaled_reward": -0.09509009215980768,
"rewards/format_reward": 0.41666667349636555,
"step": 196
},
{
"advantage_max": 0.2604234963655472,
"advantage_mean": -1.1331091689936734e-08,
"advantage_min": -0.21268348023295403,
"advantage_std": 0.20125100389122963,
"completion_length": 2484.062568664551,
"epoch": 0.22514285714285714,
"grad_norm": 0.004311454016715288,
"kl": 0.00030350685119628906,
"learning_rate": 7.857936576865356e-07,
"loss": 0.0081,
"reward": 0.21867160964757204,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.2012510122731328,
"rewards/cosine_scaled_reward": 0.3315436402335763,
"rewards/format_reward": 0.6250000055879354,
"step": 197
},
{
"advantage_max": 0.1826078612357378,
"advantage_mean": -7.916242161787324e-09,
"advantage_min": -0.18366167414933443,
"advantage_std": 0.15232555009424686,
"completion_length": 2441.958381652832,
"epoch": 0.22628571428571428,
"grad_norm": 0.00223523355089128,
"kl": 0.00019724667072296143,
"learning_rate": 7.831121542179086e-07,
"loss": 0.0078,
"reward": 0.13793383864685893,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.15232555102556944,
"rewards/cosine_scaled_reward": 0.12416904792189598,
"rewards/format_reward": 0.5625000037252903,
"step": 198
},
{
"advantage_max": 0.17881701048463583,
"advantage_mean": 1.5522043234073024e-09,
"advantage_min": -0.14401802979409695,
"advantage_std": 0.12912732851691544,
"completion_length": 3530.9166870117188,
"epoch": 0.22742857142857142,
"grad_norm": 0.0025338924024254084,
"kl": 0.00028127431869506836,
"learning_rate": 7.804192891917571e-07,
"loss": 0.002,
"reward": -0.015164745040237904,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.12912732968106866,
"rewards/cosine_scaled_reward": -0.1283108638599515,
"rewards/format_reward": 0.16666666977107525,
"step": 199
},
{
"advantage_max": 0.15909895114600658,
"advantage_mean": -5.355104984450243e-09,
"advantage_min": -0.22837194707244635,
"advantage_std": 0.15889625437557697,
"completion_length": 1886.9375457763672,
"epoch": 0.22857142857142856,
"grad_norm": 0.002250520745292306,
"kl": 0.00016075372695922852,
"learning_rate": 7.777151938545235e-07,
"loss": -0.003,
"reward": 0.1841872469522059,
"reward_advantage_correlation": 1.0,
"reward_std": 0.15889626182615757,
"rewards/cosine_scaled_reward": 0.13623794727027416,
"rewards/format_reward": 0.812500013038516,
"step": 200
},
{
"advantage_max": 0.17722219973802567,
"advantage_mean": 3.1044086745701804e-10,
"advantage_min": -0.1915279608219862,
"advantage_std": 0.13609004858881235,
"completion_length": 2347.166732788086,
"epoch": 0.2297142857142857,
"grad_norm": 0.002754747634753585,
"kl": 0.00020887888967990875,
"learning_rate": 7.75e-07,
"loss": 0.0038,
"reward": 0.28291497589088976,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13609005440957844,
"rewards/cosine_scaled_reward": 0.4535187867586501,
"rewards/format_reward": 0.770833345130086,
"step": 201
},
{
"advantage_max": 0.08778555504977703,
"advantage_mean": -4.346172172153828e-09,
"advantage_min": -0.08903190679848194,
"advantage_std": 0.06955086882226169,
"completion_length": 2056.6875228881836,
"epoch": 0.23085714285714284,
"grad_norm": 0.0014487183652818203,
"kl": 0.00016849488019943237,
"learning_rate": 7.72273839962904e-07,
"loss": 0.005,
"reward": 0.20257333759218454,
"reward_advantage_correlation": 1.0,
"reward_std": 0.06955087138339877,
"rewards/cosine_scaled_reward": 0.3097168318927288,
"rewards/format_reward": 0.5833333358168602,
"step": 202
},
{
"advantage_max": 0.13725088443607092,
"advantage_mean": 5.781961086998022e-09,
"advantage_min": -0.1490999348461628,
"advantage_std": 0.11772086331620812,
"completion_length": 3153.8333740234375,
"epoch": 0.232,
"grad_norm": 0.0027381619438529015,
"kl": 0.00029969215393066406,
"learning_rate": 7.695368466124296e-07,
"loss": 0.0089,
"reward": 0.05879632290452719,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.11772086471319199,
"rewards/cosine_scaled_reward": 0.016048375517129898,
"rewards/format_reward": 0.3125000037252903,
"step": 203
},
{
"advantage_max": 0.1252095801755786,
"advantage_mean": -3.2596291082986895e-09,
"advantage_min": -0.11681158654391766,
"advantage_std": 0.09182075597345829,
"completion_length": 1734.6042251586914,
"epoch": 0.23314285714285715,
"grad_norm": 0.001834864029660821,
"kl": 0.00022155791521072388,
"learning_rate": 7.667891533457718e-07,
"loss": 0.0018,
"reward": 0.18190127734851558,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09182075783610344,
"rewards/cosine_scaled_reward": 0.15297963470220566,
"rewards/format_reward": 0.7708333395421505,
"step": 204
},
{
"advantage_max": 0.1340260272845626,
"advantage_mean": -6.984919392882816e-09,
"advantage_min": -0.30524480529129505,
"advantage_std": 0.1720650801435113,
"completion_length": 2263.625015258789,
"epoch": 0.2342857142857143,
"grad_norm": 0.0025743981823325157,
"kl": 0.00026476383209228516,
"learning_rate": 7.640308940816239e-07,
"loss": 0.0058,
"reward": 0.25796742155216634,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1720650834031403,
"rewards/cosine_scaled_reward": 0.3982603717595339,
"rewards/format_reward": 0.7291666865348816,
"step": 205
},
{
"advantage_max": 0.13997728214599192,
"advantage_mean": -1.5522043372850902e-09,
"advantage_min": -0.1517253816127777,
"advantage_std": 0.11635040352120996,
"completion_length": 2658.7291870117188,
"epoch": 0.23542857142857143,
"grad_norm": 0.0019164991099387407,
"kl": 0.0002377629280090332,
"learning_rate": 7.612622032536507e-07,
"loss": 0.0035,
"reward": 0.050769580993801355,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1163504053838551,
"rewards/cosine_scaled_reward": -0.07947664987295866,
"rewards/format_reward": 0.45833334140479565,
"step": 206
},
{
"advantage_max": 0.11476221471093595,
"advantage_mean": -6.519258070880607e-09,
"advantage_min": -0.132195595651865,
"advantage_std": 0.0947426650673151,
"completion_length": 2846.520866394043,
"epoch": 0.23657142857142857,
"grad_norm": 0.0019112242152914405,
"kl": 0.00027942657470703125,
"learning_rate": 7.584832158039378e-07,
"loss": 0.0027,
"reward": 0.10473369807004929,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09474266786128283,
"rewards/cosine_scaled_reward": 0.028989043086767197,
"rewards/format_reward": 0.5625000149011612,
"step": 207
},
{
"advantage_max": 0.12081033829599619,
"advantage_mean": -2.716357666576741e-09,
"advantage_min": -0.1374441795051098,
"advantage_std": 0.11835672007873654,
"completion_length": 2765.4791946411133,
"epoch": 0.2377142857142857,
"grad_norm": 0.0027541995514184237,
"kl": 0.00021903729066252708,
"learning_rate": 7.556940671764124e-07,
"loss": 0.0065,
"reward": 0.08172197639942169,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.11835672124288976,
"rewards/cosine_scaled_reward": 0.04407403990626335,
"rewards/format_reward": 0.3958333358168602,
"step": 208
},
{
"advantage_max": 0.1953953867778182,
"advantage_mean": -2.8715779060162205e-09,
"advantage_min": -0.1776261981576681,
"advantage_std": 0.1543159680441022,
"completion_length": 2137.229221343994,
"epoch": 0.23885714285714285,
"grad_norm": 0.002014850964769721,
"kl": 0.0002369508147239685,
"learning_rate": 7.528948933102438e-07,
"loss": 0.0048,
"reward": 0.121353481663391,
"reward_advantage_correlation": 1.0,
"reward_std": 0.15431597619317472,
"rewards/cosine_scaled_reward": 0.05646992567926645,
"rewards/format_reward": 0.6041666753590107,
"step": 209
},
{
"advantage_max": 0.12167689856141806,
"advantage_mean": -4.113341334210929e-09,
"advantage_min": -0.1051677679643035,
"advantage_std": 0.09225608897395432,
"completion_length": 2772.7708740234375,
"epoch": 0.24,
"grad_norm": 0.00219483720138669,
"kl": 0.00022499263286590576,
"learning_rate": 7.500858306332172e-07,
"loss": 0.001,
"reward": 0.11764410836622119,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.0922560899052769,
"rewards/cosine_scaled_reward": 0.0887430626899004,
"rewards/format_reward": 0.5208333358168602,
"step": 210
},
{
"advantage_max": 0.11049975454807281,
"advantage_mean": 1.862645232497684e-09,
"advantage_min": -0.11649919580668211,
"advantage_std": 0.09677038621157408,
"completion_length": 2156.375045776367,
"epoch": 0.24114285714285713,
"grad_norm": 0.002211064798757434,
"kl": 0.0002203192561864853,
"learning_rate": 7.472670160550848e-07,
"loss": 0.0023,
"reward": 0.11667975131422281,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09677039366215467,
"rewards/cosine_scaled_reward": 0.045283637940883636,
"rewards/format_reward": 0.6041666753590107,
"step": 211
},
{
"advantage_max": 0.17078783456236124,
"advantage_mean": -5.2774949826916995e-09,
"advantage_min": -0.13496317621320486,
"advantage_std": 0.12754983035847545,
"completion_length": 1858.0625381469727,
"epoch": 0.2422857142857143,
"grad_norm": 0.0025052272249013186,
"kl": 0.0001669749617576599,
"learning_rate": 7.444385869608921e-07,
"loss": 0.0025,
"reward": 0.1585842336062342,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12754983338527381,
"rewards/cosine_scaled_reward": 0.13444282207638025,
"rewards/format_reward": 0.6666666697710752,
"step": 212
},
{
"advantage_max": 0.14249009639024734,
"advantage_mean": 2.173086086076914e-09,
"advantage_min": -0.15096384286880493,
"advantage_std": 0.12109754607081413,
"completion_length": 2033.2083625793457,
"epoch": 0.24342857142857144,
"grad_norm": 0.00203386670909822,
"kl": 0.000254213809967041,
"learning_rate": 7.416006812042827e-07,
"loss": -0.0005,
"reward": 0.16817454434931278,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12109755538403988,
"rewards/cosine_scaled_reward": 0.15286272019147873,
"rewards/format_reward": 0.6875,
"step": 213
},
{
"advantage_max": 0.09318645251914859,
"advantage_mean": -3.647680116292129e-09,
"advantage_min": -0.11485875491052866,
"advantage_std": 0.08119010645896196,
"completion_length": 2441.00004196167,
"epoch": 0.24457142857142858,
"grad_norm": 0.0015373064670711756,
"kl": 0.0002943165600299835,
"learning_rate": 7.387534371007797e-07,
"loss": -0.0005,
"reward": 0.07720327051356435,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.08119010925292969,
"rewards/cosine_scaled_reward": -0.05541713163256645,
"rewards/format_reward": 0.5625000074505806,
"step": 214
},
{
"advantage_max": 0.18169015739113092,
"advantage_mean": -1.7850349531833842e-09,
"advantage_min": -0.10681417491286993,
"advantage_std": 0.11135896015912294,
"completion_length": 1973.0417098999023,
"epoch": 0.24571428571428572,
"grad_norm": 0.0019926519598811865,
"kl": 0.0001901760697364807,
"learning_rate": 7.358969934210438e-07,
"loss": 0.0096,
"reward": 0.0591527302749455,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.11135896574705839,
"rewards/cosine_scaled_reward": -0.16166850773151964,
"rewards/format_reward": 0.6666666679084301,
"step": 215
},
{
"advantage_max": 0.13990403385832906,
"advantage_mean": -1.0089328178475299e-08,
"advantage_min": -0.1367074535228312,
"advantage_std": 0.1053521609865129,
"completion_length": 1759.458396911621,
"epoch": 0.24685714285714286,
"grad_norm": 0.002091531176120043,
"kl": 0.00021722912788391113,
"learning_rate": 7.330314893841101e-07,
"loss": 0.0054,
"reward": 0.17575624957680702,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.10535216005519032,
"rewards/cosine_scaled_reward": 0.15388290956616402,
"rewards/format_reward": 0.7291666772216558,
"step": 216
},
{
"advantage_max": 0.23972546868026257,
"advantage_mean": -2.793967973646616e-09,
"advantage_min": -0.19562211446464062,
"advantage_std": 0.1821348867379129,
"completion_length": 2473.8542098999023,
"epoch": 0.248,
"grad_norm": 0.0031409154180437326,
"kl": 0.00028133392333984375,
"learning_rate": 7.301570646506027e-07,
"loss": 0.0158,
"reward": 0.16255547618493438,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1821348937228322,
"rewards/cosine_scaled_reward": 0.1796769928187132,
"rewards/format_reward": 0.6041666716337204,
"step": 217
},
{
"advantage_max": 0.17459031008183956,
"advantage_mean": -2.3283064087831207e-09,
"advantage_min": -0.1884671887382865,
"advantage_std": 0.14956898847594857,
"completion_length": 2652.125030517578,
"epoch": 0.24914285714285714,
"grad_norm": 0.0028272378258407116,
"kl": 0.00023734569549560547,
"learning_rate": 7.27273859315928e-07,
"loss": 0.0063,
"reward": 0.1610828833654523,
"reward_advantage_correlation": 1.0,
"reward_std": 0.14956899639219046,
"rewards/cosine_scaled_reward": 0.1954907262697816,
"rewards/format_reward": 0.562500013038516,
"step": 218
},
{
"advantage_max": 0.19564510649070144,
"advantage_mean": -3.104408646814605e-09,
"advantage_min": -0.17972843209281564,
"advantage_std": 0.15118937706574798,
"completion_length": 2349.2917098999023,
"epoch": 0.2502857142857143,
"grad_norm": 0.0022383173927664757,
"kl": 0.00031820498406887054,
"learning_rate": 7.243820139034464e-07,
"loss": 0.0066,
"reward": 0.10900084767490625,
"reward_advantage_correlation": 1.0,
"reward_std": 0.15118937892839313,
"rewards/cosine_scaled_reward": 0.06142610125243664,
"rewards/format_reward": 0.5208333432674408,
"step": 219
},
{
"advantage_max": 0.09967101691290736,
"advantage_mean": 6.596868329378225e-10,
"advantage_min": -0.09928332921117544,
"advantage_std": 0.08199074282310903,
"completion_length": 2653.062511444092,
"epoch": 0.25142857142857145,
"grad_norm": 0.001260359538719058,
"kl": 0.00023132562637329102,
"learning_rate": 7.214816693576234e-07,
"loss": 0.0026,
"reward": 0.004753962974064052,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.08199074515141547,
"rewards/cosine_scaled_reward": -0.1849276451393962,
"rewards/format_reward": 0.3958333395421505,
"step": 220
},
{
"advantage_max": 0.10617540590465069,
"advantage_mean": -1.0865430360995632e-09,
"advantage_min": -0.1566908685490489,
"advantage_std": 0.10643785918364301,
"completion_length": 1836.958366394043,
"epoch": 0.25257142857142856,
"grad_norm": 0.0013525058748200536,
"kl": 0.00019219331443309784,
"learning_rate": 7.185729670371604e-07,
"loss": 0.0024,
"reward": 0.14161380444420502,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10643785947468132,
"rewards/cosine_scaled_reward": 0.0743873082101345,
"rewards/format_reward": 0.6875,
"step": 221
},
{
"advantage_max": 0.10208693100139499,
"advantage_mean": -1.2572854896086838e-08,
"advantage_min": -0.14001461677253246,
"advantage_std": 0.1002459516748786,
"completion_length": 2023.0625381469727,
"epoch": 0.2537142857142857,
"grad_norm": 0.001963146962225437,
"kl": 0.0002463310956954956,
"learning_rate": 7.156560487081051e-07,
"loss": -0.0009,
"reward": 0.20748503901995718,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.10024595074355602,
"rewards/cosine_scaled_reward": 0.23565726913511753,
"rewards/format_reward": 0.7500000037252903,
"step": 222
},
{
"advantage_max": 0.11967136012390256,
"advantage_mean": 2.0954757928848267e-09,
"advantage_min": -0.1763377906754613,
"advantage_std": 0.12263105483725667,
"completion_length": 1949.6250534057617,
"epoch": 0.25485714285714284,
"grad_norm": 0.0019644785206764936,
"kl": 0.00023895502090454102,
"learning_rate": 7.127310565369415e-07,
"loss": 0.0067,
"reward": 0.12921895319595933,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.12263105623424053,
"rewards/cosine_scaled_reward": 0.036362769082188606,
"rewards/format_reward": 0.6875000074505806,
"step": 223
},
{
"advantage_max": 0.22225173842161894,
"advantage_mean": -4.967053990334591e-09,
"advantage_min": -0.20200780779123306,
"advantage_std": 0.1708352784626186,
"completion_length": 2851.104217529297,
"epoch": 0.256,
"grad_norm": 0.003322938922792673,
"kl": 0.00026100873947143555,
"learning_rate": 7.097981330836616e-07,
"loss": 0.0085,
"reward": 0.10233315639197826,
"reward_advantage_correlation": 1.0,
"reward_std": 0.17083528079092503,
"rewards/cosine_scaled_reward": 0.020202322863042355,
"rewards/format_reward": 0.5625000093132257,
"step": 224
},
{
"advantage_max": 0.13286243984475732,
"advantage_mean": -4.811833334561477e-09,
"advantage_min": -0.19514462095685303,
"advantage_std": 0.1269650950562209,
"completion_length": 2449.312515258789,
"epoch": 0.2571428571428571,
"grad_norm": 0.002462556352838874,
"kl": 0.00022039934992790222,
"learning_rate": 7.068574212948169e-07,
"loss": 0.0073,
"reward": 0.07976344670169055,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12696509901434183,
"rewards/cosine_scaled_reward": -0.04560824343934655,
"rewards/format_reward": 0.5625000149011612,
"step": 225
},
{
"advantage_max": 0.14535690797492862,
"advantage_mean": -8.692344163896415e-09,
"advantage_min": -0.17608788143843412,
"advantage_std": 0.1196846547536552,
"completion_length": 2130.791690826416,
"epoch": 0.2582857142857143,
"grad_norm": 0.0015196395106613636,
"kl": 0.0002590194344520569,
"learning_rate": 7.039090644965509e-07,
"loss": 0.0011,
"reward": 0.14173566875979304,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.11968465615063906,
"rewards/cosine_scaled_reward": 0.10786362644284964,
"rewards/format_reward": 0.6250000055879354,
"step": 226
},
{
"advantage_max": 0.18973981589078903,
"advantage_mean": -4.579002760296547e-09,
"advantage_min": -0.2287753401324153,
"advantage_std": 0.17609892785549164,
"completion_length": 1823.8125228881836,
"epoch": 0.25942857142857145,
"grad_norm": 0.0025245463475584984,
"kl": 0.00020713824778795242,
"learning_rate": 7.009532063876148e-07,
"loss": 0.006,
"reward": 0.15795880928635597,
"reward_advantage_correlation": 1.0,
"reward_std": 0.17609893204644322,
"rewards/cosine_scaled_reward": 0.10740425251424313,
"rewards/format_reward": 0.7083333432674408,
"step": 227
},
{
"advantage_max": 0.08450271608307958,
"advantage_mean": -9.041590240399522e-09,
"advantage_min": -0.07050300342962146,
"advantage_std": 0.06420902267564088,
"completion_length": 2023.0833549499512,
"epoch": 0.26057142857142856,
"grad_norm": 0.0015921753365546465,
"kl": 0.00022837892174720764,
"learning_rate": 6.979899910323624e-07,
"loss": 0.0008,
"reward": 0.16322663193568587,
"reward_advantage_correlation": 1.0,
"reward_std": 0.06420902314130217,
"rewards/cosine_scaled_reward": 0.18748834542930126,
"rewards/format_reward": 0.583333333954215,
"step": 228
},
{
"advantage_max": 0.13741022581234574,
"advantage_mean": -3.1044086745701804e-10,
"advantage_min": -0.12077388912439346,
"advantage_std": 0.11520560039207339,
"completion_length": 3129.5000228881836,
"epoch": 0.26171428571428573,
"grad_norm": 0.0022859734017401934,
"kl": 0.0004267692565917969,
"learning_rate": 6.950195628537299e-07,
"loss": 0.0073,
"reward": 0.05897674150764942,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1152056036517024,
"rewards/cosine_scaled_reward": 0.060396708548069,
"rewards/format_reward": 0.2291666679084301,
"step": 229
},
{
"advantage_max": 0.15972233191132545,
"advantage_mean": -2.483526828633842e-09,
"advantage_min": -0.14318527560681105,
"advantage_std": 0.11432101391255856,
"completion_length": 2700.354217529297,
"epoch": 0.26285714285714284,
"grad_norm": 0.00235546356998384,
"kl": 0.00027105212211608887,
"learning_rate": 6.920420666261961e-07,
"loss": 0.0114,
"reward": 0.04795671720057726,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.11432101810351014,
"rewards/cosine_scaled_reward": -0.06557212490588427,
"rewards/format_reward": 0.41666667349636555,
"step": 230
},
{
"advantage_max": 0.18535670265555382,
"advantage_mean": -4.81183344558378e-09,
"advantage_min": -0.16397515125572681,
"advantage_std": 0.14254886470735073,
"completion_length": 2433.5416946411133,
"epoch": 0.264,
"grad_norm": 0.0023599222768098116,
"kl": 0.00023667514324188232,
"learning_rate": 6.890576474687263e-07,
"loss": 0.005,
"reward": 0.08141399221494794,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.14254886843264103,
"rewards/cosine_scaled_reward": -0.03232991881668568,
"rewards/format_reward": 0.5416666679084301,
"step": 231
},
{
"advantage_max": 0.19113029213622212,
"advantage_mean": -6.596868273867074e-09,
"advantage_min": -0.16376679064705968,
"advantage_std": 0.1487897140905261,
"completion_length": 2886.979217529297,
"epoch": 0.2651428571428571,
"grad_norm": 0.0029496531933546066,
"kl": 0.0004017353057861328,
"learning_rate": 6.860664508377001e-07,
"loss": 0.0057,
"reward": 0.09052334149600938,
"reward_advantage_correlation": 0.9999999999999997,
"reward_std": 0.14878972386941314,
"rewards/cosine_scaled_reward": -0.04579928144812584,
"rewards/format_reward": 0.6250000093132257,
"step": 232
},
{
"advantage_max": 0.15348306251689792,
"advantage_mean": -4.0357312769412346e-09,
"advantage_min": -0.18462875578552485,
"advantage_std": 0.14140585623681545,
"completion_length": 1905.6042022705078,
"epoch": 0.2662857142857143,
"grad_norm": 0.001814075163565576,
"kl": 0.00020284950733184814,
"learning_rate": 6.83068622519821e-07,
"loss": 0.0081,
"reward": 0.14864619029685855,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1414058580994606,
"rewards/cosine_scaled_reward": 0.10695588774979115,
"rewards/format_reward": 0.666666679084301,
"step": 233
},
{
"advantage_max": 0.10880297655239701,
"advantage_mean": 5.432714833553121e-10,
"advantage_min": -0.13644719682633877,
"advantage_std": 0.11064268089830875,
"completion_length": 2632.7916870117188,
"epoch": 0.2674285714285714,
"grad_norm": 0.002043582499027252,
"kl": 0.00025378167629241943,
"learning_rate": 6.800643086250121e-07,
"loss": 0.0032,
"reward": 0.02333975490182638,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11064268462359905,
"rewards/cosine_scaled_reward": -0.14015551283955574,
"rewards/format_reward": 0.4166666716337204,
"step": 234
},
{
"advantage_max": 0.13019301602616906,
"advantage_mean": 6.984919448393967e-10,
"advantage_min": -0.10806654393672943,
"advantage_std": 0.09198249317705631,
"completion_length": 2275.854179382324,
"epoch": 0.26857142857142857,
"grad_norm": 0.0018223219085484743,
"kl": 0.00024427380412817,
"learning_rate": 6.770536555792944e-07,
"loss": -0.0012,
"reward": 0.0993248739396222,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09198249503970146,
"rewards/cosine_scaled_reward": 0.024362626485526562,
"rewards/format_reward": 0.5416666679084301,
"step": 235
},
{
"advantage_max": 0.16375997196882963,
"advantage_mean": -2.328306325516394e-09,
"advantage_min": -0.2442130297422409,
"advantage_std": 0.1719423239119351,
"completion_length": 2600.6667098999023,
"epoch": 0.26971428571428574,
"grad_norm": 0.002591744065284729,
"kl": 0.00027217157185077667,
"learning_rate": 6.740368101176495e-07,
"loss": 0.0014,
"reward": 0.14142214879393578,
"reward_advantage_correlation": 1.0,
"reward_std": 0.17194233275949955,
"rewards/cosine_scaled_reward": 0.13721440639346838,
"rewards/format_reward": 0.5625000149011612,
"step": 236
},
{
"advantage_max": 0.17325403855647892,
"advantage_mean": 2.0372681319713593e-09,
"advantage_min": -0.1193100816453807,
"advantage_std": 0.11586322111543268,
"completion_length": 2441.2917137145996,
"epoch": 0.27085714285714285,
"grad_norm": 0.0020941535476595163,
"kl": 0.00025866925716400146,
"learning_rate": 6.710139192768694e-07,
"loss": 0.0041,
"reward": 0.08818818477448076,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1158632239094004,
"rewards/cosine_scaled_reward": 0.03216279484331608,
"rewards/format_reward": 0.4583333358168602,
"step": 237
},
{
"advantage_max": 0.18302309326827526,
"advantage_mean": -1.071020997583938e-08,
"advantage_min": -0.2006698572076857,
"advantage_std": 0.14664061879739165,
"completion_length": 2554.229202270508,
"epoch": 0.272,
"grad_norm": 0.002649980830028653,
"kl": 0.00035972893238067627,
"learning_rate": 6.679851303883891e-07,
"loss": 0.0081,
"reward": 0.1962011584546417,
"reward_advantage_correlation": 1.0,
"reward_std": 0.14664062252268195,
"rewards/cosine_scaled_reward": 0.2361844995757565,
"rewards/format_reward": 0.6875000111758709,
"step": 238
},
{
"advantage_max": 0.0755655961111188,
"advantage_mean": -2.7163575277988627e-09,
"advantage_min": -0.14539830526337028,
"advantage_std": 0.09070903505198658,
"completion_length": 1671.4167175292969,
"epoch": 0.27314285714285713,
"grad_norm": 0.0017475574277341366,
"kl": 0.00013406574726104736,
"learning_rate": 6.649505910711058e-07,
"loss": 0.0024,
"reward": 0.26976251835003495,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.09070903807878494,
"rewards/cosine_scaled_reward": 0.37741944566369057,
"rewards/format_reward": 0.8333333358168602,
"step": 239
},
{
"advantage_max": 0.11013911385089159,
"advantage_mean": -3.8805109126016646e-10,
"advantage_min": -0.11023932602256536,
"advantage_std": 0.09071210492402315,
"completion_length": 3068.5208435058594,
"epoch": 0.2742857142857143,
"grad_norm": 0.002247036434710026,
"kl": 0.0004382133483886719,
"learning_rate": 6.619104492241847e-07,
"loss": 0.0039,
"reward": -0.00012199021875858307,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.09071210911497474,
"rewards/cosine_scaled_reward": -0.1677103042602539,
"rewards/format_reward": 0.3333333358168602,
"step": 240
},
{
"advantage_max": 0.12328928150236607,
"advantage_mean": 1.6298145749660264e-09,
"advantage_min": -0.13954242039471865,
"advantage_std": 0.11165554029867053,
"completion_length": 2856.5208587646484,
"epoch": 0.2754285714285714,
"grad_norm": 0.002248368225991726,
"kl": 0.00041091442108154297,
"learning_rate": 6.588648530198504e-07,
"loss": 0.0093,
"reward": 0.020226968685165048,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.11165554216131568,
"rewards/cosine_scaled_reward": -0.15902700275182724,
"rewards/format_reward": 0.4375000074505806,
"step": 241
},
{
"advantage_max": 0.11202648957259953,
"advantage_mean": -4.6178079728120824e-09,
"advantage_min": -0.17740579205565155,
"advantage_std": 0.10806333494838327,
"completion_length": 1915.7708587646484,
"epoch": 0.2765714285714286,
"grad_norm": 0.0017519504763185978,
"kl": 0.00040813907980918884,
"learning_rate": 6.558139508961654e-07,
"loss": 0.0026,
"reward": 0.09660546365194023,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.108063337742351,
"rewards/cosine_scaled_reward": -0.04916233662515879,
"rewards/format_reward": 0.666666679084301,
"step": 242
},
{
"advantage_max": 0.2148361522704363,
"advantage_mean": -1.2417634698280722e-09,
"advantage_min": -0.1866682404652238,
"advantage_std": 0.15860576275736094,
"completion_length": 2627.2708587646484,
"epoch": 0.2777142857142857,
"grad_norm": 0.0025715562514960766,
"kl": 0.00027126073837280273,
"learning_rate": 6.527578915497951e-07,
"loss": 0.0012,
"reward": 0.15241722203791142,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.15860575903207064,
"rewards/cosine_scaled_reward": 0.1767411855980754,
"rewards/format_reward": 0.5416666753590107,
"step": 243
},
{
"advantage_max": 0.14424416236579418,
"advantage_mean": -1.3969838619232178e-09,
"advantage_min": -0.08643147628754377,
"advantage_std": 0.09387495345436037,
"completion_length": 2834.2916984558105,
"epoch": 0.27885714285714286,
"grad_norm": 0.0018725207773968577,
"kl": 0.0003355741500854492,
"learning_rate": 6.496968239287603e-07,
"loss": 0.0001,
"reward": 0.18463631451595575,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09387495927512646,
"rewards/cosine_scaled_reward": 0.26374871004372835,
"rewards/format_reward": 0.5625000018626451,
"step": 244
},
{
"advantage_max": 0.1822696654126048,
"advantage_mean": -7.605800961263398e-09,
"advantage_min": -0.2544402740895748,
"advantage_std": 0.17538686329498887,
"completion_length": 2421.520866394043,
"epoch": 0.28,
"grad_norm": 0.003280170261859894,
"kl": 0.0003395378589630127,
"learning_rate": 6.466308972251785e-07,
"loss": 0.0128,
"reward": 0.17269339971244335,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1753868730738759,
"rewards/cosine_scaled_reward": 0.18853969313204288,
"rewards/format_reward": 0.6458333507180214,
"step": 245
},
{
"advantage_max": 0.1803152672946453,
"advantage_mean": 1.0865430083439875e-09,
"advantage_min": -0.19117824081331491,
"advantage_std": 0.1531025180593133,
"completion_length": 2599.7084045410156,
"epoch": 0.28114285714285714,
"grad_norm": 0.0027648189570754766,
"kl": 0.0003256797790527344,
"learning_rate": 6.435602608679916e-07,
"loss": 0.0135,
"reward": 0.17906012770254165,
"reward_advantage_correlation": 1.0,
"reward_std": 0.15310251899063587,
"rewards/cosine_scaled_reward": 0.19450474623590708,
"rewards/format_reward": 0.6666666753590107,
"step": 246
},
{
"advantage_max": 0.09243609569966793,
"advantage_mean": -1.0865430222217753e-09,
"advantage_min": -0.10062496736645699,
"advantage_std": 0.07642483478412032,
"completion_length": 2902.8333587646484,
"epoch": 0.2822857142857143,
"grad_norm": 0.0014275149442255497,
"kl": 0.0002752244472503662,
"learning_rate": 6.404850645156841e-07,
"loss": 0.0059,
"reward": 0.019425339065492153,
"reward_advantage_correlation": 1.0,
"reward_std": 0.07642483757808805,
"rewards/cosine_scaled_reward": -0.09852191805839539,
"rewards/format_reward": 0.3125000074505806,
"step": 247
},
{
"advantage_max": 0.11183958873152733,
"advantage_mean": -1.3038516377683607e-08,
"advantage_min": -0.13401627726852894,
"advantage_std": 0.10416306741535664,
"completion_length": 2012.2500381469727,
"epoch": 0.2834285714285714,
"grad_norm": 0.002708690706640482,
"kl": 0.00023761391639709473,
"learning_rate": 6.374054580489873e-07,
"loss": 0.0037,
"reward": 0.20697915088385344,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.10416307393461466,
"rewards/cosine_scaled_reward": 0.2683180356398225,
"rewards/format_reward": 0.6875000037252903,
"step": 248
},
{
"advantage_max": 0.16352112963795662,
"advantage_mean": 1.5522041985072121e-10,
"advantage_min": -0.1368715576827526,
"advantage_std": 0.12225319631397724,
"completion_length": 1839.5000114440918,
"epoch": 0.2845714285714286,
"grad_norm": 0.001457585021853447,
"kl": 0.00012803077697753906,
"learning_rate": 6.343215915635761e-07,
"loss": -0.0022,
"reward": 0.13103453570511192,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.12225319864228368,
"rewards/cosine_scaled_reward": 0.07532077515497804,
"rewards/format_reward": 0.6250000055879354,
"step": 249
},
{
"advantage_max": 0.18821298703551292,
"advantage_mean": -1.6298144778215118e-09,
"advantage_min": -0.16967704251874238,
"advantage_std": 0.14901624876074493,
"completion_length": 2353.687545776367,
"epoch": 0.2857142857142857,
"grad_norm": 0.0034939555916935205,
"kl": 0.0003637373447418213,
"learning_rate": 6.31233615362752e-07,
"loss": 0.0112,
"reward": 0.056937860790640116,
"reward_advantage_correlation": 1.0,
"reward_std": 0.14901624782942235,
"rewards/cosine_scaled_reward": -0.114087900146842,
"rewards/format_reward": 0.5625,
"step": 250
},
{
"advantage_max": 0.1813768669962883,
"advantage_mean": -1.1408702066395549e-08,
"advantage_min": -0.21314455661922693,
"advantage_std": 0.16639887960627675,
"completion_length": 1951.5000610351562,
"epoch": 0.28685714285714287,
"grad_norm": 0.003103352850303054,
"kl": 0.00021637976169586182,
"learning_rate": 6.281416799501187e-07,
"loss": 0.0052,
"reward": 0.17971518449485302,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.16639888333156705,
"rewards/cosine_scaled_reward": 0.1355207832530141,
"rewards/format_reward": 0.7916666697710752,
"step": 251
},
{
"advantage_max": 0.11473383381962776,
"advantage_mean": 2.7939677932353746e-09,
"advantage_min": -0.09810259565711021,
"advantage_std": 0.0875001561944373,
"completion_length": 2449.1458587646484,
"epoch": 0.288,
"grad_norm": 0.0016089569544419646,
"kl": 0.0003833770751953125,
"learning_rate": 6.25045936022246e-07,
"loss": 0.0088,
"reward": 0.09802764293272048,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.08750016003614292,
"rewards/cosine_scaled_reward": 0.019266456365585327,
"rewards/format_reward": 0.5416666697710752,
"step": 252
},
{
"advantage_max": 0.19016839936375618,
"advantage_mean": 5.432715388664633e-10,
"advantage_min": -0.12732800282537937,
"advantage_std": 0.12401359155774117,
"completion_length": 2734.625026702881,
"epoch": 0.28914285714285715,
"grad_norm": 0.002614011289551854,
"kl": 0.00036776065826416016,
"learning_rate": 6.219465344613258e-07,
"loss": 0.0097,
"reward": 0.05949016893282533,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12401359574869275,
"rewards/cosine_scaled_reward": -0.04276910796761513,
"rewards/format_reward": 0.4375000074505806,
"step": 253
},
{
"advantage_max": 0.12035822123289108,
"advantage_mean": -4.423782354323613e-09,
"advantage_min": -0.12394018657505512,
"advantage_std": 0.09628990339115262,
"completion_length": 2267.2916870117188,
"epoch": 0.29028571428571426,
"grad_norm": 0.0013566212728619576,
"kl": 0.00023874640464782715,
"learning_rate": 6.188436263278172e-07,
"loss": 0.0028,
"reward": 0.10561956372112036,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09628990711644292,
"rewards/cosine_scaled_reward": 0.020613186061382294,
"rewards/format_reward": 0.5833333358168602,
"step": 254
},
{
"advantage_max": 0.1810228805989027,
"advantage_mean": -1.396983917434369e-09,
"advantage_min": -0.1731141395866871,
"advantage_std": 0.16093693696893752,
"completion_length": 2984.1041984558105,
"epoch": 0.2914285714285714,
"grad_norm": 0.002712044632062316,
"kl": 0.00037491321563720703,
"learning_rate": 6.157373628530852e-07,
"loss": 0.0101,
"reward": 0.05883750435896218,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.16093694604933262,
"rewards/cosine_scaled_reward": -0.024436630308628082,
"rewards/format_reward": 0.3958333358168602,
"step": 255
},
{
"advantage_max": 0.21645413525402546,
"advantage_mean": -1.241763414316921e-09,
"advantage_min": -0.21925134025514126,
"advantage_std": 0.1742071988992393,
"completion_length": 2541.562545776367,
"epoch": 0.2925714285714286,
"grad_norm": 0.0028157387860119343,
"kl": 0.00041604042053222656,
"learning_rate": 6.126278954320294e-07,
"loss": -0.0019,
"reward": 0.1251123258844018,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1742072026245296,
"rewards/cosine_scaled_reward": 0.08667805790901184,
"rewards/format_reward": 0.562500013038516,
"step": 256
},
{
"advantage_max": 0.12831580359488726,
"advantage_mean": -1.4745941502580795e-08,
"advantage_min": -0.2287419093772769,
"advantage_std": 0.13877611607313156,
"completion_length": 2855.9167098999023,
"epoch": 0.2937142857142857,
"grad_norm": 0.0024661049246788025,
"kl": 0.0003330707550048828,
"learning_rate": 6.095153756157051e-07,
"loss": 0.0093,
"reward": 0.20216338173486292,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13877612398937345,
"rewards/cosine_scaled_reward": 0.3188007604330778,
"rewards/format_reward": 0.562500013038516,
"step": 257
},
{
"advantage_max": 0.24111134372651577,
"advantage_mean": -4.0357312769412346e-09,
"advantage_min": -0.18471561698243022,
"advantage_std": 0.18118810467422009,
"completion_length": 3009.2291870117188,
"epoch": 0.2948571428571429,
"grad_norm": 0.003488308284431696,
"kl": 0.0004291534423828125,
"learning_rate": 6.06399955103937e-07,
"loss": 0.0158,
"reward": 0.06386373564600945,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.18118811072781682,
"rewards/cosine_scaled_reward": -0.018720313906669617,
"rewards/format_reward": 0.4166666716337204,
"step": 258
},
{
"advantage_max": 0.20813544653356075,
"advantage_mean": -2.5999424002609572e-09,
"advantage_min": -0.19101551175117493,
"advantage_std": 0.16957074729725718,
"completion_length": 2645.8542251586914,
"epoch": 0.296,
"grad_norm": 0.0034065325744450092,
"kl": 0.0004336535930633545,
"learning_rate": 6.032817857379256e-07,
"loss": 0.0119,
"reward": 0.11577623779885471,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.16957074729725718,
"rewards/cosine_scaled_reward": 0.10282952804118395,
"rewards/format_reward": 0.47916666977107525,
"step": 259
},
{
"advantage_max": 0.10018521640449762,
"advantage_mean": -5.393910020023984e-09,
"advantage_min": -0.11331065790727735,
"advantage_std": 0.08604301093146205,
"completion_length": 1918.2916831970215,
"epoch": 0.29714285714285715,
"grad_norm": 0.0016338881105184555,
"kl": 0.00025529414415359497,
"learning_rate": 6.001610194928464e-07,
"loss": 0.0048,
"reward": 0.20562266194610856,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.0860430154716596,
"rewards/cosine_scaled_reward": 0.29520507203415036,
"rewards/format_reward": 0.6250000055879354,
"step": 260
},
{
"advantage_max": 0.1361211899202317,
"advantage_mean": 5.626740046116296e-10,
"advantage_min": -0.14850289840251207,
"advantage_std": 0.11766799632459879,
"completion_length": 2685.2708892822266,
"epoch": 0.29828571428571427,
"grad_norm": 0.002024485031142831,
"kl": 0.0002980828285217285,
"learning_rate": 5.97037808470444e-07,
"loss": 0.0036,
"reward": 0.0964075651136227,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.1176679995842278,
"rewards/cosine_scaled_reward": 0.014010767918080091,
"rewards/format_reward": 0.5416666697710752,
"step": 261
},
{
"advantage_max": 0.09950929321348667,
"advantage_mean": 2.3283065753165744e-10,
"advantage_min": -0.10815745778381824,
"advantage_std": 0.07817226415500045,
"completion_length": 2924.062530517578,
"epoch": 0.29942857142857143,
"grad_norm": 0.0013728238409385085,
"kl": 0.0004365351051092148,
"learning_rate": 5.939123048916173e-07,
"loss": 0.0032,
"reward": 0.004250659607350826,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.07817226415500045,
"rewards/cosine_scaled_reward": -0.1638176329433918,
"rewards/format_reward": 0.35416667722165585,
"step": 262
},
{
"advantage_max": 0.11774280667304993,
"advantage_mean": 2.4835269396561444e-09,
"advantage_min": -0.08933224296197295,
"advantage_std": 0.08471674006432295,
"completion_length": 2697.458354949951,
"epoch": 0.30057142857142854,
"grad_norm": 0.0009717949433252215,
"kl": 0.00025212764739990234,
"learning_rate": 5.907846610890011e-07,
"loss": 0.0048,
"reward": -0.008404992360738106,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.08471674332395196,
"rewards/cosine_scaled_reward": -0.18136566318571568,
"rewards/format_reward": 0.31250000186264515,
"step": 263
},
{
"advantage_max": 0.13900742027908564,
"advantage_mean": -2.6387472762401387e-09,
"advantage_min": -0.0985984280705452,
"advantage_std": 0.09653732646256685,
"completion_length": 2763.479202270508,
"epoch": 0.3017142857142857,
"grad_norm": 0.0014484527055174112,
"kl": 0.0003957897424697876,
"learning_rate": 5.87655029499542e-07,
"loss": 0.0013,
"reward": 0.06449721958779264,
"reward_advantage_correlation": 1.0,
"reward_std": 0.0965373320505023,
"rewards/cosine_scaled_reward": -0.039240069687366486,
"rewards/format_reward": 0.4583333395421505,
"step": 264
},
{
"advantage_max": 0.12348126340657473,
"advantage_mean": -3.2596290944209017e-09,
"advantage_min": -0.1179004842415452,
"advantage_std": 0.0924111008644104,
"completion_length": 1786.0208587646484,
"epoch": 0.3028571428571429,
"grad_norm": 0.0013441459741443396,
"kl": 0.00021648406982421875,
"learning_rate": 5.845235626570683e-07,
"loss": 0.0007,
"reward": 0.17772597214207053,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09241109946742654,
"rewards/cosine_scaled_reward": 0.15900977700948715,
"rewards/format_reward": 0.7291666772216558,
"step": 265
},
{
"advantage_max": 0.21225751377642155,
"advantage_mean": -4.2685618928395286e-10,
"advantage_min": -0.11814463697373867,
"advantage_std": 0.13529152376577258,
"completion_length": 3109.5208587646484,
"epoch": 0.304,
"grad_norm": 0.00254819099791348,
"kl": 0.00038570165634155273,
"learning_rate": 5.813904131848564e-07,
"loss": 0.0041,
"reward": -0.01641088235192001,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13529152469709516,
"rewards/cosine_scaled_reward": -0.20564116793684661,
"rewards/format_reward": 0.31250000558793545,
"step": 266
},
{
"advantage_max": 0.12912985170260072,
"advantage_mean": -5.083469145628072e-09,
"advantage_min": -0.16517041064798832,
"advantage_std": 0.12337575666606426,
"completion_length": 2857.4791717529297,
"epoch": 0.30514285714285716,
"grad_norm": 0.0021744819823652506,
"kl": 0.0003535747528076172,
"learning_rate": 5.78255733788191e-07,
"loss": -0.0004,
"reward": 0.09505193377844989,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12337576039135456,
"rewards/cosine_scaled_reward": 0.08310575038194656,
"rewards/format_reward": 0.3958333395421505,
"step": 267
},
{
"advantage_max": 0.2863444034010172,
"advantage_mean": -3.725290464995368e-09,
"advantage_min": -0.21928295260295272,
"advantage_std": 0.21429739147424698,
"completion_length": 2655.7292289733887,
"epoch": 0.3062857142857143,
"grad_norm": 0.003988311160355806,
"kl": 0.0004082322120666504,
"learning_rate": 5.751196772469237e-07,
"loss": 0.0232,
"reward": 0.11551153496839106,
"reward_advantage_correlation": 1.0,
"reward_std": 0.21429740265011787,
"rewards/cosine_scaled_reward": 0.0790593889541924,
"rewards/format_reward": 0.5208333488553762,
"step": 268
},
{
"advantage_max": 0.08062019851058722,
"advantage_mean": -9.235615966440847e-09,
"advantage_min": -0.12000223528593779,
"advantage_std": 0.07978324650321156,
"completion_length": 2586.8125610351562,
"epoch": 0.30742857142857144,
"grad_norm": 0.00132320960983634,
"kl": 0.00033305585384368896,
"learning_rate": 5.71982396408026e-07,
"loss": -0.0053,
"reward": 0.16532681556418538,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.07978325278963894,
"rewards/cosine_scaled_reward": 0.19731012731790543,
"rewards/format_reward": 0.5833333432674408,
"step": 269
},
{
"advantage_max": 0.21284050540998578,
"advantage_mean": -1.319373665875645e-09,
"advantage_min": -0.21470065601170063,
"advantage_std": 0.1724660824984312,
"completion_length": 2639.0208587646484,
"epoch": 0.30857142857142855,
"grad_norm": 0.0030540579464286566,
"kl": 0.00039489567279815674,
"learning_rate": 5.688440441781398e-07,
"loss": 0.008,
"reward": 0.15868494706228375,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1724660899490118,
"rewards/cosine_scaled_reward": 0.16503717796877027,
"rewards/format_reward": 0.6041666828095913,
"step": 270
},
{
"advantage_max": 0.11745514534413815,
"advantage_mean": -1.4280279847511679e-08,
"advantage_min": -0.12490505632013083,
"advantage_std": 0.09618132305331528,
"completion_length": 1718.4166946411133,
"epoch": 0.3097142857142857,
"grad_norm": 0.0017048126319423318,
"kl": 0.0001882612705230713,
"learning_rate": 5.657047735161255e-07,
"loss": 0.0016,
"reward": 0.24240022152662277,
"reward_advantage_correlation": 1.0,
"reward_std": 0.0961813268950209,
"rewards/cosine_scaled_reward": 0.2979451888240874,
"rewards/format_reward": 0.8333333358168602,
"step": 271
},
{
"advantage_max": 0.20761525630950928,
"advantage_mean": -3.259629080543114e-09,
"advantage_min": -0.22688710037618876,
"advantage_std": 0.1721926424652338,
"completion_length": 2622.7291870117188,
"epoch": 0.31085714285714283,
"grad_norm": 0.003913143649697304,
"kl": 0.0003896951675415039,
"learning_rate": 5.625647374256061e-07,
"loss": 0.0139,
"reward": 0.111437275307253,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.17219264013692737,
"rewards/cosine_scaled_reward": 0.05980448704212904,
"rewards/format_reward": 0.5416666809469461,
"step": 272
},
{
"advantage_max": 0.17210129369050264,
"advantage_mean": -1.0865430291606692e-08,
"advantage_min": -0.147225983440876,
"advantage_std": 0.1311533278785646,
"completion_length": 2594.666717529297,
"epoch": 0.312,
"grad_norm": 0.0020003090612590313,
"kl": 0.00030806660652160645,
"learning_rate": 5.594240889475106e-07,
"loss": 0.0103,
"reward": 0.0801794994622469,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1311533316038549,
"rewards/cosine_scaled_reward": 0.027412916533648968,
"rewards/format_reward": 0.4166666679084301,
"step": 273
},
{
"advantage_max": 0.12541158869862556,
"advantage_mean": -1.1020650489412809e-08,
"advantage_min": -0.1271469658240676,
"advantage_std": 0.10741374921053648,
"completion_length": 1575.7291717529297,
"epoch": 0.31314285714285717,
"grad_norm": 0.0014687005896121264,
"kl": 0.00015874579548835754,
"learning_rate": 5.562829811526154e-07,
"loss": 0.0037,
"reward": 0.2069433918222785,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10741375316865742,
"rewards/cosine_scaled_reward": 0.19209350552409887,
"rewards/format_reward": 0.8333333358168602,
"step": 274
},
{
"advantage_max": 0.11719317454844713,
"advantage_mean": -1.3969838202898543e-09,
"advantage_min": -0.12048850674182177,
"advantage_std": 0.09731898817699403,
"completion_length": 2238.8958435058594,
"epoch": 0.3142857142857143,
"grad_norm": 0.0018332276958972216,
"kl": 0.00023164600133895874,
"learning_rate": 5.531415671340826e-07,
"loss": 0.0027,
"reward": 0.2168528651818633,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.09731898899190128,
"rewards/cosine_scaled_reward": 0.3284746464341879,
"rewards/format_reward": 0.6250000055879354,
"step": 275
},
{
"advantage_max": 0.10887271910905838,
"advantage_mean": -5.044664352915618e-09,
"advantage_min": -0.16490559931844473,
"advantage_std": 0.11456224136054516,
"completion_length": 2457.4792251586914,
"epoch": 0.31542857142857145,
"grad_norm": 0.001995307393372059,
"kl": 0.0003548562526702881,
"learning_rate": 5.5e-07,
"loss": 0.0063,
"reward": 0.17041749227792025,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1145622415933758,
"rewards/cosine_scaled_reward": 0.22119942121207714,
"rewards/format_reward": 0.5625000149011612,
"step": 276
},
{
"advantage_max": 0.288712446577847,
"advantage_mean": -8.692344219407566e-09,
"advantage_min": -0.20587524212896824,
"advantage_std": 0.2012051260098815,
"completion_length": 2489.979217529297,
"epoch": 0.31657142857142856,
"grad_norm": 0.004219182766973972,
"kl": 0.0003845691680908203,
"learning_rate": 5.468584328659172e-07,
"loss": 0.0198,
"reward": 0.13046906306408346,
"reward_advantage_correlation": 0.9999999999999997,
"reward_std": 0.2012051409110427,
"rewards/cosine_scaled_reward": 0.10410384787246585,
"rewards/format_reward": 0.5625000093132257,
"step": 277
},
{
"advantage_max": 0.10111749917268753,
"advantage_mean": -2.832772877381373e-09,
"advantage_min": -0.13670605374500155,
"advantage_std": 0.09576660464517772,
"completion_length": 1926.6458740234375,
"epoch": 0.3177142857142857,
"grad_norm": 0.001758676953613758,
"kl": 0.00029021501541137695,
"learning_rate": 5.437170188473847e-07,
"loss": 0.0003,
"reward": 0.22084622830152512,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.0957666093017906,
"rewards/cosine_scaled_reward": 0.20693974336609244,
"rewards/format_reward": 0.8750000111758709,
"step": 278
},
{
"advantage_max": 0.13892194349318743,
"advantage_mean": -2.949188213086096e-09,
"advantage_min": -0.10274117905646563,
"advantage_std": 0.10129120200872421,
"completion_length": 3169.729179382324,
"epoch": 0.31885714285714284,
"grad_norm": 0.0020529532339423895,
"kl": 0.00042885541915893555,
"learning_rate": 5.405759110524894e-07,
"loss": 0.0019,
"reward": 0.004306883085519075,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10129120014607906,
"rewards/cosine_scaled_reward": -0.1018618680536747,
"rewards/format_reward": 0.22916666977107525,
"step": 279
},
{
"advantage_max": 0.22641071490943432,
"advantage_mean": -2.638747387262441e-09,
"advantage_min": -0.15394274424761534,
"advantage_std": 0.15549280680716038,
"completion_length": 1992.2291831970215,
"epoch": 0.32,
"grad_norm": 0.002533955965191126,
"kl": 0.00034427642822265625,
"learning_rate": 5.37435262574394e-07,
"loss": 0.0115,
"reward": 0.16187963518314064,
"reward_advantage_correlation": 1.0,
"reward_std": 0.15549280843697488,
"rewards/cosine_scaled_reward": 0.13246607966721058,
"rewards/format_reward": 0.6875000111758709,
"step": 280
},
{
"advantage_max": 0.20242772391065955,
"advantage_mean": -2.2506962960022747e-09,
"advantage_min": -0.13997359201312065,
"advantage_std": 0.13463286077603698,
"completion_length": 3309.4583740234375,
"epoch": 0.3211428571428571,
"grad_norm": 0.0027612389530986547,
"kl": 0.0004780292510986328,
"learning_rate": 5.342952264838747e-07,
"loss": 0.0086,
"reward": 0.004778874106705189,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1346328603103757,
"rewards/cosine_scaled_reward": -0.14123893855139613,
"rewards/format_reward": 0.31250000931322575,
"step": 281
},
{
"advantage_max": 0.12792781926691532,
"advantage_mean": -4.6566130951219975e-09,
"advantage_min": -0.14909182582050562,
"advantage_std": 0.11060419026762247,
"completion_length": 2272.062568664551,
"epoch": 0.3222857142857143,
"grad_norm": 0.0016208424931392074,
"kl": 0.0003352165222167969,
"learning_rate": 5.311559558218603e-07,
"loss": 0.0001,
"reward": 0.1578272543847561,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11060419538989663,
"rewards/cosine_scaled_reward": 0.15399221889674664,
"rewards/format_reward": 0.6250000149011612,
"step": 282
},
{
"advantage_max": 0.13227641116827726,
"advantage_mean": -7.140139812733537e-09,
"advantage_min": -0.14622067473828793,
"advantage_std": 0.10864142281934619,
"completion_length": 2319.791717529297,
"epoch": 0.32342857142857145,
"grad_norm": 0.0018907062476500869,
"kl": 0.0003216862678527832,
"learning_rate": 5.28017603591974e-07,
"loss": 0.0046,
"reward": 0.23123158095404506,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10864142375066876,
"rewards/cosine_scaled_reward": 0.33133327309042215,
"rewards/format_reward": 0.7083333432674408,
"step": 283
},
{
"advantage_max": 0.13931749551557004,
"advantage_mean": -6.131207229420621e-09,
"advantage_min": -0.17834768863394856,
"advantage_std": 0.11884868424385786,
"completion_length": 1949.2917175292969,
"epoch": 0.32457142857142857,
"grad_norm": 0.0016096375184133649,
"kl": 0.0001983940601348877,
"learning_rate": 5.248803227530763e-07,
"loss": 0.0042,
"reward": 0.20003115246072412,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11884868587367237,
"rewards/cosine_scaled_reward": 0.2159505933523178,
"rewards/format_reward": 0.7500000055879354,
"step": 284
},
{
"advantage_max": 0.10915980814024806,
"advantage_mean": -2.949188275536141e-09,
"advantage_min": -0.07292798534035683,
"advantage_std": 0.07839016616344452,
"completion_length": 1869.0625495910645,
"epoch": 0.32571428571428573,
"grad_norm": 0.0018345932476222515,
"kl": 0.00028606876730918884,
"learning_rate": 5.21744266211809e-07,
"loss": 0.001,
"reward": 0.10105163743719459,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.0783901670947671,
"rewards/cosine_scaled_reward": -0.0772322453558445,
"rewards/format_reward": 0.7500000111758709,
"step": 285
},
{
"advantage_max": 0.12339612538926303,
"advantage_mean": -7.1013350096127414e-09,
"advantage_min": -0.15304887667298317,
"advantage_std": 0.10974670597352087,
"completion_length": 2313.625015258789,
"epoch": 0.32685714285714285,
"grad_norm": 0.0016531402943655849,
"kl": 0.00037553906440734863,
"learning_rate": 5.186095868151436e-07,
"loss": 0.0086,
"reward": 0.16036886721849442,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1097467066720128,
"rewards/cosine_scaled_reward": 0.17942035384476185,
"rewards/format_reward": 0.5833333358168602,
"step": 286
},
{
"advantage_max": 0.090579554438591,
"advantage_mean": -2.0178656315317234e-09,
"advantage_min": -0.12944842409342527,
"advantage_std": 0.09058350510895252,
"completion_length": 1710.7083435058594,
"epoch": 0.328,
"grad_norm": 0.001025793026201427,
"kl": 0.00019846856594085693,
"learning_rate": 5.154764373429315e-07,
"loss": -0.0008,
"reward": 0.13440879015251994,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.09058351023122668,
"rewards/cosine_scaled_reward": 0.10385258868336678,
"rewards/format_reward": 0.5833333432674408,
"step": 287
},
{
"advantage_max": 0.19185153394937515,
"advantage_mean": 1.241763414316921e-09,
"advantage_min": -0.1199220959097147,
"advantage_std": 0.1265643904916942,
"completion_length": 2892.2708435058594,
"epoch": 0.3291428571428571,
"grad_norm": 0.0027143056504428387,
"kl": 0.0004878044128417969,
"learning_rate": 5.123449705004581e-07,
"loss": 0.0028,
"reward": 0.05917328954092227,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12656439328566194,
"rewards/cosine_scaled_reward": -0.010442662052810192,
"rewards/format_reward": 0.3750000037252903,
"step": 288
},
{
"advantage_max": 0.10830804985016584,
"advantage_mean": -3.8805109958683914e-09,
"advantage_min": -0.12791373440995812,
"advantage_std": 0.09628125256858766,
"completion_length": 2157.4791984558105,
"epoch": 0.3302857142857143,
"grad_norm": 0.001589475548826158,
"kl": 0.00038611888885498047,
"learning_rate": 5.09215338910999e-07,
"loss": 0.0004,
"reward": 0.10157534619793296,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09628125361632556,
"rewards/cosine_scaled_reward": 0.007496317848563194,
"rewards/format_reward": 0.5833333414047956,
"step": 289
},
{
"advantage_max": 0.1997305415570736,
"advantage_mean": -7.295360621162317e-09,
"advantage_min": -0.1688457289710641,
"advantage_std": 0.14652833994477987,
"completion_length": 1426.9792022705078,
"epoch": 0.3314285714285714,
"grad_norm": 0.0017456391360610723,
"kl": 0.00028890371322631836,
"learning_rate": 5.060876951083828e-07,
"loss": 0.0068,
"reward": 0.17761991049337666,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.14652834553271532,
"rewards/cosine_scaled_reward": 0.08774526475463063,
"rewards/format_reward": 0.8750000074505806,
"step": 290
},
{
"advantage_max": 0.15469386614859104,
"advantage_mean": -1.5522043650406658e-09,
"advantage_min": -0.10938695259392262,
"advantage_std": 0.10772312432527542,
"completion_length": 2121.895851135254,
"epoch": 0.3325714285714286,
"grad_norm": 0.001207710593007505,
"kl": 0.00032941997051239014,
"learning_rate": 5.02962191529556e-07,
"loss": 0.0004,
"reward": 0.16409676615148783,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1077231322415173,
"rewards/cosine_scaled_reward": 0.1403282443061471,
"rewards/format_reward": 0.6875000018626451,
"step": 291
},
{
"advantage_max": 0.17690535634756088,
"advantage_mean": -2.3283067140944524e-10,
"advantage_min": -0.1479609040543437,
"advantage_std": 0.14120537089183927,
"completion_length": 2716.2500228881836,
"epoch": 0.33371428571428574,
"grad_norm": 0.0023947900626808405,
"kl": 0.0004105567932128906,
"learning_rate": 4.998389805071536e-07,
"loss": 0.0028,
"reward": 0.04709340166300535,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.14120537508279085,
"rewards/cosine_scaled_reward": -0.08947830833494663,
"rewards/format_reward": 0.4583333395421505,
"step": 292
},
{
"advantage_max": 0.15446675289422274,
"advantage_mean": -4.579002739479865e-09,
"advantage_min": -0.15014554280787706,
"advantage_std": 0.1075568669475615,
"completion_length": 1999.520851135254,
"epoch": 0.33485714285714285,
"grad_norm": 0.001623099553398788,
"kl": 0.0004197433590888977,
"learning_rate": 4.967182142620745e-07,
"loss": -0.0022,
"reward": 0.12098718318156898,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10755686787888408,
"rewards/cosine_scaled_reward": 0.0010581477545201778,
"rewards/format_reward": 0.7083333432674408,
"step": 293
},
{
"advantage_max": 0.2192140589468181,
"advantage_mean": 4.656613428188905e-10,
"advantage_min": -0.1265512192621827,
"advantage_std": 0.13496136059984565,
"completion_length": 3130.3958740234375,
"epoch": 0.336,
"grad_norm": 0.002399858320131898,
"kl": 0.0006003789603710175,
"learning_rate": 4.93600044896063e-07,
"loss": 0.0063,
"reward": 0.03184024168876931,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13496137037873268,
"rewards/cosine_scaled_reward": -0.04238404519855976,
"rewards/format_reward": 0.27083333767950535,
"step": 294
},
{
"advantage_max": 0.16546836122870445,
"advantage_mean": -2.483527036800659e-09,
"advantage_min": -0.14341549389064312,
"advantage_std": 0.12139872647821903,
"completion_length": 2891.7291717529297,
"epoch": 0.33714285714285713,
"grad_norm": 0.0022274511866271496,
"kl": 0.00046318769454956055,
"learning_rate": 4.904846243842949e-07,
"loss": -0.002,
"reward": 0.055140421725809574,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.12139872647821903,
"rewards/cosine_scaled_reward": -0.026432855054736137,
"rewards/format_reward": 0.37500000558793545,
"step": 295
},
{
"advantage_max": 0.09151355037465692,
"advantage_mean": -5.587935683615264e-09,
"advantage_min": -0.10618274100124836,
"advantage_std": 0.07723336713388562,
"completion_length": 2818.041702270508,
"epoch": 0.3382857142857143,
"grad_norm": 0.0014323138166218996,
"kl": 0.00042188167572021484,
"learning_rate": 4.873721045679706e-07,
"loss": 0.0041,
"reward": 0.06925072055310011,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.0772333717904985,
"rewards/cosine_scaled_reward": -0.025438087061047554,
"rewards/format_reward": 0.4583333432674408,
"step": 296
},
{
"advantage_max": 0.21857766713947058,
"advantage_mean": 3.182018898373329e-09,
"advantage_min": -0.12679255288094282,
"advantage_std": 0.14345951098948717,
"completion_length": 3366.729217529297,
"epoch": 0.3394285714285714,
"grad_norm": 0.0024829749017953873,
"kl": 0.0004658699035644531,
"learning_rate": 4.842626371469149e-07,
"loss": 0.0039,
"reward": 0.00389005895704031,
"reward_advantage_correlation": 1.0,
"reward_std": 0.14345951285213232,
"rewards/cosine_scaled_reward": -0.09262831043452024,
"rewards/format_reward": 0.2083333358168602,
"step": 297
},
{
"advantage_max": 0.2189861796796322,
"advantage_mean": -3.72529045111758e-09,
"advantage_min": -0.1656702347099781,
"advantage_std": 0.15777601953595877,
"completion_length": 2752.520851135254,
"epoch": 0.3405714285714286,
"grad_norm": 0.002729513682425022,
"kl": 0.00036172568798065186,
"learning_rate": 4.811563736721829e-07,
"loss": 0.008,
"reward": 0.08497396449092776,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1577760260552168,
"rewards/cosine_scaled_reward": 0.030580737628042698,
"rewards/format_reward": 0.43750000186264515,
"step": 298
},
{
"advantage_max": 0.11331136804074049,
"advantage_mean": -1.6298146165993899e-09,
"advantage_min": -0.16553817968815565,
"advantage_std": 0.1081388727761805,
"completion_length": 3006.666717529297,
"epoch": 0.3417142857142857,
"grad_norm": 0.0022456180304288864,
"kl": 0.0003933906555175781,
"learning_rate": 4.780534655386743e-07,
"loss": 0.0021,
"reward": 0.124224784784019,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10813887417316437,
"rewards/cosine_scaled_reward": 0.1376944463700056,
"rewards/format_reward": 0.45833334140479565,
"step": 299
},
{
"advantage_max": 0.13589740544557571,
"advantage_mean": 1.3581787534910905e-09,
"advantage_min": -0.09810729883611202,
"advantage_std": 0.0930751352570951,
"completion_length": 3335.3125610351562,
"epoch": 0.34285714285714286,
"grad_norm": 0.0022788026835769415,
"kl": 0.000484466552734375,
"learning_rate": 4.749540639777539e-07,
"loss": 0.0047,
"reward": -0.013445806922391057,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.09307514037936926,
"rewards/cosine_scaled_reward": -0.1955646127462387,
"rewards/format_reward": 0.3125000037252903,
"step": 300
},
{
"advantage_max": 0.1622638087719679,
"advantage_mean": -3.958121053138086e-09,
"advantage_min": -0.1490333159454167,
"advantage_std": 0.12028014613315463,
"completion_length": 2307.145866394043,
"epoch": 0.344,
"grad_norm": 0.0024759138468652964,
"kl": 0.00041925907135009766,
"learning_rate": 4.7185832004988133e-07,
"loss": 0.0037,
"reward": 0.09233828741707839,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.12028014985844493,
"rewards/cosine_scaled_reward": -0.09192248748149723,
"rewards/format_reward": 0.729166679084301,
"step": 301
},
{
"advantage_max": 0.1092590931802988,
"advantage_mean": -7.838631854717448e-09,
"advantage_min": -0.11798757687211037,
"advantage_std": 0.08439132361672819,
"completion_length": 2324.6042251586914,
"epoch": 0.34514285714285714,
"grad_norm": 0.0013441102346405387,
"kl": 0.00034530647099018097,
"learning_rate": 4.68766384637248e-07,
"loss": -0.0028,
"reward": 0.16111605032347143,
"reward_advantage_correlation": 1.0,
"reward_std": 0.08439132594503462,
"rewards/cosine_scaled_reward": 0.132624052464962,
"rewards/format_reward": 0.6875,
"step": 302
},
{
"advantage_max": 0.14658861979842186,
"advantage_mean": -8.071462456737954e-09,
"advantage_min": -0.17685645446181297,
"advantage_std": 0.14615025278180838,
"completion_length": 2429.604232788086,
"epoch": 0.3462857142857143,
"grad_norm": 0.002118069212883711,
"kl": 0.0004744231700897217,
"learning_rate": 4.656784084364238e-07,
"loss": 0.0035,
"reward": 0.1084291534498334,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.14615025976672769,
"rewards/cosine_scaled_reward": 0.027401255443692207,
"rewards/format_reward": 0.5833333395421505,
"step": 303
},
{
"advantage_max": 0.2133752703666687,
"advantage_mean": -7.761021464380846e-09,
"advantage_min": -0.13373715244233608,
"advantage_std": 0.13641660660505295,
"completion_length": 2490.8125610351562,
"epoch": 0.3474285714285714,
"grad_norm": 0.002510175807401538,
"kl": 0.00044634193181991577,
"learning_rate": 4.6259454195101267e-07,
"loss": 0.005,
"reward": 0.11259253711614292,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1364166084676981,
"rewards/cosine_scaled_reward": 0.05044803116470575,
"rewards/format_reward": 0.5625000037252903,
"step": 304
},
{
"advantage_max": 0.18745366763323545,
"advantage_mean": -2.3283065059276353e-09,
"advantage_min": -0.15571925230324268,
"advantage_std": 0.13849145593121648,
"completion_length": 2695.916732788086,
"epoch": 0.3485714285714286,
"grad_norm": 0.002323645632714033,
"kl": 0.00039571523666381836,
"learning_rate": 4.59514935484316e-07,
"loss": 0.0109,
"reward": 0.0692746420390904,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13849145593121648,
"rewards/cosine_scaled_reward": -0.05563849490135908,
"rewards/format_reward": 0.5208333432674408,
"step": 305
},
{
"advantage_max": 0.13452134793624282,
"advantage_mean": -4.3073669526993985e-09,
"advantage_min": -0.1587929087691009,
"advantage_std": 0.13013424794189632,
"completion_length": 2271.166690826416,
"epoch": 0.3497142857142857,
"grad_norm": 0.002338060177862644,
"kl": 0.0003482997417449951,
"learning_rate": 4.5643973913200837e-07,
"loss": 0.0155,
"reward": 0.12165643041953444,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.13013425352983177,
"rewards/cosine_scaled_reward": 0.07424017786979675,
"rewards/format_reward": 0.5625000037252903,
"step": 306
},
{
"advantage_max": 0.19390251953154802,
"advantage_mean": -3.1820189122511167e-09,
"advantage_min": -0.13394111022353172,
"advantage_std": 0.13033632142469287,
"completion_length": 2285.5625038146973,
"epoch": 0.35085714285714287,
"grad_norm": 0.0023057800717651844,
"kl": 0.00042870640754699707,
"learning_rate": 4.5336910277482155e-07,
"loss": 0.0001,
"reward": 0.11600270541384816,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13033632514998317,
"rewards/cosine_scaled_reward": 0.050759092438966036,
"rewards/format_reward": 0.5833333358168602,
"step": 307
},
{
"advantage_max": 0.20064522698521614,
"advantage_mean": 3.880514659604373e-11,
"advantage_min": -0.15068083815276623,
"advantage_std": 0.14386159926652908,
"completion_length": 3076.5209045410156,
"epoch": 0.352,
"grad_norm": 0.0026027862913906574,
"kl": 0.00039158761501312256,
"learning_rate": 4.503031760712397e-07,
"loss": 0.0015,
"reward": 0.04175692540593445,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.14386159414425492,
"rewards/cosine_scaled_reward": -0.0642844419926405,
"rewards/format_reward": 0.37500000558793545,
"step": 308
},
{
"advantage_max": 0.1422612089663744,
"advantage_mean": -7.450581041013038e-09,
"advantage_min": -0.17655340489000082,
"advantage_std": 0.12518154783174396,
"completion_length": 2657.562530517578,
"epoch": 0.35314285714285715,
"grad_norm": 0.0022041448391973972,
"kl": 0.0003338456153869629,
"learning_rate": 4.4724210845020494e-07,
"loss": 0.0006,
"reward": 0.1618395473342389,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.125181557610631,
"rewards/cosine_scaled_reward": 0.1995255146175623,
"rewards/format_reward": 0.5625000055879354,
"step": 309
},
{
"advantage_max": 0.1463564890436828,
"advantage_mean": 5.122274313040798e-09,
"advantage_min": -0.14603949431329966,
"advantage_std": 0.11013461998663843,
"completion_length": 2038.0625076293945,
"epoch": 0.35428571428571426,
"grad_norm": 0.0016478110337629914,
"kl": 0.0003361701965332031,
"learning_rate": 4.441860491038345e-07,
"loss": 0.0006,
"reward": 0.1367599029908888,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11013462860137224,
"rewards/cosine_scaled_reward": 0.08016027277335525,
"rewards/format_reward": 0.645833333954215,
"step": 310
},
{
"advantage_max": 0.10873270966112614,
"advantage_mean": 2.3283066447055134e-10,
"advantage_min": -0.15578097198158503,
"advantage_std": 0.10270903469063342,
"completion_length": 2404.8541946411133,
"epoch": 0.3554285714285714,
"grad_norm": 0.0017932639457285404,
"kl": 0.00038820505142211914,
"learning_rate": 4.4113514698014953e-07,
"loss": 0.0072,
"reward": 0.17740142671391368,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10270903422497213,
"rewards/cosine_scaled_reward": 0.21936094062402844,
"rewards/format_reward": 0.6041666772216558,
"step": 311
},
{
"advantage_max": 0.14976314548403025,
"advantage_mean": 1.241763553094799e-09,
"advantage_min": -0.1350155808031559,
"advantage_std": 0.11650370946153998,
"completion_length": 2204.1250076293945,
"epoch": 0.3565714285714286,
"grad_norm": 0.0020335179287940264,
"kl": 0.0004989905282855034,
"learning_rate": 4.3808955077581546e-07,
"loss": -0.0003,
"reward": 0.1353573240339756,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11650371551513672,
"rewards/cosine_scaled_reward": 0.1568785011768341,
"rewards/format_reward": 0.47916667722165585,
"step": 312
},
{
"advantage_max": 0.10151751572266221,
"advantage_mean": -5.2774946981970494e-09,
"advantage_min": -0.13054312393069267,
"advantage_std": 0.09196106740273535,
"completion_length": 2606.562530517578,
"epoch": 0.3577142857142857,
"grad_norm": 0.0014636931009590626,
"kl": 0.0004368424415588379,
"learning_rate": 4.350494089288943e-07,
"loss": 0.0038,
"reward": 0.07864985754713416,
"reward_advantage_correlation": 1.0,
"reward_std": 0.091961067635566,
"rewards/cosine_scaled_reward": 0.012477612122893333,
"rewards/format_reward": 0.4375000074505806,
"step": 313
},
{
"advantage_max": 0.083519974257797,
"advantage_mean": -2.79396782099095e-09,
"advantage_min": -0.0853169858455658,
"advantage_std": 0.06732387357624248,
"completion_length": 2245.500015258789,
"epoch": 0.3588571428571429,
"grad_norm": 0.0009150411933660507,
"kl": 0.00030519068241119385,
"learning_rate": 4.3201486961161093e-07,
"loss": 0.0013,
"reward": 0.10207456815987825,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.06732387689407915,
"rewards/cosine_scaled_reward": 0.03957906365394592,
"rewards/format_reward": 0.520833333954215,
"step": 314
},
{
"advantage_max": 0.09536193497478962,
"advantage_mean": 2.3283069916502086e-10,
"advantage_min": -0.17288233432918787,
"advantage_std": 0.0984175750054419,
"completion_length": 2790.8750228881836,
"epoch": 0.36,
"grad_norm": 0.0022720075212419033,
"kl": 0.000449448823928833,
"learning_rate": 4.2898608072313045e-07,
"loss": 0.0093,
"reward": 0.10023409640416503,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09841757919639349,
"rewards/cosine_scaled_reward": 0.08823532424867153,
"rewards/format_reward": 0.416666679084301,
"step": 315
},
{
"advantage_max": 0.11639646254479885,
"advantage_mean": -3.4148496252939253e-09,
"advantage_min": -0.12197889108210802,
"advantage_std": 0.09412986086681485,
"completion_length": 3332.479217529297,
"epoch": 0.36114285714285715,
"grad_norm": 0.0020200808066874743,
"kl": 0.000614166259765625,
"learning_rate": 4.2596318988235037e-07,
"loss": 0.0068,
"reward": 0.023105132393538952,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.09412986552342772,
"rewards/cosine_scaled_reward": -0.06621896661818027,
"rewards/format_reward": 0.2708333395421505,
"step": 316
},
{
"advantage_max": 0.22891795448958874,
"advantage_mean": 1.7074248265247505e-09,
"advantage_min": -0.16849522665143013,
"advantage_std": 0.1543532907962799,
"completion_length": 2679.187545776367,
"epoch": 0.36228571428571427,
"grad_norm": 0.0023186705075204372,
"kl": 0.0004814229905605316,
"learning_rate": 4.2294634442070553e-07,
"loss": 0.0081,
"reward": 0.04336748970672488,
"reward_advantage_correlation": 1.0,
"reward_std": 0.15435329405590892,
"rewards/cosine_scaled_reward": -0.07002291013486683,
"rewards/format_reward": 0.39583333767950535,
"step": 317
},
{
"advantage_max": 0.13123337179422379,
"advantage_mean": -8.498318770921998e-09,
"advantage_min": -0.12308492953889072,
"advantage_std": 0.10156573518179357,
"completion_length": 1147.7916870117188,
"epoch": 0.36342857142857143,
"grad_norm": 0.001232507056556642,
"kl": 0.00012452714145183563,
"learning_rate": 4.1993569137498776e-07,
"loss": 0.0005,
"reward": 0.17573032714426517,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10156573657877743,
"rewards/cosine_scaled_reward": 0.07146302983164787,
"rewards/format_reward": 0.895833333954215,
"step": 318
},
{
"advantage_max": 0.17452884558588266,
"advantage_mean": -2.6387474566513802e-09,
"advantage_min": -0.12432311568409204,
"advantage_std": 0.11903674202039838,
"completion_length": 2531.812515258789,
"epoch": 0.36457142857142855,
"grad_norm": 0.0020693091209977865,
"kl": 0.00045609474182128906,
"learning_rate": 4.1693137748017915e-07,
"loss": 0.0077,
"reward": 0.024991515558212996,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11903674621134996,
"rewards/cosine_scaled_reward": -0.14697773708030581,
"rewards/format_reward": 0.43750000558793545,
"step": 319
},
{
"advantage_max": 0.09645581245422363,
"advantage_mean": -1.474593960826276e-09,
"advantage_min": -0.11469449661672115,
"advantage_std": 0.08255432173609734,
"completion_length": 1697.708351135254,
"epoch": 0.3657142857142857,
"grad_norm": 0.001524286693893373,
"kl": 0.0003675222396850586,
"learning_rate": 4.1393354916230005e-07,
"loss": 0.0017,
"reward": 0.1566449678502977,
"reward_advantage_correlation": 1.0,
"reward_std": 0.08255432453006506,
"rewards/cosine_scaled_reward": 0.06804804364219308,
"rewards/format_reward": 0.7916666679084301,
"step": 320
},
{
"advantage_max": 0.10896373726427555,
"advantage_mean": -5.122274382429737e-09,
"advantage_min": -0.1531135831028223,
"advantage_std": 0.10200135898776352,
"completion_length": 1350.6041793823242,
"epoch": 0.3668571428571429,
"grad_norm": 0.0013724055606871843,
"kl": 0.00025459565222263336,
"learning_rate": 4.1094235253127374e-07,
"loss": -0.0041,
"reward": 0.24308783560991287,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.10200135805644095,
"rewards/cosine_scaled_reward": 0.2970201913267374,
"rewards/format_reward": 0.8333333432674408,
"step": 321
},
{
"advantage_max": 0.15791441453620791,
"advantage_mean": -3.104407841902912e-10,
"advantage_min": -0.15344735700637102,
"advantage_std": 0.13148926093708724,
"completion_length": 2771.4166946411133,
"epoch": 0.368,
"grad_norm": 0.0031580019276589155,
"kl": 0.0005682110786437988,
"learning_rate": 4.079579333738039e-07,
"loss": 0.0113,
"reward": 0.04155511595308781,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13148926885332912,
"rewards/cosine_scaled_reward": -0.06613871618174016,
"rewards/format_reward": 0.3750000111758709,
"step": 322
},
{
"advantage_max": 0.11292254738509655,
"advantage_mean": -5.044664325160042e-09,
"advantage_min": -0.12851296365261078,
"advantage_std": 0.09898313414305449,
"completion_length": 2861.937530517578,
"epoch": 0.36914285714285716,
"grad_norm": 0.0017387475818395615,
"kl": 0.0004943609237670898,
"learning_rate": 4.0498043714627006e-07,
"loss": 0.0023,
"reward": 0.08781374106183648,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.09898313879966736,
"rewards/cosine_scaled_reward": 0.020912078442052007,
"rewards/format_reward": 0.4791666753590107,
"step": 323
},
{
"advantage_max": 0.17608331004157662,
"advantage_mean": -2.173086106893596e-09,
"advantage_min": -0.12420807220041752,
"advantage_std": 0.11417877301573753,
"completion_length": 2464.000045776367,
"epoch": 0.3702857142857143,
"grad_norm": 0.002251465106382966,
"kl": 0.0004966855049133301,
"learning_rate": 4.020100089676376e-07,
"loss": 0.001,
"reward": 0.08936168113723397,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11417877511121333,
"rewards/cosine_scaled_reward": -0.007908736355602741,
"rewards/format_reward": 0.5416666734963655,
"step": 324
},
{
"advantage_max": 0.20293579250574112,
"advantage_mean": -6.67447851154801e-09,
"advantage_min": -0.20778802502900362,
"advantage_std": 0.1762455804273486,
"completion_length": 2415.187545776367,
"epoch": 0.37142857142857144,
"grad_norm": 0.003066692966967821,
"kl": 0.0004252195358276367,
"learning_rate": 3.9904679361238526e-07,
"loss": 0.0136,
"reward": 0.19227174390107393,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.17624558601528406,
"rewards/cosine_scaled_reward": 0.22559675807133317,
"rewards/format_reward": 0.6875000111758709,
"step": 325
},
{
"advantage_max": 0.11844811588525772,
"advantage_mean": -4.268561761000544e-09,
"advantage_min": -0.13017160259187222,
"advantage_std": 0.09890975127927959,
"completion_length": 2067.6667137145996,
"epoch": 0.37257142857142855,
"grad_norm": 0.0022512541618198156,
"kl": 0.00033466145396232605,
"learning_rate": 3.9609093550344907e-07,
"loss": 0.0074,
"reward": 0.12452584411948919,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09890975733287632,
"rewards/cosine_scaled_reward": 0.0662637110799551,
"rewards/format_reward": 0.6041666679084301,
"step": 326
},
{
"advantage_max": 0.17955493042245507,
"advantage_mean": -3.725290298461914e-09,
"advantage_min": -0.16282919980585575,
"advantage_std": 0.1376048857346177,
"completion_length": 2571.145835876465,
"epoch": 0.3737142857142857,
"grad_norm": 0.0025611212477087975,
"kl": 0.00043479073792696,
"learning_rate": 3.931425787051832e-07,
"loss": 0.0029,
"reward": 0.13137949211522937,
"reward_advantage_correlation": 1.0,
"reward_std": 0.137604889459908,
"rewards/cosine_scaled_reward": 0.12837360659614205,
"rewards/format_reward": 0.5208333376795053,
"step": 327
},
{
"advantage_max": 0.18027144204825163,
"advantage_mean": -4.11334142441655e-09,
"advantage_min": -0.12120586633682251,
"advantage_std": 0.12321445951238275,
"completion_length": 3299.437530517578,
"epoch": 0.37485714285714283,
"grad_norm": 0.002698281779885292,
"kl": 0.0005347728729248047,
"learning_rate": 3.902018669163384e-07,
"loss": 0.0104,
"reward": -0.015781979076564312,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.12321445951238275,
"rewards/cosine_scaled_reward": -0.19141755625605583,
"rewards/format_reward": 0.2916666716337204,
"step": 328
},
{
"advantage_max": 0.11980468919500709,
"advantage_mean": -3.1044086051812414e-09,
"advantage_min": -0.15748842991888523,
"advantage_std": 0.10885418800171465,
"completion_length": 1634.270839691162,
"epoch": 0.376,
"grad_norm": 0.0012264562537893653,
"kl": 0.00037041306495666504,
"learning_rate": 3.872689434630585e-07,
"loss": -0.0013,
"reward": 0.1498857717961073,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10885419067926705,
"rewards/cosine_scaled_reward": 0.09750008094124496,
"rewards/format_reward": 0.6875000074505806,
"step": 329
},
{
"advantage_max": 0.18279112502932549,
"advantage_mean": -9.7788870612181e-09,
"advantage_min": -0.14392871782183647,
"advantage_std": 0.13134588208049536,
"completion_length": 2363.208351135254,
"epoch": 0.37714285714285717,
"grad_norm": 0.002868218347430229,
"kl": 0.0007173418998718262,
"learning_rate": 3.843439512918949e-07,
"loss": 0.0023,
"reward": 0.07145864237099886,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1313458839431405,
"rewards/cosine_scaled_reward": -0.0505201262421906,
"rewards/format_reward": 0.5208333395421505,
"step": 330
},
{
"advantage_max": 0.18987913988530636,
"advantage_mean": -5.743155867543592e-09,
"advantage_min": -0.1331134121865034,
"advantage_std": 0.12819427531212568,
"completion_length": 2273.166679382324,
"epoch": 0.3782857142857143,
"grad_norm": 0.0021069832146167755,
"kl": 0.0004393383860588074,
"learning_rate": 3.8142703296283953e-07,
"loss": 0.0078,
"reward": 0.04232434229925275,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12819427764043212,
"rewards/cosine_scaled_reward": -0.10524179972708225,
"rewards/format_reward": 0.45833334140479565,
"step": 331
},
{
"advantage_max": 0.20149581134319305,
"advantage_mean": -7.916241953620506e-09,
"advantage_min": -0.14159746747463942,
"advantage_std": 0.1383329126983881,
"completion_length": 2499.6458587646484,
"epoch": 0.37942857142857145,
"grad_norm": 0.002372437622398138,
"kl": 0.0004247426986694336,
"learning_rate": 3.785183306423767e-07,
"loss": 0.0009,
"reward": 0.12115885165985674,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13833291921764612,
"rewards/cosine_scaled_reward": 0.05477744806557894,
"rewards/format_reward": 0.6041666809469461,
"step": 332
},
{
"advantage_max": 0.13082721852697432,
"advantage_mean": -6.635673649446616e-09,
"advantage_min": -0.18306226492859423,
"advantage_std": 0.12717127020005137,
"completion_length": 2043.333366394043,
"epoch": 0.38057142857142856,
"grad_norm": 0.0020115238148719072,
"kl": 0.00046539306640625,
"learning_rate": 3.7561798609655373e-07,
"loss": 0.0014,
"reward": 0.1636866086628288,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1271712731104344,
"rewards/cosine_scaled_reward": 0.10506386123597622,
"rewards/format_reward": 0.75,
"step": 333
},
{
"advantage_max": 0.12227540975436568,
"advantage_mean": -1.6298146374160716e-09,
"advantage_min": -0.12114514503628016,
"advantage_std": 0.09305540984496474,
"completion_length": 2965.1875610351562,
"epoch": 0.38171428571428573,
"grad_norm": 0.0016468079993501306,
"kl": 0.000513911247253418,
"learning_rate": 3.72726140684072e-07,
"loss": 0.0028,
"reward": 0.00035559339448809624,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09305540984496474,
"rewards/cosine_scaled_reward": -0.18864441104233265,
"rewards/format_reward": 0.3750000111758709,
"step": 334
},
{
"advantage_max": 0.17929319106042385,
"advantage_mean": -1.3426567503638243e-08,
"advantage_min": -0.24936186987906694,
"advantage_std": 0.17879032995551825,
"completion_length": 2200.229202270508,
"epoch": 0.38285714285714284,
"grad_norm": 0.0025713045615702868,
"kl": 0.00046062469482421875,
"learning_rate": 3.6984293534939737e-07,
"loss": 0.0018,
"reward": 0.21270178398117423,
"reward_advantage_correlation": 1.0,
"reward_std": 0.17879033274948597,
"rewards/cosine_scaled_reward": 0.2734536435455084,
"rewards/format_reward": 0.7083333395421505,
"step": 335
},
{
"advantage_max": 0.13005139166489244,
"advantage_mean": -1.8626451769865326e-09,
"advantage_min": -0.15239684004336596,
"advantage_std": 0.10806715162470937,
"completion_length": 2616.2916946411133,
"epoch": 0.384,
"grad_norm": 0.0015858953120186925,
"kl": 0.0004538893699645996,
"learning_rate": 3.6696851061588994e-07,
"loss": 0.0022,
"reward": 0.09085274318931624,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.10806715488433838,
"rewards/cosine_scaled_reward": 0.01898368075489998,
"rewards/format_reward": 0.5000000018626451,
"step": 336
},
{
"advantage_max": 0.14148719515651464,
"advantage_mean": -1.2417634698280722e-09,
"advantage_min": -0.1784699847921729,
"advantage_std": 0.13605968561023474,
"completion_length": 2599.1250381469727,
"epoch": 0.3851428571428571,
"grad_norm": 0.0025568308774381876,
"kl": 0.00046902894973754883,
"learning_rate": 3.641030065789562e-07,
"loss": 0.011,
"reward": 0.0828043669462204,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13605968886986375,
"rewards/cosine_scaled_reward": -0.0166609063744545,
"rewards/format_reward": 0.5208333432674408,
"step": 337
},
{
"advantage_max": 0.16837644949555397,
"advantage_mean": -8.692344150018627e-09,
"advantage_min": -0.17715413495898247,
"advantage_std": 0.1369879769627005,
"completion_length": 1891.7292098999023,
"epoch": 0.3862857142857143,
"grad_norm": 0.0030996922869235277,
"kl": 0.0004836171865463257,
"learning_rate": 3.612465628992203e-07,
"loss": 0.0105,
"reward": 0.17488606134429574,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13698797672986984,
"rewards/cosine_scaled_reward": 0.17192834429442883,
"rewards/format_reward": 0.687500013038516,
"step": 338
},
{
"advantage_max": 0.10435130982659757,
"advantage_mean": -2.4447217861212067e-09,
"advantage_min": -0.10210681799799204,
"advantage_std": 0.07953963615000248,
"completion_length": 2805.750030517578,
"epoch": 0.38742857142857146,
"grad_norm": 0.0016661995323374867,
"kl": 0.00048545002937316895,
"learning_rate": 3.5839931879571725e-07,
"loss": -0.0011,
"reward": 0.06721553253009915,
"reward_advantage_correlation": 1.0,
"reward_std": 0.07953964336775243,
"rewards/cosine_scaled_reward": -0.009967771358788013,
"rewards/format_reward": 0.41666667349636555,
"step": 339
},
{
"advantage_max": 0.14381090085953474,
"advantage_mean": -5.975986885897733e-09,
"advantage_min": -0.13768143858760595,
"advantage_std": 0.11206724308431149,
"completion_length": 2374.875030517578,
"epoch": 0.38857142857142857,
"grad_norm": 0.001165210036560893,
"kl": 0.0003544166684150696,
"learning_rate": 3.555614130391079e-07,
"loss": -0.0003,
"reward": 0.13663293584249914,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1120672500692308,
"rewards/cosine_scaled_reward": 0.07016189396381378,
"rewards/format_reward": 0.6666666679084301,
"step": 340
},
{
"advantage_max": 0.11593026760965586,
"advantage_mean": -6.053596623978308e-09,
"advantage_min": -0.1835379470139742,
"advantage_std": 0.11309912154683843,
"completion_length": 2168.2917404174805,
"epoch": 0.38971428571428574,
"grad_norm": 0.0020617349073290825,
"kl": 0.00035562366247177124,
"learning_rate": 3.5273298394491515e-07,
"loss": 0.0072,
"reward": 0.19327943865209818,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11309912716387771,
"rewards/cosine_scaled_reward": 0.20469791069626808,
"rewards/format_reward": 0.7291666753590107,
"step": 341
},
{
"advantage_max": 0.2113343793898821,
"advantage_mean": -3.880510857090513e-09,
"advantage_min": -0.21403949242085218,
"advantage_std": 0.19739758502691984,
"completion_length": 2652.5416870117188,
"epoch": 0.39085714285714285,
"grad_norm": 0.0037038561422377825,
"kl": 0.00047537684440612793,
"learning_rate": 3.4991416936678276e-07,
"loss": 0.0136,
"reward": 0.12849632510915399,
"reward_advantage_correlation": 1.0,
"reward_std": 0.19739759154617786,
"rewards/cosine_scaled_reward": 0.13792963325977325,
"rewards/format_reward": 0.47916666977107525,
"step": 342
},
{
"advantage_max": 0.1500543192960322,
"advantage_mean": -5.665545727007171e-09,
"advantage_min": -0.19857817236334085,
"advantage_std": 0.14241656986996531,
"completion_length": 2764.666717529297,
"epoch": 0.392,
"grad_norm": 0.002483693417161703,
"kl": 0.00038484111428260803,
"learning_rate": 3.471051066897562e-07,
"loss": 0.0049,
"reward": 0.17116584605537355,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.14241657592356205,
"rewards/cosine_scaled_reward": 0.21320676431059837,
"rewards/format_reward": 0.5833333432674408,
"step": 343
},
{
"advantage_max": 0.11467262031510472,
"advantage_mean": -8.343098146373906e-09,
"advantage_min": -0.09952630288898945,
"advantage_std": 0.09163640858605504,
"completion_length": 2078.5417098999023,
"epoch": 0.3931428571428571,
"grad_norm": 0.0015788535820320249,
"kl": 0.00035351328551769257,
"learning_rate": 3.4430593282358777e-07,
"loss": 0.0011,
"reward": 0.20408116653561592,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.09163640951737761,
"rewards/cosine_scaled_reward": 0.2693120799958706,
"rewards/format_reward": 0.6666666679084301,
"step": 344
},
{
"advantage_max": 0.13983573578298092,
"advantage_mean": 3.1820189816400557e-09,
"advantage_min": -0.1527677569538355,
"advantage_std": 0.1231717630289495,
"completion_length": 2897.875030517578,
"epoch": 0.3942857142857143,
"grad_norm": 0.002323306631296873,
"kl": 0.0005290508270263672,
"learning_rate": 3.4151678419606233e-07,
"loss": -0.0001,
"reward": 0.05550580471754074,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1231717630289495,
"rewards/cosine_scaled_reward": 0.00512329675257206,
"rewards/format_reward": 0.3125000037252903,
"step": 345
},
{
"advantage_max": 0.22859193058684468,
"advantage_mean": 1.3969839035565812e-09,
"advantage_min": -0.13696812302805483,
"advantage_std": 0.13573638605885208,
"completion_length": 2998.479217529297,
"epoch": 0.3954285714285714,
"grad_norm": 0.002322471234947443,
"kl": 0.0005121231079101562,
"learning_rate": 3.387377967463493e-07,
"loss": 0.0006,
"reward": 0.04644642909988761,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13573638536036015,
"rewards/cosine_scaled_reward": -0.10306344844866544,
"rewards/format_reward": 0.47916666977107525,
"step": 346
},
{
"advantage_max": 0.1349148927256465,
"advantage_mean": -8.381903310317185e-09,
"advantage_min": -0.10294443322345614,
"advantage_std": 0.09874543640762568,
"completion_length": 3018.1458740234375,
"epoch": 0.3965714285714286,
"grad_norm": 0.002016145968809724,
"kl": 0.0004055500030517578,
"learning_rate": 3.359691059183761e-07,
"loss": 0.004,
"reward": 0.033684686524793506,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.0987454392015934,
"rewards/cosine_scaled_reward": -0.11986671015620232,
"rewards/format_reward": 0.4375000037252903,
"step": 347
},
{
"advantage_max": 0.09889293229207397,
"advantage_mean": 2.0954757998237206e-09,
"advantage_min": -0.1360151378903538,
"advantage_std": 0.10096699907444417,
"completion_length": 2521.062530517578,
"epoch": 0.3977142857142857,
"grad_norm": 0.0023632138036191463,
"kl": 0.0003896355628967285,
"learning_rate": 3.3321084665422803e-07,
"loss": 0.0126,
"reward": 0.0665872145909816,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.10096699860878289,
"rewards/cosine_scaled_reward": -0.06292372569441795,
"rewards/format_reward": 0.5208333358168602,
"step": 348
},
{
"advantage_max": 0.13384045055136085,
"advantage_mean": 1.5522043372850902e-09,
"advantage_min": -0.09279891615733504,
"advantage_std": 0.08720876974985003,
"completion_length": 3144.2291870117188,
"epoch": 0.39885714285714285,
"grad_norm": 0.0023430907167494297,
"kl": 0.0006818771362304688,
"learning_rate": 3.3046315338757026e-07,
"loss": 0.0019,
"reward": 0.03250998561270535,
"reward_advantage_correlation": 1.0,
"reward_std": 0.08720877300947905,
"rewards/cosine_scaled_reward": -0.08978106081485748,
"rewards/format_reward": 0.3750000037252903,
"step": 349
},
{
"advantage_max": 0.2537369290366769,
"advantage_mean": -1.0399769351243648e-08,
"advantage_min": -0.2440725015476346,
"advantage_std": 0.2099163606762886,
"completion_length": 2217.2917404174805,
"epoch": 0.4,
"grad_norm": 0.0038065649569034576,
"kl": 0.0004671439528465271,
"learning_rate": 3.2772616003709616e-07,
"loss": 0.0138,
"reward": 0.1614842200651765,
"reward_advantage_correlation": 1.0,
"reward_std": 0.20991637371480465,
"rewards/cosine_scaled_reward": 0.16261385567486286,
"rewards/format_reward": 0.6250000093132257,
"step": 350
},
{
"advantage_max": 0.10746736731380224,
"advantage_mean": -4.346172130520465e-09,
"advantage_min": -0.12306162435561419,
"advantage_std": 0.09336511231958866,
"completion_length": 2644.937530517578,
"epoch": 0.40114285714285713,
"grad_norm": 0.0012864568270742893,
"kl": 0.0005724728107452393,
"learning_rate": 3.250000000000001e-07,
"loss": -0.0015,
"reward": 0.07907040324062109,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.0933651146478951,
"rewards/cosine_scaled_reward": -0.024259530007839203,
"rewards/format_reward": 0.5208333395421505,
"step": 351
},
{
"advantage_max": 0.14927358692511916,
"advantage_mean": -1.5522045454519073e-10,
"advantage_min": -0.13300226628780365,
"advantage_std": 0.11855996306985617,
"completion_length": 2226.562545776367,
"epoch": 0.4022857142857143,
"grad_norm": 0.0017897867364808917,
"kl": 0.0004187921294942498,
"learning_rate": 3.222848061454764e-07,
"loss": 0.0041,
"reward": 0.1405172348022461,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.11855996120721102,
"rewards/cosine_scaled_reward": 0.09164215251803398,
"rewards/format_reward": 0.6458333358168602,
"step": 352
},
{
"advantage_max": 0.15562463272362947,
"advantage_mean": -3.1044089521259366e-10,
"advantage_min": -0.1712775742635131,
"advantage_std": 0.13150706025771797,
"completion_length": 2046.375020980835,
"epoch": 0.4034285714285714,
"grad_norm": 0.003071481129154563,
"kl": 0.00032861530780792236,
"learning_rate": 3.195807108082429e-07,
"loss": 0.0127,
"reward": 0.1798698278144002,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1315070646815002,
"rewards/cosine_scaled_reward": 0.20370216853916645,
"rewards/format_reward": 0.6458333432674408,
"step": 353
},
{
"advantage_max": 0.14535299316048622,
"advantage_mean": -7.140139923755839e-09,
"advantage_min": -0.1506730942055583,
"advantage_std": 0.132601466961205,
"completion_length": 2020.5625381469727,
"epoch": 0.4045714285714286,
"grad_norm": 0.0018860672134906054,
"kl": 0.00036281999200582504,
"learning_rate": 3.168878457820915e-07,
"loss": 0.0052,
"reward": 0.1788836452178657,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.13260147930122912,
"rewards/cosine_scaled_reward": 0.1834542783908546,
"rewards/format_reward": 0.6875000018626451,
"step": 354
},
{
"advantage_max": 0.13799833692610264,
"advantage_mean": -2.2506962266133357e-09,
"advantage_min": -0.11328241974115372,
"advantage_std": 0.10098979715257883,
"completion_length": 2142.145881652832,
"epoch": 0.4057142857142857,
"grad_norm": 0.0024661533534526825,
"kl": 0.00044939108192920685,
"learning_rate": 3.142063423134644e-07,
"loss": 0.0106,
"reward": 0.17950351699255407,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10098980064503849,
"rewards/cosine_scaled_reward": 0.22795349592342973,
"rewards/format_reward": 0.6041666697710752,
"step": 355
},
{
"advantage_max": 0.17581096477806568,
"advantage_mean": -9.934107980669182e-09,
"advantage_min": -0.17233179230242968,
"advantage_std": 0.1403276165947318,
"completion_length": 2595.979202270508,
"epoch": 0.40685714285714286,
"grad_norm": 0.002081536455079913,
"kl": 0.0004221200942993164,
"learning_rate": 3.115363310950578e-07,
"loss": -0.0018,
"reward": 0.12342565413564444,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1403276203200221,
"rewards/cosine_scaled_reward": 0.03239230625331402,
"rewards/format_reward": 0.6666666716337204,
"step": 356
},
{
"advantage_max": 0.1474649435840547,
"advantage_mean": -7.528191278693974e-09,
"advantage_min": -0.11082311253994703,
"advantage_std": 0.10918143065646291,
"completion_length": 2855.250045776367,
"epoch": 0.408,
"grad_norm": 0.002035984070971608,
"kl": 0.00044214725494384766,
"learning_rate": 3.0887794225945143e-07,
"loss": 0.006,
"reward": 0.0833205720409751,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10918143903836608,
"rewards/cosine_scaled_reward": 0.027413238771259785,
"rewards/format_reward": 0.43750000558793545,
"step": 357
},
{
"advantage_max": 0.17143051512539387,
"advantage_mean": -1.6453365836444078e-08,
"advantage_min": -0.17683418467640877,
"advantage_std": 0.14035541797056794,
"completion_length": 2261.770896911621,
"epoch": 0.40914285714285714,
"grad_norm": 0.0024120674934238195,
"kl": 0.0003642141819000244,
"learning_rate": 3.062313053727671e-07,
"loss": 0.006,
"reward": 0.26533588115125895,
"reward_advantage_correlation": 1.0,
"reward_std": 0.14035542216151953,
"rewards/cosine_scaled_reward": 0.36936704453546554,
"rewards/format_reward": 0.8333333395421505,
"step": 358
},
{
"advantage_max": 0.16622212389484048,
"advantage_mean": -3.414849708560652e-09,
"advantage_min": -0.1195356696844101,
"advantage_std": 0.11384606640785933,
"completion_length": 2067.2083587646484,
"epoch": 0.4102857142857143,
"grad_norm": 0.0015450696228072047,
"kl": 0.0002728961408138275,
"learning_rate": 3.0359654942835247e-07,
"loss": 0.003,
"reward": 0.10825431568082422,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.11384607246145606,
"rewards/cosine_scaled_reward": 0.0183385512791574,
"rewards/format_reward": 0.6041666772216558,
"step": 359
},
{
"advantage_max": 0.1626054784283042,
"advantage_mean": -1.226241415352991e-08,
"advantage_min": -0.18612738978117704,
"advantage_std": 0.13540870044380426,
"completion_length": 2324.1875610351562,
"epoch": 0.4114285714285714,
"grad_norm": 0.002945123938843608,
"kl": 0.0004897117614746094,
"learning_rate": 3.0097380284049523e-07,
"loss": 0.008,
"reward": 0.22260520420968533,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13540870510041714,
"rewards/cosine_scaled_reward": 0.3045365456491709,
"rewards/format_reward": 0.7083333488553762,
"step": 360
},
{
"advantage_max": 0.15933177806437016,
"advantage_mean": -4.501392453226671e-09,
"advantage_min": -0.17589706368744373,
"advantage_std": 0.14223455544561148,
"completion_length": 2791.5208740234375,
"epoch": 0.4125714285714286,
"grad_norm": 0.0035066171549260616,
"kl": 0.0005368292331695557,
"learning_rate": 2.9836319343816397e-07,
"loss": 0.0078,
"reward": 0.14737363997846842,
"reward_advantage_correlation": 1.0,
"reward_std": 0.14223456801846623,
"rewards/cosine_scaled_reward": 0.1759349994827062,
"rewards/format_reward": 0.5208333432674408,
"step": 361
},
{
"advantage_max": 0.10177448485046625,
"advantage_mean": -1.1175871172941498e-08,
"advantage_min": -0.147513457108289,
"advantage_std": 0.09820342680905014,
"completion_length": 1387.0000305175781,
"epoch": 0.4137142857142857,
"grad_norm": 0.0026847573462873697,
"kl": 0.0003243088722229004,
"learning_rate": 2.9576484845877793e-07,
"loss": 0.0035,
"reward": 0.2018571854569018,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09820342832244933,
"rewards/cosine_scaled_reward": 0.1488652601838112,
"rewards/format_reward": 0.8958333432674408,
"step": 362
},
{
"advantage_max": 0.13114572037011385,
"advantage_mean": -4.656612970221907e-09,
"advantage_min": -0.1494987541809678,
"advantage_std": 0.11584013933315873,
"completion_length": 1757.8333473205566,
"epoch": 0.41485714285714287,
"grad_norm": 0.0014682364417240024,
"kl": 0.00039479881525039673,
"learning_rate": 2.931788945420058e-07,
"loss": 0.0004,
"reward": 0.18233315646648407,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11584014166146517,
"rewards/cosine_scaled_reward": 0.1738392524421215,
"rewards/format_reward": 0.7291666716337204,
"step": 363
},
{
"advantage_max": 0.14291757624596357,
"advantage_mean": -4.190951891080985e-09,
"advantage_min": -0.11483692191541195,
"advantage_std": 0.1075741620734334,
"completion_length": 2885.9166717529297,
"epoch": 0.416,
"grad_norm": 0.0023508663289248943,
"kl": 0.0006248950958251953,
"learning_rate": 2.9060545772359305e-07,
"loss": 0.0046,
"reward": 0.06019208254292607,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.10757415881380439,
"rewards/cosine_scaled_reward": -0.020222272723913193,
"rewards/format_reward": 0.39583333767950535,
"step": 364
},
{
"advantage_max": 0.1367309088818729,
"advantage_mean": 2.7939678071131624e-09,
"advantage_min": -0.17309968266636133,
"advantage_std": 0.125831242185086,
"completion_length": 2852.395854949951,
"epoch": 0.41714285714285715,
"grad_norm": 0.0022203971166163683,
"kl": 0.0005035400390625,
"learning_rate": 2.8804466342921987e-07,
"loss": 0.0045,
"reward": 0.05301872221753001,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12583124404773116,
"rewards/cosine_scaled_reward": -0.022073883563280106,
"rewards/format_reward": 0.3541666828095913,
"step": 365
},
{
"advantage_max": 0.08324602991342545,
"advantage_mean": 9.313225052265395e-10,
"advantage_min": -0.15099230967462063,
"advantage_std": 0.0951183415018022,
"completion_length": 1617.1875267028809,
"epoch": 0.41828571428571426,
"grad_norm": 0.0008901763940230012,
"kl": 0.0003096461296081543,
"learning_rate": 2.854966364683872e-07,
"loss": -0.0006,
"reward": 0.23938876390457153,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09511834103614092,
"rewards/cosine_scaled_reward": 0.3382144197821617,
"rewards/format_reward": 0.7291666716337204,
"step": 366
},
{
"advantage_max": 0.2165116430260241,
"advantage_mean": -4.811833376194841e-09,
"advantage_min": -0.23684036545455456,
"advantage_std": 0.17490943847224116,
"completion_length": 2510.854202270508,
"epoch": 0.41942857142857143,
"grad_norm": 0.002953734714537859,
"kl": 0.00034871697425842285,
"learning_rate": 2.829615010283344e-07,
"loss": 0.0067,
"reward": 0.14961381210014224,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.17490944173187017,
"rewards/cosine_scaled_reward": 0.12919975304976106,
"rewards/format_reward": 0.6250000093132257,
"step": 367
},
{
"advantage_max": 0.17113121692091227,
"advantage_mean": -1.5522044899407561e-09,
"advantage_min": -0.0952818775549531,
"advantage_std": 0.10084449546411633,
"completion_length": 2847.395854949951,
"epoch": 0.4205714285714286,
"grad_norm": 0.004048696719110012,
"kl": 0.0005242824554443359,
"learning_rate": 2.8043938066798645e-07,
"loss": 0.0112,
"reward": 0.06646438379539177,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.10084450151771307,
"rewards/cosine_scaled_reward": 0.03680053818970919,
"rewards/format_reward": 0.31250000186264515,
"step": 368
},
{
"advantage_max": 0.16597222117707133,
"advantage_mean": -3.1432137274911565e-09,
"advantage_min": -0.12302078539505601,
"advantage_std": 0.11180994007736444,
"completion_length": 2262.1666831970215,
"epoch": 0.4217142857142857,
"grad_norm": 0.0016202060505747795,
"kl": 0.00044993311166763306,
"learning_rate": 2.7793039831193133e-07,
"loss": 0.0003,
"reward": 0.13039299566298723,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1118099435698241,
"rewards/cosine_scaled_reward": 0.07433954149018973,
"rewards/format_reward": 0.6250000074505806,
"step": 369
},
{
"advantage_max": 0.15608058404177427,
"advantage_mean": 2.716357583310014e-09,
"advantage_min": -0.1327444650232792,
"advantage_std": 0.1269805277697742,
"completion_length": 3054.958396911621,
"epoch": 0.4228571428571429,
"grad_norm": 0.0026487144641578197,
"kl": 0.0006656236946582794,
"learning_rate": 2.7543467624442956e-07,
"loss": 0.0055,
"reward": 0.11274721287190914,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.12698053661733866,
"rewards/cosine_scaled_reward": 0.14613067544996738,
"rewards/format_reward": 0.3750000037252903,
"step": 370
},
{
"advantage_max": 0.10794967133551836,
"advantage_mean": -2.173086016687975e-09,
"advantage_min": -0.05772289913147688,
"advantage_std": 0.06405279028695077,
"completion_length": 1609.895881652832,
"epoch": 0.424,
"grad_norm": 0.0015763145638629794,
"kl": 0.0003489851951599121,
"learning_rate": 2.729523361034538e-07,
"loss": 0.0069,
"reward": 0.13482660567387938,
"reward_advantage_correlation": 1.0,
"reward_std": 0.06405279482714832,
"rewards/cosine_scaled_reward": 0.043605952989310026,
"rewards/format_reward": 0.7083333395421505,
"step": 371
},
{
"advantage_max": 0.17068721819669008,
"advantage_mean": -8.226682959855403e-09,
"advantage_min": -0.1903815222904086,
"advantage_std": 0.1448813541792333,
"completion_length": 2810.4583854675293,
"epoch": 0.42514285714285716,
"grad_norm": 0.0029529884923249483,
"kl": 0.0004370957612991333,
"learning_rate": 2.7048349887476037e-07,
"loss": 0.0065,
"reward": 0.1317643583752215,
"reward_advantage_correlation": 1.0,
"reward_std": 0.14488135650753975,
"rewards/cosine_scaled_reward": 0.15046954539138824,
"rewards/format_reward": 0.47916667349636555,
"step": 372
},
{
"advantage_max": 0.12174444226548076,
"advantage_mean": -8.071462387349015e-09,
"advantage_min": -0.0992058515548706,
"advantage_std": 0.08604467427358031,
"completion_length": 1773.1250305175781,
"epoch": 0.42628571428571427,
"grad_norm": 0.001331821666099131,
"kl": 0.00028437376022338867,
"learning_rate": 2.6802828488599294e-07,
"loss": 0.0018,
"reward": 0.13919993431773037,
"reward_advantage_correlation": 1.0,
"reward_std": 0.08604467613622546,
"rewards/cosine_scaled_reward": 0.0785412099212408,
"rewards/format_reward": 0.6666666716337204,
"step": 373
},
{
"advantage_max": 0.13440924789756536,
"advantage_mean": -6.208817238118058e-09,
"advantage_min": -0.19716255459934473,
"advantage_std": 0.13642028719186783,
"completion_length": 1887.979232788086,
"epoch": 0.42742857142857144,
"grad_norm": 0.002169976709410548,
"kl": 0.00032141804695129395,
"learning_rate": 2.655868138008171e-07,
"loss": 0.0044,
"reward": 0.17201172886416316,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1364202918484807,
"rewards/cosine_scaled_reward": 0.1637210724875331,
"rewards/format_reward": 0.6875000186264515,
"step": 374
},
{
"advantage_max": 0.12574104312807322,
"advantage_mean": -1.3193737075090084e-09,
"advantage_min": -0.16399358585476875,
"advantage_std": 0.12628946546465158,
"completion_length": 2755.687530517578,
"epoch": 0.42857142857142855,
"grad_norm": 0.0029920157976448536,
"kl": 0.0005879402160644531,
"learning_rate": 2.631592046130896e-07,
"loss": 0.0073,
"reward": 0.11453350447118282,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.12628946779295802,
"rewards/cosine_scaled_reward": 0.14050849340856075,
"rewards/format_reward": 0.3958333358168602,
"step": 375
},
{
"advantage_max": 0.15478361072018743,
"advantage_mean": 3.8805110513795427e-10,
"advantage_min": -0.08757854904979467,
"advantage_std": 0.09614620194770396,
"completion_length": 2132.7084045410156,
"epoch": 0.4297142857142857,
"grad_norm": 0.001617670408450067,
"kl": 0.00033554062247276306,
"learning_rate": 2.6074557564105724e-07,
"loss": 0.001,
"reward": 0.11974846536759287,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09614620392676443,
"rewards/cosine_scaled_reward": 0.0007210008334368467,
"rewards/format_reward": 0.7083333414047956,
"step": 376
},
{
"advantage_max": 0.2501893350854516,
"advantage_mean": -1.552203920951456e-10,
"advantage_min": -0.15648298431187868,
"advantage_std": 0.16026539681479335,
"completion_length": 3177.5834350585938,
"epoch": 0.4308571428571429,
"grad_norm": 0.002923794789239764,
"kl": 0.0006504058837890625,
"learning_rate": 2.583460445215911e-07,
"loss": 0.0023,
"reward": 0.07427594714681618,
"reward_advantage_correlation": 1.0,
"reward_std": 0.16026540519669652,
"rewards/cosine_scaled_reward": -0.039964438416063786,
"rewards/format_reward": 0.5208333395421505,
"step": 377
},
{
"advantage_max": 0.18498766515403986,
"advantage_mean": -1.4435500295117976e-08,
"advantage_min": -0.18669047579169273,
"advantage_std": 0.14398845378309488,
"completion_length": 1991.2083549499512,
"epoch": 0.432,
"grad_norm": 0.0019245728617534041,
"kl": 0.0003337450325489044,
"learning_rate": 2.5596072820445254e-07,
"loss": 0.0059,
"reward": 0.14536084234714508,
"reward_advantage_correlation": 1.0,
"reward_std": 0.14398845750838518,
"rewards/cosine_scaled_reward": 0.11685117019806057,
"rewards/format_reward": 0.6250000111758709,
"step": 378
},
{
"advantage_max": 0.2584569351747632,
"advantage_mean": -4.423782500040385e-09,
"advantage_min": -0.13952340185642242,
"advantage_std": 0.1552307652309537,
"completion_length": 2956.687530517578,
"epoch": 0.43314285714285716,
"grad_norm": 0.002495537744835019,
"kl": 0.0005144476890563965,
"learning_rate": 2.5358974294659373e-07,
"loss": 0.0077,
"reward": 0.023960275422723498,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1552307764068246,
"rewards/cosine_scaled_reward": -0.10624610353261232,
"rewards/format_reward": 0.35416667349636555,
"step": 379
},
{
"advantage_max": 0.19467550422996283,
"advantage_mean": -3.880510787701574e-09,
"advantage_min": -0.14030754379928112,
"advantage_std": 0.14547208277508616,
"completion_length": 2132.416748046875,
"epoch": 0.4342857142857143,
"grad_norm": 0.0023121978156268597,
"kl": 0.0005660057067871094,
"learning_rate": 2.512332043064913e-07,
"loss": -0.0004,
"reward": 0.13492502574808896,
"reward_advantage_correlation": 1.0,
"reward_std": 0.14547208370640874,
"rewards/cosine_scaled_reward": 0.034852080047130585,
"rewards/format_reward": 0.7291666772216558,
"step": 380
},
{
"advantage_max": 0.1496616357471794,
"advantage_mean": -3.0267984368892442e-09,
"advantage_min": -0.18097604904323816,
"advantage_std": 0.13761152140796185,
"completion_length": 2775.8541984558105,
"epoch": 0.43542857142857144,
"grad_norm": 0.002602294785901904,
"kl": 0.0006687045097351074,
"learning_rate": 2.488912271385139e-07,
"loss": 0.0076,
"reward": 0.17733385832980275,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13761152466759086,
"rewards/cosine_scaled_reward": 0.233534662052989,
"rewards/format_reward": 0.5833333395421505,
"step": 381
},
{
"advantage_max": 0.11240688525140285,
"advantage_mean": 9.313225607376907e-10,
"advantage_min": -0.13521632878109813,
"advantage_std": 0.10715258843265474,
"completion_length": 1760.1875305175781,
"epoch": 0.43657142857142855,
"grad_norm": 0.0019332170486450195,
"kl": 0.0004322826862335205,
"learning_rate": 2.465639255873246e-07,
"loss": 0.0041,
"reward": 0.12760076066479087,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10715258959680796,
"rewards/cosine_scaled_reward": -0.028475773753598332,
"rewards/format_reward": 0.8125000074505806,
"step": 382
},
{
"advantage_max": 0.21556610194966197,
"advantage_mean": -2.405916715852996e-09,
"advantage_min": -0.14847189001739025,
"advantage_std": 0.14524834416806698,
"completion_length": 2493.6041946411133,
"epoch": 0.4377142857142857,
"grad_norm": 0.0023037714418023825,
"kl": 0.0005925819277763367,
"learning_rate": 2.4425141308231765e-07,
"loss": 0.0052,
"reward": 0.12728559458628297,
"reward_advantage_correlation": 1.0,
"reward_std": 0.14524835254997015,
"rewards/cosine_scaled_reward": 0.11582583468407393,
"rewards/format_reward": 0.5208333376795053,
"step": 383
},
{
"advantage_max": 0.1525868414901197,
"advantage_mean": 5.432715943776145e-10,
"advantage_min": -0.22444791439920664,
"advantage_std": 0.15500889671966434,
"completion_length": 2127.1250381469727,
"epoch": 0.43885714285714283,
"grad_norm": 0.003150203498080373,
"kl": 0.0004943348467350006,
"learning_rate": 2.4195380233209006e-07,
"loss": 0.005,
"reward": 0.27685068640857935,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.15500890417024493,
"rewards/cosine_scaled_reward": 0.45512693375349045,
"rewards/format_reward": 0.7291666753590107,
"step": 384
},
{
"advantage_max": 0.2526491954922676,
"advantage_mean": -3.259629136054265e-09,
"advantage_min": -0.16856408398598433,
"advantage_std": 0.17220128513872623,
"completion_length": 2676.3333892822266,
"epoch": 0.44,
"grad_norm": 0.0027297367341816425,
"kl": 0.0005096197128295898,
"learning_rate": 2.3967120531894857e-07,
"loss": 0.0084,
"reward": 0.08310869638808072,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.17220128886401653,
"rewards/cosine_scaled_reward": -0.046377929858863354,
"rewards/format_reward": 0.583333345130086,
"step": 385
},
{
"advantage_max": 0.077841951046139,
"advantage_mean": -2.1730861277102775e-09,
"advantage_min": -0.13375738728791475,
"advantage_std": 0.08136873878538609,
"completion_length": 2538.1458740234375,
"epoch": 0.44114285714285717,
"grad_norm": 0.0015308266738429666,
"kl": 0.0005868077278137207,
"learning_rate": 2.374037332934512e-07,
"loss": 0.0024,
"reward": 0.08394389343447983,
"reward_advantage_correlation": 1.0,
"reward_std": 0.08136874157935381,
"rewards/cosine_scaled_reward": -0.011836465448141098,
"rewards/format_reward": 0.5208333395421505,
"step": 386
},
{
"advantage_max": 0.16614294797182083,
"advantage_mean": -3.1044086884479682e-09,
"advantage_min": -0.12347709108144045,
"advantage_std": 0.1082666483707726,
"completion_length": 2941.6042098999023,
"epoch": 0.4422857142857143,
"grad_norm": 0.0023464104160666466,
"kl": 0.0006274953484535217,
"learning_rate": 2.3515149676898552e-07,
"loss": 0.0088,
"reward": 0.06053544546011835,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.10826664976775646,
"rewards/cosine_scaled_reward": -0.009756912477314472,
"rewards/format_reward": 0.3750000037252903,
"step": 387
},
{
"advantage_max": 0.11136521841399372,
"advantage_mean": 4.3461721582760404e-09,
"advantage_min": -0.10910094575956464,
"advantage_std": 0.08894198993220925,
"completion_length": 2469.625015258789,
"epoch": 0.44342857142857145,
"grad_norm": 0.0017324852524325252,
"kl": 0.000505693256855011,
"learning_rate": 2.3291460551638237e-07,
"loss": 0.0039,
"reward": 0.07375043304637074,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.08894199575297534,
"rewards/cosine_scaled_reward": 0.010048863710835576,
"rewards/format_reward": 0.4166666716337204,
"step": 388
},
{
"advantage_max": 0.1969348695129156,
"advantage_mean": 3.880510079934396e-10,
"advantage_min": -0.140401273034513,
"advantage_std": 0.13098299829289317,
"completion_length": 2594.0000534057617,
"epoch": 0.44457142857142856,
"grad_norm": 0.002368423156440258,
"kl": 0.0004792213439941406,
"learning_rate": 2.306931685585657e-07,
"loss": 0.0015,
"reward": 0.04259849968366325,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1309830006211996,
"rewards/cosine_scaled_reward": -0.13581918645650148,
"rewards/format_reward": 0.5208333432674408,
"step": 389
},
{
"advantage_max": 0.11635044636204839,
"advantage_mean": -6.5192582443529545e-09,
"advantage_min": -0.16502864565700293,
"advantage_std": 0.1230549905449152,
"completion_length": 2135.9791870117188,
"epoch": 0.44571428571428573,
"grad_norm": 0.0020190367940813303,
"kl": 0.00028385967016220093,
"learning_rate": 2.2848729416523859e-07,
"loss": 0.0016,
"reward": 0.1558728562667966,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12305499846115708,
"rewards/cosine_scaled_reward": 0.1262847138568759,
"rewards/format_reward": 0.666666679084301,
"step": 390
},
{
"advantage_max": 0.20053921593353152,
"advantage_mean": -4.579002690907608e-09,
"advantage_min": -0.20975689589977264,
"advantage_std": 0.17516077309846878,
"completion_length": 2303.3959159851074,
"epoch": 0.44685714285714284,
"grad_norm": 0.0026405281387269497,
"kl": 0.0004959553480148315,
"learning_rate": 2.2629708984760706e-07,
"loss": 0.0077,
"reward": 0.19714019040111452,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1751607726328075,
"rewards/cosine_scaled_reward": 0.23575006239116192,
"rewards/format_reward": 0.6875000037252903,
"step": 391
},
{
"advantage_max": 0.10962881101295352,
"advantage_mean": -5.743155964688107e-09,
"advantage_min": -0.13240550691261888,
"advantage_std": 0.0982381934300065,
"completion_length": 1689.0208702087402,
"epoch": 0.448,
"grad_norm": 0.001231748261488974,
"kl": 0.00031384825706481934,
"learning_rate": 2.2412266235313973e-07,
"loss": -0.0001,
"reward": 0.16504641436040401,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.09823819668963552,
"rewards/cosine_scaled_reward": 0.06864389590919018,
"rewards/format_reward": 0.8333333358168602,
"step": 392
},
{
"advantage_max": 0.21035106386989355,
"advantage_mean": -6.364037963280111e-09,
"advantage_min": -0.136757155880332,
"advantage_std": 0.136241948697716,
"completion_length": 2200.5625381469727,
"epoch": 0.4491428571428571,
"grad_norm": 0.0022684852592647076,
"kl": 0.0004363059997558594,
"learning_rate": 2.2196411766036487e-07,
"loss": 0.0052,
"reward": 0.11575184087269008,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.13624195754528046,
"rewards/cosine_scaled_reward": 0.007750899763777852,
"rewards/format_reward": 0.6666666679084301,
"step": 393
},
{
"advantage_max": 0.28035250771790743,
"advantage_mean": -1.4745940649096845e-09,
"advantage_min": -0.1694905385375023,
"advantage_std": 0.18372618919238448,
"completion_length": 2991.666717529297,
"epoch": 0.4502857142857143,
"grad_norm": 0.0034687870647758245,
"kl": 0.0006244778633117676,
"learning_rate": 2.1982156097370557e-07,
"loss": 0.0111,
"reward": 0.03293494783429196,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.18372619384899735,
"rewards/cosine_scaled_reward": -0.0796794897178188,
"rewards/format_reward": 0.35416666977107525,
"step": 394
},
{
"advantage_max": 0.1525165536440909,
"advantage_mean": -7.974449859129984e-09,
"advantage_min": -0.09334612678503618,
"advantage_std": 0.0937539076549001,
"completion_length": 1960.2083435058594,
"epoch": 0.4514285714285714,
"grad_norm": 0.001557901268824935,
"kl": 0.0003572404384613037,
"learning_rate": 2.1769509671835223e-07,
"loss": 0.0093,
"reward": 0.07403434929437935,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09375390998320654,
"rewards/cosine_scaled_reward": -0.06747329549398273,
"rewards/format_reward": 0.5625000018626451,
"step": 395
},
{
"advantage_max": 0.1718710558488965,
"advantage_mean": -3.1044083970144243e-10,
"advantage_min": -0.20201640482991934,
"advantage_std": 0.14323359774425626,
"completion_length": 2671.5208892822266,
"epoch": 0.45257142857142857,
"grad_norm": 0.0023834805469959974,
"kl": 0.0004636496305465698,
"learning_rate": 2.1558482853517253e-07,
"loss": 0.0118,
"reward": 0.1488352312007919,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1432336075231433,
"rewards/cosine_scaled_reward": 0.13565819896757603,
"rewards/format_reward": 0.6041666828095913,
"step": 396
},
{
"advantage_max": 0.1686799516901374,
"advantage_mean": -6.829699236710063e-09,
"advantage_min": -0.1410632198676467,
"advantage_std": 0.1290680062957108,
"completion_length": 2122.1250534057617,
"epoch": 0.45371428571428574,
"grad_norm": 0.002183465054258704,
"kl": 0.00043454766273498535,
"learning_rate": 2.134908592756607e-07,
"loss": 0.0065,
"reward": 0.16060965787619352,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.12906800862401724,
"rewards/cosine_scaled_reward": 0.10085129458457232,
"rewards/format_reward": 0.7500000111758709,
"step": 397
},
{
"advantage_max": 0.16326016373932362,
"advantage_mean": -4.656613053488634e-09,
"advantage_min": -0.14716396015137434,
"advantage_std": 0.12772608175873756,
"completion_length": 2271.208381652832,
"epoch": 0.45485714285714285,
"grad_norm": 0.0022896837908774614,
"kl": 0.00048645585775375366,
"learning_rate": 2.1141329099692406e-07,
"loss": 0.0046,
"reward": 0.12706264690496027,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12772608385421336,
"rewards/cosine_scaled_reward": 0.041912979912012815,
"rewards/format_reward": 0.6666666809469461,
"step": 398
},
{
"advantage_max": 0.16285632457584143,
"advantage_mean": -5.355104866489047e-09,
"advantage_min": -0.16017363499850035,
"advantage_std": 0.12993760779500008,
"completion_length": 1922.708396911621,
"epoch": 0.456,
"grad_norm": 0.002509958343580365,
"kl": 0.0004304051399230957,
"learning_rate": 2.0935222495670968e-07,
"loss": 0.0073,
"reward": 0.18827908392995596,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1299376110546291,
"rewards/cosine_scaled_reward": 0.18050863035023212,
"rewards/format_reward": 0.7500000074505806,
"step": 399
},
{
"advantage_max": 0.17384717427194118,
"advantage_mean": -1.862645149230957e-09,
"advantage_min": -0.17707005143165588,
"advantage_std": 0.14918375061824918,
"completion_length": 1479.0833930969238,
"epoch": 0.45714285714285713,
"grad_norm": 0.002644736086949706,
"kl": 0.00038166344165802,
"learning_rate": 2.0730776160846853e-07,
"loss": 0.0081,
"reward": 0.2606083396822214,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.14918375457637012,
"rewards/cosine_scaled_reward": 0.32361013628542423,
"rewards/format_reward": 0.8958333358168602,
"step": 400
},
{
"advantage_max": 0.12657041382044554,
"advantage_mean": -1.901450281949213e-09,
"advantage_min": -0.11912647541612387,
"advantage_std": 0.10337408259510994,
"completion_length": 2812.875030517578,
"epoch": 0.4582857142857143,
"grad_norm": 0.0014676002319902182,
"kl": 0.0006059408187866211,
"learning_rate": 2.0528000059645995e-07,
"loss": 0.0035,
"reward": 0.019788147183135152,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10337408352643251,
"rewards/cosine_scaled_reward": -0.10925112292170525,
"rewards/format_reward": 0.3333333358168602,
"step": 401
},
{
"advantage_max": 0.12366076093167067,
"advantage_mean": -2.0178656801039807e-09,
"advantage_min": -0.1171658206731081,
"advantage_std": 0.09674888919107616,
"completion_length": 2147.979179382324,
"epoch": 0.4594285714285714,
"grad_norm": 0.001693824538961053,
"kl": 0.00046128034591674805,
"learning_rate": 2.032690407508949e-07,
"loss": 0.0004,
"reward": 0.09151852503418922,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09674889012239873,
"rewards/cosine_scaled_reward": -0.03256989782676101,
"rewards/format_reward": 0.6041666716337204,
"step": 402
},
{
"advantage_max": 0.08860299317166209,
"advantage_mean": -5.277494774524882e-09,
"advantage_min": -0.13536688731983304,
"advantage_std": 0.08131288702134043,
"completion_length": 1687.0208473205566,
"epoch": 0.4605714285714286,
"grad_norm": 0.001448645954951644,
"kl": 0.0003199884667992592,
"learning_rate": 2.0127498008311922e-07,
"loss": -0.0013,
"reward": 0.17468450870364904,
"reward_advantage_correlation": 1.0,
"reward_std": 0.08131289132870734,
"rewards/cosine_scaled_reward": 0.08974489662796259,
"rewards/format_reward": 0.8541666716337204,
"step": 403
},
{
"advantage_max": 0.1484815194271505,
"advantage_mean": -2.949188129819369e-09,
"advantage_min": -0.14216615236364305,
"advantage_std": 0.11517497175373137,
"completion_length": 2319.9791984558105,
"epoch": 0.4617142857142857,
"grad_norm": 0.002166890539228916,
"kl": 0.00048048049211502075,
"learning_rate": 1.9929791578083655e-07,
"loss": 0.0118,
"reward": 0.09523998136864975,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.11517497431486845,
"rewards/cosine_scaled_reward": 0.029446275904774666,
"rewards/format_reward": 0.5000000074505806,
"step": 404
},
{
"advantage_max": 0.18142827786505222,
"advantage_mean": -8.537124618346326e-10,
"advantage_min": -0.14328955672681332,
"advantage_std": 0.12373507604934275,
"completion_length": 2082.166679382324,
"epoch": 0.46285714285714286,
"grad_norm": 0.0020297281444072723,
"kl": 0.0005799531936645508,
"learning_rate": 1.9733794420337213e-07,
"loss": 0.0031,
"reward": 0.1628161850385368,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12373507721349597,
"rewards/cosine_scaled_reward": 0.19531266274861991,
"rewards/format_reward": 0.5625000018626451,
"step": 405
},
{
"advantage_max": 0.18078680709004402,
"advantage_mean": -8.071462574699151e-09,
"advantage_min": -0.18061870522797108,
"advantage_std": 0.14956936822272837,
"completion_length": 1885.7500534057617,
"epoch": 0.464,
"grad_norm": 0.002431582659482956,
"kl": 0.0005524754524230957,
"learning_rate": 1.9539516087697517e-07,
"loss": 0.003,
"reward": 0.1681446279399097,
"reward_advantage_correlation": 1.0,
"reward_std": 0.14956937148235738,
"rewards/cosine_scaled_reward": 0.10558001510798931,
"rewards/format_reward": 0.7708333432674408,
"step": 406
},
{
"advantage_max": 0.13299659185577184,
"advantage_mean": -9.924406620294424e-09,
"advantage_min": -0.1531422910047695,
"advantage_std": 0.12285241187782958,
"completion_length": 2295.9166870117188,
"epoch": 0.46514285714285714,
"grad_norm": 0.0032989894971251488,
"kl": 0.0004259645938873291,
"learning_rate": 1.934696604901642e-07,
"loss": 0.0124,
"reward": 0.12349864930001786,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12285241659265012,
"rewards/cosine_scaled_reward": 0.05015348456799984,
"rewards/format_reward": 0.6250000055879354,
"step": 407
},
{
"advantage_max": 0.16066291369497776,
"advantage_mean": -6.053597137456457e-09,
"advantage_min": -0.15431132726371288,
"advantage_std": 0.13571413152385503,
"completion_length": 2232.2708473205566,
"epoch": 0.4662857142857143,
"grad_norm": 0.002392456866800785,
"kl": 0.0004053637385368347,
"learning_rate": 1.915615368891117e-07,
"loss": 0.0115,
"reward": 0.1337297521531582,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.13571413629688323,
"rewards/cosine_scaled_reward": 0.1021328023634851,
"rewards/format_reward": 0.5833333395421505,
"step": 408
},
{
"advantage_max": 0.16096539422869682,
"advantage_mean": -6.984920142283357e-10,
"advantage_min": -0.13622304517775774,
"advantage_std": 0.13016468053683639,
"completion_length": 3345.8333740234375,
"epoch": 0.4674285714285714,
"grad_norm": 0.0023022436071187258,
"kl": 0.0006246566772460938,
"learning_rate": 1.8967088307307e-07,
"loss": 0.0106,
"reward": 0.05926420073956251,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.1301646833308041,
"rewards/cosine_scaled_reward": 0.00888601504266262,
"rewards/format_reward": 0.3333333395421505,
"step": 409
},
{
"advantage_max": 0.16869010031223297,
"advantage_mean": -1.2029583600081661e-08,
"advantage_min": -0.14472126122564077,
"advantage_std": 0.1374293458648026,
"completion_length": 2234.250057220459,
"epoch": 0.4685714285714286,
"grad_norm": 0.0026337350718677044,
"kl": 0.00045359134674072266,
"learning_rate": 1.8779779118983867e-07,
"loss": 0.0022,
"reward": 0.15104275988414884,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.13742934702895582,
"rewards/cosine_scaled_reward": 0.11242910474538803,
"rewards/format_reward": 0.6666666716337204,
"step": 410
},
{
"advantage_max": 0.14711505104787648,
"advantage_mean": 2.890980579928204e-09,
"advantage_min": -0.1517161400988698,
"advantage_std": 0.12216609879396856,
"completion_length": 2739.750026702881,
"epoch": 0.4697142857142857,
"grad_norm": 0.0020477049984037876,
"kl": 0.0005456209182739258,
"learning_rate": 1.8594235253127372e-07,
"loss": 0.0069,
"reward": 0.0763370256536291,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.12216610834002495,
"rewards/cosine_scaled_reward": 0.004631456453353167,
"rewards/format_reward": 0.4375000037252903,
"step": 411
},
{
"advantage_max": 0.1390043180435896,
"advantage_mean": -7.450580596923828e-09,
"advantage_min": -0.16988389380276203,
"advantage_std": 0.12418814655393362,
"completion_length": 2551.3125534057617,
"epoch": 0.47085714285714286,
"grad_norm": 0.002181727671995759,
"kl": 0.0006371736526489258,
"learning_rate": 1.8410465752883758e-07,
"loss": 0.0008,
"reward": 0.1608741357922554,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12418815260753036,
"rewards/cosine_scaled_reward": 0.1822793409228325,
"rewards/format_reward": 0.5833333432674408,
"step": 412
},
{
"advantage_max": 0.18585596792399883,
"advantage_mean": 3.2596291082986895e-09,
"advantage_min": -0.20336279086768627,
"advantage_std": 0.15997914131730795,
"completion_length": 2291.4375534057617,
"epoch": 0.472,
"grad_norm": 0.0022269051987677813,
"kl": 0.0003896765410900116,
"learning_rate": 1.822847957491922e-07,
"loss": 0.0019,
"reward": 0.13341048173606396,
"reward_advantage_correlation": 1.0,
"reward_std": 0.15997914737090468,
"rewards/cosine_scaled_reward": 0.10117785283364356,
"rewards/format_reward": 0.5833333395421505,
"step": 413
},
{
"advantage_max": 0.23772612866014242,
"advantage_mean": -2.79396782099095e-09,
"advantage_min": -0.20109373703598976,
"advantage_std": 0.18670698534697294,
"completion_length": 2825.8750610351562,
"epoch": 0.47314285714285714,
"grad_norm": 0.0031252303160727024,
"kl": 0.00036650896072387695,
"learning_rate": 1.804828558898332e-07,
"loss": 0.0021,
"reward": 0.09702186938375235,
"reward_advantage_correlation": 1.0,
"reward_std": 0.18670698441565037,
"rewards/cosine_scaled_reward": 0.015147236525081098,
"rewards/format_reward": 0.5416666753590107,
"step": 414
},
{
"advantage_max": 0.1667441390454769,
"advantage_mean": 1.3969839451899446e-09,
"advantage_min": -0.10517374519258738,
"advantage_std": 0.11341422703117132,
"completion_length": 3224.6458435058594,
"epoch": 0.4742857142857143,
"grad_norm": 0.00234918761998415,
"kl": 0.0006085634231567383,
"learning_rate": 1.7869892577476722e-07,
"loss": 0.0023,
"reward": -0.02207179879769683,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.11341423215344548,
"rewards/cosine_scaled_reward": -0.16945138771552593,
"rewards/format_reward": 0.2083333358168602,
"step": 415
},
{
"advantage_max": 0.19994694832712412,
"advantage_mean": -7.217750140620094e-09,
"advantage_min": -0.160794363822788,
"advantage_std": 0.13797596981748939,
"completion_length": 1336.5208473205566,
"epoch": 0.4754285714285714,
"grad_norm": 0.0012100542662665248,
"kl": 0.00020557385869324207,
"learning_rate": 1.7693309235023127e-07,
"loss": -0.0004,
"reward": 0.16311359032988548,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13797597540542483,
"rewards/cosine_scaled_reward": 0.020437828032299876,
"rewards/format_reward": 0.9166666679084301,
"step": 416
},
{
"advantage_max": 0.06850483757443726,
"advantage_mean": -1.8626452047421083e-09,
"advantage_min": -0.09264840371906757,
"advantage_std": 0.06786012463271618,
"completion_length": 2991.604202270508,
"epoch": 0.4765714285714286,
"grad_norm": 0.001323464559391141,
"kl": 0.0007146112620830536,
"learning_rate": 1.7518544168045524e-07,
"loss": -0.0018,
"reward": 0.07695972826331854,
"reward_advantage_correlation": 1.0,
"reward_std": 0.06786012649536133,
"rewards/cosine_scaled_reward": 0.037449623458087444,
"rewards/format_reward": 0.375,
"step": 417
},
{
"advantage_max": 0.08946323348209262,
"advantage_mean": 1.5522044760629683e-10,
"advantage_min": -0.08228053990751505,
"advantage_std": 0.06771091069094837,
"completion_length": 2153.7708587646484,
"epoch": 0.4777142857142857,
"grad_norm": 0.0015419662231579423,
"kl": 0.000576341524720192,
"learning_rate": 1.7345605894346726e-07,
"loss": 0.0038,
"reward": 0.12136844790074974,
"reward_advantage_correlation": 1.0,
"reward_std": 0.06771091185510159,
"rewards/cosine_scaled_reward": 0.05540352314710617,
"rewards/format_reward": 0.6041666679084301,
"step": 418
},
{
"advantage_max": 0.09467934165149927,
"advantage_mean": -8.071462193059986e-09,
"advantage_min": -0.13740856852382421,
"advantage_std": 0.09112847782671452,
"completion_length": 2377.7708587646484,
"epoch": 0.47885714285714287,
"grad_norm": 0.0011218616273254156,
"kl": 0.00043688714504241943,
"learning_rate": 1.7174502842694212e-07,
"loss": 0.0027,
"reward": 0.10573733225464821,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.09112848108634353,
"rewards/cosine_scaled_reward": 0.051418233662843704,
"rewards/format_reward": 0.520833333954215,
"step": 419
},
{
"advantage_max": 0.13071657810360193,
"advantage_mean": -3.1044085913034536e-09,
"advantage_min": -0.1421688578557223,
"advantage_std": 0.1184294882696122,
"completion_length": 1442.6667213439941,
"epoch": 0.48,
"grad_norm": 0.001580077805556357,
"kl": 0.0002828836441040039,
"learning_rate": 1.7005243352409333e-07,
"loss": 0.0055,
"reward": 0.0912556970724836,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.11842949106357992,
"rewards/cosine_scaled_reward": -0.1258711563423276,
"rewards/format_reward": 0.7916666772216558,
"step": 420
},
{
"advantage_max": 0.23604473238810897,
"advantage_mean": -3.880510801579362e-09,
"advantage_min": -0.19979441072791815,
"advantage_std": 0.17005935590714216,
"completion_length": 2882.000030517578,
"epoch": 0.48114285714285715,
"grad_norm": 0.0032496661879122257,
"kl": 0.0005581378936767578,
"learning_rate": 1.6837835672960831e-07,
"loss": 0.0113,
"reward": 0.06888013612478971,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.17005936661735177,
"rewards/cosine_scaled_reward": 0.015190824400633574,
"rewards/format_reward": 0.37500001303851604,
"step": 421
},
{
"advantage_max": 0.13127990905195475,
"advantage_mean": -2.3283065475609988e-09,
"advantage_min": -0.08840413391590118,
"advantage_std": 0.08595326798968017,
"completion_length": 2885.041717529297,
"epoch": 0.48228571428571426,
"grad_norm": 0.001612317399121821,
"kl": 0.0006029903888702393,
"learning_rate": 1.6672287963562852e-07,
"loss": 0.0004,
"reward": 0.02790595730766654,
"reward_advantage_correlation": 1.0,
"reward_std": 0.08595326961949468,
"rewards/cosine_scaled_reward": -0.08409846760332584,
"rewards/format_reward": 0.33333334140479565,
"step": 422
},
{
"advantage_max": 0.18713722238317132,
"advantage_mean": -4.423782389018083e-09,
"advantage_min": -0.15640555322170258,
"advantage_std": 0.1455209826817736,
"completion_length": 3003.3958892822266,
"epoch": 0.48342857142857143,
"grad_norm": 0.002777996240183711,
"kl": 0.0006363391876220703,
"learning_rate": 1.6508608292777203e-07,
"loss": 0.0054,
"reward": 0.07326357485726476,
"reward_advantage_correlation": 1.0,
"reward_std": 0.14552099234424531,
"rewards/cosine_scaled_reward": 0.03013175167143345,
"rewards/format_reward": 0.37500000931322575,
"step": 423
},
{
"advantage_max": 0.19879988790489733,
"advantage_mean": -1.552203920951456e-10,
"advantage_min": -0.15926985908299685,
"advantage_std": 0.15668703592382371,
"completion_length": 2868.604217529297,
"epoch": 0.4845714285714286,
"grad_norm": 0.0034800267312675714,
"kl": 0.0005161762237548828,
"learning_rate": 1.6346804638120098e-07,
"loss": 0.0092,
"reward": 0.057857689214870334,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.15668703895062208,
"rewards/cosine_scaled_reward": -0.038518927060067654,
"rewards/format_reward": 0.4166666753590107,
"step": 424
},
{
"advantage_max": 0.16558178514242172,
"advantage_mean": -1.3038516322172455e-08,
"advantage_min": -0.19483254104852676,
"advantage_std": 0.13980256451759487,
"completion_length": 1757.3750305175781,
"epoch": 0.4857142857142857,
"grad_norm": 0.0022031443659216166,
"kl": 0.00030015595257282257,
"learning_rate": 1.6186884885673413e-07,
"loss": 0.0071,
"reward": 0.28972805850207806,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1398025705711916,
"rewards/cosine_scaled_reward": 0.43010698398575187,
"rewards/format_reward": 0.8541666753590107,
"step": 425
},
{
"advantage_max": 0.12429485190659761,
"advantage_mean": 2.949188268597247e-09,
"advantage_min": -0.12924165464937687,
"advantage_std": 0.10551139246672392,
"completion_length": 2043.2083625793457,
"epoch": 0.4868571428571429,
"grad_norm": 0.0011740243062376976,
"kl": 0.00042065978050231934,
"learning_rate": 1.6028856829700258e-07,
"loss": -0.0014,
"reward": 0.13536178693175316,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.10551139432936907,
"rewards/cosine_scaled_reward": 0.11793831922113895,
"rewards/format_reward": 0.5625000149011612,
"step": 426
},
{
"advantage_max": 0.09893368650227785,
"advantage_mean": -6.674478740531509e-09,
"advantage_min": -0.11052933987230062,
"advantage_std": 0.08068146975710988,
"completion_length": 3011.9583435058594,
"epoch": 0.488,
"grad_norm": 0.0013815592974424362,
"kl": 0.00067901611328125,
"learning_rate": 1.5872728172265146e-07,
"loss": 0.001,
"reward": 0.04093674477189779,
"reward_advantage_correlation": 1.0,
"reward_std": 0.08068147208541632,
"rewards/cosine_scaled_reward": -0.035097976215183735,
"rewards/format_reward": 0.31250000186264515,
"step": 427
},
{
"advantage_max": 0.22580352891236544,
"advantage_mean": -3.531264877731921e-09,
"advantage_min": -0.23063123784959316,
"advantage_std": 0.18543015886098146,
"completion_length": 2163.0625381469727,
"epoch": 0.48914285714285716,
"grad_norm": 0.0028671131003648043,
"kl": 0.0004477202892303467,
"learning_rate": 1.5718506522858572e-07,
"loss": 0.0143,
"reward": 0.1508345203474164,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1854301649145782,
"rewards/cosine_scaled_reward": 0.12118915654718876,
"rewards/format_reward": 0.6458333414047956,
"step": 428
},
{
"advantage_max": 0.11988749261945486,
"advantage_mean": -2.7939678903798892e-09,
"advantage_min": -0.14586800010874867,
"advantage_std": 0.11273562628775835,
"completion_length": 1969.0833625793457,
"epoch": 0.49028571428571427,
"grad_norm": 0.0015224118251353502,
"kl": 0.000494047999382019,
"learning_rate": 1.5566199398026147e-07,
"loss": 0.0048,
"reward": 0.10409643454477191,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1127356318756938,
"rewards/cosine_scaled_reward": -0.03846575319766998,
"rewards/format_reward": 0.6875000074505806,
"step": 429
},
{
"advantage_max": 0.1451888126321137,
"advantage_mean": -4.423782340445825e-09,
"advantage_min": -0.1432971404865384,
"advantage_std": 0.10751870181411505,
"completion_length": 2194.375030517578,
"epoch": 0.49142857142857144,
"grad_norm": 0.0018724583787843585,
"kl": 0.0004582032561302185,
"learning_rate": 1.5415814221002265e-07,
"loss": 0.0069,
"reward": 0.13761766906827688,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10751870553940535,
"rewards/cosine_scaled_reward": 0.09085702430456877,
"rewards/format_reward": 0.6250000055879354,
"step": 430
},
{
"advantage_max": 0.15991243068128824,
"advantage_mean": -2.949188282475035e-09,
"advantage_min": -0.15987397450953722,
"advantage_std": 0.126675630453974,
"completion_length": 2091.020851135254,
"epoch": 0.49257142857142855,
"grad_norm": 0.0018920974107459188,
"kl": 0.0006821155548095703,
"learning_rate": 1.5267358321348285e-07,
"loss": 0.0019,
"reward": 0.08629386406391859,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12667563511058688,
"rewards/cosine_scaled_reward": -0.03934409748762846,
"rewards/format_reward": 0.5833333414047956,
"step": 431
},
{
"advantage_max": 0.22554702498018742,
"advantage_mean": -4.3461720750093136e-09,
"advantage_min": -0.2308659916743636,
"advantage_std": 0.18240982363931835,
"completion_length": 2816.7083892822266,
"epoch": 0.4937142857142857,
"grad_norm": 0.003146486124023795,
"kl": 0.00060272216796875,
"learning_rate": 1.5120838934595337e-07,
"loss": 0.0022,
"reward": 0.1152828261256218,
"reward_advantage_correlation": 1.0,
"reward_std": 0.18240981781855226,
"rewards/cosine_scaled_reward": 0.09906804747879505,
"rewards/format_reward": 0.47916668094694614,
"step": 432
},
{
"advantage_max": 0.12062860745936632,
"advantage_mean": -5.452117637569365e-09,
"advantage_min": -0.1215288108214736,
"advantage_std": 0.09663968626409769,
"completion_length": 2868.7916679382324,
"epoch": 0.4948571428571429,
"grad_norm": 0.0015883512096479535,
"kl": 0.0004881918430328369,
"learning_rate": 1.4976263201891613e-07,
"loss": 0.0015,
"reward": 0.06807709392160177,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.09663968672975898,
"rewards/cosine_scaled_reward": 0.023668689653277397,
"rewards/format_reward": 0.3541666753590107,
"step": 433
},
{
"advantage_max": 0.14515302376821637,
"advantage_mean": -2.0954758137015084e-09,
"advantage_min": -0.09941379074007273,
"advantage_std": 0.0925137703306973,
"completion_length": 2770.3542098999023,
"epoch": 0.496,
"grad_norm": 0.0017540472326800227,
"kl": 0.000606052577495575,
"learning_rate": 1.483363816965435e-07,
"loss": 0.0031,
"reward": -0.016186986584216356,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.09251376986503601,
"rewards/cosine_scaled_reward": -0.22496083891019225,
"rewards/format_reward": 0.35416667349636555,
"step": 434
},
{
"advantage_max": 0.11189865134656429,
"advantage_mean": -4.03573130469681e-09,
"advantage_min": -0.08333402825519443,
"advantage_std": 0.076182265591342,
"completion_length": 2093.437530517578,
"epoch": 0.49714285714285716,
"grad_norm": 0.0017215419793501496,
"kl": 0.0007574558258056641,
"learning_rate": 1.469297078922642e-07,
"loss": 0.0039,
"reward": 0.052155050449073315,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.07618226658087224,
"rewards/cosine_scaled_reward": -0.11770580988377333,
"rewards/format_reward": 0.5416666716337204,
"step": 435
},
{
"advantage_max": 0.17040221765637398,
"advantage_mean": -1.552203920951456e-10,
"advantage_min": -0.15949644800275564,
"advantage_std": 0.12988637061789632,
"completion_length": 1972.1458778381348,
"epoch": 0.4982857142857143,
"grad_norm": 0.0017543588764965534,
"kl": 0.0003309226594865322,
"learning_rate": 1.4554267916537495e-07,
"loss": 0.002,
"reward": 0.14922187570482492,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.12988637015223503,
"rewards/cosine_scaled_reward": 0.1184953460469842,
"rewards/format_reward": 0.6458333358168602,
"step": 436
},
{
"advantage_max": 0.06821496458724141,
"advantage_mean": 5.820767028097418e-10,
"advantage_min": -0.08236566046252847,
"advantage_std": 0.05960237327963114,
"completion_length": 2729.250015258789,
"epoch": 0.49942857142857144,
"grad_norm": 0.0011862348765134811,
"kl": 0.0005130767822265625,
"learning_rate": 1.4417536311769885e-07,
"loss": 0.0057,
"reward": 0.05111697223037481,
"reward_advantage_correlation": 1.0,
"reward_std": 0.059602373745292425,
"rewards/cosine_scaled_reward": -0.08793694153428078,
"rewards/format_reward": 0.4791666716337204,
"step": 437
},
{
"advantage_max": 0.17163235694169998,
"advantage_mean": -3.104409923571083e-10,
"advantage_min": -0.21905045211315155,
"advantage_std": 0.15708328178152442,
"completion_length": 2668.937545776367,
"epoch": 0.5005714285714286,
"grad_norm": 0.0025618094950914383,
"kl": 0.0005084872245788574,
"learning_rate": 1.4282782639029128e-07,
"loss": -0.003,
"reward": 0.08607154805213213,
"reward_advantage_correlation": 1.0,
"reward_std": 0.15708328364416957,
"rewards/cosine_scaled_reward": -0.00721331313252449,
"rewards/format_reward": 0.5208333488553762,
"step": 438
},
{
"advantage_max": 0.1785668469965458,
"advantage_mean": -3.4148494171271082e-09,
"advantage_min": -0.18488147668540478,
"advantage_std": 0.1533331573009491,
"completion_length": 2452.5208740234375,
"epoch": 0.5017142857142857,
"grad_norm": 0.0026862856466323137,
"kl": 0.0005292594432830811,
"learning_rate": 1.4150013466019114e-07,
"loss": 0.0009,
"reward": 0.07428326783701777,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.15333315962925553,
"rewards/cosine_scaled_reward": -0.029012007638812065,
"rewards/format_reward": 0.5000000167638063,
"step": 439
},
{
"advantage_max": 0.14857979817315936,
"advantage_mean": 6.208816794028849e-10,
"advantage_min": -0.09883663896471262,
"advantage_std": 0.09249451011419296,
"completion_length": 2944.875030517578,
"epoch": 0.5028571428571429,
"grad_norm": 0.001710064709186554,
"kl": 0.0007870197296142578,
"learning_rate": 1.4019235263722034e-07,
"loss": 0.0032,
"reward": -0.016045190238628493,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.09249451290816069,
"rewards/cosine_scaled_reward": -0.17368111293762922,
"rewards/format_reward": 0.2500000037252903,
"step": 440
},
{
"advantage_max": 0.1465267646126449,
"advantage_mean": 2.3283067834833915e-10,
"advantage_min": -0.12699715700000525,
"advantage_std": 0.11076962715014815,
"completion_length": 2901.958354949951,
"epoch": 0.504,
"grad_norm": 0.0018061235314235091,
"kl": 0.0007097125053405762,
"learning_rate": 1.3890454406082956e-07,
"loss": 0.0036,
"reward": 0.06491660978645086,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11076962575316429,
"rewards/cosine_scaled_reward": 0.0063933562487363815,
"rewards/format_reward": 0.3750000037252903,
"step": 441
},
{
"advantage_max": 0.17049752548336983,
"advantage_mean": 3.8805109126016646e-09,
"advantage_min": -0.11352705024182796,
"advantage_std": 0.118473204318434,
"completion_length": 2633.6250076293945,
"epoch": 0.5051428571428571,
"grad_norm": 0.001908893813379109,
"kl": 0.0005713105201721191,
"learning_rate": 1.3763677169699217e-07,
"loss": 0.0013,
"reward": 0.07302698490093462,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1184732080437243,
"rewards/cosine_scaled_reward": -0.023679533042013645,
"rewards/format_reward": 0.4791666679084301,
"step": 442
},
{
"advantage_max": 0.15183980716392398,
"advantage_mean": -3.0267986173004857e-09,
"advantage_min": -0.1316932663321495,
"advantage_std": 0.11646312335506082,
"completion_length": 2931.5000534057617,
"epoch": 0.5062857142857143,
"grad_norm": 0.0020145962480455637,
"kl": 0.00047835707664489746,
"learning_rate": 1.3638909733514452e-07,
"loss": 0.0068,
"reward": 0.06958311138441786,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11646311962977052,
"rewards/cosine_scaled_reward": -0.002327965572476387,
"rewards/format_reward": 0.4166666753590107,
"step": 443
},
{
"advantage_max": 0.11418756144121289,
"advantage_mean": 3.88050980237864e-11,
"advantage_min": -0.09134344570338726,
"advantage_std": 0.08608054695650935,
"completion_length": 2848.3750076293945,
"epoch": 0.5074285714285715,
"grad_norm": 0.0012016425607725978,
"kl": 0.0005184710025787354,
"learning_rate": 1.351615817851748e-07,
"loss": -0.0004,
"reward": -0.03174476232379675,
"reward_advantage_correlation": 1.0,
"reward_std": 0.08608055114746094,
"rewards/cosine_scaled_reward": -0.21938861906528473,
"rewards/format_reward": 0.25000000558793545,
"step": 444
},
{
"advantage_max": 0.1748210177756846,
"advantage_mean": -2.8715780586718864e-09,
"advantage_min": -0.14708452578634024,
"advantage_std": 0.12430504383519292,
"completion_length": 2911.7917098999023,
"epoch": 0.5085714285714286,
"grad_norm": 0.0020126677118241787,
"kl": 0.0006580352783203125,
"learning_rate": 1.3395428487445914e-07,
"loss": 0.0046,
"reward": 0.10539004136808217,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1243050447665155,
"rewards/cosine_scaled_reward": 0.07351269014179707,
"rewards/format_reward": 0.47916666977107525,
"step": 445
},
{
"advantage_max": 0.2272405456751585,
"advantage_mean": -3.1820188706177532e-09,
"advantage_min": -0.20155336987227201,
"advantage_std": 0.17020691372454166,
"completion_length": 2772.3750381469727,
"epoch": 0.5097142857142857,
"grad_norm": 0.0035501238889992237,
"kl": 0.0005533397197723389,
"learning_rate": 1.3276726544494571e-07,
"loss": 0.0091,
"reward": 0.06983472930733114,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1702069230377674,
"rewards/cosine_scaled_reward": -0.054740124847739935,
"rewards/format_reward": 0.5208333488553762,
"step": 446
},
{
"advantage_max": 0.1513710436411202,
"advantage_mean": -2.63874733175129e-09,
"advantage_min": -0.11578180687502027,
"advantage_std": 0.12062532501295209,
"completion_length": 1653.7708473205566,
"epoch": 0.5108571428571429,
"grad_norm": 0.002002675784751773,
"kl": 0.00044733285903930664,
"learning_rate": 1.316005813502869e-07,
"loss": 0.0057,
"reward": 0.07071918109431863,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12062532640993595,
"rewards/cosine_scaled_reward": -0.19742719386704266,
"rewards/format_reward": 0.8125000055879354,
"step": 447
},
{
"advantage_max": 0.12252536416053772,
"advantage_mean": -9.002785267275826e-09,
"advantage_min": -0.112825533375144,
"advantage_std": 0.08908456144854426,
"completion_length": 1953.8333549499512,
"epoch": 0.512,
"grad_norm": 0.0019004471832886338,
"kl": 0.000409543514251709,
"learning_rate": 1.3045428945301953e-07,
"loss": 0.005,
"reward": 0.09030471183359623,
"reward_advantage_correlation": 1.0,
"reward_std": 0.08908456075005233,
"rewards/cosine_scaled_reward": -0.035713573917746544,
"rewards/format_reward": 0.6041666716337204,
"step": 448
},
{
"advantage_max": 0.0966623155400157,
"advantage_mean": -2.328305812038245e-10,
"advantage_min": -0.08400777820497751,
"advantage_std": 0.06731040589511395,
"completion_length": 2492.791675567627,
"epoch": 0.5131428571428571,
"grad_norm": 0.0013465734664350748,
"kl": 0.0005087852478027344,
"learning_rate": 1.2932844562179352e-07,
"loss": 0.0014,
"reward": 0.014899131376296282,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.06731040822342038,
"rewards/cosine_scaled_reward": -0.16597000509500504,
"rewards/format_reward": 0.4166666679084301,
"step": 449
},
{
"advantage_max": 0.06399380508810282,
"advantage_mean": 2.250696351513426e-09,
"advantage_min": -0.08506503142416477,
"advantage_std": 0.057404838502407074,
"completion_length": 2388.562515258789,
"epoch": 0.5142857142857142,
"grad_norm": 0.0008139196434058249,
"kl": 0.00040439143776893616,
"learning_rate": 1.2822310472864885e-07,
"loss": 0.0015,
"reward": 0.07429015543311834,
"reward_advantage_correlation": 1.0,
"reward_std": 0.05740483803674579,
"rewards/cosine_scaled_reward": 0.00132070854306221,
"rewards/format_reward": 0.4375,
"step": 450
},
{
"advantage_max": 0.07913245167583227,
"advantage_mean": -3.8805106350459084e-09,
"advantage_min": -0.06130435457453132,
"advantage_std": 0.05127688334323466,
"completion_length": 2407.750030517578,
"epoch": 0.5154285714285715,
"grad_norm": 0.0008824392571114004,
"kl": 0.000555187463760376,
"learning_rate": 1.2713832064634125e-07,
"loss": 0.0027,
"reward": 0.0942617341352161,
"reward_advantage_correlation": 1.0,
"reward_std": 0.051276884973049164,
"rewards/cosine_scaled_reward": 0.016264691948890686,
"rewards/format_reward": 0.5208333376795053,
"step": 451
},
{
"advantage_max": 0.20604290487244725,
"advantage_mean": -5.122274437940888e-09,
"advantage_min": -0.16741247940808535,
"advantage_std": 0.14271592535078526,
"completion_length": 2762.500045776367,
"epoch": 0.5165714285714286,
"grad_norm": 0.002616358455270529,
"kl": 0.0005500912666320801,
"learning_rate": 1.260741462457165e-07,
"loss": 0.0009,
"reward": 0.07724724570289254,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1427159309387207,
"rewards/cosine_scaled_reward": -0.0017735953442752361,
"rewards/format_reward": 0.4583333469927311,
"step": 452
},
{
"advantage_max": 0.18616215698421001,
"advantage_mean": -7.761021464380846e-09,
"advantage_min": -0.1861566216684878,
"advantage_std": 0.14951407350599766,
"completion_length": 2588.4792098999023,
"epoch": 0.5177142857142857,
"grad_norm": 0.002354600466787815,
"kl": 0.0005657672882080078,
"learning_rate": 1.2503063339313356e-07,
"loss": 0.0084,
"reward": 0.10979624767787755,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.14951407723128796,
"rewards/cosine_scaled_reward": 0.07057444495148957,
"rewards/format_reward": 0.5000000093132257,
"step": 453
},
{
"advantage_max": 0.08877705689519644,
"advantage_mean": 1.0865430083439875e-09,
"advantage_min": -0.11890319734811783,
"advantage_std": 0.0781020374270156,
"completion_length": 2190.5416870117188,
"epoch": 0.5188571428571429,
"grad_norm": 0.0014429461443796754,
"kl": 0.0005607306957244873,
"learning_rate": 1.2400783294793668e-07,
"loss": 0.0021,
"reward": 0.135482975281775,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.07810203928966075,
"rewards/cosine_scaled_reward": 0.047869518399238586,
"rewards/format_reward": 0.7083333432674408,
"step": 454
},
{
"advantage_max": 0.14416319783776999,
"advantage_mean": -7.761021270091817e-10,
"advantage_min": -0.10820258548483253,
"advantage_std": 0.10255820630118251,
"completion_length": 2773.6042098999023,
"epoch": 0.52,
"grad_norm": 0.002726923208683729,
"kl": 0.0006432235240936279,
"learning_rate": 1.2300579475997657e-07,
"loss": 0.0081,
"reward": 0.00850020069628954,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10255820862948895,
"rewards/cosine_scaled_reward": -0.19340587593615055,
"rewards/format_reward": 0.4375000037252903,
"step": 455
},
{
"advantage_max": 0.17035988252609968,
"advantage_mean": 5.044663679842909e-10,
"advantage_min": -0.18033531680703163,
"advantage_std": 0.1579930440057069,
"completion_length": 2987.7292289733887,
"epoch": 0.5211428571428571,
"grad_norm": 0.0037276751827448606,
"kl": 0.0005083084106445312,
"learning_rate": 1.220245676671809e-07,
"loss": 0.0115,
"reward": 0.06464625336229801,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.157993049826473,
"rewards/cosine_scaled_reward": -0.017057785764336586,
"rewards/format_reward": 0.4166666753590107,
"step": 456
},
{
"advantage_max": 0.1135766888037324,
"advantage_mean": -2.6387474427735924e-09,
"advantage_min": -0.08013106137514114,
"advantage_std": 0.08444676687940955,
"completion_length": 3021.229217529297,
"epoch": 0.5222857142857142,
"grad_norm": 0.0019406946375966072,
"kl": 0.0007562637329101562,
"learning_rate": 1.2106419949317388e-07,
"loss": 0.0015,
"reward": 0.05151152703911066,
"reward_advantage_correlation": 1.0,
"reward_std": 0.08444677060469985,
"rewards/cosine_scaled_reward": -0.013899954035878181,
"rewards/format_reward": 0.3333333358168602,
"step": 457
},
{
"advantage_max": 0.18146846443414688,
"advantage_mean": -3.259629080543114e-09,
"advantage_min": -0.16966586094349623,
"advantage_std": 0.14272961462847888,
"completion_length": 1946.000015258789,
"epoch": 0.5234285714285715,
"grad_norm": 0.0026240902952849865,
"kl": 0.0003987550735473633,
"learning_rate": 1.2012473704494537e-07,
"loss": 0.0066,
"reward": 0.15159178618341684,
"reward_advantage_correlation": 1.0,
"reward_std": 0.14272962091490626,
"rewards/cosine_scaled_reward": 0.11331283859908581,
"rewards/format_reward": 0.666666679084301,
"step": 458
},
{
"advantage_max": 0.10034113470464945,
"advantage_mean": -6.6744786780814636e-09,
"advantage_min": -0.12437078403308988,
"advantage_std": 0.09397356864064932,
"completion_length": 1233.9167022705078,
"epoch": 0.5245714285714286,
"grad_norm": 0.0009185225935652852,
"kl": 0.00022205710411071777,
"learning_rate": 1.1920622611056974e-07,
"loss": 0.0011,
"reward": 0.1982046803459525,
"reward_advantage_correlation": 1.0,
"reward_std": 0.0939735691063106,
"rewards/cosine_scaled_reward": 0.16758823953568935,
"rewards/format_reward": 0.8333333358168602,
"step": 459
},
{
"advantage_max": 0.2020870796404779,
"advantage_mean": -2.793967834868738e-09,
"advantage_min": -0.2301188837736845,
"advantage_std": 0.176668681204319,
"completion_length": 2900.1042098999023,
"epoch": 0.5257142857142857,
"grad_norm": 0.0030042824801057577,
"kl": 0.0005886554718017578,
"learning_rate": 1.1830871145697412e-07,
"loss": 0.0058,
"reward": 0.17208986543118954,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.17666867841035128,
"rewards/cosine_scaled_reward": 0.196685079485178,
"rewards/format_reward": 0.6250000093132257,
"step": 460
},
{
"advantage_max": 0.22937612980604172,
"advantage_mean": -1.396984056212247e-09,
"advantage_min": -0.20125835668295622,
"advantage_std": 0.1885735362302512,
"completion_length": 3077.375045776367,
"epoch": 0.5268571428571428,
"grad_norm": 0.003480825573205948,
"kl": 0.000635288655757904,
"learning_rate": 1.1743223682775649e-07,
"loss": 0.0097,
"reward": 0.08316627237945795,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.18857353436760604,
"rewards/cosine_scaled_reward": 0.027362531051039696,
"rewards/format_reward": 0.4375000037252903,
"step": 461
},
{
"advantage_max": 0.09577207872644067,
"advantage_mean": 5.5879356419819e-09,
"advantage_min": -0.10161272855475545,
"advantage_std": 0.08275535795837641,
"completion_length": 2929.8541870117188,
"epoch": 0.528,
"grad_norm": 0.0019017203012481332,
"kl": 0.0005735903978347778,
"learning_rate": 1.1657684494105386e-07,
"loss": -0.001,
"reward": -0.005534024443477392,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.08275536308065057,
"rewards/cosine_scaled_reward": -0.1532220784574747,
"rewards/format_reward": 0.27083333395421505,
"step": 462
},
{
"advantage_max": 0.13797848299145699,
"advantage_mean": 6.984919517782906e-10,
"advantage_min": -0.14528980944305658,
"advantage_std": 0.11408946011215448,
"completion_length": 2641.625045776367,
"epoch": 0.5291428571428571,
"grad_norm": 0.0021766142453998327,
"kl": 0.0004150867462158203,
"learning_rate": 1.1574257748745986e-07,
"loss": 0.0123,
"reward": 0.11324432399123907,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11408946430310607,
"rewards/cosine_scaled_reward": 0.09595046006143093,
"rewards/format_reward": 0.4791666753590107,
"step": 463
},
{
"advantage_max": 0.06949485652148724,
"advantage_mean": -2.1730859889323995e-09,
"advantage_min": -0.0649899085983634,
"advantage_std": 0.05675833718851209,
"completion_length": 1714.729190826416,
"epoch": 0.5302857142857142,
"grad_norm": 0.0009724340634420514,
"kl": 0.00033186376094818115,
"learning_rate": 1.1492947512799328e-07,
"loss": 0.0017,
"reward": 0.1716830674558878,
"reward_advantage_correlation": 1.0,
"reward_std": 0.05675833881832659,
"rewards/cosine_scaled_reward": 0.15365481562912464,
"rewards/format_reward": 0.7083333358168602,
"step": 464
},
{
"advantage_max": 0.18007674161344767,
"advantage_mean": -4.501392578126762e-09,
"advantage_min": -0.18894312204793096,
"advantage_std": 0.1457268726080656,
"completion_length": 2496.875068664551,
"epoch": 0.5314285714285715,
"grad_norm": 0.0030991239473223686,
"kl": 0.000622868537902832,
"learning_rate": 1.1413757749211602e-07,
"loss": 0.006,
"reward": 0.16800264199264348,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1457268742378801,
"rewards/cosine_scaled_reward": 0.12976265419274569,
"rewards/format_reward": 0.7291666809469461,
"step": 465
},
{
"advantage_max": 0.1637826063670218,
"advantage_mean": -6.053597040311942e-09,
"advantage_min": -0.17003454267978668,
"advantage_std": 0.13364151399582624,
"completion_length": 3074.7916870117188,
"epoch": 0.5325714285714286,
"grad_norm": 0.003030646126717329,
"kl": 0.0006162524223327637,
"learning_rate": 1.1336692317580158e-07,
"loss": 0.0044,
"reward": 0.0841610130155459,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.13364151632413268,
"rewards/cosine_scaled_reward": 0.05105864675715566,
"rewards/format_reward": 0.3958333395421505,
"step": 466
},
{
"advantage_max": 0.1294914805330336,
"advantage_mean": -5.820766091346741e-10,
"advantage_min": -0.11973803536966443,
"advantage_std": 0.0973717300221324,
"completion_length": 2759.3333854675293,
"epoch": 0.5337142857142857,
"grad_norm": 0.0016337429406121373,
"kl": 0.0006120204925537109,
"learning_rate": 1.1261754973965422e-07,
"loss": 0.0054,
"reward": 0.021781093149911612,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.09737173025496304,
"rewards/cosine_scaled_reward": -0.1436814023181796,
"rewards/format_reward": 0.41666667349636555,
"step": 467
},
{
"advantage_max": 0.19513892801478505,
"advantage_mean": -1.6298145125159813e-09,
"advantage_min": -0.17503061518073082,
"advantage_std": 0.15135146118700504,
"completion_length": 2915.2708740234375,
"epoch": 0.5348571428571428,
"grad_norm": 0.0029594493098556995,
"kl": 0.0007366985082626343,
"learning_rate": 1.1188949370707787e-07,
"loss": 0.0098,
"reward": 0.039023627527058125,
"reward_advantage_correlation": 1.0,
"reward_std": 0.15135146211832762,
"rewards/cosine_scaled_reward": -0.07234426774084568,
"rewards/format_reward": 0.3750000074505806,
"step": 468
},
{
"advantage_max": 0.1406808183528483,
"advantage_mean": 3.1820189122511167e-09,
"advantage_min": -0.13096178881824017,
"advantage_std": 0.10715825203806162,
"completion_length": 2790.729190826416,
"epoch": 0.536,
"grad_norm": 0.0020550782792270184,
"kl": 0.0006768703460693359,
"learning_rate": 1.1118279056249653e-07,
"loss": 0.0022,
"reward": 0.005044038873165846,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10715826088562608,
"rewards/cosine_scaled_reward": -0.1510836249217391,
"rewards/format_reward": 0.3333333358168602,
"step": 469
},
{
"advantage_max": 0.15261405799537897,
"advantage_mean": -1.144750705686648e-09,
"advantage_min": -0.1515314057469368,
"advantage_std": 0.12770982459187508,
"completion_length": 2569.3125381469727,
"epoch": 0.5371428571428571,
"grad_norm": 0.003103189170360565,
"kl": 0.0005183219909667969,
"learning_rate": 1.1049747474962444e-07,
"loss": 0.0035,
"reward": 0.04346779244951904,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12770982831716537,
"rewards/cosine_scaled_reward": -0.14374101161956787,
"rewards/format_reward": 0.541666679084301,
"step": 470
},
{
"advantage_max": 0.09641966479830444,
"advantage_mean": -6.208817557307178e-10,
"advantage_min": -0.13236830849200487,
"advantage_std": 0.09493508515879512,
"completion_length": 2782.6458435058594,
"epoch": 0.5382857142857143,
"grad_norm": 0.0012270222650840878,
"kl": 0.0005309581756591797,
"learning_rate": 1.0983357966978745e-07,
"loss": 0.0019,
"reward": 0.004960605408996344,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09493508981540799,
"rewards/cosine_scaled_reward": -0.1521736173890531,
"rewards/format_reward": 0.3333333432674408,
"step": 471
},
{
"advantage_max": 0.17413780465722084,
"advantage_mean": -3.88050980237864e-10,
"advantage_min": -0.1484542451798916,
"advantage_std": 0.12915962655097246,
"completion_length": 2722.8750228881836,
"epoch": 0.5394285714285715,
"grad_norm": 0.0021392928902059793,
"kl": 0.0006825923919677734,
"learning_rate": 1.0919113768029517e-07,
"loss": 0.0033,
"reward": 0.07151374779641628,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.12915962236002088,
"rewards/cosine_scaled_reward": -0.02726024203002453,
"rewards/format_reward": 0.4791666679084301,
"step": 472
},
{
"advantage_max": 0.145589595194906,
"advantage_mean": 4.850638404829688e-09,
"advantage_min": -0.12249492853879929,
"advantage_std": 0.11026488617062569,
"completion_length": 2914.354217529297,
"epoch": 0.5405714285714286,
"grad_norm": 0.0016195240896195173,
"kl": 0.0005763769149780273,
"learning_rate": 1.0857018009286381e-07,
"loss": 0.0003,
"reward": 0.09501276165246964,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11026489362120628,
"rewards/cosine_scaled_reward": 0.08362941443920135,
"rewards/format_reward": 0.39583333395421505,
"step": 473
},
{
"advantage_max": 0.15750652737915516,
"advantage_mean": -5.743156061832622e-09,
"advantage_min": -0.09204499330371618,
"advantage_std": 0.09810823854058981,
"completion_length": 2134.6458625793457,
"epoch": 0.5417142857142857,
"grad_norm": 0.0021665513049811125,
"kl": 0.0005587935447692871,
"learning_rate": 1.0797073717209013e-07,
"loss": 0.0067,
"reward": 0.19979873031843454,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09810824086889625,
"rewards/cosine_scaled_reward": 0.29946115519851446,
"rewards/format_reward": 0.5833333358168602,
"step": 474
},
{
"advantage_max": 0.16019965521991253,
"advantage_mean": -9.002785017475645e-09,
"advantage_min": -0.24789944384247065,
"advantage_std": 0.1666221539489925,
"completion_length": 2112.8542098999023,
"epoch": 0.5428571428571428,
"grad_norm": 0.002450009109452367,
"kl": 0.0004245880991220474,
"learning_rate": 1.0739283813397639e-07,
"loss": 0.0018,
"reward": 0.2531615113839507,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1666221613995731,
"rewards/cosine_scaled_reward": 0.35053614526987076,
"rewards/format_reward": 0.791666679084301,
"step": 475
},
{
"advantage_max": 0.1794994603842497,
"advantage_mean": 2.638747262362351e-09,
"advantage_min": -0.2670667041093111,
"advantage_std": 0.17754664039239287,
"completion_length": 2330.375045776367,
"epoch": 0.544,
"grad_norm": 0.0032068255823105574,
"kl": 0.0006053447723388672,
"learning_rate": 1.068365111445064e-07,
"loss": 0.0134,
"reward": 0.21058788988739252,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.1775466427206993,
"rewards/cosine_scaled_reward": 0.27757735550403595,
"rewards/format_reward": 0.6875000260770321,
"step": 476
},
{
"advantage_max": 0.1589113175868988,
"advantage_mean": -4.811833376194841e-09,
"advantage_min": -0.15475624846294522,
"advantage_std": 0.12695908243767917,
"completion_length": 1449.8542022705078,
"epoch": 0.5451428571428572,
"grad_norm": 0.001828887383453548,
"kl": 0.0002796947956085205,
"learning_rate": 1.063017833182728e-07,
"loss": 0.0029,
"reward": 0.24288752442225814,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12695908942259848,
"rewards/cosine_scaled_reward": 0.26963918656110764,
"rewards/format_reward": 0.8958333395421505,
"step": 477
},
{
"advantage_max": 0.11892331298440695,
"advantage_mean": -8.381903254806033e-09,
"advantage_min": -0.15956566762179136,
"advantage_std": 0.1133618257008493,
"completion_length": 2689.5416946411133,
"epoch": 0.5462857142857143,
"grad_norm": 0.0018854563822969794,
"kl": 0.0004665255546569824,
"learning_rate": 1.0578868071715544e-07,
"loss": 0.0045,
"reward": 0.13063816633075476,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.11336182476952672,
"rewards/cosine_scaled_reward": 0.11580769601278007,
"rewards/format_reward": 0.541666679084301,
"step": 478
},
{
"advantage_max": 0.18029004149138927,
"advantage_mean": -4.03573130469681e-09,
"advantage_min": -0.17780630104243755,
"advantage_std": 0.13413295801728964,
"completion_length": 2802.062530517578,
"epoch": 0.5474285714285714,
"grad_norm": 0.0028555947355926037,
"kl": 0.0007611513137817383,
"learning_rate": 1.0529722834905125e-07,
"loss": 0.0021,
"reward": 0.06550013413652778,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1341329631395638,
"rewards/cosine_scaled_reward": -0.07874849392101169,
"rewards/format_reward": 0.541666679084301,
"step": 479
},
{
"advantage_max": 0.22729169484227896,
"advantage_mean": -2.5999422337275035e-09,
"advantage_min": -0.15585079044103622,
"advantage_std": 0.1480179699137807,
"completion_length": 2250.6667404174805,
"epoch": 0.5485714285714286,
"grad_norm": 0.002713319845497608,
"kl": 0.0004476308822631836,
"learning_rate": 1.0482745016665526e-07,
"loss": 0.0066,
"reward": 0.08439068030565977,
"reward_advantage_correlation": 1.0,
"reward_std": 0.14801797457039356,
"rewards/cosine_scaled_reward": -0.05194460041821003,
"rewards/format_reward": 0.604166679084301,
"step": 480
},
{
"advantage_max": 0.12421236839145422,
"advantage_mean": -8.304293031002885e-09,
"advantage_min": -0.1307937242090702,
"advantage_std": 0.0952663142234087,
"completion_length": 2587.9375915527344,
"epoch": 0.5497142857142857,
"grad_norm": 0.0014279150636866689,
"kl": 0.0005082488059997559,
"learning_rate": 1.0437936906629334e-07,
"loss": 0.0048,
"reward": 0.07169135846197605,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09526631888002157,
"rewards/cosine_scaled_reward": -0.12012962490553036,
"rewards/format_reward": 0.6666666697710752,
"step": 481
},
{
"advantage_max": 0.18846415961161256,
"advantage_mean": -7.99385220517923e-09,
"advantage_min": -0.1828137980774045,
"advantage_std": 0.15363801596686244,
"completion_length": 2704.9375534057617,
"epoch": 0.5508571428571428,
"grad_norm": 0.002800821093842387,
"kl": 0.0006156265735626221,
"learning_rate": 1.0395300688680625e-07,
"loss": 0.0005,
"reward": 0.1636654119938612,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.15363801969215274,
"rewards/cosine_scaled_reward": 0.21197660156758502,
"rewards/format_reward": 0.5416666679084301,
"step": 482
},
{
"advantage_max": 0.19416179601103067,
"advantage_mean": -1.5522043372850902e-10,
"advantage_min": -0.1611415701918304,
"advantage_std": 0.12938955938443542,
"completion_length": 2665.7083740234375,
"epoch": 0.552,
"grad_norm": 0.0027008799370378256,
"kl": 0.0005914568901062012,
"learning_rate": 1.0354838440848501e-07,
"loss": 0.0043,
"reward": 0.07180615421384573,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1293895640410483,
"rewards/cosine_scaled_reward": -0.11217466462403536,
"rewards/format_reward": 0.6458333414047956,
"step": 483
},
{
"advantage_max": 0.15204123593866825,
"advantage_mean": -2.2506964417190467e-09,
"advantage_min": -0.14813785336446017,
"advantage_std": 0.11534715490415692,
"completion_length": 2195.687515258789,
"epoch": 0.5531428571428572,
"grad_norm": 0.0017237699357792735,
"kl": 0.00040522217750549316,
"learning_rate": 1.0316552135205837e-07,
"loss": 0.0034,
"reward": 0.08024531602859497,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11534715909510851,
"rewards/cosine_scaled_reward": -0.03545547462999821,
"rewards/format_reward": 0.5416666679084301,
"step": 484
},
{
"advantage_max": 0.11956683732569218,
"advantage_mean": -1.164153232147136e-09,
"advantage_min": -0.08647039532661438,
"advantage_std": 0.07989799580536783,
"completion_length": 1736.5625457763672,
"epoch": 0.5542857142857143,
"grad_norm": 0.0011961512500420213,
"kl": 0.00036829710006713867,
"learning_rate": 1.0280443637773163e-07,
"loss": -0.0012,
"reward": 0.1085349339991808,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.07989799627102911,
"rewards/cosine_scaled_reward": -0.05393883492797613,
"rewards/format_reward": 0.75,
"step": 485
},
{
"advantage_max": 0.1754322131164372,
"advantage_mean": 3.4924587180573674e-10,
"advantage_min": -0.13765499275177717,
"advantage_std": 0.11859225039370358,
"completion_length": 1778.7083854675293,
"epoch": 0.5554285714285714,
"grad_norm": 0.0013995830668136477,
"kl": 0.00038488954305648804,
"learning_rate": 1.0246514708427701e-07,
"loss": -0.0012,
"reward": 0.05783613526728004,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.11859225668013096,
"rewards/cosine_scaled_reward": -0.14235917665064335,
"rewards/format_reward": 0.6250000111758709,
"step": 486
},
{
"advantage_max": 0.1776252081617713,
"advantage_mean": -6.053596859900701e-09,
"advantage_min": -0.25969033129513264,
"advantage_std": 0.16778704058378935,
"completion_length": 1765.500020980835,
"epoch": 0.5565714285714286,
"grad_norm": 0.0018098040018230677,
"kl": 0.00029283761978149414,
"learning_rate": 1.0214767000817596e-07,
"loss": -0.0038,
"reward": 0.2465712195262313,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1677870382554829,
"rewards/cosine_scaled_reward": 0.3729119673371315,
"rewards/format_reward": 0.7083333488553762,
"step": 487
},
{
"advantage_max": 0.1596626602113247,
"advantage_mean": -3.065603493279667e-09,
"advantage_min": -0.1261517507955432,
"advantage_std": 0.10886070877313614,
"completion_length": 2018.1458587646484,
"epoch": 0.5577142857142857,
"grad_norm": 0.001423255424015224,
"kl": 0.00038205087184906006,
"learning_rate": 1.0185202062281336e-07,
"loss": 0.0011,
"reward": 0.05636691814288497,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10886071575805545,
"rewards/cosine_scaled_reward": -0.1461393255740404,
"rewards/format_reward": 0.6250000055879354,
"step": 488
},
{
"advantage_max": 0.13344533974304795,
"advantage_mean": 3.880509941156518e-10,
"advantage_min": -0.12539346516132355,
"advantage_std": 0.10917034232988954,
"completion_length": 3137.8333587646484,
"epoch": 0.5588571428571428,
"grad_norm": 0.0024442316498607397,
"kl": 0.0006935596466064453,
"learning_rate": 1.0157821333772304e-07,
"loss": 0.0037,
"reward": -0.013123379554599524,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.10917035210877657,
"rewards/cosine_scaled_reward": -0.1439531659707427,
"rewards/format_reward": 0.20833334140479565,
"step": 489
},
{
"advantage_max": 0.12650098372250795,
"advantage_mean": 9.216203494810671e-11,
"advantage_min": -0.14339723202283494,
"advantage_std": 0.11344200430903584,
"completion_length": 2136.083354949951,
"epoch": 0.56,
"grad_norm": 0.0020219513680785894,
"kl": 0.0005998890846967697,
"learning_rate": 1.013262614978859e-07,
"loss": 0.0018,
"reward": 0.10040172806475312,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11344200803432614,
"rewards/cosine_scaled_reward": -0.045523665845394135,
"rewards/format_reward": 0.6875000111758709,
"step": 490
},
{
"advantage_max": 0.1827850081026554,
"advantage_mean": -3.104408563547878e-09,
"advantage_min": -0.20528748910874128,
"advantage_std": 0.15947058238089085,
"completion_length": 2341.541717529297,
"epoch": 0.5611428571428572,
"grad_norm": 0.003260035300627351,
"kl": 0.0004696100950241089,
"learning_rate": 1.0109617738307911e-07,
"loss": 0.0126,
"reward": 0.15225723420735449,
"reward_advantage_correlation": 1.0,
"reward_std": 0.15947059262543917,
"rewards/cosine_scaled_reward": 0.11595133878290653,
"rewards/format_reward": 0.666666679084301,
"step": 491
},
{
"advantage_max": 0.13405136327492073,
"advantage_mean": 1.5425030610444201e-09,
"advantage_min": -0.11482641356997192,
"advantage_std": 0.10828206856967881,
"completion_length": 2551.937511444092,
"epoch": 0.5622857142857143,
"grad_norm": 0.0028170021250844,
"kl": 0.00046622753143310547,
"learning_rate": 1.0088797220727779e-07,
"loss": 0.006,
"reward": 0.06306978134671226,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.10828206315636635,
"rewards/cosine_scaled_reward": -0.03327612020075321,
"rewards/format_reward": 0.4375000037252903,
"step": 492
},
{
"advantage_max": 0.1396464416757226,
"advantage_mean": -2.328306464294272e-09,
"advantage_min": -0.15597060602158308,
"advantage_std": 0.12428847094997764,
"completion_length": 1993.7708892822266,
"epoch": 0.5634285714285714,
"grad_norm": 0.0031442081090062857,
"kl": 0.00047659873962402344,
"learning_rate": 1.0070165611810855e-07,
"loss": 0.0204,
"reward": 0.13567497371695936,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.12428847677074373,
"rewards/cosine_scaled_reward": 0.05777975544333458,
"rewards/format_reward": 0.6875000037252903,
"step": 493
},
{
"advantage_max": 0.18611273635178804,
"advantage_mean": -1.1175870923141318e-08,
"advantage_min": -0.20461444603279233,
"advantage_std": 0.16241960739716887,
"completion_length": 1439.104190826416,
"epoch": 0.5645714285714286,
"grad_norm": 0.002649143571034074,
"kl": 0.000398978590965271,
"learning_rate": 1.005372381963547e-07,
"loss": -0.0075,
"reward": 0.24482827726751566,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.16241961810737848,
"rewards/cosine_scaled_reward": 0.22105430043302476,
"rewards/format_reward": 1.0,
"step": 494
},
{
"advantage_max": 0.14939172100275755,
"advantage_mean": -5.8983763567832526e-09,
"advantage_min": -0.13820822536945343,
"advantage_std": 0.11426809709519148,
"completion_length": 2485.166717529297,
"epoch": 0.5657142857142857,
"grad_norm": 0.0017962036654353142,
"kl": 0.00054946169257164,
"learning_rate": 1.0039472645551372e-07,
"loss": 0.0039,
"reward": 0.14726564195007086,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11426810221746564,
"rewards/cosine_scaled_reward": 0.16136360727250576,
"rewards/format_reward": 0.5416666679084301,
"step": 495
},
{
"advantage_max": 0.15256773307919502,
"advantage_mean": 1.0089328539297782e-09,
"advantage_min": -0.13196661323308945,
"advantage_std": 0.11284308601170778,
"completion_length": 1811.2916793823242,
"epoch": 0.5668571428571428,
"grad_norm": 0.0022594898473471403,
"kl": 0.00036126933991909027,
"learning_rate": 1.002741278414069e-07,
"loss": 0.0059,
"reward": 0.20605642755981535,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11284308694303036,
"rewards/cosine_scaled_reward": 0.2769278697669506,
"rewards/format_reward": 0.6666666716337204,
"step": 496
},
{
"advantage_max": 0.17324247024953365,
"advantage_mean": -1.6298146443549655e-09,
"advantage_min": -0.2051475211046636,
"advantage_std": 0.16091588605195284,
"completion_length": 2181.6042137145996,
"epoch": 0.568,
"grad_norm": 0.0028121285140514374,
"kl": 0.00037629157304763794,
"learning_rate": 1.0017544823184055e-07,
"loss": 0.0112,
"reward": 0.18671931326389313,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1609158907085657,
"rewards/cosine_scaled_reward": 0.22045234218239784,
"rewards/format_reward": 0.6666666679084301,
"step": 497
},
{
"advantage_max": 0.1668732976540923,
"advantage_mean": -1.3193736866923267e-09,
"advantage_min": -0.1562135349959135,
"advantage_std": 0.12399712949991226,
"completion_length": 2174.375015258789,
"epoch": 0.5691428571428572,
"grad_norm": 0.0032206247560679913,
"kl": 0.0004447326064109802,
"learning_rate": 1.0009869243631952e-07,
"loss": 0.0153,
"reward": 0.09758738335222006,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1239971297327429,
"rewards/cosine_scaled_reward": -0.015365742146968842,
"rewards/format_reward": 0.6041666753590107,
"step": 498
},
{
"advantage_max": 0.18602585699409246,
"advantage_mean": -4.811833487217143e-09,
"advantage_min": -0.23508271854370832,
"advantage_std": 0.17544544488191605,
"completion_length": 2213.7292098999023,
"epoch": 0.5702857142857143,
"grad_norm": 0.0020961996633559465,
"kl": 0.0003085378557443619,
"learning_rate": 1.000438641958131e-07,
"loss": 0.007,
"reward": 0.1805350393988192,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.17544544488191605,
"rewards/cosine_scaled_reward": 0.1705985008738935,
"rewards/format_reward": 0.7291666753590107,
"step": 499
},
{
"advantage_max": 0.17333817295730114,
"advantage_mean": -8.925174717344664e-09,
"advantage_min": -0.2120303800329566,
"advantage_std": 0.1599614191800356,
"completion_length": 2608.6458892822266,
"epoch": 0.5714285714285714,
"grad_norm": 0.003356323577463627,
"kl": 0.0006328821182250977,
"learning_rate": 1.0001096618257236e-07,
"loss": 0.0124,
"reward": 0.13607118383515626,
"reward_advantage_correlation": 1.0,
"reward_std": 0.15996142383664846,
"rewards/cosine_scaled_reward": 0.14388170279562473,
"rewards/format_reward": 0.5208333507180214,
"step": 500
},
{
"epoch": 0.5714285714285714,
"step": 500,
"total_flos": 0.0,
"train_loss": 0.004779140234080842,
"train_runtime": 145838.7741,
"train_samples_per_second": 0.165,
"train_steps_per_second": 0.003
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}