DRA-DR_GRPO / trainer_state.json
kangdawei's picture
Model save
6ece1d2 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5714285714285714,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"advantage_max": 0.1714239763095975,
"advantage_mean": 2.7163576388211652e-09,
"advantage_min": -0.18542360328137875,
"advantage_std": 0.14101680787280202,
"completion_length": 2571.2083587646484,
"epoch": 0.001142857142857143,
"grad_norm": 0.025767112150788307,
"kl": 0.0,
"learning_rate": 2e-08,
"loss": 0.0137,
"reward": 0.08349451050162315,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.14101681299507618,
"rewards/cosine_scaled_reward": -0.015534311532974243,
"rewards/format_reward": 0.5208333488553762,
"step": 1
},
{
"advantage_max": 0.07242919644340873,
"advantage_mean": 1.8626451769865326e-09,
"advantage_min": -0.09870566707104445,
"advantage_std": 0.071280462667346,
"completion_length": 2804.395881652832,
"epoch": 0.002285714285714286,
"grad_norm": 0.010948998853564262,
"kl": 0.0,
"learning_rate": 4e-08,
"loss": 0.0044,
"reward": 0.04647743375971913,
"reward_advantage_correlation": 1.0,
"reward_std": 0.071280462667346,
"rewards/cosine_scaled_reward": -0.04980122856795788,
"rewards/format_reward": 0.37500000558793545,
"step": 2
},
{
"advantage_max": 0.10077127907425165,
"advantage_mean": 3.880513965714982e-11,
"advantage_min": -0.07837366871535778,
"advantage_std": 0.07264299970120192,
"completion_length": 3330.7291870117188,
"epoch": 0.0034285714285714284,
"grad_norm": 0.011240585707128048,
"kl": 4.692375659942627e-05,
"learning_rate": 6e-08,
"loss": -0.0004,
"reward": -0.05792155209928751,
"reward_advantage_correlation": 1.0,
"reward_std": 0.07264300249516964,
"rewards/cosine_scaled_reward": -0.24313471233472228,
"rewards/format_reward": 0.14583333395421505,
"step": 3
},
{
"advantage_max": 0.1539376201108098,
"advantage_mean": -1.396983917434369e-09,
"advantage_min": -0.13278733659535646,
"advantage_std": 0.11549357417970896,
"completion_length": 2221.6875228881836,
"epoch": 0.004571428571428572,
"grad_norm": 0.021713746711611748,
"kl": 4.139542579650879e-05,
"learning_rate": 8e-08,
"loss": -0.0008,
"reward": 0.07605884410440922,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11549357557669282,
"rewards/cosine_scaled_reward": -0.10020758584141731,
"rewards/format_reward": 0.6458333358168602,
"step": 4
},
{
"advantage_max": 0.18950440920889378,
"advantage_mean": 2.561137149581505e-09,
"advantage_min": -0.1065681865438819,
"advantage_std": 0.1075586169026792,
"completion_length": 3417.7291870117188,
"epoch": 0.005714285714285714,
"grad_norm": 0.01573144644498825,
"kl": 4.338473081588745e-05,
"learning_rate": 1e-07,
"loss": 0.0028,
"reward": -0.03302042291034013,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10755862621590495,
"rewards/cosine_scaled_reward": -0.20160695351660252,
"rewards/format_reward": 0.2083333395421505,
"step": 5
},
{
"advantage_max": 0.11689717648550868,
"advantage_mean": -1.7850348837944452e-09,
"advantage_min": -0.08994922507554293,
"advantage_std": 0.08724205708131194,
"completion_length": 2931.1458892822266,
"epoch": 0.006857142857142857,
"grad_norm": 0.021110277622938156,
"kl": 3.407290205359459e-05,
"learning_rate": 1.2e-07,
"loss": 0.0099,
"reward": -0.009482193738222122,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.08724205940961838,
"rewards/cosine_scaled_reward": -0.204607228981331,
"rewards/format_reward": 0.35416667349636555,
"step": 6
},
{
"advantage_max": 0.11794257629662752,
"advantage_mean": -3.1820189400066923e-09,
"advantage_min": -0.1437535872682929,
"advantage_std": 0.10767027572728693,
"completion_length": 3049.3959045410156,
"epoch": 0.008,
"grad_norm": 0.020024148747324944,
"kl": 2.3268163204193115e-05,
"learning_rate": 1.4e-07,
"loss": 0.0037,
"reward": 0.11388289113529027,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.10767027852125466,
"rewards/cosine_scaled_reward": 0.003962432965636253,
"rewards/format_reward": 0.6666666828095913,
"step": 7
},
{
"advantage_max": 0.20535230357199907,
"advantage_mean": -1.7074246877468724e-09,
"advantage_min": -0.16674288269132376,
"advantage_std": 0.14301629923284054,
"completion_length": 2752.0625,
"epoch": 0.009142857142857144,
"grad_norm": 0.025196732953190804,
"kl": 2.1200627088546753e-05,
"learning_rate": 1.6e-07,
"loss": 0.0065,
"reward": 0.1058007568353787,
"reward_advantage_correlation": 1.0,
"reward_std": 0.14301631227135658,
"rewards/cosine_scaled_reward": 0.06942666228860617,
"rewards/format_reward": 0.4791666753590107,
"step": 8
},
{
"advantage_max": 0.17053810507059097,
"advantage_mean": -9.701276934559466e-10,
"advantage_min": -0.12266492750495672,
"advantage_std": 0.12067469954490662,
"completion_length": 3259.8125610351562,
"epoch": 0.010285714285714285,
"grad_norm": 0.02746347151696682,
"kl": 4.096329212188721e-05,
"learning_rate": 1.8e-07,
"loss": 0.0086,
"reward": 0.006678506499156356,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.12067470327019691,
"rewards/cosine_scaled_reward": -0.11600704118609428,
"rewards/format_reward": 0.2708333395421505,
"step": 9
},
{
"advantage_max": 0.1689818874001503,
"advantage_mean": -3.2596291221764773e-09,
"advantage_min": -0.1334312935359776,
"advantage_std": 0.11464598076418042,
"completion_length": 2768.8542098999023,
"epoch": 0.011428571428571429,
"grad_norm": 0.019988562911748886,
"kl": 3.0182301998138428e-05,
"learning_rate": 2e-07,
"loss": 0.0019,
"reward": 0.03028559315134771,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11464598076418042,
"rewards/cosine_scaled_reward": -0.10943621303886175,
"rewards/format_reward": 0.3958333358168602,
"step": 10
},
{
"advantage_max": 0.07887471001595259,
"advantage_mean": 1.24176348370586e-09,
"advantage_min": -0.07153345271945,
"advantage_std": 0.06311596930027008,
"completion_length": 3333.9166717529297,
"epoch": 0.012571428571428572,
"grad_norm": 0.009001471102237701,
"kl": 3.399699926376343e-05,
"learning_rate": 2.1999999999999998e-07,
"loss": 0.0012,
"reward": -0.07426449563354254,
"reward_advantage_correlation": 1.0,
"reward_std": 0.06311597069725394,
"rewards/cosine_scaled_reward": -0.27152102813124657,
"rewards/format_reward": 0.1041666716337204,
"step": 11
},
{
"advantage_max": 0.1358098853379488,
"advantage_mean": 1.2417634559502844e-09,
"advantage_min": -0.145633140578866,
"advantage_std": 0.11910986108705401,
"completion_length": 2601.395896911621,
"epoch": 0.013714285714285714,
"grad_norm": 0.016579382121562958,
"kl": 3.965198993682861e-05,
"learning_rate": 2.4e-07,
"loss": 0.0048,
"reward": 0.05325879342854023,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1191098652780056,
"rewards/cosine_scaled_reward": -0.14510760456323624,
"rewards/format_reward": 0.6041666753590107,
"step": 12
},
{
"advantage_max": 0.11820426164194942,
"advantage_mean": -4.656612873077393e-10,
"advantage_min": -0.10757657652720809,
"advantage_std": 0.09022333845496178,
"completion_length": 2989.5208740234375,
"epoch": 0.014857142857142857,
"grad_norm": 0.02005729451775551,
"kl": 3.2998621463775635e-05,
"learning_rate": 2.6e-07,
"loss": 0.008,
"reward": 0.0009057910647243261,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.09022334171459079,
"rewards/cosine_scaled_reward": -0.1317348200827837,
"rewards/format_reward": 0.2708333358168602,
"step": 13
},
{
"advantage_max": 0.14062324352562428,
"advantage_mean": 3.1044090909038147e-10,
"advantage_min": -0.16110873501747847,
"advantage_std": 0.11608104594051838,
"completion_length": 2749.0208587646484,
"epoch": 0.016,
"grad_norm": 0.017504651099443436,
"kl": 2.360716462135315e-05,
"learning_rate": 2.8e-07,
"loss": 0.0057,
"reward": 0.05834482208592817,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11608104268088937,
"rewards/cosine_scaled_reward": -0.06565599981695414,
"rewards/format_reward": 0.4791666753590107,
"step": 14
},
{
"advantage_max": 0.13011277560144663,
"advantage_mean": -1.940255407728575e-09,
"advantage_min": -0.10297658666968346,
"advantage_std": 0.0875613666139543,
"completion_length": 2769.208366394043,
"epoch": 0.017142857142857144,
"grad_norm": 0.010686655528843403,
"kl": 1.9287224858999252e-05,
"learning_rate": 3e-07,
"loss": 0.0016,
"reward": 0.061318085878156126,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.08756137173622847,
"rewards/cosine_scaled_reward": -0.007162087596952915,
"rewards/format_reward": 0.3750000037252903,
"step": 15
},
{
"advantage_max": 0.13473028596490622,
"advantage_mean": 4.1133415354388525e-09,
"advantage_min": -0.077317263931036,
"advantage_std": 0.08469738904386759,
"completion_length": 3583.0833435058594,
"epoch": 0.018285714285714287,
"grad_norm": 0.0169773381203413,
"kl": 3.8251280784606934e-05,
"learning_rate": 3.2e-07,
"loss": 0.0001,
"reward": -0.07893023523502052,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.08469739044085145,
"rewards/cosine_scaled_reward": -0.24335206672549248,
"rewards/format_reward": 0.02083333395421505,
"step": 16
},
{
"advantage_max": 0.13479452300816774,
"advantage_mean": -3.065603576546394e-09,
"advantage_min": -0.15165873477235436,
"advantage_std": 0.12228226847946644,
"completion_length": 2357.625026702881,
"epoch": 0.019428571428571427,
"grad_norm": 0.01721261627972126,
"kl": 4.204362630844116e-05,
"learning_rate": 3.4000000000000003e-07,
"loss": -0.0014,
"reward": 0.10265706898644567,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12228227453306317,
"rewards/cosine_scaled_reward": 0.030887765809893608,
"rewards/format_reward": 0.5416666734963655,
"step": 17
},
{
"advantage_max": 0.12159187206998467,
"advantage_mean": -3.8805105656569694e-10,
"advantage_min": -0.12701823841780424,
"advantage_std": 0.09618540527299047,
"completion_length": 2885.3125,
"epoch": 0.02057142857142857,
"grad_norm": 0.02120651677250862,
"kl": 3.0279159545898438e-05,
"learning_rate": 3.6e-07,
"loss": 0.007,
"reward": 0.02408734685741365,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.09618540899828076,
"rewards/cosine_scaled_reward": -0.11675130156800151,
"rewards/format_reward": 0.37500000931322575,
"step": 18
},
{
"advantage_max": 0.2786301076412201,
"advantage_mean": -2.0178656801039807e-09,
"advantage_min": -0.17539117764681578,
"advantage_std": 0.1897407090291381,
"completion_length": 3042.479202270508,
"epoch": 0.021714285714285714,
"grad_norm": 0.0366692878305912,
"kl": 2.7358531951904297e-05,
"learning_rate": 3.7999999999999996e-07,
"loss": 0.0142,
"reward": 0.08209404302760959,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1897407118231058,
"rewards/cosine_scaled_reward": 0.05350984400138259,
"rewards/format_reward": 0.37500000186264515,
"step": 19
},
{
"advantage_max": 0.17369511630386114,
"advantage_mean": -3.7252902707063384e-09,
"advantage_min": -0.143904535099864,
"advantage_std": 0.12505553639493883,
"completion_length": 2488.437587738037,
"epoch": 0.022857142857142857,
"grad_norm": 0.026572400704026222,
"kl": 1.1764466762542725e-05,
"learning_rate": 4e-07,
"loss": 0.0135,
"reward": 0.09353481137077324,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.12505554361268878,
"rewards/cosine_scaled_reward": -0.01666953694075346,
"rewards/format_reward": 0.5833333395421505,
"step": 20
},
{
"advantage_max": 0.15747881215065718,
"advantage_mean": -3.1044090909038147e-10,
"advantage_min": -0.09395024552941322,
"advantage_std": 0.09924266301095486,
"completion_length": 2713.125015258789,
"epoch": 0.024,
"grad_norm": 0.016476722434163094,
"kl": 3.544241189956665e-05,
"learning_rate": 4.1999999999999995e-07,
"loss": 0.0059,
"reward": 0.06582744396291673,
"reward_advantage_correlation": 1.0,
"reward_std": 0.0992426648736,
"rewards/cosine_scaled_reward": -0.02483982127159834,
"rewards/format_reward": 0.43750000558793545,
"step": 21
},
{
"advantage_max": 0.1536049460992217,
"advantage_mean": -5.82076628563577e-09,
"advantage_min": -0.19146334286779165,
"advantage_std": 0.13952347543090582,
"completion_length": 1874.5417251586914,
"epoch": 0.025142857142857144,
"grad_norm": 0.02956153266131878,
"kl": 2.4802982807159424e-05,
"learning_rate": 4.3999999999999997e-07,
"loss": 0.0068,
"reward": 0.12891051033511758,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1395234796218574,
"rewards/cosine_scaled_reward": -0.014458773657679558,
"rewards/format_reward": 0.7916666772216558,
"step": 22
},
{
"advantage_max": 0.23872516956180334,
"advantage_mean": 1.3969838758010056e-09,
"advantage_min": -0.15435245260596275,
"advantage_std": 0.15457574743777514,
"completion_length": 2569.3125610351562,
"epoch": 0.026285714285714287,
"grad_norm": 0.026004575192928314,
"kl": 3.0472874641418457e-05,
"learning_rate": 4.6e-07,
"loss": 0.0088,
"reward": 0.06847280421061441,
"reward_advantage_correlation": 1.0,
"reward_std": 0.15457575675100088,
"rewards/cosine_scaled_reward": -0.04925672709941864,
"rewards/format_reward": 0.5000000149011612,
"step": 23
},
{
"advantage_max": 0.18653128948062658,
"advantage_mean": -2.444721682037798e-09,
"advantage_min": -0.1594811975955963,
"advantage_std": 0.13956549763679504,
"completion_length": 2802.395881652832,
"epoch": 0.027428571428571427,
"grad_norm": 0.020463019609451294,
"kl": 2.32793390750885e-05,
"learning_rate": 4.8e-07,
"loss": 0.0016,
"reward": 0.10617550695315003,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13956550369039178,
"rewards/cosine_scaled_reward": 0.03222842514514923,
"rewards/format_reward": 0.5625000186264515,
"step": 24
},
{
"advantage_max": 0.16575950756669044,
"advantage_mean": -1.7074248265247505e-09,
"advantage_min": -0.138729483820498,
"advantage_std": 0.1247619753703475,
"completion_length": 2796.5208740234375,
"epoch": 0.02857142857142857,
"grad_norm": 0.02838418260216713,
"kl": 3.729015588760376e-05,
"learning_rate": 5e-07,
"loss": 0.0122,
"reward": 0.032140296418219805,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12476197723299265,
"rewards/cosine_scaled_reward": -0.10717500746250153,
"rewards/format_reward": 0.39583334140479565,
"step": 25
},
{
"advantage_max": 0.1065903976559639,
"advantage_mean": 2.5611371079481415e-09,
"advantage_min": -0.12964865937829018,
"advantage_std": 0.096164018381387,
"completion_length": 3076.7708740234375,
"epoch": 0.029714285714285714,
"grad_norm": 0.023165522143244743,
"kl": 3.162771463394165e-05,
"learning_rate": 5.2e-07,
"loss": 0.0058,
"reward": 0.05766999162733555,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09616402024403214,
"rewards/cosine_scaled_reward": -0.03639649413526058,
"rewards/format_reward": 0.416666679084301,
"step": 26
},
{
"advantage_max": 0.2176275816746056,
"advantage_mean": -2.2700988727697435e-09,
"advantage_min": -0.14101197582203895,
"advantage_std": 0.14790143747814,
"completion_length": 3005.291702270508,
"epoch": 0.030857142857142857,
"grad_norm": 0.02873804420232773,
"kl": 2.8399168513715267e-05,
"learning_rate": 5.4e-07,
"loss": 0.0111,
"reward": 0.04077844490529969,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.14790144679136574,
"rewards/cosine_scaled_reward": -0.0776456345920451,
"rewards/format_reward": 0.39583333767950535,
"step": 27
},
{
"advantage_max": 0.15818555373698473,
"advantage_mean": -2.7551626674560126e-09,
"advantage_min": -0.17423970997333527,
"advantage_std": 0.13145049894228578,
"completion_length": 2832.729202270508,
"epoch": 0.032,
"grad_norm": 0.018948372453451157,
"kl": 3.7573277950286865e-05,
"learning_rate": 5.6e-07,
"loss": 0.0082,
"reward": 0.0805886962916702,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1314505017362535,
"rewards/cosine_scaled_reward": 0.016917362809181213,
"rewards/format_reward": 0.43750000558793545,
"step": 28
},
{
"advantage_max": 0.18313346058130264,
"advantage_mean": 3.1820188567399654e-09,
"advantage_min": -0.1069188816472888,
"advantage_std": 0.11597575852647424,
"completion_length": 3305.0208740234375,
"epoch": 0.03314285714285714,
"grad_norm": 0.03014214336872101,
"kl": 2.7902424335479736e-05,
"learning_rate": 5.8e-07,
"loss": 0.0136,
"reward": -0.044296178268268704,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.11597575340420008,
"rewards/cosine_scaled_reward": -0.22412699135020375,
"rewards/format_reward": 0.18750000558793545,
"step": 29
},
{
"advantage_max": 0.24942097766324878,
"advantage_mean": 4.3461721582760404e-09,
"advantage_min": -0.1541912415996194,
"advantage_std": 0.15846544690430164,
"completion_length": 2920.562545776367,
"epoch": 0.03428571428571429,
"grad_norm": 0.022144218906760216,
"kl": 2.2359192371368408e-05,
"learning_rate": 6e-07,
"loss": 0.0071,
"reward": 0.06114856945350766,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.15846544643864036,
"rewards/cosine_scaled_reward": -0.04798364010639489,
"rewards/format_reward": 0.4583333469927311,
"step": 30
},
{
"advantage_max": 0.1647213133983314,
"advantage_mean": 2.2506963098800625e-09,
"advantage_min": -0.11298016970977187,
"advantage_std": 0.11202249862253666,
"completion_length": 3126.5833587646484,
"epoch": 0.03542857142857143,
"grad_norm": 0.02341640554368496,
"kl": 1.6361474990844727e-05,
"learning_rate": 6.2e-07,
"loss": 0.0102,
"reward": -0.011873322539031506,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.11202250327914953,
"rewards/cosine_scaled_reward": -0.1413223911076784,
"rewards/format_reward": 0.2083333395421505,
"step": 31
},
{
"advantage_max": 0.1927646165713668,
"advantage_mean": -4.0357314226580066e-09,
"advantage_min": -0.1581784477457404,
"advantage_std": 0.12995850760489702,
"completion_length": 3253.125030517578,
"epoch": 0.036571428571428574,
"grad_norm": 0.016674285754561424,
"kl": 1.6938894987106323e-05,
"learning_rate": 6.4e-07,
"loss": 0.004,
"reward": 0.0447476077824831,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12995851668529212,
"rewards/cosine_scaled_reward": -0.0544386301189661,
"rewards/format_reward": 0.3750000149011612,
"step": 32
},
{
"advantage_max": 0.20177126210182905,
"advantage_mean": -1.241763414316921e-09,
"advantage_min": -0.18795710895210505,
"advantage_std": 0.15662608901038766,
"completion_length": 3411.541717529297,
"epoch": 0.037714285714285714,
"grad_norm": 0.02308499813079834,
"kl": 2.4718232452869415e-05,
"learning_rate": 6.6e-07,
"loss": 0.0056,
"reward": 0.04366421408485621,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.15662609227001667,
"rewards/cosine_scaled_reward": -0.026438521221280098,
"rewards/format_reward": 0.31250000558793545,
"step": 33
},
{
"advantage_max": 0.17956872167997062,
"advantage_mean": -3.104408619059029e-09,
"advantage_min": -0.14220268558710814,
"advantage_std": 0.12088308949023485,
"completion_length": 2531.812530517578,
"epoch": 0.038857142857142854,
"grad_norm": 0.014704632572829723,
"kl": 6.726384162902832e-05,
"learning_rate": 6.800000000000001e-07,
"loss": 0.0015,
"reward": 0.09752498054876924,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12088309531100094,
"rewards/cosine_scaled_reward": 0.025213209679350257,
"rewards/format_reward": 0.520833333954215,
"step": 34
},
{
"advantage_max": 0.29936027619987726,
"advantage_mean": -5.551115123125783e-17,
"advantage_min": -0.172477250918746,
"advantage_std": 0.19010970601812005,
"completion_length": 2973.3750534057617,
"epoch": 0.04,
"grad_norm": 0.028409497812390327,
"kl": 5.359947681427002e-05,
"learning_rate": 7e-07,
"loss": 0.0123,
"reward": 0.04867997905239463,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.19010971300303936,
"rewards/cosine_scaled_reward": -0.024306990206241608,
"rewards/format_reward": 0.3333333395421505,
"step": 35
},
{
"advantage_max": 0.13590538362041116,
"advantage_mean": 9.313226370655237e-10,
"advantage_min": -0.09683632245287299,
"advantage_std": 0.08833803655579686,
"completion_length": 3288.0833740234375,
"epoch": 0.04114285714285714,
"grad_norm": 0.015868162736296654,
"kl": 7.23712146282196e-05,
"learning_rate": 7.2e-07,
"loss": 0.0045,
"reward": -0.03358328447211534,
"reward_advantage_correlation": 1.0,
"reward_std": 0.0883380388841033,
"rewards/cosine_scaled_reward": -0.22424227092415094,
"rewards/format_reward": 0.2500000074505806,
"step": 36
},
{
"advantage_max": 0.07873150100931525,
"advantage_mean": -3.1044085357923024e-10,
"advantage_min": -0.07795562036335468,
"advantage_std": 0.061489060055464506,
"completion_length": 3361.562530517578,
"epoch": 0.04228571428571429,
"grad_norm": 0.009638470597565174,
"kl": 2.0228326320648193e-05,
"learning_rate": 7.4e-07,
"loss": 0.0021,
"reward": -0.050579807022586465,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.06148906052112579,
"rewards/cosine_scaled_reward": -0.24296171963214874,
"rewards/format_reward": 0.1875,
"step": 37
},
{
"advantage_max": 0.1380753773264587,
"advantage_mean": -6.20881665525097e-10,
"advantage_min": -0.1019920501857996,
"advantage_std": 0.09536230750381947,
"completion_length": 3306.4583587646484,
"epoch": 0.04342857142857143,
"grad_norm": 0.015968551859259605,
"kl": 4.314631223678589e-05,
"learning_rate": 7.599999999999999e-07,
"loss": 0.002,
"reward": -0.031228411942720413,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09536230750381947,
"rewards/cosine_scaled_reward": -0.16523530799895525,
"rewards/format_reward": 0.14583333395421505,
"step": 38
},
{
"advantage_max": 0.1628796993754804,
"advantage_mean": -1.7074248126469627e-09,
"advantage_min": -0.1276350189000368,
"advantage_std": 0.1117043545236811,
"completion_length": 2918.7291984558105,
"epoch": 0.044571428571428574,
"grad_norm": 0.019875982776284218,
"kl": 6.305798888206482e-05,
"learning_rate": 7.799999999999999e-07,
"loss": 0.0022,
"reward": 0.05022500859195134,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.11170436086831614,
"rewards/cosine_scaled_reward": -0.05875556939281523,
"rewards/format_reward": 0.4166666679084301,
"step": 39
},
{
"advantage_max": 0.1351394895464182,
"advantage_mean": 2.1730860721991263e-09,
"advantage_min": -0.12486388254910707,
"advantage_std": 0.10662073362618685,
"completion_length": 2454.8958587646484,
"epoch": 0.045714285714285714,
"grad_norm": 0.022016212344169617,
"kl": 0.0002264752984046936,
"learning_rate": 8e-07,
"loss": 0.0066,
"reward": 0.06583994440734386,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10662073455750942,
"rewards/cosine_scaled_reward": -0.07870683167129755,
"rewards/format_reward": 0.541666679084301,
"step": 40
},
{
"advantage_max": 0.1448317738249898,
"advantage_mean": 1.3969838966176873e-09,
"advantage_min": -0.1136879026889801,
"advantage_std": 0.10412771673873067,
"completion_length": 3089.3333740234375,
"epoch": 0.046857142857142854,
"grad_norm": 0.01577775366604328,
"kl": 4.999339580535889e-05,
"learning_rate": 8.199999999999999e-07,
"loss": 0.0071,
"reward": -0.007575191382784396,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.10412772092968225,
"rewards/cosine_scaled_reward": -0.20044021122157574,
"rewards/format_reward": 0.3541666716337204,
"step": 41
},
{
"advantage_max": 0.1294115127529949,
"advantage_mean": 8.149072944219071e-10,
"advantage_min": -0.08364611677825451,
"advantage_std": 0.08207520749419928,
"completion_length": 2773.3333702087402,
"epoch": 0.048,
"grad_norm": 0.014152735471725464,
"kl": 0.00012195669114589691,
"learning_rate": 8.399999999999999e-07,
"loss": 0.0055,
"reward": -0.024103335803374648,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.08207521075382829,
"rewards/cosine_scaled_reward": -0.23803024366497993,
"rewards/format_reward": 0.33333333395421505,
"step": 42
},
{
"advantage_max": 0.1437476323917508,
"advantage_mean": 1.5522043927962415e-09,
"advantage_min": -0.11569147277623415,
"advantage_std": 0.09973477618768811,
"completion_length": 3171.854202270508,
"epoch": 0.04914285714285714,
"grad_norm": 0.014957522042095661,
"kl": 4.2708590626716614e-05,
"learning_rate": 8.599999999999999e-07,
"loss": 0.0047,
"reward": -0.022214435506612062,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09973477479070425,
"rewards/cosine_scaled_reward": -0.16881779208779335,
"rewards/format_reward": 0.2083333358168602,
"step": 43
},
{
"advantage_max": 0.17799750808626413,
"advantage_mean": -1.7074247293802358e-09,
"advantage_min": -0.16484235506504774,
"advantage_std": 0.14439343940466642,
"completion_length": 2845.7708892822266,
"epoch": 0.05028571428571429,
"grad_norm": 0.0470295213162899,
"kl": 0.00029357708990573883,
"learning_rate": 8.799999999999999e-07,
"loss": 0.023,
"reward": 0.0359237277880311,
"reward_advantage_correlation": 1.0,
"reward_std": 0.14439343893900514,
"rewards/cosine_scaled_reward": -0.05193283036351204,
"rewards/format_reward": 0.31250000931322575,
"step": 44
},
{
"advantage_max": 0.19965059403330088,
"advantage_mean": -8.537123091789667e-10,
"advantage_min": -0.1259068874642253,
"advantage_std": 0.1281011113896966,
"completion_length": 3430.0833740234375,
"epoch": 0.05142857142857143,
"grad_norm": 0.018138015642762184,
"kl": 6.644893437623978e-05,
"learning_rate": 9e-07,
"loss": 0.0027,
"reward": 0.02113422704860568,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12810111325234175,
"rewards/cosine_scaled_reward": -0.06111693615093827,
"rewards/format_reward": 0.2500000074505806,
"step": 45
},
{
"advantage_max": 0.11925685312598944,
"advantage_mean": 1.2417634420724966e-09,
"advantage_min": -0.06791578326374292,
"advantage_std": 0.07461203960701823,
"completion_length": 3229.770835876465,
"epoch": 0.052571428571428575,
"grad_norm": 0.010699857957661152,
"kl": 0.0001803375780582428,
"learning_rate": 9.2e-07,
"loss": 0.0005,
"reward": -0.06760753598064184,
"reward_advantage_correlation": 1.0,
"reward_std": 0.07461204146966338,
"rewards/cosine_scaled_reward": -0.2723505459725857,
"rewards/format_reward": 0.14583333395421505,
"step": 46
},
{
"advantage_max": 0.17866623401641846,
"advantage_mean": -7.6834113516e-09,
"advantage_min": -0.2375992350280285,
"advantage_std": 0.1749500371515751,
"completion_length": 2958.6250610351562,
"epoch": 0.053714285714285714,
"grad_norm": 0.029765864834189415,
"kl": 6.363540887832642e-05,
"learning_rate": 9.399999999999999e-07,
"loss": 0.0121,
"reward": 0.11308036895934492,
"reward_advantage_correlation": 1.0,
"reward_std": 0.17495004460215569,
"rewards/cosine_scaled_reward": 0.07415074668824673,
"rewards/format_reward": 0.5208333432674408,
"step": 47
},
{
"advantage_max": 0.19035830302163959,
"advantage_mean": -3.4924599323638006e-10,
"advantage_min": -0.16151767084375024,
"advantage_std": 0.1434162282384932,
"completion_length": 2765.6250228881836,
"epoch": 0.054857142857142854,
"grad_norm": 0.02456662431359291,
"kl": 0.00045037176460027695,
"learning_rate": 9.6e-07,
"loss": 0.0078,
"reward": 0.07018421730026603,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1434162249788642,
"rewards/cosine_scaled_reward": -0.013612153008580208,
"rewards/format_reward": 0.43750000558793545,
"step": 48
},
{
"advantage_max": 0.2279358534142375,
"advantage_mean": -1.4745941204208357e-09,
"advantage_min": -0.14564000815153122,
"advantage_std": 0.15678073978051543,
"completion_length": 2342.354232788086,
"epoch": 0.056,
"grad_norm": 0.02502998150885105,
"kl": 0.00013617053627967834,
"learning_rate": 9.8e-07,
"loss": 0.0089,
"reward": 0.06247584073571488,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.15678074583411217,
"rewards/cosine_scaled_reward": -0.09697807114571333,
"rewards/format_reward": 0.5625000037252903,
"step": 49
},
{
"advantage_max": 0.122623095754534,
"advantage_mean": -1.3969838619232178e-09,
"advantage_min": -0.1242001224309206,
"advantage_std": 0.1060920343734324,
"completion_length": 2923.5625228881836,
"epoch": 0.05714285714285714,
"grad_norm": 0.021566810086369514,
"kl": 0.00023385882377624512,
"learning_rate": 1e-06,
"loss": 0.0096,
"reward": 0.038516357075423,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10609203530475497,
"rewards/cosine_scaled_reward": -0.03362384531646967,
"rewards/format_reward": 0.29166666977107525,
"step": 50
},
{
"advantage_max": 0.10491376649588346,
"advantage_mean": -3.647680130169917e-09,
"advantage_min": -0.16684891190379858,
"advantage_std": 0.10262906912248582,
"completion_length": 2406.3750228881836,
"epoch": 0.05828571428571429,
"grad_norm": 0.02082122303545475,
"kl": 0.000521540641784668,
"learning_rate": 9.999890338174275e-07,
"loss": 0.0026,
"reward": 0.062345280312001705,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10262907319702208,
"rewards/cosine_scaled_reward": -0.05357350129634142,
"rewards/format_reward": 0.47916667722165585,
"step": 51
},
{
"advantage_max": 0.23814772348850965,
"advantage_mean": -2.2506961641632905e-09,
"advantage_min": -0.21308327466249466,
"advantage_std": 0.1857591886073351,
"completion_length": 2921.333366394043,
"epoch": 0.05942857142857143,
"grad_norm": 0.026340872049331665,
"kl": 0.00041694939136505127,
"learning_rate": 9.999561358041868e-07,
"loss": 0.0094,
"reward": 0.09964685700833797,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.18575919978320599,
"rewards/cosine_scaled_reward": 0.08640430495142937,
"rewards/format_reward": 0.4166666753590107,
"step": 52
},
{
"advantage_max": 0.22019695164635777,
"advantage_mean": -7.761021547647573e-10,
"advantage_min": -0.18540269322693348,
"advantage_std": 0.17194287246093154,
"completion_length": 2839.1250610351562,
"epoch": 0.060571428571428575,
"grad_norm": 0.02775190770626068,
"kl": 0.00034201203379780054,
"learning_rate": 9.999013075636804e-07,
"loss": 0.0087,
"reward": 0.0726233726600185,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.17194287665188313,
"rewards/cosine_scaled_reward": -0.035814402624964714,
"rewards/format_reward": 0.5000000093132257,
"step": 53
},
{
"advantage_max": 0.15818986017256975,
"advantage_mean": -8.537123299956484e-10,
"advantage_min": -0.15511069353669882,
"advantage_std": 0.13614196004346013,
"completion_length": 2918.3958892822266,
"epoch": 0.061714285714285715,
"grad_norm": 0.02299380674958229,
"kl": 0.0001678699627518654,
"learning_rate": 9.998245517681593e-07,
"loss": 0.009,
"reward": 0.15431041596457362,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1361419651657343,
"rewards/cosine_scaled_reward": 0.18133503943681717,
"rewards/format_reward": 0.5416666772216558,
"step": 54
},
{
"advantage_max": 0.19129236973822117,
"advantage_mean": -2.949188199208308e-09,
"advantage_min": -0.1691391160711646,
"advantage_std": 0.14046459831297398,
"completion_length": 2980.0208892822266,
"epoch": 0.06285714285714286,
"grad_norm": 0.020739315077662468,
"kl": 0.0004740804433822632,
"learning_rate": 9.997258721585931e-07,
"loss": 0.01,
"reward": 0.0497954161837697,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.14046460296958685,
"rewards/cosine_scaled_reward": -0.019190420862287283,
"rewards/format_reward": 0.33333333767950535,
"step": 55
},
{
"advantage_max": 0.08986138668842614,
"advantage_mean": -7.450580721823918e-09,
"advantage_min": -0.18162237294018269,
"advantage_std": 0.10682848328724504,
"completion_length": 2916.1666870117188,
"epoch": 0.064,
"grad_norm": 0.0158238485455513,
"kl": 0.0002256631851196289,
"learning_rate": 9.996052735444862e-07,
"loss": 0.007,
"reward": 0.06883894634665921,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.10682848887518048,
"rewards/cosine_scaled_reward": 0.016542285680770874,
"rewards/format_reward": 0.3750000074505806,
"step": 56
},
{
"advantage_max": 0.159211162943393,
"advantage_mean": -1.4745940371541089e-09,
"advantage_min": -0.14113077148795128,
"advantage_std": 0.11528732301667333,
"completion_length": 3302.312530517578,
"epoch": 0.06514285714285714,
"grad_norm": 0.01696748286485672,
"kl": 0.00016479287296533585,
"learning_rate": 9.994627618036452e-07,
"loss": 0.0063,
"reward": 0.011191772297024727,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11528732441365719,
"rewards/cosine_scaled_reward": -0.10429812222719193,
"rewards/format_reward": 0.27083334513008595,
"step": 57
},
{
"advantage_max": 0.18715458177030087,
"advantage_mean": -5.277494732891519e-09,
"advantage_min": -0.171913824044168,
"advantage_std": 0.13622049521654844,
"completion_length": 2435.8542404174805,
"epoch": 0.06628571428571428,
"grad_norm": 0.019368024542927742,
"kl": 0.0015213489532470703,
"learning_rate": 9.992983438818915e-07,
"loss": 0.0105,
"reward": 0.14305569988209754,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.13622050173580647,
"rewards/cosine_scaled_reward": 0.08871240820735693,
"rewards/format_reward": 0.6666666753590107,
"step": 58
},
{
"advantage_max": 0.14700328465551138,
"advantage_mean": -1.862645301886623e-09,
"advantage_min": -0.1413423651829362,
"advantage_std": 0.12064886884763837,
"completion_length": 2893.4791870117188,
"epoch": 0.06742857142857143,
"grad_norm": 0.02102687954902649,
"kl": 0.0005481839179992676,
"learning_rate": 9.991120277927223e-07,
"loss": 0.0087,
"reward": 0.034189446829259396,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.12064887676388025,
"rewards/cosine_scaled_reward": -0.0558818019926548,
"rewards/format_reward": 0.31250000558793545,
"step": 59
},
{
"advantage_max": 0.1409899704158306,
"advantage_mean": -3.880510732190423e-09,
"advantage_min": -0.11852918658405542,
"advantage_std": 0.11182335065677762,
"completion_length": 2964.1459045410156,
"epoch": 0.06857142857142857,
"grad_norm": 0.01700800471007824,
"kl": 0.0003733038902282715,
"learning_rate": 9.989038226169207e-07,
"loss": 0.0015,
"reward": 0.008576460648328066,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.11182335112243891,
"rewards/cosine_scaled_reward": -0.16191277094185352,
"rewards/format_reward": 0.37500000186264515,
"step": 60
},
{
"advantage_max": 0.15250339172780514,
"advantage_mean": 2.173086086076914e-09,
"advantage_min": -0.1706991521641612,
"advantage_std": 0.13841882860288024,
"completion_length": 3171.541717529297,
"epoch": 0.06971428571428571,
"grad_norm": 0.03268995136022568,
"kl": 0.0005887793377041817,
"learning_rate": 9.98673738502114e-07,
"loss": 0.014,
"reward": 0.024381998693570495,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.13841883558779955,
"rewards/cosine_scaled_reward": -0.12546866945922375,
"rewards/format_reward": 0.39583334140479565,
"step": 61
},
{
"advantage_max": 0.2628666125237942,
"advantage_mean": -1.4745941620541991e-09,
"advantage_min": -0.18349808733910322,
"advantage_std": 0.1721328515559435,
"completion_length": 2700.2500610351562,
"epoch": 0.07085714285714285,
"grad_norm": 0.026928190141916275,
"kl": 0.0014960318803787231,
"learning_rate": 9.98421786662277e-07,
"loss": 0.0187,
"reward": 0.08059305348433554,
"reward_advantage_correlation": 1.0,
"reward_std": 0.17213285621255636,
"rewards/cosine_scaled_reward": -0.04422247753245756,
"rewards/format_reward": 0.5625000074505806,
"step": 62
},
{
"advantage_max": 0.17473032884299755,
"advantage_mean": -1.4745942522598199e-09,
"advantage_min": -0.19655942358076572,
"advantage_std": 0.16620324458926916,
"completion_length": 2423.541732788086,
"epoch": 0.072,
"grad_norm": 0.0306819137185812,
"kl": 0.0013110339641571045,
"learning_rate": 9.981479793771866e-07,
"loss": 0.0283,
"reward": 0.0984296789392829,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.16620325110852718,
"rewards/cosine_scaled_reward": 0.008712463080883026,
"rewards/format_reward": 0.5625000093132257,
"step": 63
},
{
"advantage_max": 0.16565060429275036,
"advantage_mean": -2.7939678071131624e-09,
"advantage_min": -0.16477500926703215,
"advantage_std": 0.14017474581487477,
"completion_length": 2889.3750610351562,
"epoch": 0.07314285714285715,
"grad_norm": 0.020427115261554718,
"kl": 0.0007784366607666016,
"learning_rate": 9.97852329991824e-07,
"loss": 0.0132,
"reward": 0.05052297201473266,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.14017474791035056,
"rewards/cosine_scaled_reward": -0.07072503212839365,
"rewards/format_reward": 0.4375000074505806,
"step": 64
},
{
"advantage_max": 0.127271534409374,
"advantage_mean": -1.3969838619232178e-09,
"advantage_min": -0.0950427707284689,
"advantage_std": 0.08464562566950917,
"completion_length": 2746.354179382324,
"epoch": 0.07428571428571429,
"grad_norm": 0.013379656709730625,
"kl": 0.0009982585906982422,
"learning_rate": 9.975348529157229e-07,
"loss": 0.0042,
"reward": 0.03162489866372198,
"reward_advantage_correlation": 0.9999999999999997,
"reward_std": 0.08464562753215432,
"rewards/cosine_scaled_reward": -0.11611694470047951,
"rewards/format_reward": 0.41666666977107525,
"step": 65
},
{
"advantage_max": 0.149964171461761,
"advantage_mean": -3.5700701006557978e-09,
"advantage_min": -0.13461025152355433,
"advantage_std": 0.10918906982988119,
"completion_length": 2079.6041946411133,
"epoch": 0.07542857142857143,
"grad_norm": 0.014194848015904427,
"kl": 0.0012865066528320312,
"learning_rate": 9.971955636222684e-07,
"loss": 0.004,
"reward": 0.11370784028986236,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10918907588347793,
"rewards/cosine_scaled_reward": 0.05414431728422642,
"rewards/format_reward": 0.5625000018626451,
"step": 66
},
{
"advantage_max": 0.08563538501039147,
"advantage_mean": 2.1730860513824446e-09,
"advantage_min": -0.07122973585501313,
"advantage_std": 0.05874074366874993,
"completion_length": 3511.4375,
"epoch": 0.07657142857142857,
"grad_norm": 0.008974037133157253,
"kl": 0.0012157298624515533,
"learning_rate": 9.968344786479415e-07,
"loss": 0.0014,
"reward": -0.08041232687537558,
"reward_advantage_correlation": 1.0,
"reward_std": 0.05874074646271765,
"rewards/cosine_scaled_reward": -0.2987704258412123,
"rewards/format_reward": 0.1250000037252903,
"step": 67
},
{
"advantage_max": 0.1881636488251388,
"advantage_mean": -1.2417634281947088e-09,
"advantage_min": -0.17326833494007587,
"advantage_std": 0.16107123950496316,
"completion_length": 2121.895866394043,
"epoch": 0.07771428571428571,
"grad_norm": 0.02590767852962017,
"kl": 0.004287242889404297,
"learning_rate": 9.964516155915151e-07,
"loss": 0.0219,
"reward": 0.07258323905989528,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1610712418332696,
"rewards/cosine_scaled_reward": -0.08903148956596851,
"rewards/format_reward": 0.6041666716337204,
"step": 68
},
{
"advantage_max": 0.17149817757308483,
"advantage_mean": 6.208817349140361e-10,
"advantage_min": -0.09525131899863482,
"advantage_std": 0.1044535138644278,
"completion_length": 2714.416748046875,
"epoch": 0.07885714285714286,
"grad_norm": 0.017063690349459648,
"kl": 0.0025910139083862305,
"learning_rate": 9.960469931131936e-07,
"loss": 0.0049,
"reward": -0.0257909067440778,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10445351805537939,
"rewards/cosine_scaled_reward": -0.26491281390190125,
"rewards/format_reward": 0.3750000037252903,
"step": 69
},
{
"advantage_max": 0.13912878511473536,
"advantage_mean": -1.5522043234073024e-09,
"advantage_min": -0.1294058826752007,
"advantage_std": 0.11556470859795809,
"completion_length": 3043.625015258789,
"epoch": 0.08,
"grad_norm": 0.02166864648461342,
"kl": 0.0016424953937530518,
"learning_rate": 9.956206309337066e-07,
"loss": 0.0011,
"reward": 0.030413513217354193,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11556471651419997,
"rewards/cosine_scaled_reward": -0.11716132145375013,
"rewards/format_reward": 0.4166666716337204,
"step": 70
},
{
"advantage_max": 0.1449303338304162,
"advantage_mean": -8.537123299956484e-10,
"advantage_min": -0.12476017605513334,
"advantage_std": 0.11173301562666893,
"completion_length": 2603.7291717529297,
"epoch": 0.08114285714285714,
"grad_norm": 0.012032200582325459,
"kl": 0.0020183324813842773,
"learning_rate": 9.951725498333448e-07,
"loss": 0.0006,
"reward": 0.041470743250101805,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11173302121460438,
"rewards/cosine_scaled_reward": -0.06599017698317766,
"rewards/format_reward": 0.375,
"step": 71
},
{
"advantage_max": 0.1914712623693049,
"advantage_mean": -1.2417634420724966e-09,
"advantage_min": -0.14899978134781122,
"advantage_std": 0.12655010493472219,
"completion_length": 3024.2083740234375,
"epoch": 0.08228571428571428,
"grad_norm": 0.025330260396003723,
"kl": 0.0027605295181274414,
"learning_rate": 9.947027716509488e-07,
"loss": 0.0173,
"reward": -0.00684193754568696,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.12655011098831892,
"rewards/cosine_scaled_reward": -0.18876908160746098,
"rewards/format_reward": 0.33333334140479565,
"step": 72
},
{
"advantage_max": 0.18401953671127558,
"advantage_mean": 9.31322616248842e-10,
"advantage_min": -0.12465803744271398,
"advantage_std": 0.12810300663113594,
"completion_length": 3407.2500610351562,
"epoch": 0.08342857142857144,
"grad_norm": 0.023279855027794838,
"kl": 0.0006111264228820801,
"learning_rate": 9.942113192828444e-07,
"loss": 0.0084,
"reward": -0.0037489386450033635,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.12810300663113594,
"rewards/cosine_scaled_reward": -0.12534647807478905,
"rewards/format_reward": 0.2291666679084301,
"step": 73
},
{
"advantage_max": 0.12516837287694216,
"advantage_mean": -1.6686196174786616e-09,
"advantage_min": -0.1078806221485138,
"advantage_std": 0.09417479066178203,
"completion_length": 3254.875030517578,
"epoch": 0.08457142857142858,
"grad_norm": 0.019327718764543533,
"kl": 0.0016609132289886475,
"learning_rate": 9.93698216681727e-07,
"loss": 0.0035,
"reward": 0.026729536708444357,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09417479438707232,
"rewards/cosine_scaled_reward": -0.055625975131988525,
"rewards/format_reward": 0.27083333767950535,
"step": 74
},
{
"advantage_max": 0.14641679031774402,
"advantage_mean": -7.528190813788083e-09,
"advantage_min": -0.13670456875115633,
"advantage_std": 0.11592453811317682,
"completion_length": 3050.3541870117188,
"epoch": 0.08571428571428572,
"grad_norm": 0.018251126632094383,
"kl": 0.0018388032913208008,
"learning_rate": 9.931634888554935e-07,
"loss": 0.0044,
"reward": 0.07386055216193199,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11592454044148326,
"rewards/cosine_scaled_reward": 0.03964068624190986,
"rewards/format_reward": 0.3541666753590107,
"step": 75
},
{
"advantage_max": 0.11959696374833584,
"advantage_mean": 3.1044086745701804e-10,
"advantage_min": -0.11977787129580975,
"advantage_std": 0.10093671828508377,
"completion_length": 2620.500045776367,
"epoch": 0.08685714285714285,
"grad_norm": 0.02156541682779789,
"kl": 0.0006852000951766968,
"learning_rate": 9.926071618660237e-07,
"loss": 0.0103,
"reward": 0.026126212440431118,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10093672387301922,
"rewards/cosine_scaled_reward": -0.18299918808043003,
"rewards/format_reward": 0.5208333469927311,
"step": 76
},
{
"advantage_max": 0.1113869184628129,
"advantage_mean": -4.287964372462483e-09,
"advantage_min": -0.1384631348773837,
"advantage_std": 0.0967109005432576,
"completion_length": 3143.729217529297,
"epoch": 0.088,
"grad_norm": 0.016500068828463554,
"kl": 0.001096084713935852,
"learning_rate": 9.9202926282791e-07,
"loss": -0.0054,
"reward": 0.04420704103540629,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.09671090613119304,
"rewards/cosine_scaled_reward": -0.058531179500278085,
"rewards/format_reward": 0.37500000931322575,
"step": 77
},
{
"advantage_max": 0.16691427025943995,
"advantage_mean": 1.2417634420724966e-09,
"advantage_min": -0.1529896855354309,
"advantage_std": 0.13225172739475965,
"completion_length": 3109.354202270508,
"epoch": 0.08914285714285715,
"grad_norm": 0.021866677328944206,
"kl": 0.0009225308895111084,
"learning_rate": 9.91429819907136e-07,
"loss": 0.0065,
"reward": 0.03236245736479759,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13225173251703382,
"rewards/cosine_scaled_reward": -0.09289113059639931,
"rewards/format_reward": 0.3750000074505806,
"step": 78
},
{
"advantage_max": 0.15700082294642925,
"advantage_mean": 1.125348175756713e-09,
"advantage_min": -0.10489925090223551,
"advantage_std": 0.10893942508846521,
"completion_length": 2253.1250534057617,
"epoch": 0.09028571428571429,
"grad_norm": 0.017613250762224197,
"kl": 0.0023194551467895508,
"learning_rate": 9.908088623197048e-07,
"loss": -0.0019,
"reward": 0.0731448968872428,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1089394255541265,
"rewards/cosine_scaled_reward": -0.08033954538404942,
"rewards/format_reward": 0.5833333358168602,
"step": 79
},
{
"advantage_max": 0.15978450048714876,
"advantage_mean": 4.6566125261326974e-10,
"advantage_min": -0.13024994870647788,
"advantage_std": 0.11621860601007938,
"completion_length": 3283.041717529297,
"epoch": 0.09142857142857143,
"grad_norm": 0.019476035609841347,
"kl": 0.0017393827438354492,
"learning_rate": 9.901664203302124e-07,
"loss": 0.0064,
"reward": -0.002654203213751316,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11621860833838582,
"rewards/cosine_scaled_reward": -0.1530076563358307,
"rewards/format_reward": 0.29166666977107525,
"step": 80
},
{
"advantage_max": 0.1000084918923676,
"advantage_mean": -5.432714486608425e-10,
"advantage_min": -0.12673839554190636,
"advantage_std": 0.0900690802372992,
"completion_length": 3145.875030517578,
"epoch": 0.09257142857142857,
"grad_norm": 0.016789212822914124,
"kl": 0.005313873291015625,
"learning_rate": 9.895025252503755e-07,
"loss": 0.0054,
"reward": 0.0005171550437808037,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09006908535957336,
"rewards/cosine_scaled_reward": -0.1340514589101076,
"rewards/format_reward": 0.2708333432674408,
"step": 81
},
{
"advantage_max": 0.16784677654504776,
"advantage_mean": -6.519258133330652e-09,
"advantage_min": -0.13112404569983482,
"advantage_std": 0.12467234069481492,
"completion_length": 2761.5208435058594,
"epoch": 0.09371428571428571,
"grad_norm": 0.02522902935743332,
"kl": 0.0028878003358840942,
"learning_rate": 9.888172094375033e-07,
"loss": 0.0019,
"reward": 0.0789759517647326,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12467234022915363,
"rewards/cosine_scaled_reward": 0.024436804931610823,
"rewards/format_reward": 0.4166666716337204,
"step": 82
},
{
"advantage_max": 0.1500550713390112,
"advantage_mean": -8.537122953011789e-10,
"advantage_min": -0.13098668679594994,
"advantage_std": 0.10632651299238205,
"completion_length": 2842.187530517578,
"epoch": 0.09485714285714286,
"grad_norm": 0.017034098505973816,
"kl": 0.0017483234405517578,
"learning_rate": 9.881105062929221e-07,
"loss": 0.0024,
"reward": 0.019357941579073668,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.10632651718333364,
"rewards/cosine_scaled_reward": -0.12056602351367474,
"rewards/format_reward": 0.3541666679084301,
"step": 83
},
{
"advantage_max": 0.1495693982578814,
"advantage_mean": -1.474594099604154e-09,
"advantage_min": -0.19178100768476725,
"advantage_std": 0.1472853058949113,
"completion_length": 3046.791717529297,
"epoch": 0.096,
"grad_norm": 0.027498627081513405,
"kl": 0.001116037368774414,
"learning_rate": 9.873824502603459e-07,
"loss": 0.0056,
"reward": 0.10806799679994583,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1472853091545403,
"rewards/cosine_scaled_reward": 0.09858678560703993,
"rewards/format_reward": 0.43750001303851604,
"step": 84
},
{
"advantage_max": 0.2928034896031022,
"advantage_mean": 8.537124340790569e-10,
"advantage_min": -0.21211090218275785,
"advantage_std": 0.20124134561046958,
"completion_length": 2936.0209045410156,
"epoch": 0.09714285714285714,
"grad_norm": 0.027292873710393906,
"kl": 0.0011175870895385742,
"learning_rate": 9.866330768241983e-07,
"loss": 0.011,
"reward": 0.09737453208072111,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.20124135678634048,
"rewards/cosine_scaled_reward": 0.017047576373443007,
"rewards/format_reward": 0.5416666772216558,
"step": 85
},
{
"advantage_max": 0.161500733345747,
"advantage_mean": 2.949188199208308e-09,
"advantage_min": -0.12576205004006624,
"advantage_std": 0.11738913925364614,
"completion_length": 3064.6042098999023,
"epoch": 0.09828571428571428,
"grad_norm": 0.019446710124611855,
"kl": 0.0017948150634765625,
"learning_rate": 9.85862422507884e-07,
"loss": 0.0081,
"reward": 0.030952767468988895,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11738914204761386,
"rewards/cosine_scaled_reward": -0.08462417311966419,
"rewards/format_reward": 0.3541666679084301,
"step": 86
},
{
"advantage_max": 0.1681693554855883,
"advantage_mean": -1.8626452047421083e-09,
"advantage_min": -0.1518132919445634,
"advantage_std": 0.1314193387515843,
"completion_length": 2709.166702270508,
"epoch": 0.09942857142857142,
"grad_norm": 0.027268240228295326,
"kl": 0.0031093358993530273,
"learning_rate": 9.850705248720068e-07,
"loss": 0.0161,
"reward": 0.03507482446730137,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1314193462021649,
"rewards/cosine_scaled_reward": -0.12755973311141133,
"rewards/format_reward": 0.45833334885537624,
"step": 87
},
{
"advantage_max": 0.27509157080203295,
"advantage_mean": -3.570070003511283e-09,
"advantage_min": -0.19592854473739862,
"advantage_std": 0.1789478063583374,
"completion_length": 2636.250045776367,
"epoch": 0.10057142857142858,
"grad_norm": 0.03065653145313263,
"kl": 0.0032744407653808594,
"learning_rate": 9.8425742251254e-07,
"loss": 0.0206,
"reward": 0.08256556163541973,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.17894781846553087,
"rewards/cosine_scaled_reward": 0.014251076150685549,
"rewards/format_reward": 0.4583333469927311,
"step": 88
},
{
"advantage_max": 0.16426204703748226,
"advantage_mean": -2.638747317873502e-09,
"advantage_min": -0.1488904170691967,
"advantage_std": 0.12655864632688463,
"completion_length": 3053.6041717529297,
"epoch": 0.10171428571428572,
"grad_norm": 0.02547341212630272,
"kl": 0.002205371856689453,
"learning_rate": 9.83423155058946e-07,
"loss": 0.0104,
"reward": 0.027745387284085155,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.126558649353683,
"rewards/cosine_scaled_reward": -0.09617769811302423,
"rewards/format_reward": 0.3541666716337204,
"step": 89
},
{
"advantage_max": 0.10391614772379398,
"advantage_mean": -1.3193736519978572e-09,
"advantage_min": -0.10397443547844887,
"advantage_std": 0.07875827606767416,
"completion_length": 2370.375015258789,
"epoch": 0.10285714285714286,
"grad_norm": 0.015140805393457413,
"kl": 0.004084110260009766,
"learning_rate": 9.825677631722435e-07,
"loss": 0.0056,
"reward": -0.006435986841097474,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.0787582783959806,
"rewards/cosine_scaled_reward": -0.2495388761162758,
"rewards/format_reward": 0.45833334140479565,
"step": 90
},
{
"advantage_max": 0.1725978935137391,
"advantage_mean": 3.02679846464482e-09,
"advantage_min": -0.14624899346381426,
"advantage_std": 0.13377743028104305,
"completion_length": 3174.0416870117188,
"epoch": 0.104,
"grad_norm": 0.0243070051074028,
"kl": 0.0019826889038085938,
"learning_rate": 9.816912885430258e-07,
"loss": 0.0033,
"reward": 0.021438519936054945,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13377743400633335,
"rewards/cosine_scaled_reward": -0.09193318895995617,
"rewards/format_reward": 0.31250000558793545,
"step": 91
},
{
"advantage_max": 0.14272566698491573,
"advantage_mean": -1.7850349948167477e-09,
"advantage_min": -0.11759363766759634,
"advantage_std": 0.10569121921434999,
"completion_length": 2625.6667098999023,
"epoch": 0.10514285714285715,
"grad_norm": 0.0243788193911314,
"kl": 0.0036519765853881836,
"learning_rate": 9.807937738894303e-07,
"loss": -0.0039,
"reward": 0.034875532728619874,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.10569121642038226,
"rewards/cosine_scaled_reward": -0.15802248800173402,
"rewards/format_reward": 0.5208333376795053,
"step": 92
},
{
"advantage_max": 0.10833407612517476,
"advantage_mean": 1.3193736728145389e-09,
"advantage_min": -0.09252861887216568,
"advantage_std": 0.07854902278631926,
"completion_length": 3483.5208740234375,
"epoch": 0.10628571428571429,
"grad_norm": 0.013983568176627159,
"kl": 0.0025484561920166016,
"learning_rate": 9.798752629550546e-07,
"loss": 0.0054,
"reward": -0.07466757856309414,
"reward_advantage_correlation": 1.0,
"reward_std": 0.07854902278631926,
"rewards/cosine_scaled_reward": -0.2516121231019497,
"rewards/format_reward": 0.06250000186264515,
"step": 93
},
{
"advantage_max": 0.16513420641422272,
"advantage_mean": 9.313226023710541e-10,
"advantage_min": -0.10861359536647797,
"advantage_std": 0.10586209408938885,
"completion_length": 3181.625030517578,
"epoch": 0.10742857142857143,
"grad_norm": 0.019842853769659996,
"kl": 0.0038785934448242188,
"learning_rate": 9.78935800506826e-07,
"loss": 0.0048,
"reward": 0.023791223531588912,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10586209432221949,
"rewards/cosine_scaled_reward": -0.043594514252617955,
"rewards/format_reward": 0.22916666977107525,
"step": 94
},
{
"advantage_max": 0.15752420481294394,
"advantage_mean": 1.3969838966176873e-09,
"advantage_min": -0.10875887889415026,
"advantage_std": 0.09435309539549053,
"completion_length": 3359.4583740234375,
"epoch": 0.10857142857142857,
"grad_norm": 0.012666534632444382,
"kl": 0.0016287565231323242,
"learning_rate": 9.779754323328192e-07,
"loss": 0.004,
"reward": -0.032025402411818504,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09435309586115181,
"rewards/cosine_scaled_reward": -0.18826766480924562,
"rewards/format_reward": 0.18750000186264515,
"step": 95
},
{
"advantage_max": 0.16466173576191068,
"advantage_mean": 5.122274243651859e-09,
"advantage_min": -0.14734835969284177,
"advantage_std": 0.11930368887260556,
"completion_length": 2774.3333740234375,
"epoch": 0.10971428571428571,
"grad_norm": 0.020811520516872406,
"kl": 0.004055976867675781,
"learning_rate": 9.769942052400235e-07,
"loss": 0.0012,
"reward": 0.07045774557627738,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11930369678884745,
"rewards/cosine_scaled_reward": -0.010209089145064354,
"rewards/format_reward": 0.43750000186264515,
"step": 96
},
{
"advantage_max": 0.20686533208936453,
"advantage_mean": 5.551115123125783e-17,
"advantage_min": -0.1880991030484438,
"advantage_std": 0.1569270808249712,
"completion_length": 3132.5834350585938,
"epoch": 0.11085714285714286,
"grad_norm": 0.023457398638129234,
"kl": 0.0023653507232666016,
"learning_rate": 9.759921670520634e-07,
"loss": 0.0108,
"reward": 0.06218179999268614,
"reward_advantage_correlation": 1.0,
"reward_std": 0.15692708734422922,
"rewards/cosine_scaled_reward": -0.03378633502870798,
"rewards/format_reward": 0.43750001303851604,
"step": 97
},
{
"advantage_max": 0.14833640353754163,
"advantage_mean": 3.1044086745701804e-10,
"advantage_min": -0.12552594719454646,
"advantage_std": 0.10549607453867793,
"completion_length": 2822.7708587646484,
"epoch": 0.112,
"grad_norm": 0.019470063969492912,
"kl": 0.0015528202056884766,
"learning_rate": 9.749693666068663e-07,
"loss": 0.0103,
"reward": 0.03668228443711996,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10549607826396823,
"rewards/cosine_scaled_reward": -0.12178766075521708,
"rewards/format_reward": 0.45833334140479565,
"step": 98
},
{
"advantage_max": 0.11012635566294193,
"advantage_mean": 4.1909516412808046e-09,
"advantage_min": -0.11911307182163,
"advantage_std": 0.09590944508090615,
"completion_length": 2851.645835876465,
"epoch": 0.11314285714285714,
"grad_norm": 0.018691029399633408,
"kl": 0.00240325927734375,
"learning_rate": 9.739258537542835e-07,
"loss": -0.0003,
"reward": 0.009538065176457167,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09590945346280932,
"rewards/cosine_scaled_reward": -0.1280885050073266,
"rewards/format_reward": 0.31250000186264515,
"step": 99
},
{
"advantage_max": 0.13672667369246483,
"advantage_mean": -2.638747401140229e-09,
"advantage_min": -0.13193287048488855,
"advantage_std": 0.10591222485527396,
"completion_length": 2591.270866394043,
"epoch": 0.11428571428571428,
"grad_norm": 0.018864328041672707,
"kl": 0.0028095245361328125,
"learning_rate": 9.728616793536587e-07,
"loss": -0.0009,
"reward": 0.10221519600600004,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1059122271835804,
"rewards/cosine_scaled_reward": 0.042956192046403885,
"rewards/format_reward": 0.520833333954215,
"step": 100
},
{
"advantage_max": 0.093271154910326,
"advantage_mean": -2.7755575615628914e-17,
"advantage_min": -0.09792666789144278,
"advantage_std": 0.08075838536024094,
"completion_length": 2602.7916870117188,
"epoch": 0.11542857142857142,
"grad_norm": 0.016473442316055298,
"kl": 0.0017747879028320312,
"learning_rate": 9.717768952713511e-07,
"loss": 0.0086,
"reward": 0.05790668725967407,
"reward_advantage_correlation": 1.0,
"reward_std": 0.08075838536024094,
"rewards/cosine_scaled_reward": -0.05933393910527229,
"rewards/format_reward": 0.45833333395421505,
"step": 101
},
{
"advantage_max": 0.24052445031702518,
"advantage_mean": -1.2417634698280722e-09,
"advantage_min": -0.19093013741075993,
"advantage_std": 0.16782204061746597,
"completion_length": 2524.7708892822266,
"epoch": 0.11657142857142858,
"grad_norm": 0.035958126187324524,
"kl": 0.004951953887939453,
"learning_rate": 9.706715543782064e-07,
"loss": 0.0201,
"reward": 0.056061833864077926,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1678220392204821,
"rewards/cosine_scaled_reward": -0.10558513924479485,
"rewards/format_reward": 0.5416666772216558,
"step": 102
},
{
"advantage_max": 0.185080180875957,
"advantage_mean": -3.414849569782774e-09,
"advantage_min": -0.14461032394319773,
"advantage_std": 0.13952101161703467,
"completion_length": 2854.5416984558105,
"epoch": 0.11771428571428572,
"grad_norm": 0.01984941028058529,
"kl": 0.0026073455810546875,
"learning_rate": 9.695457105469804e-07,
"loss": 0.012,
"reward": 0.025294456630945206,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.13952100928872824,
"rewards/cosine_scaled_reward": -0.11525619064923376,
"rewards/format_reward": 0.37500000558793545,
"step": 103
},
{
"advantage_max": 0.08083041338250041,
"advantage_mean": -2.716357590248908e-09,
"advantage_min": -0.10733875446021557,
"advantage_std": 0.07301557017490268,
"completion_length": 2621.395851135254,
"epoch": 0.11885714285714286,
"grad_norm": 0.11099490523338318,
"kl": 0.0036411285400390625,
"learning_rate": 9.683994186497132e-07,
"loss": 0.0006,
"reward": 0.03443576395511627,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.07301557110622525,
"rewards/cosine_scaled_reward": -0.09871989954262972,
"rewards/format_reward": 0.39583333395421505,
"step": 104
},
{
"advantage_max": 0.2338110376149416,
"advantage_mean": -6.51925827210853e-09,
"advantage_min": -0.2026761043816805,
"advantage_std": 0.18882041098549962,
"completion_length": 2379.8958587646484,
"epoch": 0.12,
"grad_norm": 0.02698267251253128,
"kl": 0.0016281604766845703,
"learning_rate": 9.672327345550543e-07,
"loss": 0.0194,
"reward": 0.11758742481470108,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.18882041005417705,
"rewards/cosine_scaled_reward": 0.06502276286482811,
"rewards/format_reward": 0.5625000074505806,
"step": 105
},
{
"advantage_max": 0.11389932315796614,
"advantage_mean": -4.811833403950416e-09,
"advantage_min": -0.15608873032033443,
"advantage_std": 0.11125539761269465,
"completion_length": 2167.5833702087402,
"epoch": 0.12114285714285715,
"grad_norm": 0.020373547449707985,
"kl": 0.0024791955947875977,
"learning_rate": 9.66045715125541e-07,
"loss": 0.0012,
"reward": 0.20540974102914333,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11125540168723091,
"rewards/cosine_scaled_reward": 0.22056686785072088,
"rewards/format_reward": 0.7708333432674408,
"step": 106
},
{
"advantage_max": 0.10643723327666521,
"advantage_mean": 3.8805109126016646e-10,
"advantage_min": -0.11985098151490092,
"advantage_std": 0.09828559448942542,
"completion_length": 2751.604179382324,
"epoch": 0.12228571428571429,
"grad_norm": 0.018458819016814232,
"kl": 0.0025892257690429688,
"learning_rate": 9.648384182148252e-07,
"loss": 0.0116,
"reward": 0.025896158069372177,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09828559728339314,
"rewards/cosine_scaled_reward": -0.16178596578538418,
"rewards/format_reward": 0.4791666753590107,
"step": 107
},
{
"advantage_max": 0.16523229470476508,
"advantage_mean": -1.2417635114614356e-09,
"advantage_min": -0.11955348215997219,
"advantage_std": 0.11437350790947676,
"completion_length": 2345.145866394043,
"epoch": 0.12342857142857143,
"grad_norm": 0.015816396102309227,
"kl": 0.0018943548202514648,
"learning_rate": 9.636109026648554e-07,
"loss": 0.0099,
"reward": 0.08748990359163145,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.11437350790947676,
"rewards/cosine_scaled_reward": -0.044466909021139145,
"rewards/format_reward": 0.6041666772216558,
"step": 108
},
{
"advantage_max": 0.10261640883982182,
"advantage_mean": -3.0656037708354233e-09,
"advantage_min": -0.07673678267747164,
"advantage_std": 0.07046977989375591,
"completion_length": 2884.791702270508,
"epoch": 0.12457142857142857,
"grad_norm": 0.012454885058104992,
"kl": 0.0016399621963500977,
"learning_rate": 9.623632283030077e-07,
"loss": 0.0055,
"reward": 0.007819185324478894,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.07046978268772364,
"rewards/cosine_scaled_reward": -0.15225903131067753,
"rewards/format_reward": 0.3541666679084301,
"step": 109
},
{
"advantage_max": 0.19972685351967812,
"advantage_mean": -1.474594168993093e-09,
"advantage_min": -0.1611551959067583,
"advantage_std": 0.1318470723927021,
"completion_length": 2799.854263305664,
"epoch": 0.12571428571428572,
"grad_norm": 0.022104663774371147,
"kl": 0.0022640228271484375,
"learning_rate": 9.610954559391704e-07,
"loss": 0.0006,
"reward": 0.06127751222811639,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.13184707635082304,
"rewards/cosine_scaled_reward": -0.07033403514651582,
"rewards/format_reward": 0.5000000074505806,
"step": 110
},
{
"advantage_max": 0.1784421824850142,
"advantage_mean": 8.537123924456935e-10,
"advantage_min": -0.1500476342625916,
"advantage_std": 0.1379630877636373,
"completion_length": 3026.3334045410156,
"epoch": 0.12685714285714286,
"grad_norm": 0.023240555077791214,
"kl": 0.004309654235839844,
"learning_rate": 9.598076473627796e-07,
"loss": 0.0047,
"reward": 0.0340226587431971,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13796309987083077,
"rewards/cosine_scaled_reward": -0.08653214666992426,
"rewards/format_reward": 0.375,
"step": 111
},
{
"advantage_max": 0.2080360697582364,
"advantage_mean": 2.5611371773370806e-09,
"advantage_min": -0.1433120183646679,
"advantage_std": 0.1446642866358161,
"completion_length": 2837.750030517578,
"epoch": 0.128,
"grad_norm": 0.019071072340011597,
"kl": 0.0014045238494873047,
"learning_rate": 9.58499865339809e-07,
"loss": 0.0012,
"reward": 0.08503166912123561,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1446642787195742,
"rewards/cosine_scaled_reward": 0.011173800681717694,
"rewards/format_reward": 0.47916666977107525,
"step": 112
},
{
"advantage_max": 0.19163453578948975,
"advantage_mean": -2.1730860721991263e-09,
"advantage_min": -0.1518814405426383,
"advantage_std": 0.14186442783102393,
"completion_length": 2613.0625762939453,
"epoch": 0.12914285714285714,
"grad_norm": 0.0337207056581974,
"kl": 0.0030469894409179688,
"learning_rate": 9.571721736097088e-07,
"loss": 0.0203,
"reward": 0.04671849589794874,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.14186443528160453,
"rewards/cosine_scaled_reward": -0.1140240803360939,
"rewards/format_reward": 0.5000000111758709,
"step": 113
},
{
"advantage_max": 0.13480149395763874,
"advantage_mean": -3.4924596686858322e-09,
"advantage_min": -0.09827340161427855,
"advantage_std": 0.08871775027364492,
"completion_length": 2424.5208892822266,
"epoch": 0.13028571428571428,
"grad_norm": 0.013029958121478558,
"kl": 0.0030794143676757812,
"learning_rate": 9.55824636882301e-07,
"loss": 0.0055,
"reward": 0.034493221901357174,
"reward_advantage_correlation": 1.0,
"reward_std": 0.08871775213629007,
"rewards/cosine_scaled_reward": -0.22131562419235706,
"rewards/format_reward": 0.6458333395421505,
"step": 114
},
{
"advantage_max": 0.10426800954155624,
"advantage_mean": 1.9014502611325312e-09,
"advantage_min": -0.1240433705970645,
"advantage_std": 0.08707842836156487,
"completion_length": 2850.9583587646484,
"epoch": 0.13142857142857142,
"grad_norm": 0.011764142662286758,
"kl": 0.003220081329345703,
"learning_rate": 9.54457320834625e-07,
"loss": 0.0007,
"reward": 0.02795394801069051,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.08707842929288745,
"rewards/cosine_scaled_reward": -0.08467382844537497,
"rewards/format_reward": 0.3333333358168602,
"step": 115
},
{
"advantage_max": 0.1957715330645442,
"advantage_mean": -7.761021686425451e-10,
"advantage_min": -0.10577770043164492,
"advantage_std": 0.11993355210870504,
"completion_length": 3311.604202270508,
"epoch": 0.13257142857142856,
"grad_norm": 0.02515524998307228,
"kl": 0.0029854774475097656,
"learning_rate": 9.530702921077358e-07,
"loss": -0.0008,
"reward": -0.030864793108776212,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.11993355257436633,
"rewards/cosine_scaled_reward": -0.17528742615832016,
"rewards/format_reward": 0.16666667349636555,
"step": 116
},
{
"advantage_max": 0.18044267036020756,
"advantage_mean": 3.88051125954636e-10,
"advantage_min": -0.11745557747781277,
"advantage_std": 0.10928188590332866,
"completion_length": 2841.6041717529297,
"epoch": 0.1337142857142857,
"grad_norm": 0.019817935302853584,
"kl": 0.0042095184326171875,
"learning_rate": 9.516636183034564e-07,
"loss": 0.003,
"reward": 0.01119667274178937,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10928189102560282,
"rewards/cosine_scaled_reward": -0.17523309215903282,
"rewards/format_reward": 0.416666679084301,
"step": 117
},
{
"advantage_max": 0.21986545948311687,
"advantage_mean": -4.268561920595104e-09,
"advantage_min": -0.14438048377633095,
"advantage_std": 0.14283119468018413,
"completion_length": 2871.166732788086,
"epoch": 0.13485714285714287,
"grad_norm": 0.024016261100769043,
"kl": 0.0021152496337890625,
"learning_rate": 9.502373679810839e-07,
"loss": 0.0001,
"reward": 0.15214395127259195,
"reward_advantage_correlation": 1.0,
"reward_std": 0.142831196077168,
"rewards/cosine_scaled_reward": 0.16847316874191165,
"rewards/format_reward": 0.5625000037252903,
"step": 118
},
{
"advantage_max": 0.15783230029046535,
"advantage_mean": -5.1222742575296465e-09,
"advantage_min": -0.14731642603874207,
"advantage_std": 0.12371453363448381,
"completion_length": 2522.604202270508,
"epoch": 0.136,
"grad_norm": 0.01609306037425995,
"kl": 0.004221200942993164,
"learning_rate": 9.487916106540465e-07,
"loss": 0.0129,
"reward": 0.09867640398442745,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12371453177183867,
"rewards/cosine_scaled_reward": -0.0007585976272821426,
"rewards/format_reward": 0.5833333395421505,
"step": 119
},
{
"advantage_max": 0.20129030477255583,
"advantage_mean": -8.537124063234813e-10,
"advantage_min": -0.12198320962488651,
"advantage_std": 0.12838405929505825,
"completion_length": 2559.8958740234375,
"epoch": 0.13714285714285715,
"grad_norm": 0.04911419749259949,
"kl": 0.004084587097167969,
"learning_rate": 9.473264167865171e-07,
"loss": 0.0137,
"reward": 0.06981711252592504,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12838406348600984,
"rewards/cosine_scaled_reward": -0.044595762854442,
"rewards/format_reward": 0.5000000074505806,
"step": 120
},
{
"advantage_max": 0.1426694355905056,
"advantage_mean": -6.907309335613121e-09,
"advantage_min": -0.16301002446562052,
"advantage_std": 0.12418079562485218,
"completion_length": 1872.1042251586914,
"epoch": 0.1382857142857143,
"grad_norm": 0.040391478687524796,
"kl": 0.0055408477783203125,
"learning_rate": 9.458418577899774e-07,
"loss": 0.0151,
"reward": 0.12901237746700644,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12418079702183604,
"rewards/cosine_scaled_reward": 0.0047687627375125885,
"rewards/format_reward": 0.7500000074505806,
"step": 121
},
{
"advantage_max": 0.17488223453983665,
"advantage_mean": -4.113341472988807e-09,
"advantage_min": -0.20183194149285555,
"advantage_std": 0.1446224031969905,
"completion_length": 2870.270896911621,
"epoch": 0.13942857142857143,
"grad_norm": 0.043231695890426636,
"kl": 0.0026760101318359375,
"learning_rate": 9.443380060197385e-07,
"loss": 0.0071,
"reward": 0.07613519253209233,
"reward_advantage_correlation": 1.0,
"reward_std": 0.14462240180000663,
"rewards/cosine_scaled_reward": 0.00641607865691185,
"rewards/format_reward": 0.4375000111758709,
"step": 122
},
{
"advantage_max": 0.1747486158274114,
"advantage_mean": 8.537123646901179e-10,
"advantage_min": -0.15847238339483738,
"advantage_std": 0.13815004844218493,
"completion_length": 2630.8959197998047,
"epoch": 0.14057142857142857,
"grad_norm": 0.026115527376532555,
"kl": 0.002586841583251953,
"learning_rate": 9.428149347714143e-07,
"loss": 0.0165,
"reward": 0.04064048221334815,
"reward_advantage_correlation": 0.9999999999999997,
"reward_std": 0.13815005496144295,
"rewards/cosine_scaled_reward": -0.15376391634345055,
"rewards/format_reward": 0.5416666828095913,
"step": 123
},
{
"advantage_max": 0.2104870369657874,
"advantage_mean": -1.4745941412375174e-09,
"advantage_min": -0.16038642171770334,
"advantage_std": 0.14703476894646883,
"completion_length": 2258.8125228881836,
"epoch": 0.1417142857142857,
"grad_norm": 0.023247675970196724,
"kl": 0.005304813385009766,
"learning_rate": 9.412727182773486e-07,
"loss": 0.0087,
"reward": 0.061627675080671906,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.14703477453440428,
"rewards/cosine_scaled_reward": -0.1097550387494266,
"rewards/format_reward": 0.5833333414047956,
"step": 124
},
{
"advantage_max": 0.16207702737301588,
"advantage_mean": -1.7074247571358114e-09,
"advantage_min": -0.12357809208333492,
"advantage_std": 0.11802704073488712,
"completion_length": 2823.6875228881836,
"epoch": 0.14285714285714285,
"grad_norm": 0.01639670506119728,
"kl": 0.0029687881469726562,
"learning_rate": 9.397114317029974e-07,
"loss": 0.0048,
"reward": 0.0728115017991513,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11802704306319356,
"rewards/cosine_scaled_reward": 0.035107508301734924,
"rewards/format_reward": 0.3541666679084301,
"step": 125
},
{
"advantage_max": 0.13201149785891175,
"advantage_mean": -2.9491882547194592e-09,
"advantage_min": -0.13687030225992203,
"advantage_std": 0.1064839765895158,
"completion_length": 2888.375030517578,
"epoch": 0.144,
"grad_norm": 0.021405896171927452,
"kl": 0.002410888671875,
"learning_rate": 9.381311511432658e-07,
"loss": 0.0082,
"reward": 0.029038145439699292,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10648397775366902,
"rewards/cosine_scaled_reward": -0.09103115275502205,
"rewards/format_reward": 0.35416668094694614,
"step": 126
},
{
"advantage_max": 0.18761903140693903,
"advantage_mean": 1.0865429944661997e-09,
"advantage_min": -0.09865566249936819,
"advantage_std": 0.11243651760742068,
"completion_length": 3327.7084045410156,
"epoch": 0.14514285714285713,
"grad_norm": 0.01991112157702446,
"kl": 0.005802154541015625,
"learning_rate": 9.36531953618799e-07,
"loss": 0.0053,
"reward": -0.03913277422543615,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.11243652272969484,
"rewards/cosine_scaled_reward": -0.21913862321525812,
"rewards/format_reward": 0.20833334140479565,
"step": 127
},
{
"advantage_max": 0.14964813645929098,
"advantage_mean": -5.743156020199258e-09,
"advantage_min": -0.2074049087241292,
"advantage_std": 0.1491677723824978,
"completion_length": 2978.166732788086,
"epoch": 0.1462857142857143,
"grad_norm": 0.02437690831720829,
"kl": 0.004235744476318359,
"learning_rate": 9.34913917072228e-07,
"loss": 0.0069,
"reward": 0.08901105728000402,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.14916778123006225,
"rewards/cosine_scaled_reward": 0.07575214840471745,
"rewards/format_reward": 0.3750000074505806,
"step": 128
},
{
"advantage_max": 0.16403791401535273,
"advantage_mean": 6.208817210362483e-10,
"advantage_min": -0.1340857520699501,
"advantage_std": 0.12814921559765935,
"completion_length": 3485.479217529297,
"epoch": 0.14742857142857144,
"grad_norm": 0.024071309715509415,
"kl": 0.004917621612548828,
"learning_rate": 9.332771203643714e-07,
"loss": 0.0058,
"reward": -0.0004819065798074007,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12814922630786896,
"rewards/cosine_scaled_reward": -0.09490467235445976,
"rewards/format_reward": 0.18750000186264515,
"step": 129
},
{
"advantage_max": 0.1427209312096238,
"advantage_mean": 3.104408632936817e-09,
"advantage_min": -0.08900717180222273,
"advantage_std": 0.08698507398366928,
"completion_length": 3171.6666717529297,
"epoch": 0.14857142857142858,
"grad_norm": 0.014160319231450558,
"kl": 0.0046808719635009766,
"learning_rate": 9.316216432703916e-07,
"loss": 0.002,
"reward": -0.03297149168793112,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.08698507212102413,
"rewards/cosine_scaled_reward": -0.19968184363096952,
"rewards/format_reward": 0.2083333358168602,
"step": 130
},
{
"advantage_max": 0.13422834686934948,
"advantage_mean": -1.1641533154138628e-09,
"advantage_min": -0.18032845202833414,
"advantage_std": 0.12928894069045782,
"completion_length": 2918.6875610351562,
"epoch": 0.14971428571428572,
"grad_norm": 0.0223796758800745,
"kl": 0.0055751800537109375,
"learning_rate": 9.299475664759068e-07,
"loss": 0.0089,
"reward": 0.07486723270267248,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12928894069045782,
"rewards/cosine_scaled_reward": 0.023864680901169777,
"rewards/format_reward": 0.39583334513008595,
"step": 131
},
{
"advantage_max": 0.16987570468336344,
"advantage_mean": -3.88051125954636e-10,
"advantage_min": -0.17349545564502478,
"advantage_std": 0.1324772317893803,
"completion_length": 2984.437515258789,
"epoch": 0.15085714285714286,
"grad_norm": 0.019351573660969734,
"kl": 0.004413604736328125,
"learning_rate": 9.282549715730579e-07,
"loss": 0.0084,
"reward": 0.05797156970947981,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.13247722946107388,
"rewards/cosine_scaled_reward": -0.00954313576221466,
"rewards/format_reward": 0.35416667722165585,
"step": 132
},
{
"advantage_max": 0.1357247936539352,
"advantage_mean": 1.862645218619896e-09,
"advantage_min": -0.08459451515227556,
"advantage_std": 0.08351372461766005,
"completion_length": 3332.4583740234375,
"epoch": 0.152,
"grad_norm": 0.014467400498688221,
"kl": 0.005523681640625,
"learning_rate": 9.265439410565328e-07,
"loss": 0.0031,
"reward": -0.05285291757900268,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.08351372368633747,
"rewards/cosine_scaled_reward": -0.2597227357327938,
"rewards/format_reward": 0.20833333395421505,
"step": 133
},
{
"advantage_max": 0.2571664294227958,
"advantage_mean": -5.898376592705645e-09,
"advantage_min": -0.15844144020229578,
"advantage_std": 0.16976315109059215,
"completion_length": 2875.3750610351562,
"epoch": 0.15314285714285714,
"grad_norm": 0.028712719678878784,
"kl": 0.007971763610839844,
"learning_rate": 9.248145583195447e-07,
"loss": 0.0054,
"reward": 0.10160159273073077,
"reward_advantage_correlation": 1.0,
"reward_std": 0.16976316180080175,
"rewards/cosine_scaled_reward": 0.0412679030559957,
"rewards/format_reward": 0.5208333432674408,
"step": 134
},
{
"advantage_max": 0.17440959997475147,
"advantage_mean": -1.3969840007010959e-09,
"advantage_min": -0.175615637563169,
"advantage_std": 0.14522844285238534,
"completion_length": 2164.6458740234375,
"epoch": 0.15428571428571428,
"grad_norm": 0.026748182252049446,
"kl": 0.004391670227050781,
"learning_rate": 9.230669076497687e-07,
"loss": -0.0007,
"reward": 0.1501585068181157,
"reward_advantage_correlation": 1.0,
"reward_std": 0.14522844285238534,
"rewards/cosine_scaled_reward": 0.1208992125466466,
"rewards/format_reward": 0.6458333432674408,
"step": 135
},
{
"advantage_max": 0.18647493747994304,
"advantage_mean": -3.570070156166949e-09,
"advantage_min": -0.23410069476813078,
"advantage_std": 0.17950192606076598,
"completion_length": 3038.9583435058594,
"epoch": 0.15542857142857142,
"grad_norm": 0.0378388985991478,
"kl": 0.006519317626953125,
"learning_rate": 9.213010742252327e-07,
"loss": 0.0237,
"reward": 0.0896531674079597,
"reward_advantage_correlation": 1.0,
"reward_std": 0.179501932580024,
"rewards/cosine_scaled_reward": 0.07765412889420986,
"rewards/format_reward": 0.3750000111758709,
"step": 136
},
{
"advantage_max": 0.15427146770525724,
"advantage_mean": 2.1730860513824446e-09,
"advantage_min": -0.13093871576711535,
"advantage_std": 0.11751817003823817,
"completion_length": 3286.7708587646484,
"epoch": 0.15657142857142858,
"grad_norm": 0.02052266336977482,
"kl": 0.005681037902832031,
"learning_rate": 9.195171441101668e-07,
"loss": 0.0092,
"reward": -0.00857232604175806,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11751817702315748,
"rewards/cosine_scaled_reward": -0.1289244736544788,
"rewards/format_reward": 0.20833333767950535,
"step": 137
},
{
"advantage_max": 0.1545063005760312,
"advantage_mean": 4.190951738425319e-09,
"advantage_min": -0.09753599436953664,
"advantage_std": 0.10371883399784565,
"completion_length": 2524.375045776367,
"epoch": 0.15771428571428572,
"grad_norm": 0.013675352558493614,
"kl": 0.004207611083984375,
"learning_rate": 9.177152042508077e-07,
"loss": 0.0049,
"reward": 0.07395011962216813,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1037188358604908,
"rewards/cosine_scaled_reward": -0.064213783480227,
"rewards/format_reward": 0.562500013038516,
"step": 138
},
{
"advantage_max": 0.14760101400315762,
"advantage_mean": -1.241763476766966e-09,
"advantage_min": -0.16335713909938931,
"advantage_std": 0.13432236923836172,
"completion_length": 3093.229232788086,
"epoch": 0.15885714285714286,
"grad_norm": 0.026457417756319046,
"kl": 0.005748748779296875,
"learning_rate": 9.158953424711624e-07,
"loss": 0.009,
"reward": 0.0407895278185606,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.13432237412780523,
"rewards/cosine_scaled_reward": -0.06742438767105341,
"rewards/format_reward": 0.3750000149011612,
"step": 139
},
{
"advantage_max": 0.09711257927119732,
"advantage_mean": -6.984918754504577e-10,
"advantage_min": -0.08666328061372042,
"advantage_std": 0.07689650449901819,
"completion_length": 3201.7083740234375,
"epoch": 0.16,
"grad_norm": 0.021007120609283447,
"kl": 0.010286331176757812,
"learning_rate": 9.140576474687263e-07,
"loss": 0.0091,
"reward": -0.013685423880815506,
"reward_advantage_correlation": 1.0,
"reward_std": 0.07689650217071176,
"rewards/cosine_scaled_reward": -0.1452522613108158,
"rewards/format_reward": 0.20833334140479565,
"step": 140
},
{
"advantage_max": 0.2189687853679061,
"advantage_mean": -2.0954758206404023e-09,
"advantage_min": -0.13264015363529325,
"advantage_std": 0.14607419818639755,
"completion_length": 2922.229248046875,
"epoch": 0.16114285714285714,
"grad_norm": 0.02560114860534668,
"kl": 0.0079193115234375,
"learning_rate": 9.122022088101613e-07,
"loss": 0.0077,
"reward": 0.030265355249866843,
"reward_advantage_correlation": 1.0,
"reward_std": 0.14607420284301043,
"rewards/cosine_scaled_reward": -0.1499252589419484,
"rewards/format_reward": 0.47916667349636555,
"step": 141
},
{
"advantage_max": 0.18417442869395018,
"advantage_mean": -3.570070017389071e-09,
"advantage_min": -0.16223066858947277,
"advantage_std": 0.1339187677949667,
"completion_length": 3167.104248046875,
"epoch": 0.16228571428571428,
"grad_norm": 0.020130103453993797,
"kl": 0.0071773529052734375,
"learning_rate": 9.103291169269299e-07,
"loss": 0.0164,
"reward": 0.06062847562134266,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13391877012327313,
"rewards/cosine_scaled_reward": -0.03922642639372498,
"rewards/format_reward": 0.43750001676380634,
"step": 142
},
{
"advantage_max": 0.12369767762720585,
"advantage_mean": -2.7163574167765603e-10,
"advantage_min": -0.10968521423637867,
"advantage_std": 0.09598812274634838,
"completion_length": 2993.0833435058594,
"epoch": 0.16342857142857142,
"grad_norm": 0.02057984471321106,
"kl": 0.007022857666015625,
"learning_rate": 9.084384631108882e-07,
"loss": 0.0107,
"reward": -0.01793865323998034,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09598812321200967,
"rewards/cosine_scaled_reward": -0.2098203683272004,
"rewards/format_reward": 0.31250000558793545,
"step": 143
},
{
"advantage_max": 0.24469326250255108,
"advantage_mean": 1.8626451353531692e-09,
"advantage_min": -0.16228821780532598,
"advantage_std": 0.1688228864222765,
"completion_length": 3053.1458740234375,
"epoch": 0.16457142857142856,
"grad_norm": 0.028377985581755638,
"kl": 0.006343841552734375,
"learning_rate": 9.065303395098358e-07,
"loss": 0.0125,
"reward": 0.02497765606676694,
"reward_advantage_correlation": 1.0,
"reward_std": 0.16882288735359907,
"rewards/cosine_scaled_reward": -0.08332122955471277,
"rewards/format_reward": 0.31250000558793545,
"step": 144
},
{
"advantage_max": 0.11938331183046103,
"advantage_mean": -3.182019189806873e-09,
"advantage_min": -0.10436953045427799,
"advantage_std": 0.0860091031063348,
"completion_length": 2103.9375534057617,
"epoch": 0.1657142857142857,
"grad_norm": 0.015253880061209202,
"kl": 0.004912376403808594,
"learning_rate": 9.046048391230247e-07,
"loss": 0.0093,
"reward": 0.13016253290697932,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.08600910496897995,
"rewards/cosine_scaled_reward": 0.08105293428525329,
"rewards/format_reward": 0.6041666772216558,
"step": 145
},
{
"advantage_max": 0.19676159508526325,
"advantage_mean": -2.0954758622737657e-09,
"advantage_min": -0.14002857124432921,
"advantage_std": 0.13696159655228257,
"completion_length": 2605.979202270508,
"epoch": 0.16685714285714287,
"grad_norm": 0.027135798707604408,
"kl": 0.004586696624755859,
"learning_rate": 9.026620557966279e-07,
"loss": 0.0149,
"reward": 0.050586492056027055,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13696160027757287,
"rewards/cosine_scaled_reward": -0.14402856817469,
"rewards/format_reward": 0.5833333395421505,
"step": 146
},
{
"advantage_max": 0.16726404940709472,
"advantage_mean": -4.5790028366243796e-09,
"advantage_min": -0.183818063698709,
"advantage_std": 0.1481722998432815,
"completion_length": 3017.0416870117188,
"epoch": 0.168,
"grad_norm": 0.032472483813762665,
"kl": 0.0078277587890625,
"learning_rate": 9.007020842191634e-07,
"loss": 0.015,
"reward": 0.05535583617165685,
"reward_advantage_correlation": 1.0,
"reward_std": 0.14817229937762022,
"rewards/cosine_scaled_reward": -0.0040863314643502235,
"rewards/format_reward": 0.3333333432674408,
"step": 147
},
{
"advantage_max": 0.1869575590826571,
"advantage_mean": -1.5522044760629683e-10,
"advantage_min": -0.16520546469837427,
"advantage_std": 0.1403520731255412,
"completion_length": 2839.854202270508,
"epoch": 0.16914285714285715,
"grad_norm": 0.0203824695199728,
"kl": 0.007618904113769531,
"learning_rate": 8.987250199168808e-07,
"loss": 0.0064,
"reward": 0.06426224764436483,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.14035206753760576,
"rewards/cosine_scaled_reward": -0.050383010879158974,
"rewards/format_reward": 0.47916667349636555,
"step": 148
},
{
"advantage_max": 0.14235155889764428,
"advantage_mean": -9.119200672369487e-10,
"advantage_min": -0.17431307956576347,
"advantage_std": 0.12154442211613059,
"completion_length": 2893.604217529297,
"epoch": 0.1702857142857143,
"grad_norm": 0.017824489623308182,
"kl": 0.004513263702392578,
"learning_rate": 8.967309592491052e-07,
"loss": 0.0019,
"reward": 0.08055121125653386,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12154442025348544,
"rewards/cosine_scaled_reward": -0.03182307630777359,
"rewards/format_reward": 0.5416666734963655,
"step": 149
},
{
"advantage_max": 0.19116142578423023,
"advantage_mean": -1.9402553730341054e-09,
"advantage_min": -0.15318272728472948,
"advantage_std": 0.14041123539209366,
"completion_length": 2982.8958740234375,
"epoch": 0.17142857142857143,
"grad_norm": 0.024410845711827278,
"kl": 0.008457183837890625,
"learning_rate": 8.9471999940354e-07,
"loss": 0.0147,
"reward": 0.037670310121029615,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1404112372547388,
"rewards/cosine_scaled_reward": -0.0773946214467287,
"rewards/format_reward": 0.3750000037252903,
"step": 150
},
{
"advantage_max": 0.13646899722516537,
"advantage_mean": -3.6476802203755376e-09,
"advantage_min": -0.177975757047534,
"advantage_std": 0.12031815620139241,
"completion_length": 2622.729217529297,
"epoch": 0.17257142857142857,
"grad_norm": 0.03814442828297615,
"kl": 0.007709503173828125,
"learning_rate": 8.926922383915315e-07,
"loss": 0.0096,
"reward": 0.09881546010728925,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.120318160392344,
"rewards/cosine_scaled_reward": 0.010256066918373108,
"rewards/format_reward": 0.5625000149011612,
"step": 151
},
{
"advantage_max": 0.17624704539775848,
"advantage_mean": 3.8805107738237865e-10,
"advantage_min": -0.11608144547790289,
"advantage_std": 0.1141918571665883,
"completion_length": 2824.2291984558105,
"epoch": 0.1737142857142857,
"grad_norm": 0.022418169304728508,
"kl": 0.006558418273925781,
"learning_rate": 8.906477750432903e-07,
"loss": 0.0045,
"reward": 0.01563113136216998,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.11419186135753989,
"rewards/cosine_scaled_reward": -0.15195966488681734,
"rewards/format_reward": 0.39583333395421505,
"step": 152
},
{
"advantage_max": 0.1550552500411868,
"advantage_mean": 1.7850349809389598e-09,
"advantage_min": -0.16724903974682093,
"advantage_std": 0.11843094322830439,
"completion_length": 3011.7916870117188,
"epoch": 0.17485714285714285,
"grad_norm": 0.02170824445784092,
"kl": 0.012271881103515625,
"learning_rate": 8.88586709003076e-07,
"loss": 0.0105,
"reward": 0.0336550869178609,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.11843094741925597,
"rewards/cosine_scaled_reward": -0.0872773714363575,
"rewards/format_reward": 0.37500001303851604,
"step": 153
},
{
"advantage_max": 0.2223479701206088,
"advantage_mean": 3.1044086745701804e-09,
"advantage_min": -0.1979465465992689,
"advantage_std": 0.171802272554487,
"completion_length": 3407.437530517578,
"epoch": 0.176,
"grad_norm": 0.02847815677523613,
"kl": 0.005352020263671875,
"learning_rate": 8.865091407243394e-07,
"loss": 0.0077,
"reward": 0.047635506991355214,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.17180227162316442,
"rewards/cosine_scaled_reward": -0.01420294726267457,
"rewards/format_reward": 0.3125000074505806,
"step": 154
},
{
"advantage_max": 0.15556193236261606,
"advantage_mean": -3.880511745268933e-10,
"advantage_min": -0.20040578301995993,
"advantage_std": 0.1497023869305849,
"completion_length": 2653.562545776367,
"epoch": 0.17714285714285713,
"grad_norm": 0.03856699913740158,
"kl": 0.005893707275390625,
"learning_rate": 8.844151714648274e-07,
"loss": 0.0166,
"reward": 0.08896010369062424,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.14970239624381065,
"rewards/cosine_scaled_reward": 0.03297489322721958,
"rewards/format_reward": 0.45833334513008595,
"step": 155
},
{
"advantage_max": 0.18007311457768083,
"advantage_mean": -1.241763414316921e-09,
"advantage_min": -0.14902439527213573,
"advantage_std": 0.13531371485441923,
"completion_length": 3162.687515258789,
"epoch": 0.1782857142857143,
"grad_norm": 0.017985651269555092,
"kl": 0.0062007904052734375,
"learning_rate": 8.823049032816478e-07,
"loss": 0.0054,
"reward": 0.019509871723130345,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.13531372044235468,
"rewards/cosine_scaled_reward": -0.11034888960421085,
"rewards/format_reward": 0.3333333469927311,
"step": 156
},
{
"advantage_max": 0.13918431987985969,
"advantage_mean": -4.163336342344337e-17,
"advantage_min": -0.11121347360312939,
"advantage_std": 0.10173497628420591,
"completion_length": 3224.166717529297,
"epoch": 0.17942857142857144,
"grad_norm": 0.01812376268208027,
"kl": 0.00917816162109375,
"learning_rate": 8.801784390262943e-07,
"loss": 0.01,
"reward": -0.0007896313909441233,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10173497628420591,
"rewards/cosine_scaled_reward": -0.13752574939280748,
"rewards/format_reward": 0.2708333432674408,
"step": 157
},
{
"advantage_max": 0.257691353559494,
"advantage_mean": -4.035731332452386e-09,
"advantage_min": -0.17544407676905394,
"advantage_std": 0.17222883319482207,
"completion_length": 3024.125030517578,
"epoch": 0.18057142857142858,
"grad_norm": 0.027070507407188416,
"kl": 0.0061359405517578125,
"learning_rate": 8.780358823396352e-07,
"loss": 0.0161,
"reward": 0.16088842856697738,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.17222883319482207,
"rewards/cosine_scaled_reward": 0.20306535623967648,
"rewards/format_reward": 0.541666679084301,
"step": 158
},
{
"advantage_max": 0.1392697487026453,
"advantage_mean": -1.3193737005701145e-09,
"advantage_min": -0.10665746498852968,
"advantage_std": 0.10162637522444129,
"completion_length": 3238.625030517578,
"epoch": 0.18171428571428572,
"grad_norm": 0.019616369158029556,
"kl": 0.009876251220703125,
"learning_rate": 8.758773376468604e-07,
"loss": 0.0023,
"reward": -0.024257861077785492,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.10162637708708644,
"rewards/cosine_scaled_reward": -0.20666235126554966,
"rewards/format_reward": 0.27083334140479565,
"step": 159
},
{
"advantage_max": 0.16948407562449574,
"advantage_mean": -3.5700700312668587e-09,
"advantage_min": -0.13234250340610743,
"advantage_std": 0.12069881893694401,
"completion_length": 2822.208381652832,
"epoch": 0.18285714285714286,
"grad_norm": 0.022409770637750626,
"kl": 0.011167526245117188,
"learning_rate": 8.737029101523929e-07,
"loss": 0.0033,
"reward": 0.04982085805386305,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.12069882079958916,
"rewards/cosine_scaled_reward": -0.04145237850025296,
"rewards/format_reward": 0.3750000111758709,
"step": 160
},
{
"advantage_max": 0.26078341249376535,
"advantage_mean": -4.190951766180895e-09,
"advantage_min": -0.20996790193021297,
"advantage_std": 0.18169856257736683,
"completion_length": 2801.5000610351562,
"epoch": 0.184,
"grad_norm": 0.03609248995780945,
"kl": 0.009645462036132812,
"learning_rate": 8.715127058347614e-07,
"loss": 0.0183,
"reward": 0.07784991223888937,
"reward_advantage_correlation": 1.0,
"reward_std": 0.18169856630265713,
"rewards/cosine_scaled_reward": -0.011576765216886997,
"rewards/format_reward": 0.47916668094694614,
"step": 161
},
{
"advantage_max": 0.14908618130721152,
"advantage_mean": -1.5522033658399437e-10,
"advantage_min": -0.13768692780286074,
"advantage_std": 0.12910312414169312,
"completion_length": 3373.8958435058594,
"epoch": 0.18514285714285714,
"grad_norm": 0.02396521158516407,
"kl": 0.012908935546875,
"learning_rate": 8.693068314414344e-07,
"loss": 0.0078,
"reward": -0.01704839337617159,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12910313392058015,
"rewards/cosine_scaled_reward": -0.1342112785205245,
"rewards/format_reward": 0.16666666977107525,
"step": 162
},
{
"advantage_max": 0.1578800524584949,
"advantage_mean": -3.4148495420271985e-09,
"advantage_min": -0.1815645396709442,
"advantage_std": 0.13664002949371934,
"completion_length": 2519.604232788086,
"epoch": 0.18628571428571428,
"grad_norm": 0.025771912187337875,
"kl": 0.009222030639648438,
"learning_rate": 8.670853944836176e-07,
"loss": 0.0139,
"reward": 0.15985322836786509,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.13664002902805805,
"rewards/cosine_scaled_reward": 0.1906539173796773,
"rewards/format_reward": 0.562500013038516,
"step": 163
},
{
"advantage_max": 0.1497408151626587,
"advantage_mean": 3.8805109126016646e-10,
"advantage_min": -0.13764986861497164,
"advantage_std": 0.11453935131430626,
"completion_length": 2729.666732788086,
"epoch": 0.18742857142857142,
"grad_norm": 0.027371998876333237,
"kl": 0.008401870727539062,
"learning_rate": 8.648485032310144e-07,
"loss": 0.006,
"reward": 0.08568728528916836,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.11453934758901596,
"rewards/cosine_scaled_reward": 0.013143057469278574,
"rewards/format_reward": 0.4791666679084301,
"step": 164
},
{
"advantage_max": 0.2020930303260684,
"advantage_mean": 4.113341500744383e-09,
"advantage_min": -0.13421366550028324,
"advantage_std": 0.1295091542415321,
"completion_length": 3323.375030517578,
"epoch": 0.18857142857142858,
"grad_norm": 0.02407933585345745,
"kl": 0.013275146484375,
"learning_rate": 8.625962667065487e-07,
"loss": 0.0101,
"reward": 0.006851513287983835,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1295091570354998,
"rewards/cosine_scaled_reward": -0.1068816565675661,
"rewards/format_reward": 0.25000000186264515,
"step": 165
},
{
"advantage_max": 0.21237648325040936,
"advantage_mean": -1.629814540271557e-09,
"advantage_min": -0.15448179375380278,
"advantage_std": 0.1506841192021966,
"completion_length": 3147.666702270508,
"epoch": 0.18971428571428572,
"grad_norm": 0.02479720674455166,
"kl": 0.0067691802978515625,
"learning_rate": 8.603287946810513e-07,
"loss": 0.0051,
"reward": 0.013205445604398847,
"reward_advantage_correlation": 1.0,
"reward_std": 0.15068412572145462,
"rewards/cosine_scaled_reward": -0.1286666316445917,
"rewards/format_reward": 0.33333334140479565,
"step": 166
},
{
"advantage_max": 0.1873478158377111,
"advantage_mean": 4.6566125955216364e-10,
"advantage_min": -0.14719886984676123,
"advantage_std": 0.12714364705607295,
"completion_length": 2632.7083892822266,
"epoch": 0.19085714285714286,
"grad_norm": 0.019351521506905556,
"kl": 0.006518363952636719,
"learning_rate": 8.580461976679099e-07,
"loss": 0.0108,
"reward": 0.05125786177814007,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.12714365031570196,
"rewards/cosine_scaled_reward": -0.13082036562263966,
"rewards/format_reward": 0.5625000111758709,
"step": 167
},
{
"advantage_max": 0.17462534084916115,
"advantage_mean": -4.346172061131526e-09,
"advantage_min": -0.1636114427819848,
"advantage_std": 0.1395610896870494,
"completion_length": 3228.1875610351562,
"epoch": 0.192,
"grad_norm": 0.02397916279733181,
"kl": 0.007595062255859375,
"learning_rate": 8.557485869176825e-07,
"loss": -0.003,
"reward": 0.05293558700941503,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13956109527498484,
"rewards/cosine_scaled_reward": -0.05299521051347256,
"rewards/format_reward": 0.4166666753590107,
"step": 168
},
{
"advantage_max": 0.14650389458984137,
"advantage_mean": -8.45951358269259e-09,
"advantage_min": -0.15014035161584616,
"advantage_std": 0.12220517976675183,
"completion_length": 2530.3542098999023,
"epoch": 0.19314285714285714,
"grad_norm": 0.02601473033428192,
"kl": 0.009768486022949219,
"learning_rate": 8.534360744126753e-07,
"loss": 0.0108,
"reward": 0.22324875311460346,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.12220518512185663,
"rewards/cosine_scaled_reward": 0.32412082329392433,
"rewards/format_reward": 0.6666666716337204,
"step": 169
},
{
"advantage_max": 0.11422509420663118,
"advantage_mean": 8.925175043472677e-10,
"advantage_min": -0.12703087367117405,
"advantage_std": 0.1002190806902945,
"completion_length": 2565.562515258789,
"epoch": 0.19428571428571428,
"grad_norm": 0.01757214218378067,
"kl": 0.0067424774169921875,
"learning_rate": 8.511087728614862e-07,
"loss": 0.0112,
"reward": 0.07882664329372346,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10021908720955253,
"rewards/cosine_scaled_reward": 0.008724531158804893,
"rewards/format_reward": 0.43750000558793545,
"step": 170
},
{
"advantage_max": 0.18300288636237383,
"advantage_mean": -1.2417634975836478e-09,
"advantage_min": -0.14693648740649223,
"advantage_std": 0.12005474278703332,
"completion_length": 2807.2291870117188,
"epoch": 0.19542857142857142,
"grad_norm": 0.022502249106764793,
"kl": 0.006862640380859375,
"learning_rate": 8.487667956935087e-07,
"loss": 0.0131,
"reward": 0.07513996493071318,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12005474418401718,
"rewards/cosine_scaled_reward": 0.012896033469587564,
"rewards/format_reward": 0.41666667349636555,
"step": 171
},
{
"advantage_max": 0.16364847961813211,
"advantage_mean": -2.173086016687975e-09,
"advantage_min": -0.12421727832406759,
"advantage_std": 0.11818943079560995,
"completion_length": 2927.9791870117188,
"epoch": 0.19657142857142856,
"grad_norm": 0.021065451204776764,
"kl": 0.010840415954589844,
"learning_rate": 8.464102570534061e-07,
"loss": 0.0073,
"reward": 0.06200486654415727,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11818943126127124,
"rewards/cosine_scaled_reward": 0.02644550008699298,
"rewards/format_reward": 0.31250000186264515,
"step": 172
},
{
"advantage_max": 0.19667653134092689,
"advantage_mean": -4.229756791346295e-09,
"advantage_min": -0.16769259562715888,
"advantage_std": 0.1501930463127792,
"completion_length": 1986.583381652832,
"epoch": 0.1977142857142857,
"grad_norm": 0.023256558924913406,
"kl": 0.007579803466796875,
"learning_rate": 8.440392717955475e-07,
"loss": 0.0107,
"reward": 0.09084123687352985,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.15019305399619043,
"rewards/cosine_scaled_reward": -0.06845231936313212,
"rewards/format_reward": 0.6666666772216558,
"step": 173
},
{
"advantage_max": 0.19292704621329904,
"advantage_mean": -4.889443558364626e-09,
"advantage_min": -0.15875433292239904,
"advantage_std": 0.13439691066741943,
"completion_length": 2762.854232788086,
"epoch": 0.19885714285714284,
"grad_norm": 0.021673617884516716,
"kl": 0.011868476867675781,
"learning_rate": 8.416539554784089e-07,
"loss": 0.0079,
"reward": 0.06353488937020302,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1343969153240323,
"rewards/cosine_scaled_reward": -0.05013580992817879,
"rewards/format_reward": 0.4791666716337204,
"step": 174
},
{
"advantage_max": 0.12862909119576216,
"advantage_mean": -9.778887297140493e-09,
"advantage_min": -0.15183910354971886,
"advantage_std": 0.121172487270087,
"completion_length": 2763.083366394043,
"epoch": 0.2,
"grad_norm": 0.018218420445919037,
"kl": 0.00750732421875,
"learning_rate": 8.392544243589427e-07,
"loss": 0.0052,
"reward": 0.0709586595185101,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.12117248913273215,
"rewards/cosine_scaled_reward": -0.020462360233068466,
"rewards/format_reward": 0.4583333358168602,
"step": 175
},
{
"advantage_max": 0.19677884504199028,
"advantage_mean": -2.1730863497548825e-09,
"advantage_min": -0.2427662005648017,
"advantage_std": 0.17106487229466438,
"completion_length": 2771.3542098999023,
"epoch": 0.20114285714285715,
"grad_norm": 0.0242843609303236,
"kl": 0.007083892822265625,
"learning_rate": 8.368407953869103e-07,
"loss": 0.0075,
"reward": 0.13187712128274143,
"reward_advantage_correlation": 1.0,
"reward_std": 0.17106487601995468,
"rewards/cosine_scaled_reward": 0.10790817299857736,
"rewards/format_reward": 0.562500013038516,
"step": 176
},
{
"advantage_max": 0.12549755768850446,
"advantage_mean": -2.0816681711721685e-17,
"advantage_min": -0.1870342567563057,
"advantage_std": 0.11075702356174588,
"completion_length": 3025.3334045410156,
"epoch": 0.2022857142857143,
"grad_norm": 0.02049132063984871,
"kl": 0.011358261108398438,
"learning_rate": 8.344131861991828e-07,
"loss": 0.0082,
"reward": 0.036965833278372884,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11075702914968133,
"rewards/cosine_scaled_reward": -0.1412050067447126,
"rewards/format_reward": 0.5000000223517418,
"step": 177
},
{
"advantage_max": 0.19278418272733688,
"advantage_mean": -4.190951682914168e-09,
"advantage_min": -0.18454215489327908,
"advantage_std": 0.15981952054426074,
"completion_length": 2930.416748046875,
"epoch": 0.20342857142857143,
"grad_norm": 0.02552485093474388,
"kl": 0.013797760009765625,
"learning_rate": 8.319717151140072e-07,
"loss": 0.0099,
"reward": 0.07667386531829834,
"reward_advantage_correlation": 1.0,
"reward_std": 0.15981952100992203,
"rewards/cosine_scaled_reward": -0.03615371882915497,
"rewards/format_reward": 0.5208333525806665,
"step": 178
},
{
"advantage_max": 0.15210868138819933,
"advantage_mean": -1.6298145541493447e-09,
"advantage_min": -0.11770202778279781,
"advantage_std": 0.1120380088686943,
"completion_length": 2897.062530517578,
"epoch": 0.20457142857142857,
"grad_norm": 0.021920261904597282,
"kl": 0.008077621459960938,
"learning_rate": 8.295165011252396e-07,
"loss": 0.0015,
"reward": 0.01579895243048668,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11203800793737173,
"rewards/cosine_scaled_reward": -0.16117261722683907,
"rewards/format_reward": 0.41666667349636555,
"step": 179
},
{
"advantage_max": 0.15735867712646723,
"advantage_mean": -6.713283928760916e-09,
"advantage_min": -0.1801003571599722,
"advantage_std": 0.12799114314839244,
"completion_length": 2355.6458892822266,
"epoch": 0.2057142857142857,
"grad_norm": 0.019921960309147835,
"kl": 0.01093292236328125,
"learning_rate": 8.270476638965461e-07,
"loss": -0.0007,
"reward": 0.13479173695668578,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.12799114966765046,
"rewards/cosine_scaled_reward": 0.10458490764722228,
"rewards/format_reward": 0.5833333432674408,
"step": 180
},
{
"advantage_max": 0.0872650584205985,
"advantage_mean": 1.6298145749660264e-09,
"advantage_min": -0.09466889966279268,
"advantage_std": 0.0786438356153667,
"completion_length": 3100.9583587646484,
"epoch": 0.20685714285714285,
"grad_norm": 0.017120540142059326,
"kl": 0.0089263916015625,
"learning_rate": 8.245653237555705e-07,
"loss": 0.0029,
"reward": 0.02593186777085066,
"reward_advantage_correlation": 1.0,
"reward_std": 0.07864383282139897,
"rewards/cosine_scaled_reward": -0.06834173109382391,
"rewards/format_reward": 0.29166667349636555,
"step": 181
},
{
"advantage_max": 0.13478928711265326,
"advantage_mean": -7.605801224941366e-09,
"advantage_min": -0.21524380147457123,
"advantage_std": 0.14336608722805977,
"completion_length": 2586.5208892822266,
"epoch": 0.208,
"grad_norm": 0.027521653100848198,
"kl": 0.00634765625,
"learning_rate": 8.220696016880687e-07,
"loss": 0.0057,
"reward": 0.14823864586651325,
"reward_advantage_correlation": 1.0,
"reward_std": 0.14336608815938234,
"rewards/cosine_scaled_reward": 0.13677118346095085,
"rewards/format_reward": 0.604166679084301,
"step": 182
},
{
"advantage_max": 0.18059027008712292,
"advantage_mean": -2.6387474982847436e-09,
"advantage_min": -0.17588150314986706,
"advantage_std": 0.14301183447241783,
"completion_length": 2779.6250915527344,
"epoch": 0.20914285714285713,
"grad_norm": 0.023653734475374222,
"kl": 0.013004302978515625,
"learning_rate": 8.195606193320136e-07,
"loss": 0.0143,
"reward": 0.04137046728283167,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.1430118391290307,
"rewards/cosine_scaled_reward": -0.10869285650551319,
"rewards/format_reward": 0.45833334140479565,
"step": 183
},
{
"advantage_max": 0.127640918828547,
"advantage_mean": 2.7939678071131624e-09,
"advantage_min": -0.1175808496773243,
"advantage_std": 0.10124421585351229,
"completion_length": 2929.5833740234375,
"epoch": 0.2102857142857143,
"grad_norm": 0.019127527251839638,
"kl": 0.009464263916015625,
"learning_rate": 8.170384989716657e-07,
"loss": 0.0029,
"reward": -0.02080875914543867,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10124421957880259,
"rewards/cosine_scaled_reward": -0.19583849888294935,
"rewards/format_reward": 0.2708333395421505,
"step": 184
},
{
"advantage_max": 0.15481803310103714,
"advantage_mean": -8.537123716290118e-10,
"advantage_min": -0.10665717558003962,
"advantage_std": 0.1069570422405377,
"completion_length": 2703.2500534057617,
"epoch": 0.21142857142857144,
"grad_norm": 0.02145558036863804,
"kl": 0.010293006896972656,
"learning_rate": 8.145033635316128e-07,
"loss": 0.0095,
"reward": -0.010108587564900517,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10695704608224332,
"rewards/cosine_scaled_reward": -0.24899004492908716,
"rewards/format_reward": 0.4375000037252903,
"step": 185
},
{
"advantage_max": 0.12561852345243096,
"advantage_mean": 2.5611370246814147e-09,
"advantage_min": -0.16176388878375292,
"advantage_std": 0.11582700302824378,
"completion_length": 3060.604202270508,
"epoch": 0.21257142857142858,
"grad_norm": 0.020156843587756157,
"kl": 0.008787155151367188,
"learning_rate": 8.119553365707802e-07,
"loss": 0.0001,
"reward": 0.05811757780611515,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1158270058222115,
"rewards/cosine_scaled_reward": -0.01455057691782713,
"rewards/format_reward": 0.3750000111758709,
"step": 186
},
{
"advantage_max": 0.12376056425273418,
"advantage_mean": -1.0089327498463696e-09,
"advantage_min": -0.11852440610527992,
"advantage_std": 0.09066633740440011,
"completion_length": 2529.5625915527344,
"epoch": 0.21371428571428572,
"grad_norm": 0.020506154745817184,
"kl": 0.012113571166992188,
"learning_rate": 8.093945422764069e-07,
"loss": 0.0106,
"reward": 0.05328264785930514,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09066633740440011,
"rewards/cosine_scaled_reward": -0.10320662707090378,
"rewards/format_reward": 0.5208333469927311,
"step": 187
},
{
"advantage_max": 0.16143850050866604,
"advantage_mean": 1.0089328331130965e-09,
"advantage_min": -0.10339000448584557,
"advantage_std": 0.103767134482041,
"completion_length": 3437.2916870117188,
"epoch": 0.21485714285714286,
"grad_norm": 0.019638704136013985,
"kl": 0.009023666381835938,
"learning_rate": 8.068211054579943e-07,
"loss": 0.0035,
"reward": -0.034261735156178474,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10376713238656521,
"rewards/cosine_scaled_reward": -0.1733952183276415,
"rewards/format_reward": 0.1458333358168602,
"step": 188
},
{
"advantage_max": 0.0868607796728611,
"advantage_mean": -8.537123882823572e-09,
"advantage_min": -0.10214976128190756,
"advantage_std": 0.07864707754924893,
"completion_length": 2609.270881652832,
"epoch": 0.216,
"grad_norm": 0.016191232949495316,
"kl": 0.007904052734375,
"learning_rate": 8.04235151541222e-07,
"loss": 0.0096,
"reward": 0.09003830677829683,
"reward_advantage_correlation": 1.0,
"reward_std": 0.07864707987755537,
"rewards/cosine_scaled_reward": 0.014717839658260345,
"rewards/format_reward": 0.5000000037252903,
"step": 189
},
{
"advantage_max": 0.14429873740300536,
"advantage_mean": -2.4835268772060992e-09,
"advantage_min": -0.12300875596702099,
"advantage_std": 0.10701592592522502,
"completion_length": 2838.5208740234375,
"epoch": 0.21714285714285714,
"grad_norm": 0.016013866290450096,
"kl": 0.01021575927734375,
"learning_rate": 8.01636806561836e-07,
"loss": 0.0091,
"reward": 0.05636314395815134,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10701592825353146,
"rewards/cosine_scaled_reward": -0.021093813586048782,
"rewards/format_reward": 0.37500000931322575,
"step": 190
},
{
"advantage_max": 0.19808372668921947,
"advantage_mean": -5.665545754762746e-09,
"advantage_min": -0.18589732144027948,
"advantage_std": 0.1561442338861525,
"completion_length": 2586.6875076293945,
"epoch": 0.21828571428571428,
"grad_norm": 0.031619712710380554,
"kl": 0.010101318359375,
"learning_rate": 7.990261971595048e-07,
"loss": 0.0154,
"reward": 0.089652857510373,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1561442413367331,
"rewards/cosine_scaled_reward": 0.012603958894032985,
"rewards/format_reward": 0.5000000055879354,
"step": 191
},
{
"advantage_max": 0.1139423786662519,
"advantage_mean": -1.5522042262627878e-09,
"advantage_min": -0.10976849030703306,
"advantage_std": 0.09138505253940821,
"completion_length": 3170.791717529297,
"epoch": 0.21942857142857142,
"grad_norm": 0.01777508296072483,
"kl": 0.008083343505859375,
"learning_rate": 7.964034505716476e-07,
"loss": 0.0057,
"reward": 0.01685933256521821,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09138506185263395,
"rewards/cosine_scaled_reward": -0.09557726047933102,
"rewards/format_reward": 0.2916666716337204,
"step": 192
},
{
"advantage_max": 0.14380517834797502,
"advantage_mean": -3.1044090909038147e-10,
"advantage_min": -0.13345478381961584,
"advantage_std": 0.10799073707312346,
"completion_length": 3189.750045776367,
"epoch": 0.22057142857142858,
"grad_norm": 0.01901032216846943,
"kl": 0.010015487670898438,
"learning_rate": 7.93768694627233e-07,
"loss": 0.0091,
"reward": 0.02204875904135406,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10799074079841375,
"rewards/cosine_scaled_reward": -0.11225971346721053,
"rewards/format_reward": 0.35416667722165585,
"step": 193
},
{
"advantage_max": 0.15160069148987532,
"advantage_mean": -4.190951627403017e-09,
"advantage_min": -0.1970332907512784,
"advantage_std": 0.1322903553955257,
"completion_length": 3056.916717529297,
"epoch": 0.22171428571428572,
"grad_norm": 0.021130474284291267,
"kl": 0.008333206176757812,
"learning_rate": 7.911220577405484e-07,
"loss": 0.0085,
"reward": 0.15966177079826593,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.13229035586118698,
"rewards/cosine_scaled_reward": 0.20346511714160442,
"rewards/format_reward": 0.541666679084301,
"step": 194
},
{
"advantage_max": 0.21137044485658407,
"advantage_mean": -2.017865666226193e-09,
"advantage_min": -0.18295289110392332,
"advantage_std": 0.16112522408366203,
"completion_length": 3007.3125610351562,
"epoch": 0.22285714285714286,
"grad_norm": 0.02642098069190979,
"kl": 0.008836746215820312,
"learning_rate": 7.884636689049422e-07,
"loss": 0.0125,
"reward": 0.07984138361644,
"reward_advantage_correlation": 1.0,
"reward_std": 0.16112523339688778,
"rewards/cosine_scaled_reward": -0.01430131122469902,
"rewards/format_reward": 0.5000000149011612,
"step": 195
},
{
"advantage_max": 0.11525858240202069,
"advantage_mean": 1.862645163108745e-09,
"advantage_min": -0.11348771117627621,
"advantage_std": 0.09375560656189919,
"completion_length": 3303.4166870117188,
"epoch": 0.224,
"grad_norm": 0.021477309986948967,
"kl": 0.0077724456787109375,
"learning_rate": 7.857936576865356e-07,
"loss": 0.0007,
"reward": 0.050928775453940034,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.09375560656189919,
"rewards/cosine_scaled_reward": -0.02439815178513527,
"rewards/format_reward": 0.35416666977107525,
"step": 196
},
{
"advantage_max": 0.3016190994530916,
"advantage_mean": -7.605801224941366e-09,
"advantage_min": -0.20797276590019464,
"advantage_std": 0.2091372339054942,
"completion_length": 2295.3333587646484,
"epoch": 0.22514285714285714,
"grad_norm": 0.03227706253528595,
"kl": 0.008520126342773438,
"learning_rate": 7.831121542179086e-07,
"loss": 0.0104,
"reward": 0.11764910374768078,
"reward_advantage_correlation": 1.0,
"reward_std": 0.2091372385621071,
"rewards/cosine_scaled_reward": 0.04368517640978098,
"rewards/format_reward": 0.604166679084301,
"step": 197
},
{
"advantage_max": 0.15463142562657595,
"advantage_mean": -4.035731401841325e-09,
"advantage_min": -0.1649370063096285,
"advantage_std": 0.12568959640339017,
"completion_length": 2766.3750610351562,
"epoch": 0.22628571428571428,
"grad_norm": 0.023209044709801674,
"kl": 0.011608123779296875,
"learning_rate": 7.804192891917571e-07,
"loss": 0.011,
"reward": 0.0660445298999548,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12568959454074502,
"rewards/cosine_scaled_reward": -0.045943474397063255,
"rewards/format_reward": 0.47916668094694614,
"step": 198
},
{
"advantage_max": 0.14701731782406569,
"advantage_mean": -2.1730860721991263e-09,
"advantage_min": -0.14183137379586697,
"advantage_std": 0.11165185179561377,
"completion_length": 3187.5208435058594,
"epoch": 0.22742857142857142,
"grad_norm": 0.022100677713751793,
"kl": 0.012798309326171875,
"learning_rate": 7.777151938545235e-07,
"loss": 0.0058,
"reward": -0.011955318361287937,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1116518541239202,
"rewards/cosine_scaled_reward": -0.1814872338436544,
"rewards/format_reward": 0.2916666753590107,
"step": 199
},
{
"advantage_max": 0.22152548655867577,
"advantage_mean": -3.5700699618779197e-09,
"advantage_min": -0.17661806754767895,
"advantage_std": 0.16029490064829588,
"completion_length": 2475.854217529297,
"epoch": 0.22857142857142856,
"grad_norm": 0.0255091842263937,
"kl": 0.0064983367919921875,
"learning_rate": 7.75e-07,
"loss": 0.0114,
"reward": 0.12036499596433714,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1602949034422636,
"rewards/cosine_scaled_reward": 0.08125638961791992,
"rewards/format_reward": 0.5416666772216558,
"step": 200
},
{
"advantage_max": 0.23342143837362528,
"advantage_mean": -6.208817404651512e-09,
"advantage_min": -0.21431293059140444,
"advantage_std": 0.18895365437492728,
"completion_length": 2669.520851135254,
"epoch": 0.2297142857142857,
"grad_norm": 0.030897650867700577,
"kl": 0.0077991485595703125,
"learning_rate": 7.72273839962904e-07,
"loss": 0.0155,
"reward": 0.1639406383037567,
"reward_advantage_correlation": 1.0,
"reward_std": 0.18895365856587887,
"rewards/cosine_scaled_reward": 0.18311863904818892,
"rewards/format_reward": 0.6041666772216558,
"step": 201
},
{
"advantage_max": 0.08613977860659361,
"advantage_mean": -4.346172047253738e-09,
"advantage_min": -0.10160630848258734,
"advantage_std": 0.0818297709338367,
"completion_length": 2086.8333587646484,
"epoch": 0.23085714285714284,
"grad_norm": 0.011819848790764809,
"kl": 0.005578041076660156,
"learning_rate": 7.695368466124296e-07,
"loss": -0.0019,
"reward": 0.18190485704690218,
"reward_advantage_correlation": 1.0,
"reward_std": 0.0818297709338367,
"rewards/cosine_scaled_reward": 0.22224061330780387,
"rewards/format_reward": 0.625,
"step": 202
},
{
"advantage_max": 0.19074973743408918,
"advantage_mean": 6.208817349140361e-10,
"advantage_min": -0.09372275089845061,
"advantage_std": 0.11288550030440092,
"completion_length": 3141.375030517578,
"epoch": 0.232,
"grad_norm": 0.021399665623903275,
"kl": 0.0112152099609375,
"learning_rate": 7.667891533457718e-07,
"loss": 0.0082,
"reward": 0.017376512056216598,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11288550635799766,
"rewards/cosine_scaled_reward": -0.0945606417953968,
"rewards/format_reward": 0.2916666716337204,
"step": 203
},
{
"advantage_max": 0.16842536255717278,
"advantage_mean": -2.949188337986186e-09,
"advantage_min": -0.19366966281086206,
"advantage_std": 0.14565699081867933,
"completion_length": 2458.7292098999023,
"epoch": 0.23314285714285715,
"grad_norm": 0.033364247530698776,
"kl": 0.01160430908203125,
"learning_rate": 7.640308940816239e-07,
"loss": 0.0215,
"reward": 0.0991232428496005,
"reward_advantage_correlation": 1.0,
"reward_std": 0.14565699454396963,
"rewards/cosine_scaled_reward": -0.021544933319091797,
"rewards/format_reward": 0.6250000204890966,
"step": 204
},
{
"advantage_max": 0.22404637094587088,
"advantage_mean": -4.035731443474688e-09,
"advantage_min": -0.23686167504638433,
"advantage_std": 0.18611570354551077,
"completion_length": 2529.187530517578,
"epoch": 0.2342857142857143,
"grad_norm": 0.04161351919174194,
"kl": 0.0066680908203125,
"learning_rate": 7.612622032536507e-07,
"loss": 0.0148,
"reward": 0.14069395791739225,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1861157095991075,
"rewards/cosine_scaled_reward": 0.10330121964216232,
"rewards/format_reward": 0.625000013038516,
"step": 205
},
{
"advantage_max": 0.191861386410892,
"advantage_mean": 1.8626451908643205e-09,
"advantage_min": -0.09568049665540457,
"advantage_std": 0.11434536194428802,
"completion_length": 3076.7917404174805,
"epoch": 0.23542857142857143,
"grad_norm": 0.017068374902009964,
"kl": 0.007312774658203125,
"learning_rate": 7.584832158039378e-07,
"loss": 0.005,
"reward": -0.04568293271586299,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11434536380693316,
"rewards/cosine_scaled_reward": -0.2805359214544296,
"rewards/format_reward": 0.2916666716337204,
"step": 206
},
{
"advantage_max": 0.17041416559368372,
"advantage_mean": -2.9491882408416714e-09,
"advantage_min": -0.15167315676808357,
"advantage_std": 0.1317178774625063,
"completion_length": 2986.6875610351562,
"epoch": 0.23657142857142857,
"grad_norm": 0.019738713279366493,
"kl": 0.013507843017578125,
"learning_rate": 7.556940671764124e-07,
"loss": 0.0111,
"reward": 0.02802197606069967,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.13171787792816758,
"rewards/cosine_scaled_reward": -0.1678624264895916,
"rewards/format_reward": 0.5000000093132257,
"step": 207
},
{
"advantage_max": 0.12179180048406124,
"advantage_mean": 1.6298144223103606e-09,
"advantage_min": -0.12574960058555007,
"advantage_std": 0.09972883993759751,
"completion_length": 2369.2708892822266,
"epoch": 0.2377142857142857,
"grad_norm": 0.019418170675635338,
"kl": 0.006829261779785156,
"learning_rate": 7.528948933102438e-07,
"loss": 0.0023,
"reward": 0.10339555609971285,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09972884552553296,
"rewards/cosine_scaled_reward": 0.014620641246438026,
"rewards/format_reward": 0.583333333954215,
"step": 208
},
{
"advantage_max": 0.14903791062533855,
"advantage_mean": -3.880511467713177e-10,
"advantage_min": -0.09558573551476002,
"advantage_std": 0.09572446253150702,
"completion_length": 2642.020835876465,
"epoch": 0.23885714285714285,
"grad_norm": 0.015551680698990822,
"kl": 0.009195327758789062,
"learning_rate": 7.500858306332172e-07,
"loss": 0.0058,
"reward": 0.0819700972060673,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.09572446579113603,
"rewards/cosine_scaled_reward": -0.007855089381337166,
"rewards/format_reward": 0.5000000018626451,
"step": 209
},
{
"advantage_max": 0.20507059153169394,
"advantage_mean": -1.2417633865613453e-09,
"advantage_min": -0.14836034085601568,
"advantage_std": 0.12791450042277575,
"completion_length": 2679.5208892822266,
"epoch": 0.24,
"grad_norm": 0.013562957756221294,
"kl": 0.0076656341552734375,
"learning_rate": 7.472670160550848e-07,
"loss": 0.0103,
"reward": 0.06892588455229998,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.12791450135409832,
"rewards/cosine_scaled_reward": -0.0150822380091995,
"rewards/format_reward": 0.4375000074505806,
"step": 210
},
{
"advantage_max": 0.08701092284172773,
"advantage_mean": -1.6298144847604057e-09,
"advantage_min": -0.06568480283021927,
"advantage_std": 0.05974301462993026,
"completion_length": 2295.4583587646484,
"epoch": 0.24114285714285713,
"grad_norm": 0.010742315091192722,
"kl": 0.008317947387695312,
"learning_rate": 7.444385869608921e-07,
"loss": 0.0027,
"reward": 0.0884767509996891,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.05974301369860768,
"rewards/cosine_scaled_reward": -0.061297111213207245,
"rewards/format_reward": 0.6458333395421505,
"step": 211
},
{
"advantage_max": 0.1430717734619975,
"advantage_mean": -8.84756487518068e-09,
"advantage_min": -0.15477489028126,
"advantage_std": 0.11844521341845393,
"completion_length": 2305.875015258789,
"epoch": 0.2422857142857143,
"grad_norm": 0.022116854786872864,
"kl": 0.008672714233398438,
"learning_rate": 7.416006812042827e-07,
"loss": 0.0115,
"reward": 0.11736472509801388,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11844521760940552,
"rewards/cosine_scaled_reward": 0.03006414882838726,
"rewards/format_reward": 0.6250000093132257,
"step": 212
},
{
"advantage_max": 0.21030229609459639,
"advantage_mean": -1.0865430638551388e-09,
"advantage_min": -0.1627835095860064,
"advantage_std": 0.14803386572748423,
"completion_length": 2458.354202270508,
"epoch": 0.24342857142857144,
"grad_norm": 0.02114744670689106,
"kl": 0.013973236083984375,
"learning_rate": 7.387534371007797e-07,
"loss": 0.0078,
"reward": 0.09445515216793865,
"reward_advantage_correlation": 1.0,
"reward_std": 0.14803387317806482,
"rewards/cosine_scaled_reward": -0.003314238041639328,
"rewards/format_reward": 0.5625000055879354,
"step": 213
},
{
"advantage_max": 0.16307567991316319,
"advantage_mean": 3.8805106350459084e-10,
"advantage_min": -0.1880338666960597,
"advantage_std": 0.13389361603185534,
"completion_length": 2575.6875762939453,
"epoch": 0.24457142857142858,
"grad_norm": 0.02633727341890335,
"kl": 0.008653640747070312,
"learning_rate": 7.358969934210438e-07,
"loss": 0.0085,
"reward": 0.1364652120973915,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1338936211541295,
"rewards/cosine_scaled_reward": 0.07814164273440838,
"rewards/format_reward": 0.6458333488553762,
"step": 214
},
{
"advantage_max": 0.11850057449191809,
"advantage_mean": 2.6387473733846534e-09,
"advantage_min": -0.1467567002400756,
"advantage_std": 0.10804584342986345,
"completion_length": 2357.2708587646484,
"epoch": 0.24571428571428572,
"grad_norm": 0.016689570620656013,
"kl": 0.0051898956298828125,
"learning_rate": 7.330314893841101e-07,
"loss": 0.0116,
"reward": 0.09326311138829624,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10804584808647633,
"rewards/cosine_scaled_reward": -0.05246574338525534,
"rewards/format_reward": 0.6458333469927311,
"step": 215
},
{
"advantage_max": 0.17286218609660864,
"advantage_mean": -2.0178656801039807e-09,
"advantage_min": -0.2057242812588811,
"advantage_std": 0.15920248720794916,
"completion_length": 2195.8958587646484,
"epoch": 0.24685714285714286,
"grad_norm": 0.028986535966396332,
"kl": 0.009554386138916016,
"learning_rate": 7.301570646506027e-07,
"loss": 0.015,
"reward": 0.08979062891739886,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1592024890705943,
"rewards/cosine_scaled_reward": -0.01833750121295452,
"rewards/format_reward": 0.5625000111758709,
"step": 216
},
{
"advantage_max": 0.2062919419258833,
"advantage_mean": 3.8805107738237865e-10,
"advantage_min": -0.16623112093657255,
"advantage_std": 0.15084476629272103,
"completion_length": 2666.6875915527344,
"epoch": 0.248,
"grad_norm": 0.022731564939022064,
"kl": 0.008672714233398438,
"learning_rate": 7.27273859315928e-07,
"loss": 0.0027,
"reward": 0.06434820429421961,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.15084476908668876,
"rewards/cosine_scaled_reward": -0.05128720495849848,
"rewards/format_reward": 0.47916666977107525,
"step": 217
},
{
"advantage_max": 0.2643870050087571,
"advantage_mean": -1.668619659112025e-09,
"advantage_min": -0.1602165149524808,
"advantage_std": 0.16815320495516062,
"completion_length": 2753.666748046875,
"epoch": 0.24914285714285714,
"grad_norm": 0.023222601041197777,
"kl": 0.01018524169921875,
"learning_rate": 7.243820139034464e-07,
"loss": 0.0127,
"reward": 0.05022087972611189,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.16815320681780577,
"rewards/cosine_scaled_reward": -0.07070112135261297,
"rewards/format_reward": 0.4375000074505806,
"step": 218
},
{
"advantage_max": 0.18152069114148617,
"advantage_mean": -1.8626452463754717e-09,
"advantage_min": -0.1738226441666484,
"advantage_std": 0.13549907505512238,
"completion_length": 2355.1042251586914,
"epoch": 0.2502857142857143,
"grad_norm": 0.022829996421933174,
"kl": 0.011911392211914062,
"learning_rate": 7.214816693576234e-07,
"loss": 0.0078,
"reward": 0.11652564397081733,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.13549907878041267,
"rewards/cosine_scaled_reward": 0.040676600649021566,
"rewards/format_reward": 0.6041666734963655,
"step": 219
},
{
"advantage_max": 0.0978488284163177,
"advantage_mean": -8.537123716290118e-10,
"advantage_min": -0.1063184947706759,
"advantage_std": 0.07688094722107053,
"completion_length": 2560.625045776367,
"epoch": 0.25142857142857145,
"grad_norm": 0.010847953148186207,
"kl": 0.0134124755859375,
"learning_rate": 7.185729670371604e-07,
"loss": 0.0068,
"reward": 0.002609904622659087,
"reward_advantage_correlation": 1.0,
"reward_std": 0.07688094908371568,
"rewards/cosine_scaled_reward": -0.22321993205696344,
"rewards/format_reward": 0.4583333469927311,
"step": 220
},
{
"advantage_max": 0.12692446261644363,
"advantage_mean": -5.587935586470749e-09,
"advantage_min": -0.1331609645858407,
"advantage_std": 0.09573783411178738,
"completion_length": 1943.9792098999023,
"epoch": 0.25257142857142856,
"grad_norm": 0.01251673512160778,
"kl": 0.005926609039306641,
"learning_rate": 7.156560487081051e-07,
"loss": 0.0098,
"reward": 0.11415702244266868,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.09573783841915429,
"rewards/cosine_scaled_reward": 0.003712208941578865,
"rewards/format_reward": 0.6666666734963655,
"step": 221
},
{
"advantage_max": 0.11880875332280993,
"advantage_mean": -7.372970567409709e-10,
"advantage_min": -0.09530603419989347,
"advantage_std": 0.08716670400463045,
"completion_length": 2160.687515258789,
"epoch": 0.2537142857142857,
"grad_norm": 0.015030966140329838,
"kl": 0.005939483642578125,
"learning_rate": 7.127310565369415e-07,
"loss": 0.0025,
"reward": 0.16150327073410153,
"reward_advantage_correlation": 1.0,
"reward_std": 0.08716670540161431,
"rewards/cosine_scaled_reward": 0.15005341079086065,
"rewards/format_reward": 0.645833333954215,
"step": 222
},
{
"advantage_max": 0.12176577933132648,
"advantage_mean": -2.328306450416484e-09,
"advantage_min": -0.1717464942485094,
"advantage_std": 0.11729715252295136,
"completion_length": 2503.500045776367,
"epoch": 0.25485714285714284,
"grad_norm": 0.019054196774959564,
"kl": 0.006908416748046875,
"learning_rate": 7.097981330836616e-07,
"loss": 0.0064,
"reward": 0.11482558376155794,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.11729715345427394,
"rewards/cosine_scaled_reward": 0.03707345947623253,
"rewards/format_reward": 0.6041666753590107,
"step": 223
},
{
"advantage_max": 0.15128592168912292,
"advantage_mean": 9.313226578822054e-10,
"advantage_min": -0.12854185421019793,
"advantage_std": 0.11887390678748488,
"completion_length": 2807.4166870117188,
"epoch": 0.256,
"grad_norm": 0.019626790657639503,
"kl": 0.011783599853515625,
"learning_rate": 7.068574212948169e-07,
"loss": 0.0043,
"reward": 0.02313301805406809,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.11887391144409776,
"rewards/cosine_scaled_reward": -0.1292988988570869,
"rewards/format_reward": 0.3958333395421505,
"step": 224
},
{
"advantage_max": 0.14465034054592252,
"advantage_mean": 2.3283067140944524e-10,
"advantage_min": -0.17112653935328126,
"advantage_std": 0.13964845798909664,
"completion_length": 2912.1042404174805,
"epoch": 0.2571428571428571,
"grad_norm": 0.031687695533037186,
"kl": 0.010837554931640625,
"learning_rate": 7.039090644965509e-07,
"loss": 0.0072,
"reward": 0.06882307189516723,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1396484593860805,
"rewards/cosine_scaled_reward": -0.025214084424078465,
"rewards/format_reward": 0.45833334140479565,
"step": 225
},
{
"advantage_max": 0.15410880697891116,
"advantage_mean": -6.208817203423589e-09,
"advantage_min": -0.1973248701542616,
"advantage_std": 0.13895932119339705,
"completion_length": 2613.041717529297,
"epoch": 0.2582857142857143,
"grad_norm": 0.02550850808620453,
"kl": 0.012088775634765625,
"learning_rate": 7.009532063876148e-07,
"loss": 0.0088,
"reward": 0.11753622768446803,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.13895932817831635,
"rewards/cosine_scaled_reward": 0.09499787259846926,
"rewards/format_reward": 0.5000000055879354,
"step": 226
},
{
"advantage_max": 0.20522056613117456,
"advantage_mean": -5.432714417219486e-10,
"advantage_min": -0.13819229789078236,
"advantage_std": 0.13077273219823837,
"completion_length": 2225.187545776367,
"epoch": 0.25942857142857145,
"grad_norm": 0.02267509512603283,
"kl": 0.014047622680664062,
"learning_rate": 6.979899910323624e-07,
"loss": 0.0049,
"reward": 0.06590676098130643,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13077273732051253,
"rewards/cosine_scaled_reward": -0.14173524361103773,
"rewards/format_reward": 0.6666666772216558,
"step": 227
},
{
"advantage_max": 0.10316049680113792,
"advantage_mean": -9.002785308909189e-09,
"advantage_min": -0.1567352144047618,
"advantage_std": 0.1106303846463561,
"completion_length": 2026.0833473205566,
"epoch": 0.26057142857142856,
"grad_norm": 0.019735384732484818,
"kl": 0.0073490142822265625,
"learning_rate": 6.950195628537299e-07,
"loss": 0.0142,
"reward": 0.12237067025853321,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11063039023429155,
"rewards/cosine_scaled_reward": 0.06896073557436466,
"rewards/format_reward": 0.5833333376795053,
"step": 228
},
{
"advantage_max": 0.14611426461488008,
"advantage_mean": -1.862645218619896e-09,
"advantage_min": -0.11763771809637547,
"advantage_std": 0.11251156777143478,
"completion_length": 3169.4791717529297,
"epoch": 0.26171428571428573,
"grad_norm": 0.023638132959604263,
"kl": 0.013431549072265625,
"learning_rate": 6.920420666261961e-07,
"loss": 0.003,
"reward": 0.07627941074315459,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11251156777143478,
"rewards/cosine_scaled_reward": 0.06778724677860737,
"rewards/format_reward": 0.31250000558793545,
"step": 229
},
{
"advantage_max": 0.20736990496516228,
"advantage_mean": 1.396983924373263e-09,
"advantage_min": -0.1171214496716857,
"advantage_std": 0.12726245261728764,
"completion_length": 3142.541717529297,
"epoch": 0.26285714285714284,
"grad_norm": 0.02262984775006771,
"kl": 0.01515960693359375,
"learning_rate": 6.890576474687263e-07,
"loss": 0.0136,
"reward": -0.014790376415476203,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.12726245447993279,
"rewards/cosine_scaled_reward": -0.2001241371035576,
"rewards/format_reward": 0.31250000931322575,
"step": 230
},
{
"advantage_max": 0.1515338383615017,
"advantage_mean": 2.0178655690816782e-09,
"advantage_min": -0.16625231131911278,
"advantage_std": 0.12852926715277135,
"completion_length": 2640.020866394043,
"epoch": 0.264,
"grad_norm": 0.02330903708934784,
"kl": 0.011930465698242188,
"learning_rate": 6.860664508377001e-07,
"loss": 0.0066,
"reward": 0.10281063965521753,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1285292694810778,
"rewards/cosine_scaled_reward": 0.04173674341291189,
"rewards/format_reward": 0.5208333469927311,
"step": 231
},
{
"advantage_max": 0.20202800119295716,
"advantage_mean": -1.3969838896787934e-09,
"advantage_min": -0.13641920872032642,
"advantage_std": 0.12839107867330313,
"completion_length": 3155.4583892822266,
"epoch": 0.2651428571428571,
"grad_norm": 0.02304697036743164,
"kl": 0.011341094970703125,
"learning_rate": 6.83068622519821e-07,
"loss": 0.0023,
"reward": 0.0004285484756110236,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12839107774198055,
"rewards/cosine_scaled_reward": -0.1556595927104354,
"rewards/format_reward": 0.31250000186264515,
"step": 232
},
{
"advantage_max": 0.207712696865201,
"advantage_mean": -1.2417634698280722e-09,
"advantage_min": -0.1607958972454071,
"advantage_std": 0.13828255608677864,
"completion_length": 2502.68754196167,
"epoch": 0.2662857142857143,
"grad_norm": 0.031160853803157806,
"kl": 0.012420654296875,
"learning_rate": 6.800643086250121e-07,
"loss": 0.0161,
"reward": 0.04692278621951118,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1382825607433915,
"rewards/cosine_scaled_reward": -0.1229497455060482,
"rewards/format_reward": 0.520833345130086,
"step": 233
},
{
"advantage_max": 0.13861850136891007,
"advantage_mean": -2.638747317873502e-09,
"advantage_min": -0.13130635116249323,
"advantage_std": 0.1169007895514369,
"completion_length": 2610.875045776367,
"epoch": 0.2674285714285714,
"grad_norm": 0.01976979523897171,
"kl": 0.009636878967285156,
"learning_rate": 6.770536555792944e-07,
"loss": 0.0069,
"reward": 0.07872189255431294,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.11690079234540462,
"rewards/cosine_scaled_reward": -0.007438613101840019,
"rewards/format_reward": 0.4791666716337204,
"step": 234
},
{
"advantage_max": 0.13972541643306613,
"advantage_mean": -5.665545844968367e-09,
"advantage_min": -0.1696104733273387,
"advantage_std": 0.12484075641259551,
"completion_length": 2025.4583892822266,
"epoch": 0.26857142857142857,
"grad_norm": 0.027829406782984734,
"kl": 0.012134552001953125,
"learning_rate": 6.740368101176495e-07,
"loss": 0.012,
"reward": 0.18675878643989563,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.12484075920656323,
"rewards/cosine_scaled_reward": 0.2382234064862132,
"rewards/format_reward": 0.6250000074505806,
"step": 235
},
{
"advantage_max": 0.17086784075945616,
"advantage_mean": -6.926711669519303e-09,
"advantage_min": -0.16489312052726746,
"advantage_std": 0.12884586374275386,
"completion_length": 2521.8334045410156,
"epoch": 0.26971428571428574,
"grad_norm": 0.01972981169819832,
"kl": 0.008334159851074219,
"learning_rate": 6.710139192768694e-07,
"loss": 0.0013,
"reward": 0.12938043719623238,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.12884586746804416,
"rewards/cosine_scaled_reward": 0.0801224485039711,
"rewards/format_reward": 0.6041666753590107,
"step": 236
},
{
"advantage_max": 0.12039782106876373,
"advantage_mean": -1.7074246044801455e-09,
"advantage_min": -0.12975230207666755,
"advantage_std": 0.09648847626522183,
"completion_length": 2152.375045776367,
"epoch": 0.27085714285714285,
"grad_norm": 0.015118488110601902,
"kl": 0.009752273559570312,
"learning_rate": 6.679851303883891e-07,
"loss": 0.0075,
"reward": 0.1261741843773052,
"reward_advantage_correlation": 1.0,
"reward_std": 0.096488481387496,
"rewards/cosine_scaled_reward": 0.06003169761970639,
"rewards/format_reward": 0.6250000093132257,
"step": 237
},
{
"advantage_max": 0.20382637344300747,
"advantage_mean": -2.483526898022781e-09,
"advantage_min": -0.21393706556409597,
"advantage_std": 0.17641730420291424,
"completion_length": 2390.8958740234375,
"epoch": 0.272,
"grad_norm": 0.02798081561923027,
"kl": 0.01264190673828125,
"learning_rate": 6.649505910711058e-07,
"loss": 0.011,
"reward": 0.17759971413761377,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1764173088595271,
"rewards/cosine_scaled_reward": 0.17150592245161533,
"rewards/format_reward": 0.7083333432674408,
"step": 238
},
{
"advantage_max": 0.11714844591915607,
"advantage_mean": -2.0954758137015084e-09,
"advantage_min": -0.14218166377395391,
"advantage_std": 0.10565405956003815,
"completion_length": 1920.2708435058594,
"epoch": 0.27314285714285713,
"grad_norm": 0.012746420688927174,
"kl": 0.00591278076171875,
"learning_rate": 6.619104492241847e-07,
"loss": 0.0072,
"reward": 0.1769508863799274,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10565406491514295,
"rewards/cosine_scaled_reward": 0.17638282477855682,
"rewards/format_reward": 0.6875000074505806,
"step": 239
},
{
"advantage_max": 0.11744444910436869,
"advantage_mean": -4.113341431355444e-09,
"advantage_min": -0.08588352426886559,
"advantage_std": 0.07977030100300908,
"completion_length": 2931.6458740234375,
"epoch": 0.2742857142857143,
"grad_norm": 0.03343014419078827,
"kl": 0.0148162841796875,
"learning_rate": 6.588648530198504e-07,
"loss": -0.0001,
"reward": -0.016711448086425662,
"reward_advantage_correlation": 1.0,
"reward_std": 0.07977030146867037,
"rewards/cosine_scaled_reward": -0.2383982054889202,
"rewards/format_reward": 0.3750000037252903,
"step": 240
},
{
"advantage_max": 0.15446932334452868,
"advantage_mean": -3.1044083970144243e-10,
"advantage_min": -0.1100171497091651,
"advantage_std": 0.11261474713683128,
"completion_length": 2998.2083892822266,
"epoch": 0.2754285714285714,
"grad_norm": 0.02460714988410473,
"kl": 0.014617919921875,
"learning_rate": 6.558139508961654e-07,
"loss": -0.0017,
"reward": -0.004733615671284497,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11261475319042802,
"rewards/cosine_scaled_reward": -0.20185158215463161,
"rewards/format_reward": 0.3750000037252903,
"step": 241
},
{
"advantage_max": 0.17675767396576703,
"advantage_mean": -3.6476801718032803e-09,
"advantage_min": -0.12605122895911336,
"advantage_std": 0.10846545218373649,
"completion_length": 1993.0625305175781,
"epoch": 0.2765714285714286,
"grad_norm": 0.03174210339784622,
"kl": 0.01557159423828125,
"learning_rate": 6.527578915497951e-07,
"loss": 0.0084,
"reward": 0.07294759101932868,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10846545672393404,
"rewards/cosine_scaled_reward": -0.12876278175099287,
"rewards/format_reward": 0.6875000055879354,
"step": 242
},
{
"advantage_max": 0.16028737928718328,
"advantage_mean": 1.94025534527853e-09,
"advantage_min": -0.13600675389170647,
"advantage_std": 0.11945267952978611,
"completion_length": 2631.3750762939453,
"epoch": 0.2777142857142857,
"grad_norm": 0.019514845684170723,
"kl": 0.00899505615234375,
"learning_rate": 6.496968239287603e-07,
"loss": 0.0074,
"reward": 0.08974379347637296,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11945268930867314,
"rewards/cosine_scaled_reward": 0.0119549349474255,
"rewards/format_reward": 0.5000000111758709,
"step": 243
},
{
"advantage_max": 0.2118885600939393,
"advantage_mean": -7.140140090289293e-09,
"advantage_min": -0.15698527079075575,
"advantage_std": 0.15755242481827736,
"completion_length": 2349.479217529297,
"epoch": 0.27885714285714286,
"grad_norm": 0.04179028794169426,
"kl": 0.010972976684570312,
"learning_rate": 6.466308972251785e-07,
"loss": 0.009,
"reward": 0.11599040031433105,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1575524415820837,
"rewards/cosine_scaled_reward": 0.04908072855323553,
"rewards/format_reward": 0.5833333395421505,
"step": 244
},
{
"advantage_max": 0.25635348074138165,
"advantage_mean": -6.208816794028849e-10,
"advantage_min": -0.17878746148198843,
"advantage_std": 0.17052227910608053,
"completion_length": 2936.187545776367,
"epoch": 0.28,
"grad_norm": 0.032245736569166183,
"kl": 0.010873794555664062,
"learning_rate": 6.435602608679916e-07,
"loss": 0.0165,
"reward": 0.04710392498964211,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.17052228096872568,
"rewards/cosine_scaled_reward": -0.049887945875525475,
"rewards/format_reward": 0.37500000931322575,
"step": 245
},
{
"advantage_max": 0.19493758492171764,
"advantage_mean": -6.674478622570312e-09,
"advantage_min": -0.17645201738923788,
"advantage_std": 0.14856922021135688,
"completion_length": 2657.6250610351562,
"epoch": 0.28114285714285714,
"grad_norm": 0.034092389047145844,
"kl": 0.01275634765625,
"learning_rate": 6.404850645156841e-07,
"loss": 0.0126,
"reward": 0.12820027698762715,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1485692197456956,
"rewards/cosine_scaled_reward": 0.08732841722667217,
"rewards/format_reward": 0.5833333414047956,
"step": 246
},
{
"advantage_max": 0.18537013605237007,
"advantage_mean": 3.5700699618779197e-09,
"advantage_min": -0.12155716121196747,
"advantage_std": 0.12443299405276775,
"completion_length": 2937.5833587646484,
"epoch": 0.2822857142857143,
"grad_norm": 0.023569073528051376,
"kl": 0.0147857666015625,
"learning_rate": 6.374054580489873e-07,
"loss": 0.0026,
"reward": 0.011105528741609305,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.12443300196900964,
"rewards/cosine_scaled_reward": -0.13518046215176582,
"rewards/format_reward": 0.33333333395421505,
"step": 247
},
{
"advantage_max": 0.199058273807168,
"advantage_mean": -6.984919739827511e-09,
"advantage_min": -0.16079838667064905,
"advantage_std": 0.13820406701415777,
"completion_length": 2236.7916946411133,
"epoch": 0.2834285714285714,
"grad_norm": 0.02281934767961502,
"kl": 0.012094497680664062,
"learning_rate": 6.343215915635761e-07,
"loss": 0.0039,
"reward": 0.13462523429188877,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13820407399907708,
"rewards/cosine_scaled_reward": 0.10365579696372151,
"rewards/format_reward": 0.5833333376795053,
"step": 248
},
{
"advantage_max": 0.10034069698303938,
"advantage_mean": 6.984919517782906e-10,
"advantage_min": -0.12371373269706964,
"advantage_std": 0.09401640691794455,
"completion_length": 2077.8125534057617,
"epoch": 0.2845714285714286,
"grad_norm": 0.017827067524194717,
"kl": 0.011096954345703125,
"learning_rate": 6.31233615362752e-07,
"loss": 0.0095,
"reward": 0.1499801934696734,
"reward_advantage_correlation": 1.0,
"reward_std": 0.0940164087805897,
"rewards/cosine_scaled_reward": 0.11817886866629124,
"rewards/format_reward": 0.6458333358168602,
"step": 249
},
{
"advantage_max": 0.17879820056259632,
"advantage_mean": -5.975986677730916e-09,
"advantage_min": -0.11353706941008568,
"advantage_std": 0.1170863639563322,
"completion_length": 2327.854202270508,
"epoch": 0.2857142857142857,
"grad_norm": 0.019272523000836372,
"kl": 0.01653289794921875,
"learning_rate": 6.281416799501187e-07,
"loss": 0.0106,
"reward": 0.049585518427193165,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1170863676816225,
"rewards/cosine_scaled_reward": -0.15713005082216114,
"rewards/format_reward": 0.6041666697710752,
"step": 250
},
{
"advantage_max": 0.22082332614809275,
"advantage_mean": -1.7074247710135992e-09,
"advantage_min": -0.15717947948724031,
"advantage_std": 0.14891249779611826,
"completion_length": 1988.0625228881836,
"epoch": 0.28685714285714287,
"grad_norm": 0.022272994741797447,
"kl": 0.012928009033203125,
"learning_rate": 6.25045936022246e-07,
"loss": 0.004,
"reward": 0.10127391491550952,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.14891249779611826,
"rewards/cosine_scaled_reward": -0.05721327941864729,
"rewards/format_reward": 0.708333345130086,
"step": 251
},
{
"advantage_max": 0.10648482386022806,
"advantage_mean": 1.6298146165993899e-09,
"advantage_min": -0.12054414115846157,
"advantage_std": 0.09566444996744394,
"completion_length": 2717.187511444092,
"epoch": 0.288,
"grad_norm": 0.016168739646673203,
"kl": 0.011434555053710938,
"learning_rate": 6.219465344613258e-07,
"loss": 0.0024,
"reward": 0.03236317203845829,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.09566445369273424,
"rewards/cosine_scaled_reward": -0.10126717574894428,
"rewards/format_reward": 0.3958333432674408,
"step": 252
},
{
"advantage_max": 0.1952264877036214,
"advantage_mean": -2.017865596837254e-09,
"advantage_min": -0.21492321323603392,
"advantage_std": 0.16749495640397072,
"completion_length": 2362.8750228881836,
"epoch": 0.28914285714285715,
"grad_norm": 0.02706560119986534,
"kl": 0.014007568359375,
"learning_rate": 6.188436263278172e-07,
"loss": 0.0082,
"reward": 0.11930957529693842,
"reward_advantage_correlation": 1.0,
"reward_std": 0.16749496012926102,
"rewards/cosine_scaled_reward": 0.03688877751119435,
"rewards/format_reward": 0.6250000149011612,
"step": 253
},
{
"advantage_max": 0.26456522569060326,
"advantage_mean": 1.0865430083439875e-09,
"advantage_min": -0.14143134467303753,
"advantage_std": 0.16227889992296696,
"completion_length": 3049.5000610351562,
"epoch": 0.29028571428571426,
"grad_norm": 0.03010343573987484,
"kl": 0.020702362060546875,
"learning_rate": 6.157373628530852e-07,
"loss": 0.0114,
"reward": -0.001374326879158616,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.16227890457957983,
"rewards/cosine_scaled_reward": -0.18095367995556444,
"rewards/format_reward": 0.3541666716337204,
"step": 254
},
{
"advantage_max": 0.18009355012327433,
"advantage_mean": -3.764095424241276e-09,
"advantage_min": -0.15930992551147938,
"advantage_std": 0.13817342184484005,
"completion_length": 2864.5833892822266,
"epoch": 0.2914285714285714,
"grad_norm": 0.023358209058642387,
"kl": 0.0135955810546875,
"learning_rate": 6.126278954320294e-07,
"loss": 0.0038,
"reward": 0.0554531047528144,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.13817342836409807,
"rewards/cosine_scaled_reward": -0.06686954014003277,
"rewards/format_reward": 0.45833334140479565,
"step": 255
},
{
"advantage_max": 0.13159022433683276,
"advantage_mean": -3.5700700312668587e-09,
"advantage_min": -0.16444099321961403,
"advantage_std": 0.11331630731001496,
"completion_length": 2758.104263305664,
"epoch": 0.2925714285714286,
"grad_norm": 0.02609197422862053,
"kl": 0.017688751220703125,
"learning_rate": 6.095153756157051e-07,
"loss": 0.0155,
"reward": 0.041455244878306985,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11331631150096655,
"rewards/cosine_scaled_reward": -0.08538339659571648,
"rewards/format_reward": 0.4166666753590107,
"step": 256
},
{
"advantage_max": 0.26734560914337635,
"advantage_mean": 2.949188213086096e-09,
"advantage_min": -0.17771659325808287,
"advantage_std": 0.18554198555648327,
"completion_length": 2963.2500534057617,
"epoch": 0.2937142857142857,
"grad_norm": 0.03234243020415306,
"kl": 0.01363372802734375,
"learning_rate": 6.06399955103937e-07,
"loss": 0.0175,
"reward": 0.11527824534277897,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.18554198974743485,
"rewards/cosine_scaled_reward": 0.12133318744599819,
"rewards/format_reward": 0.4375000074505806,
"step": 257
},
{
"advantage_max": 0.14878903981298208,
"advantage_mean": -3.104408258236546e-10,
"advantage_min": -0.2111966870725155,
"advantage_std": 0.14870566432364285,
"completion_length": 2749.2708892822266,
"epoch": 0.2948571428571429,
"grad_norm": 0.025443458929657936,
"kl": 0.012126922607421875,
"learning_rate": 6.032817857379256e-07,
"loss": 0.0117,
"reward": 0.09438971313647926,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.14870566176250577,
"rewards/cosine_scaled_reward": -0.04469215031713247,
"rewards/format_reward": 0.6458333507180214,
"step": 258
},
{
"advantage_max": 0.1835308726876974,
"advantage_mean": 4.579002871318849e-09,
"advantage_min": -0.1416417476721108,
"advantage_std": 0.12419275566935539,
"completion_length": 2493.3541717529297,
"epoch": 0.296,
"grad_norm": 0.02159561589360237,
"kl": 0.016582489013671875,
"learning_rate": 6.001610194928464e-07,
"loss": 0.0053,
"reward": 0.08983549298136495,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12419275799766183,
"rewards/cosine_scaled_reward": 0.003949735313653946,
"rewards/format_reward": 0.5208333376795053,
"step": 259
},
{
"advantage_max": 0.11710935411974788,
"advantage_mean": -3.7252903400952775e-09,
"advantage_min": -0.14876556862145662,
"advantage_std": 0.112224759766832,
"completion_length": 1975.4166831970215,
"epoch": 0.29714285714285715,
"grad_norm": 0.024187171831727028,
"kl": 0.012000083923339844,
"learning_rate": 5.97037808470444e-07,
"loss": 0.0128,
"reward": 0.16194967506453395,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11222475813701749,
"rewards/cosine_scaled_reward": 0.20593629218637943,
"rewards/format_reward": 0.5416666716337204,
"step": 260
},
{
"advantage_max": 0.15087209455668926,
"advantage_mean": -7.761021686425451e-10,
"advantage_min": -0.10865057073533535,
"advantage_std": 0.1102964838501066,
"completion_length": 2710.812545776367,
"epoch": 0.29828571428571427,
"grad_norm": 0.022183962166309357,
"kl": 0.012647628784179688,
"learning_rate": 5.939123048916173e-07,
"loss": 0.0111,
"reward": 0.00019670464098453522,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.11029648408293724,
"rewards/cosine_scaled_reward": -0.22872492298483849,
"rewards/format_reward": 0.4583333395421505,
"step": 261
},
{
"advantage_max": 0.1102948416955769,
"advantage_mean": -1.552203920951456e-10,
"advantage_min": -0.1153049236163497,
"advantage_std": 0.09051254531368613,
"completion_length": 2378.937545776367,
"epoch": 0.29942857142857143,
"grad_norm": 0.015410098247230053,
"kl": 0.015895843505859375,
"learning_rate": 5.907846610890011e-07,
"loss": 0.007,
"reward": 0.02980024111457169,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.09051254950463772,
"rewards/cosine_scaled_reward": -0.18379387632012367,
"rewards/format_reward": 0.541666679084301,
"step": 262
},
{
"advantage_max": 0.17358731664717197,
"advantage_mean": 1.552203920951456e-10,
"advantage_min": -0.12128621805459261,
"advantage_std": 0.1090443255379796,
"completion_length": 2539.7083892822266,
"epoch": 0.30057142857142854,
"grad_norm": 0.022145798429846764,
"kl": 0.014247894287109375,
"learning_rate": 5.87655029499542e-07,
"loss": 0.0043,
"reward": 0.02160137635655701,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10904432833194733,
"rewards/cosine_scaled_reward": -0.2081189528107643,
"rewards/format_reward": 0.5416666753590107,
"step": 263
},
{
"advantage_max": 0.20790673978626728,
"advantage_mean": -1.6298145194548752e-09,
"advantage_min": -0.1554710865020752,
"advantage_std": 0.14528044033795595,
"completion_length": 2800.1250915527344,
"epoch": 0.3017142857142857,
"grad_norm": 0.028843365609645844,
"kl": 0.01845550537109375,
"learning_rate": 5.845235626570683e-07,
"loss": 0.0197,
"reward": 0.05024130782112479,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.1452804459258914,
"rewards/cosine_scaled_reward": -0.0917367022484541,
"rewards/format_reward": 0.47916667349636555,
"step": 264
},
{
"advantage_max": 0.12884274497628212,
"advantage_mean": -8.071462553882469e-09,
"advantage_min": -0.17412229906767607,
"advantage_std": 0.12255425984039903,
"completion_length": 2051.02091217041,
"epoch": 0.3028571428571429,
"grad_norm": 0.02178996056318283,
"kl": 0.0155487060546875,
"learning_rate": 5.813904131848564e-07,
"loss": 0.0136,
"reward": 0.12704905099235475,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12255426635965705,
"rewards/cosine_scaled_reward": 0.03132200799882412,
"rewards/format_reward": 0.6875000037252903,
"step": 265
},
{
"advantage_max": 0.19498047791421413,
"advantage_mean": 3.259629080543114e-09,
"advantage_min": -0.1322614224627614,
"advantage_std": 0.13702732603996992,
"completion_length": 2982.541732788086,
"epoch": 0.304,
"grad_norm": 0.027023041620850563,
"kl": 0.02156829833984375,
"learning_rate": 5.78255733788191e-07,
"loss": 0.0138,
"reward": -0.0014124545268714428,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1370273269712925,
"rewards/cosine_scaled_reward": -0.1706711007282138,
"rewards/format_reward": 0.3333333358168602,
"step": 266
},
{
"advantage_max": 0.146828792989254,
"advantage_mean": -1.396983993762202e-09,
"advantage_min": -0.15964362304657698,
"advantage_std": 0.12036533374339342,
"completion_length": 3103.791702270508,
"epoch": 0.30514285714285716,
"grad_norm": 0.02677420899271965,
"kl": 0.02091217041015625,
"learning_rate": 5.751196772469237e-07,
"loss": 0.0062,
"reward": 0.0031448822701349854,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.12036533374339342,
"rewards/cosine_scaled_reward": -0.1587185263633728,
"rewards/format_reward": 0.33333334140479565,
"step": 267
},
{
"advantage_max": 0.24769605975598097,
"advantage_mean": -3.2935835170277983e-09,
"advantage_min": -0.17003098130226135,
"advantage_std": 0.16715412167832255,
"completion_length": 2381.291717529297,
"epoch": 0.3062857142857143,
"grad_norm": 0.03521136939525604,
"kl": 0.022693634033203125,
"learning_rate": 5.71982396408026e-07,
"loss": 0.0072,
"reward": 0.0530893302639015,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.16715412959456444,
"rewards/cosine_scaled_reward": -0.1051311838964466,
"rewards/format_reward": 0.520833345130086,
"step": 268
},
{
"advantage_max": 0.19814197719097137,
"advantage_mean": -2.7755575615628914e-17,
"advantage_min": -0.1455021221190691,
"advantage_std": 0.1363191232085228,
"completion_length": 2688.7708740234375,
"epoch": 0.30742857142857144,
"grad_norm": 0.03205736353993416,
"kl": 0.01491546630859375,
"learning_rate": 5.688440441781398e-07,
"loss": 0.0024,
"reward": 0.04180408164393157,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1363191264681518,
"rewards/cosine_scaled_reward": -0.1398752792738378,
"rewards/format_reward": 0.5208333414047956,
"step": 269
},
{
"advantage_max": 0.28430189471691847,
"advantage_mean": -2.4059166048306935e-09,
"advantage_min": -0.22107769828289747,
"advantage_std": 0.20366744883358479,
"completion_length": 2568.7500762939453,
"epoch": 0.30857142857142855,
"grad_norm": 0.03902197256684303,
"kl": 0.01998138427734375,
"learning_rate": 5.657047735161255e-07,
"loss": 0.0161,
"reward": 0.1241414062678814,
"reward_advantage_correlation": 1.0,
"reward_std": 0.20366745628416538,
"rewards/cosine_scaled_reward": 0.09125652257353067,
"rewards/format_reward": 0.5416666846722364,
"step": 270
},
{
"advantage_max": 0.20672860275954008,
"advantage_mean": -2.9491883102306105e-09,
"advantage_min": -0.2065953854471445,
"advantage_std": 0.16908213449642062,
"completion_length": 2227.3958587646484,
"epoch": 0.3097142857142857,
"grad_norm": 0.030723223462700844,
"kl": 0.013994216918945312,
"learning_rate": 5.625647374256061e-07,
"loss": 0.0136,
"reward": 0.10949450048792642,
"reward_advantage_correlation": 0.9999999999999997,
"reward_std": 0.16908214567229152,
"rewards/cosine_scaled_reward": -0.001862822100520134,
"rewards/format_reward": 0.6458333432674408,
"step": 271
},
{
"advantage_max": 0.23002553265541792,
"advantage_mean": -4.190951558014078e-09,
"advantage_min": -0.14808179438114166,
"advantage_std": 0.15896892128512263,
"completion_length": 2769.5834045410156,
"epoch": 0.31085714285714283,
"grad_norm": 0.05374065041542053,
"kl": 0.0186004638671875,
"learning_rate": 5.594240889475106e-07,
"loss": 0.0201,
"reward": 0.03493615868501365,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.15896892687305808,
"rewards/cosine_scaled_reward": -0.10572225786745548,
"rewards/format_reward": 0.41666667349636555,
"step": 272
},
{
"advantage_max": 0.19195815734565258,
"advantage_mean": -6.36403778286887e-09,
"advantage_min": -0.1525148469954729,
"advantage_std": 0.14543341007083654,
"completion_length": 2723.666748046875,
"epoch": 0.312,
"grad_norm": 0.03333937004208565,
"kl": 0.016330718994140625,
"learning_rate": 5.562829811526154e-07,
"loss": 0.0138,
"reward": 0.08220908371731639,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.14543340960517526,
"rewards/cosine_scaled_reward": 0.04326418973505497,
"rewards/format_reward": 0.3958333358168602,
"step": 273
},
{
"advantage_max": 0.2558351717889309,
"advantage_mean": -6.519258133330652e-09,
"advantage_min": -0.22837742511183023,
"advantage_std": 0.19885483849793673,
"completion_length": 1787.9375343322754,
"epoch": 0.31314285714285717,
"grad_norm": 0.03228946030139923,
"kl": 0.013763427734375,
"learning_rate": 5.531415671340826e-07,
"loss": 0.0107,
"reward": 0.17565890843980014,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.19885484222322702,
"rewards/cosine_scaled_reward": 0.14979170076549053,
"rewards/format_reward": 0.7291666753590107,
"step": 274
},
{
"advantage_max": 0.1555377847980708,
"advantage_mean": -9.662471971844111e-09,
"advantage_min": -0.13416611310094595,
"advantage_std": 0.1166462292894721,
"completion_length": 2052.4583740234375,
"epoch": 0.3142857142857143,
"grad_norm": 0.012692847289144993,
"kl": 0.015716552734375,
"learning_rate": 5.5e-07,
"loss": 0.0051,
"reward": 0.169587709242478,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11664623068645597,
"rewards/cosine_scaled_reward": 0.15611401526257396,
"rewards/format_reward": 0.6875,
"step": 275
},
{
"advantage_max": 0.22789240814745426,
"advantage_mean": -4.540197690028336e-09,
"advantage_min": -0.1707323812879622,
"advantage_std": 0.16234863363206387,
"completion_length": 2383.229202270508,
"epoch": 0.31542857142857145,
"grad_norm": 0.03404555842280388,
"kl": 0.023883819580078125,
"learning_rate": 5.468584328659172e-07,
"loss": 0.0148,
"reward": 0.097462791018188,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1623486364260316,
"rewards/cosine_scaled_reward": 0.0030499575659632683,
"rewards/format_reward": 0.5625000055879354,
"step": 276
},
{
"advantage_max": 0.14935290860012174,
"advantage_mean": 2.755162625822649e-09,
"advantage_min": -0.14729766873642802,
"advantage_std": 0.11353026679717004,
"completion_length": 2168.8958740234375,
"epoch": 0.31657142857142856,
"grad_norm": 0.01725960150361061,
"kl": 0.020036697387695312,
"learning_rate": 5.437170188473847e-07,
"loss": 0.0069,
"reward": 0.08052603248506784,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11353026377037168,
"rewards/cosine_scaled_reward": -0.0758397476747632,
"rewards/format_reward": 0.6250000018626451,
"step": 277
},
{
"advantage_max": 0.1460155351087451,
"advantage_mean": -8.459513443914712e-09,
"advantage_min": -0.15751561522483826,
"advantage_std": 0.11409496748819947,
"completion_length": 1924.4792175292969,
"epoch": 0.3177142857142857,
"grad_norm": 0.02192055620253086,
"kl": 0.01718902587890625,
"learning_rate": 5.405759110524894e-07,
"loss": 0.0053,
"reward": 0.17414987459778786,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.11409496935084462,
"rewards/cosine_scaled_reward": 0.07652450073510408,
"rewards/format_reward": 0.8541666828095913,
"step": 278
},
{
"advantage_max": 0.18112573074176908,
"advantage_mean": -2.32830644347759e-09,
"advantage_min": -0.1802171589806676,
"advantage_std": 0.1357805049046874,
"completion_length": 2853.2916870117188,
"epoch": 0.31885714285714284,
"grad_norm": 0.045709144324064255,
"kl": 0.02671051025390625,
"learning_rate": 5.37435262574394e-07,
"loss": 0.0191,
"reward": 0.038704983657225966,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.1357805086299777,
"rewards/cosine_scaled_reward": -0.09425987303256989,
"rewards/format_reward": 0.41666668094694614,
"step": 279
},
{
"advantage_max": 0.22648306377232075,
"advantage_mean": -3.2596291638098407e-09,
"advantage_min": -0.18116825073957443,
"advantage_std": 0.1603056015446782,
"completion_length": 2309.437545776367,
"epoch": 0.32,
"grad_norm": 0.02816803753376007,
"kl": 0.023956298828125,
"learning_rate": 5.342952264838747e-07,
"loss": 0.0099,
"reward": 0.15197777177672833,
"reward_advantage_correlation": 1.0,
"reward_std": 0.16030560433864594,
"rewards/cosine_scaled_reward": 0.12034583219792694,
"rewards/format_reward": 0.6458333414047956,
"step": 280
},
{
"advantage_max": 0.14990063337609172,
"advantage_mean": -7.761022102759085e-10,
"advantage_min": -0.10904566152021289,
"advantage_std": 0.10730710672214627,
"completion_length": 3093.2708435058594,
"epoch": 0.3211428571428571,
"grad_norm": 0.024048512801527977,
"kl": 0.019947052001953125,
"learning_rate": 5.311559558218603e-07,
"loss": 0.0088,
"reward": -0.002999696182087064,
"reward_advantage_correlation": 1.0,
"reward_std": 0.107307109516114,
"rewards/cosine_scaled_reward": -0.17536162724718451,
"rewards/format_reward": 0.3333333432674408,
"step": 281
},
{
"advantage_max": 0.13856762740761042,
"advantage_mean": -2.7939678071131624e-09,
"advantage_min": -0.16603135224431753,
"advantage_std": 0.11764003010466695,
"completion_length": 2342.416732788086,
"epoch": 0.3222857142857143,
"grad_norm": 0.03633342310786247,
"kl": 0.0218658447265625,
"learning_rate": 5.28017603591974e-07,
"loss": 0.0024,
"reward": 0.1459582296665758,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1176400319673121,
"rewards/cosine_scaled_reward": 0.1172967292368412,
"rewards/format_reward": 0.6250000037252903,
"step": 282
},
{
"advantage_max": 0.19806544668972492,
"advantage_mean": -2.4835269951672956e-09,
"advantage_min": -0.18965621013194323,
"advantage_std": 0.16816747142001987,
"completion_length": 2804.6459045410156,
"epoch": 0.32342857142857145,
"grad_norm": 0.04573136195540428,
"kl": 0.01639556884765625,
"learning_rate": 5.248803227530763e-07,
"loss": 0.0092,
"reward": 0.13224803050979972,
"reward_advantage_correlation": 1.0,
"reward_std": 0.16816747328266501,
"rewards/cosine_scaled_reward": 0.17202033032663167,
"rewards/format_reward": 0.4375000037252903,
"step": 283
},
{
"advantage_max": 0.18402406759560108,
"advantage_mean": -2.9491882894139287e-09,
"advantage_min": -0.22102447994984686,
"advantage_std": 0.15604595269542187,
"completion_length": 2228.729217529297,
"epoch": 0.32457142857142857,
"grad_norm": 0.03294950723648071,
"kl": 0.020061492919921875,
"learning_rate": 5.21744266211809e-07,
"loss": 0.0181,
"reward": 0.10767308878712356,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.15604595455806702,
"rewards/cosine_scaled_reward": 0.011314274743199348,
"rewards/format_reward": 0.6041666939854622,
"step": 284
},
{
"advantage_max": 0.16824073251336813,
"advantage_mean": -2.949188192269414e-09,
"advantage_min": -0.12993684597313404,
"advantage_std": 0.12016739509999752,
"completion_length": 1927.5208587646484,
"epoch": 0.32571428571428573,
"grad_norm": 0.02860177308320999,
"kl": 0.021915435791015625,
"learning_rate": 5.186095868151436e-07,
"loss": -0.0004,
"reward": 0.10571011027786881,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12016740208491683,
"rewards/cosine_scaled_reward": -0.06410545017570257,
"rewards/format_reward": 0.7500000055879354,
"step": 285
},
{
"advantage_max": 0.19652512576431036,
"advantage_mean": -2.561137010803627e-09,
"advantage_min": -0.18228960130363703,
"advantage_std": 0.15237156953662634,
"completion_length": 2139.7916946411133,
"epoch": 0.32685714285714285,
"grad_norm": 0.06984082609415054,
"kl": 0.0286865234375,
"learning_rate": 5.154764373429315e-07,
"loss": -0.0004,
"reward": 0.09307598043233156,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1523715741932392,
"rewards/cosine_scaled_reward": -0.043882093974389136,
"rewards/format_reward": 0.6250000093132257,
"step": 286
},
{
"advantage_max": 0.10403442522510886,
"advantage_mean": 2.0954757928848267e-09,
"advantage_min": -0.11645598197355866,
"advantage_std": 0.09277561539784074,
"completion_length": 1798.8958587646484,
"epoch": 0.328,
"grad_norm": 0.025246037170290947,
"kl": 0.018194198608398438,
"learning_rate": 5.123449705004581e-07,
"loss": 0.002,
"reward": 0.08277540933340788,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09277561772614717,
"rewards/cosine_scaled_reward": -0.07126312516629696,
"rewards/format_reward": 0.6250000055879354,
"step": 287
},
{
"advantage_max": 0.1896635489538312,
"advantage_mean": -1.552204503818544e-09,
"advantage_min": -0.14109544549137354,
"advantage_std": 0.12216217769309878,
"completion_length": 2708.500068664551,
"epoch": 0.3291428571428571,
"grad_norm": 0.036087606102228165,
"kl": 0.029327392578125,
"learning_rate": 5.09215338910999e-07,
"loss": 0.0044,
"reward": 0.031016689725220203,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.12216217769309878,
"rewards/cosine_scaled_reward": -0.11878702905960381,
"rewards/format_reward": 0.41666666977107525,
"step": 288
},
{
"advantage_max": 0.1401263326406479,
"advantage_mean": 2.4447218693879336e-09,
"advantage_min": -0.14393609017133713,
"advantage_std": 0.1113440697081387,
"completion_length": 2145.3541679382324,
"epoch": 0.3302857142857143,
"grad_norm": 0.0327521413564682,
"kl": 0.02288055419921875,
"learning_rate": 5.060876951083828e-07,
"loss": 0.0016,
"reward": 0.06709581252653152,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11134407250210643,
"rewards/cosine_scaled_reward": -0.07541561592370272,
"rewards/format_reward": 0.5416666679084301,
"step": 289
},
{
"advantage_max": 0.16939363535493612,
"advantage_mean": -5.161079424942372e-09,
"advantage_min": -0.1828483436256647,
"advantage_std": 0.13901030272245407,
"completion_length": 1707.9583854675293,
"epoch": 0.3314285714285714,
"grad_norm": 0.05445309728384018,
"kl": 0.017580032348632812,
"learning_rate": 5.02962191529556e-07,
"loss": 0.0129,
"reward": 0.1843718090094626,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.1390103055164218,
"rewards/cosine_scaled_reward": 0.1699133664369583,
"rewards/format_reward": 0.7500000204890966,
"step": 290
},
{
"advantage_max": 0.25220514833927155,
"advantage_mean": -9.158005617737608e-09,
"advantage_min": -0.1723672477528453,
"advantage_std": 0.16362386476248503,
"completion_length": 1981.354248046875,
"epoch": 0.3325714285714286,
"grad_norm": 0.061871450394392014,
"kl": 0.0141448974609375,
"learning_rate": 4.998389805071536e-07,
"loss": 0.0159,
"reward": 0.09107551211491227,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.16362386848777533,
"rewards/cosine_scaled_reward": -0.07718436582945287,
"rewards/format_reward": 0.687500013038516,
"step": 291
},
{
"advantage_max": 0.16672541294246912,
"advantage_mean": -1.3193737213867962e-09,
"advantage_min": -0.12999323196709156,
"advantage_std": 0.11027388600632548,
"completion_length": 2551.8750228881836,
"epoch": 0.33371428571428574,
"grad_norm": 0.02972051128745079,
"kl": 0.026575088500976562,
"learning_rate": 4.967182142620745e-07,
"loss": 0.0011,
"reward": 0.02736746583832428,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11027389112859964,
"rewards/cosine_scaled_reward": -0.17034983914345503,
"rewards/format_reward": 0.5000000111758709,
"step": 292
},
{
"advantage_max": 0.1079831887036562,
"advantage_mean": -1.9402554424230445e-09,
"advantage_min": -0.12870269175618887,
"advantage_std": 0.09040121687576175,
"completion_length": 1931.1250839233398,
"epoch": 0.33485714285714285,
"grad_norm": 0.016690244898200035,
"kl": 0.03152656555175781,
"learning_rate": 4.93600044896063e-07,
"loss": 0.0073,
"reward": 0.10857149804360233,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09040121641010046,
"rewards/cosine_scaled_reward": -0.013043075799942017,
"rewards/format_reward": 0.6666666772216558,
"step": 293
},
{
"advantage_max": 0.14342291373759508,
"advantage_mean": -5.551115123125783e-17,
"advantage_min": -0.17630695085972548,
"advantage_std": 0.11805281089618802,
"completion_length": 2847.229217529297,
"epoch": 0.336,
"grad_norm": 0.04706273600459099,
"kl": 0.028173446655273438,
"learning_rate": 4.904846243842949e-07,
"loss": 0.0118,
"reward": 0.08359809592366219,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11805281648412347,
"rewards/cosine_scaled_reward": 0.04589838907122612,
"rewards/format_reward": 0.39583334513008595,
"step": 294
},
{
"advantage_max": 0.10511154402047396,
"advantage_mean": 1.9790605126912553e-09,
"advantage_min": -0.10564734903164208,
"advantage_std": 0.08633032138459384,
"completion_length": 3080.7291870117188,
"epoch": 0.33714285714285713,
"grad_norm": 0.05320084095001221,
"kl": 0.040599822998046875,
"learning_rate": 4.873721045679706e-07,
"loss": 0.0015,
"reward": 0.02853070362471044,
"reward_advantage_correlation": 1.0,
"reward_std": 0.08633032557554543,
"rewards/cosine_scaled_reward": -0.05254795402288437,
"rewards/format_reward": 0.2708333395421505,
"step": 295
},
{
"advantage_max": 0.2000164007768035,
"advantage_mean": 9.313225884932663e-10,
"advantage_min": -0.16814672946929932,
"advantage_std": 0.14298693323507905,
"completion_length": 3020.104217529297,
"epoch": 0.3382857142857143,
"grad_norm": 0.06135251745581627,
"kl": 0.0340576171875,
"learning_rate": 4.842626371469149e-07,
"loss": 0.0194,
"reward": 0.02324377093464136,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.14298693649470806,
"rewards/cosine_scaled_reward": -0.10881040431559086,
"rewards/format_reward": 0.35416667722165585,
"step": 296
},
{
"advantage_max": 0.14637420466169715,
"advantage_mean": 2.9491883241083983e-09,
"advantage_min": -0.11458162683993578,
"advantage_std": 0.09986498206853867,
"completion_length": 3078.687545776367,
"epoch": 0.3394285714285714,
"grad_norm": 0.027566736564040184,
"kl": 0.0302581787109375,
"learning_rate": 4.811563736721829e-07,
"loss": 0.011,
"reward": -0.021376774879172444,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09986498579382896,
"rewards/cosine_scaled_reward": -0.13615123182535172,
"rewards/format_reward": 0.1458333395421505,
"step": 297
},
{
"advantage_max": 0.171283058822155,
"advantage_mean": -4.656613150633149e-10,
"advantage_min": -0.10321410931646824,
"advantage_std": 0.10990889801178128,
"completion_length": 2413.7083892822266,
"epoch": 0.3405714285714286,
"grad_norm": 0.03488784655928612,
"kl": 0.023456573486328125,
"learning_rate": 4.780534655386743e-07,
"loss": 0.012,
"reward": 0.07853745546890423,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.10990889696404338,
"rewards/cosine_scaled_reward": -0.02953695846372284,
"rewards/format_reward": 0.5208333469927311,
"step": 298
},
{
"advantage_max": 0.1128107258118689,
"advantage_mean": -3.065603638996439e-09,
"advantage_min": -0.1558314487338066,
"advantage_std": 0.11369897332042456,
"completion_length": 2749.104202270508,
"epoch": 0.3417142857142857,
"grad_norm": 0.030868861824274063,
"kl": 0.029834747314453125,
"learning_rate": 4.749540639777539e-07,
"loss": 0.004,
"reward": 0.09189531486481428,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.113698975648731,
"rewards/cosine_scaled_reward": 0.06086218822747469,
"rewards/format_reward": 0.4166666716337204,
"step": 299
},
{
"advantage_max": 0.1933749821037054,
"advantage_mean": 5.432715666220389e-10,
"advantage_min": -0.1367768244817853,
"advantage_std": 0.1271583898924291,
"completion_length": 3239.916717529297,
"epoch": 0.34285714285714286,
"grad_norm": 0.03370937705039978,
"kl": 0.04051971435546875,
"learning_rate": 4.7185832004988133e-07,
"loss": 0.0131,
"reward": -0.02385653683450073,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.12715838942676783,
"rewards/cosine_scaled_reward": -0.20604670932516456,
"rewards/format_reward": 0.2708333395421505,
"step": 300
},
{
"advantage_max": 0.17281748075038195,
"advantage_mean": -1.9887617525027323e-09,
"advantage_min": -0.14648894406855106,
"advantage_std": 0.12490739766508341,
"completion_length": 2306.6875610351562,
"epoch": 0.344,
"grad_norm": 0.06356479972600937,
"kl": 0.030529022216796875,
"learning_rate": 4.68766384637248e-07,
"loss": 0.0186,
"reward": 0.06374910019803792,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.12490739580243826,
"rewards/cosine_scaled_reward": -0.12478701584041119,
"rewards/format_reward": 0.6250000055879354,
"step": 301
},
{
"advantage_max": 0.20410604868084192,
"advantage_mean": 6.208817071584605e-10,
"advantage_min": -0.11600066442042589,
"advantage_std": 0.12798969075083733,
"completion_length": 2438.166702270508,
"epoch": 0.34514285714285714,
"grad_norm": 0.050883322954177856,
"kl": 0.026035308837890625,
"learning_rate": 4.656784084364238e-07,
"loss": 0.0184,
"reward": 0.04835976893082261,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12798968655988574,
"rewards/cosine_scaled_reward": -0.059276397922076285,
"rewards/format_reward": 0.3958333395421505,
"step": 302
},
{
"advantage_max": 0.1770703885704279,
"advantage_mean": -5.122274340796373e-09,
"advantage_min": -0.1827222853899002,
"advantage_std": 0.1458009947091341,
"completion_length": 1774.208366394043,
"epoch": 0.3462857142857143,
"grad_norm": 0.062340594828128815,
"kl": 0.02555084228515625,
"learning_rate": 4.6259454195101267e-07,
"loss": 0.0184,
"reward": 0.10038578975945711,
"reward_advantage_correlation": 1.0,
"reward_std": 0.14580100309103727,
"rewards/cosine_scaled_reward": -0.06850272964220494,
"rewards/format_reward": 0.7291666865348816,
"step": 303
},
{
"advantage_max": 0.21447939984500408,
"advantage_mean": -3.2596290527875382e-09,
"advantage_min": -0.15638948930427432,
"advantage_std": 0.13950767274945974,
"completion_length": 2737.937545776367,
"epoch": 0.3474285714285714,
"grad_norm": 0.03145613148808479,
"kl": 0.0426483154296875,
"learning_rate": 4.59514935484316e-07,
"loss": 0.0069,
"reward": 0.06072054826654494,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.13950766902416945,
"rewards/cosine_scaled_reward": -0.07232037000358105,
"rewards/format_reward": 0.5000000149011612,
"step": 304
},
{
"advantage_max": 0.16362837608903646,
"advantage_mean": 2.0178656107150417e-09,
"advantage_min": -0.1397431530058384,
"advantage_std": 0.11684367200359702,
"completion_length": 2308.729202270508,
"epoch": 0.3485714285714286,
"grad_norm": 0.02347092516720295,
"kl": 0.024188995361328125,
"learning_rate": 4.5643973913200837e-07,
"loss": 0.0055,
"reward": 0.04415482934564352,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1168436761945486,
"rewards/cosine_scaled_reward": -0.12959892745129764,
"rewards/format_reward": 0.5208333488553762,
"step": 305
},
{
"advantage_max": 0.1916728913784027,
"advantage_mean": -1.3969838202898543e-09,
"advantage_min": -0.12143237423151731,
"advantage_std": 0.12374015152454376,
"completion_length": 2760.2500610351562,
"epoch": 0.3497142857142857,
"grad_norm": 0.05556763336062431,
"kl": 0.0508575439453125,
"learning_rate": 4.5336910277482155e-07,
"loss": 0.0157,
"reward": 0.04871231457218528,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12374015431851149,
"rewards/cosine_scaled_reward": -0.05752767622470856,
"rewards/format_reward": 0.3958333358168602,
"step": 306
},
{
"advantage_max": 0.22683174721896648,
"advantage_mean": -3.570070017389071e-09,
"advantage_min": -0.19085029885172844,
"advantage_std": 0.16714101657271385,
"completion_length": 2140.104217529297,
"epoch": 0.35085714285714287,
"grad_norm": 0.04561980441212654,
"kl": 0.0324554443359375,
"learning_rate": 4.503031760712397e-07,
"loss": 0.0103,
"reward": 0.10696669295430183,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1671410221606493,
"rewards/cosine_scaled_reward": 0.03263398795388639,
"rewards/format_reward": 0.5625000111758709,
"step": 307
},
{
"advantage_max": 0.17833841359242797,
"advantage_mean": -1.5522043372850902e-09,
"advantage_min": -0.1457248479127884,
"advantage_std": 0.1348523572087288,
"completion_length": 3109.5208740234375,
"epoch": 0.352,
"grad_norm": 0.038504425436258316,
"kl": 0.0427093505859375,
"learning_rate": 4.4724210845020494e-07,
"loss": 0.0046,
"reward": 0.03145545581355691,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.13485235767439008,
"rewards/cosine_scaled_reward": -0.07493541622534394,
"rewards/format_reward": 0.3333333395421505,
"step": 308
},
{
"advantage_max": 0.22489432198926806,
"advantage_mean": -2.1730860721991263e-09,
"advantage_min": -0.17644386645406485,
"advantage_std": 0.16854400280863047,
"completion_length": 2918.7083892822266,
"epoch": 0.35314285714285715,
"grad_norm": 0.05082552134990692,
"kl": 0.040985107421875,
"learning_rate": 4.441860491038345e-07,
"loss": 0.0018,
"reward": 0.06658328603953123,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.16854400653392076,
"rewards/cosine_scaled_reward": -0.033274039917159826,
"rewards/format_reward": 0.4583333432674408,
"step": 309
},
{
"advantage_max": 0.14540826249867678,
"advantage_mean": -1.2417634420724966e-09,
"advantage_min": -0.10229035327211022,
"advantage_std": 0.09198084427043796,
"completion_length": 2406.8750228881836,
"epoch": 0.35428571428571426,
"grad_norm": 0.04952380806207657,
"kl": 0.0579376220703125,
"learning_rate": 4.4113514698014953e-07,
"loss": 0.0055,
"reward": 0.02953993622213602,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.09198084473609924,
"rewards/cosine_scaled_reward": -0.18812143243849277,
"rewards/format_reward": 0.5416666679084301,
"step": 310
},
{
"advantage_max": 0.2078695334494114,
"advantage_mean": -4.8118333900726284e-09,
"advantage_min": -0.1414637891575694,
"advantage_std": 0.13211291236802936,
"completion_length": 2218.9792251586914,
"epoch": 0.3554285714285714,
"grad_norm": 0.0518975704908371,
"kl": 0.038421630859375,
"learning_rate": 4.3808955077581546e-07,
"loss": 0.0172,
"reward": 0.09966501512099057,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13211291562765837,
"rewards/cosine_scaled_reward": 0.021480013616383076,
"rewards/format_reward": 0.541666679084301,
"step": 311
},
{
"advantage_max": 0.11580835562199354,
"advantage_mean": -3.0267983952558808e-09,
"advantage_min": -0.13315439969301224,
"advantage_std": 0.09709975449368358,
"completion_length": 2090.3750381469727,
"epoch": 0.3565714285714286,
"grad_norm": 0.05046186223626137,
"kl": 0.0472564697265625,
"learning_rate": 4.350494089288943e-07,
"loss": 0.0035,
"reward": 0.12841306265909225,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.09709975705482066,
"rewards/cosine_scaled_reward": 0.11523362691514194,
"rewards/format_reward": 0.5208333488553762,
"step": 312
},
{
"advantage_max": 0.13410293869674206,
"advantage_mean": -7.062529679136009e-09,
"advantage_min": -0.15682349167764187,
"advantage_std": 0.11782865854911506,
"completion_length": 2920.4167289733887,
"epoch": 0.3577142857142857,
"grad_norm": 0.05611386522650719,
"kl": 0.042430877685546875,
"learning_rate": 4.3201486961161093e-07,
"loss": 0.0011,
"reward": 0.10145594039931893,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11782865924760699,
"rewards/cosine_scaled_reward": 0.10086375288665295,
"rewards/format_reward": 0.39583333767950535,
"step": 313
},
{
"advantage_max": 0.14438471477478743,
"advantage_mean": -3.1044086745701804e-09,
"advantage_min": -0.17650274466723204,
"advantage_std": 0.11929162684828043,
"completion_length": 1980.0417137145996,
"epoch": 0.3588571428571429,
"grad_norm": 0.07130403071641922,
"kl": 0.03459930419921875,
"learning_rate": 4.2898608072313045e-07,
"loss": 0.0125,
"reward": 0.12104845186695457,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11929162871092558,
"rewards/cosine_scaled_reward": 0.010619329288601875,
"rewards/format_reward": 0.687500013038516,
"step": 314
},
{
"advantage_max": 0.13581964280456305,
"advantage_mean": 3.1044089521259366e-10,
"advantage_min": -0.11171440966427326,
"advantage_std": 0.1006471742875874,
"completion_length": 2960.916732788086,
"epoch": 0.36,
"grad_norm": 0.07105167210102081,
"kl": 0.06281280517578125,
"learning_rate": 4.2596318988235037e-07,
"loss": 0.0045,
"reward": 0.10702863708138466,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10064717940986156,
"rewards/cosine_scaled_reward": 0.07692010141909122,
"rewards/format_reward": 0.47916666977107525,
"step": 315
},
{
"advantage_max": 0.22751855989918113,
"advantage_mean": -7.761023768093622e-11,
"advantage_min": -0.15471346024423838,
"advantage_std": 0.15318563301116228,
"completion_length": 2986.5834350585938,
"epoch": 0.36114285714285715,
"grad_norm": 0.08785312622785568,
"kl": 0.0533447265625,
"learning_rate": 4.2294634442070553e-07,
"loss": 0.0193,
"reward": -0.0024650731356814504,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.15318563859909773,
"rewards/cosine_scaled_reward": -0.14366810489445925,
"rewards/format_reward": 0.2708333395421505,
"step": 316
},
{
"advantage_max": 0.20047110598534346,
"advantage_mean": -5.122274229774071e-09,
"advantage_min": -0.1447579087689519,
"advantage_std": 0.13735897513106465,
"completion_length": 2916.2917404174805,
"epoch": 0.36228571428571427,
"grad_norm": 0.034367047250270844,
"kl": 0.05880928039550781,
"learning_rate": 4.1993569137498776e-07,
"loss": 0.0085,
"reward": 0.03647155943326652,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.13735897792503238,
"rewards/cosine_scaled_reward": -0.07012255070731044,
"rewards/format_reward": 0.35416667349636555,
"step": 317
},
{
"advantage_max": 0.1579853631556034,
"advantage_mean": -1.5522041985072121e-10,
"advantage_min": -0.11961670313030481,
"advantage_std": 0.10564623354002833,
"completion_length": 2064.000011444092,
"epoch": 0.36342857142857143,
"grad_norm": 0.04541625827550888,
"kl": 0.0489959716796875,
"learning_rate": 4.1693137748017915e-07,
"loss": 0.0173,
"reward": 0.04783309390768409,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10564623679965734,
"rewards/cosine_scaled_reward": -0.20319920778274536,
"rewards/format_reward": 0.6875000111758709,
"step": 318
},
{
"advantage_max": 0.1397206410765648,
"advantage_mean": 1.5522041985072121e-10,
"advantage_min": -0.11532348208129406,
"advantage_std": 0.10406744293868542,
"completion_length": 2528.000045776367,
"epoch": 0.36457142857142855,
"grad_norm": 0.055004946887493134,
"kl": 0.05081939697265625,
"learning_rate": 4.1393354916230005e-07,
"loss": 0.0142,
"reward": 0.02235355321317911,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.10406744759529829,
"rewards/cosine_scaled_reward": -0.1417781561613083,
"rewards/format_reward": 0.41666666977107525,
"step": 319
},
{
"advantage_max": 0.1373197054490447,
"advantage_mean": -2.4059166568723978e-09,
"advantage_min": -0.09656485263258219,
"advantage_std": 0.09253839054144919,
"completion_length": 2120.916732788086,
"epoch": 0.3657142857142857,
"grad_norm": 0.08500746637582779,
"kl": 0.058597564697265625,
"learning_rate": 4.1094235253127374e-07,
"loss": 0.0016,
"reward": 0.1001118189888075,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09253839147277176,
"rewards/cosine_scaled_reward": -0.06903557199984789,
"rewards/format_reward": 0.7291666679084301,
"step": 320
},
{
"advantage_max": 0.18437588680535555,
"advantage_mean": -7.140139798855749e-09,
"advantage_min": -0.1534122722223401,
"advantage_std": 0.13574167201295495,
"completion_length": 1700.7292175292969,
"epoch": 0.3668571428571429,
"grad_norm": 0.059033576399087906,
"kl": 0.0357208251953125,
"learning_rate": 4.079579333738039e-07,
"loss": 0.0032,
"reward": 0.1664855630369857,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1357416776008904,
"rewards/cosine_scaled_reward": 0.10123182460665703,
"rewards/format_reward": 0.7708333414047956,
"step": 321
},
{
"advantage_max": 0.20797332702204585,
"advantage_mean": 1.9014503027658947e-09,
"advantage_min": -0.13548879977315664,
"advantage_std": 0.145243885461241,
"completion_length": 2375.0833892822266,
"epoch": 0.368,
"grad_norm": 0.058852821588516235,
"kl": 0.062713623046875,
"learning_rate": 4.0498043714627006e-07,
"loss": 0.01,
"reward": 0.014190776884788647,
"reward_advantage_correlation": 1.0,
"reward_std": 0.14524389104917645,
"rewards/cosine_scaled_reward": -0.16725354408845305,
"rewards/format_reward": 0.4166666716337204,
"step": 322
},
{
"advantage_max": 0.1604612385854125,
"advantage_mean": -2.949188129819369e-09,
"advantage_min": -0.152086915448308,
"advantage_std": 0.12315284926444292,
"completion_length": 2394.291717529297,
"epoch": 0.36914285714285716,
"grad_norm": 0.03161586821079254,
"kl": 0.04833984375,
"learning_rate": 4.020100089676376e-07,
"loss": 0.0084,
"reward": 0.03113732289057225,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12315285205841064,
"rewards/cosine_scaled_reward": -0.1679130750708282,
"rewards/format_reward": 0.5208333544433117,
"step": 323
},
{
"advantage_max": 0.1508808215148747,
"advantage_mean": 1.3877787807814457e-17,
"advantage_min": -0.12193193286657333,
"advantage_std": 0.10500428173691034,
"completion_length": 2905.0209045410156,
"epoch": 0.3702857142857143,
"grad_norm": 0.04525084048509598,
"kl": 0.064605712890625,
"learning_rate": 3.9904679361238526e-07,
"loss": 0.0093,
"reward": -0.02125760749913752,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10500428825616837,
"rewards/cosine_scaled_reward": -0.1961777014657855,
"rewards/format_reward": 0.2708333358168602,
"step": 324
},
{
"advantage_max": 0.1821247013285756,
"advantage_mean": -3.725290437239792e-09,
"advantage_min": -0.20376018062233925,
"advantage_std": 0.1471891412511468,
"completion_length": 2759.437515258789,
"epoch": 0.37142857142857144,
"grad_norm": 0.06518473476171494,
"kl": 0.0437469482421875,
"learning_rate": 3.9609093550344907e-07,
"loss": 0.0159,
"reward": 0.0820797230117023,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.14718915149569511,
"rewards/cosine_scaled_reward": 0.03319347696378827,
"rewards/format_reward": 0.41666668094694614,
"step": 325
},
{
"advantage_max": 0.16853288700804114,
"advantage_mean": -8.53712503467996e-10,
"advantage_min": -0.1664345981553197,
"advantage_std": 0.1293939589522779,
"completion_length": 2176.375045776367,
"epoch": 0.37257142857142855,
"grad_norm": 0.050375796854496,
"kl": 0.0517578125,
"learning_rate": 3.931425787051832e-07,
"loss": 0.0125,
"reward": 0.07511803903616965,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1293939701281488,
"rewards/cosine_scaled_reward": -0.030897012911736965,
"rewards/format_reward": 0.5000000167638063,
"step": 326
},
{
"advantage_max": 0.1837756261229515,
"advantage_mean": -3.259629136054265e-09,
"advantage_min": -0.18448743131011724,
"advantage_std": 0.14881908521056175,
"completion_length": 2198.8333740234375,
"epoch": 0.3737142857142857,
"grad_norm": 0.08379311859607697,
"kl": 0.03250885009765625,
"learning_rate": 3.902018669163384e-07,
"loss": 0.0077,
"reward": 0.14384737284854054,
"reward_advantage_correlation": 1.0,
"reward_std": 0.14881909638643265,
"rewards/cosine_scaled_reward": 0.10746606485918164,
"rewards/format_reward": 0.625,
"step": 327
},
{
"advantage_max": 0.16970978770405054,
"advantage_mean": 2.9103831011845216e-09,
"advantage_min": -0.10629498213529587,
"advantage_std": 0.1124201756902039,
"completion_length": 2977.7291870117188,
"epoch": 0.37485714285714283,
"grad_norm": 0.04713457077741623,
"kl": 0.0502471923828125,
"learning_rate": 3.872689434630585e-07,
"loss": 0.0091,
"reward": -0.021038700826466084,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.11242017382755876,
"rewards/cosine_scaled_reward": -0.20708716148510575,
"rewards/format_reward": 0.29166667722165585,
"step": 328
},
{
"advantage_max": 0.15668292297050357,
"advantage_mean": -7.295360440751075e-09,
"advantage_min": -0.20917884726077318,
"advantage_std": 0.15731059899553657,
"completion_length": 1727.7916851043701,
"epoch": 0.376,
"grad_norm": 0.07096515595912933,
"kl": 0.0440673828125,
"learning_rate": 3.843439512918949e-07,
"loss": 0.0088,
"reward": 0.15083087398670614,
"reward_advantage_correlation": 1.0,
"reward_std": 0.157310601323843,
"rewards/cosine_scaled_reward": 0.10906307026743889,
"rewards/format_reward": 0.6666666734963655,
"step": 329
},
{
"advantage_max": 0.16370283998548985,
"advantage_mean": 2.5611371773370806e-09,
"advantage_min": -0.13650465942919254,
"advantage_std": 0.11045987298712134,
"completion_length": 2315.125045776367,
"epoch": 0.37714285714285717,
"grad_norm": 0.10308735072612762,
"kl": 0.056667327880859375,
"learning_rate": 3.8142703296283953e-07,
"loss": 0.0108,
"reward": 0.042721322970464826,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.11045988043770194,
"rewards/cosine_scaled_reward": -0.16713772248476744,
"rewards/format_reward": 0.5833333395421505,
"step": 330
},
{
"advantage_max": 0.1993811996653676,
"advantage_mean": -1.8626451908643205e-09,
"advantage_min": -0.11707951128482819,
"advantage_std": 0.11946522584185004,
"completion_length": 2759.4166946411133,
"epoch": 0.3782857142857143,
"grad_norm": 0.06283679604530334,
"kl": 0.05450439453125,
"learning_rate": 3.785183306423767e-07,
"loss": 0.0064,
"reward": -0.007956791669130325,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.11946522584185004,
"rewards/cosine_scaled_reward": -0.17038972454611212,
"rewards/format_reward": 0.2916666679084301,
"step": 331
},
{
"advantage_max": 0.1879209829494357,
"advantage_mean": -7.761021547647573e-10,
"advantage_min": -0.1391853764653206,
"advantage_std": 0.13691260432824492,
"completion_length": 2184.2083740234375,
"epoch": 0.37942857142857145,
"grad_norm": 0.08008905500173569,
"kl": 0.04682159423828125,
"learning_rate": 3.7561798609655373e-07,
"loss": -0.0015,
"reward": 0.0423368806950748,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13691260758787394,
"rewards/cosine_scaled_reward": -0.14775002468377352,
"rewards/format_reward": 0.5416666734963655,
"step": 332
},
{
"advantage_max": 0.15720007987692952,
"advantage_mean": -1.7850349878778538e-09,
"advantage_min": -0.18071353994309902,
"advantage_std": 0.13339901249855757,
"completion_length": 2106.9792137145996,
"epoch": 0.38057142857142856,
"grad_norm": 0.03596104308962822,
"kl": 0.0339202880859375,
"learning_rate": 3.72726140684072e-07,
"loss": 0.0019,
"reward": 0.12304100673645735,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.13339901715517044,
"rewards/cosine_scaled_reward": -0.006082692489144392,
"rewards/format_reward": 0.729166679084301,
"step": 333
},
{
"advantage_max": 0.14941539708524942,
"advantage_mean": 4.656613011855271e-10,
"advantage_min": -0.1259508691728115,
"advantage_std": 0.10819857241585851,
"completion_length": 3429.9375610351562,
"epoch": 0.38171428571428573,
"grad_norm": 0.06314020603895187,
"kl": 0.057037353515625,
"learning_rate": 3.6984293534939737e-07,
"loss": 0.0079,
"reward": -0.027071162359789014,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.10819857893511653,
"rewards/cosine_scaled_reward": -0.16390738973859698,
"rewards/format_reward": 0.16666667349636555,
"step": 334
},
{
"advantage_max": 0.09907219745218754,
"advantage_mean": -1.1486311984887365e-08,
"advantage_min": -0.15171983744949102,
"advantage_std": 0.09425494028255343,
"completion_length": 2277.7500381469727,
"epoch": 0.38285714285714284,
"grad_norm": 0.038017332553863525,
"kl": 0.035099029541015625,
"learning_rate": 3.6696851061588994e-07,
"loss": 0.0047,
"reward": 0.14433281571837142,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.09425493981689215,
"rewards/cosine_scaled_reward": 0.12261204607784748,
"rewards/format_reward": 0.6041666679084301,
"step": 335
},
{
"advantage_max": 0.21837344765663147,
"advantage_mean": -6.053596901534064e-09,
"advantage_min": -0.17420747876167297,
"advantage_std": 0.1544438600540161,
"completion_length": 2266.6458892822266,
"epoch": 0.384,
"grad_norm": 0.0935477614402771,
"kl": 0.033916473388671875,
"learning_rate": 3.641030065789562e-07,
"loss": 0.0221,
"reward": 0.11615504696965218,
"reward_advantage_correlation": 1.0,
"reward_std": 0.15444386564195156,
"rewards/cosine_scaled_reward": 0.10184320248663425,
"rewards/format_reward": 0.47916667722165585,
"step": 336
},
{
"advantage_max": 0.19050366338342428,
"advantage_mean": -3.123811228786244e-09,
"advantage_min": -0.19180483371019363,
"advantage_std": 0.1521349996328354,
"completion_length": 2244.2501068115234,
"epoch": 0.3851428571428571,
"grad_norm": 0.11284514516592026,
"kl": 0.034465789794921875,
"learning_rate": 3.612465628992203e-07,
"loss": 0.0136,
"reward": 0.12575469084549695,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1521350033581257,
"rewards/cosine_scaled_reward": 0.005518501624464989,
"rewards/format_reward": 0.7291666828095913,
"step": 337
},
{
"advantage_max": 0.19977085664868355,
"advantage_mean": -2.7755575615628914e-17,
"advantage_min": -0.1505418843589723,
"advantage_std": 0.14051949698477983,
"completion_length": 1987.5000457763672,
"epoch": 0.3862857142857143,
"grad_norm": 0.03124089166522026,
"kl": 0.03697967529296875,
"learning_rate": 3.5839931879571725e-07,
"loss": 0.005,
"reward": 0.13385723589453846,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.140519502107054,
"rewards/cosine_scaled_reward": 0.008122519589960575,
"rewards/format_reward": 0.7708333488553762,
"step": 338
},
{
"advantage_max": 0.11336069833487272,
"advantage_mean": 1.0089328331130965e-09,
"advantage_min": -0.10100190062075853,
"advantage_std": 0.0796776907518506,
"completion_length": 3045.750030517578,
"epoch": 0.38742857142857146,
"grad_norm": 0.03193638101220131,
"kl": 0.044342041015625,
"learning_rate": 3.555614130391079e-07,
"loss": 0.0068,
"reward": -0.012824157951399684,
"reward_advantage_correlation": 1.0,
"reward_std": 0.07967769308015704,
"rewards/cosine_scaled_reward": -0.1936829062178731,
"rewards/format_reward": 0.31250000558793545,
"step": 339
},
{
"advantage_max": 0.20058695320039988,
"advantage_mean": -4.113341493805489e-09,
"advantage_min": -0.18251327332109213,
"advantage_std": 0.15282507333904505,
"completion_length": 2754.979248046875,
"epoch": 0.38857142857142857,
"grad_norm": 0.07920674979686737,
"kl": 0.037872314453125,
"learning_rate": 3.5273298394491515e-07,
"loss": 0.0156,
"reward": 0.09624811878893524,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.15282507613301277,
"rewards/cosine_scaled_reward": 0.020154272206127644,
"rewards/format_reward": 0.5208333414047956,
"step": 340
},
{
"advantage_max": 0.13869278971105814,
"advantage_mean": -1.4435500156340098e-08,
"advantage_min": -0.15075519727542996,
"advantage_std": 0.12164947250857949,
"completion_length": 2472.0833892822266,
"epoch": 0.38971428571428574,
"grad_norm": 0.040600065141916275,
"kl": 0.038482666015625,
"learning_rate": 3.4991416936678276e-07,
"loss": 0.0081,
"reward": 0.18206297140568495,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.12164947902783751,
"rewards/cosine_scaled_reward": 0.19754501432180405,
"rewards/format_reward": 0.6666666716337204,
"step": 341
},
{
"advantage_max": 0.18315967917442322,
"advantage_mean": -4.6566127342995145e-10,
"advantage_min": -0.16725237760692835,
"advantage_std": 0.14502713968977332,
"completion_length": 2800.604217529297,
"epoch": 0.39085714285714285,
"grad_norm": 0.06095492094755173,
"kl": 0.05242919921875,
"learning_rate": 3.471051066897562e-07,
"loss": 0.0124,
"reward": 0.03126559848897159,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.14502713968977332,
"rewards/cosine_scaled_reward": -0.09631808660924435,
"rewards/format_reward": 0.3750000074505806,
"step": 342
},
{
"advantage_max": 0.22337682452052832,
"advantage_mean": -8.614733849887646e-09,
"advantage_min": -0.18603284191340208,
"advantage_std": 0.16502254595980048,
"completion_length": 2473.2708892822266,
"epoch": 0.392,
"grad_norm": 0.03511941805481911,
"kl": 0.03363037109375,
"learning_rate": 3.4430593282358777e-07,
"loss": 0.0078,
"reward": 0.13036981271579862,
"reward_advantage_correlation": 1.0,
"reward_std": 0.16502255713567138,
"rewards/cosine_scaled_reward": 0.06001020688563585,
"rewards/format_reward": 0.6458333469927311,
"step": 343
},
{
"advantage_max": 0.1367392516694963,
"advantage_mean": -6.51925814720844e-09,
"advantage_min": -0.19531613495200872,
"advantage_std": 0.12819127598777413,
"completion_length": 2279.270881652832,
"epoch": 0.3931428571428571,
"grad_norm": 0.04424729198217392,
"kl": 0.026615142822265625,
"learning_rate": 3.4151678419606233e-07,
"loss": 0.013,
"reward": 0.16871812019962817,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.12819127459079027,
"rewards/cosine_scaled_reward": 0.18393309600651264,
"rewards/format_reward": 0.6250000093132257,
"step": 344
},
{
"advantage_max": 0.13400619849562645,
"advantage_mean": 3.88051125954636e-10,
"advantage_min": -0.12482567969709635,
"advantage_std": 0.09939198894426227,
"completion_length": 2628.2709045410156,
"epoch": 0.3942857142857143,
"grad_norm": 0.055625367909669876,
"kl": 0.042591094970703125,
"learning_rate": 3.387377967463493e-07,
"loss": 0.004,
"reward": 0.04211104451678693,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.09939198987558484,
"rewards/cosine_scaled_reward": -0.09614112600684166,
"rewards/format_reward": 0.43750000558793545,
"step": 345
},
{
"advantage_max": 0.09226818988099694,
"advantage_mean": -5.587935725248627e-09,
"advantage_min": -0.09677781723439693,
"advantage_std": 0.07406530145090073,
"completion_length": 2657.937545776367,
"epoch": 0.3954285714285714,
"grad_norm": 0.037199534475803375,
"kl": 0.04526519775390625,
"learning_rate": 3.359691059183761e-07,
"loss": 0.007,
"reward": 0.07278500194661319,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.0740653061075136,
"rewards/cosine_scaled_reward": -0.046637315303087234,
"rewards/format_reward": 0.5208333414047956,
"step": 346
},
{
"advantage_max": 0.190785126760602,
"advantage_mean": -1.396983917434369e-09,
"advantage_min": -0.13941976055502892,
"advantage_std": 0.137190165463835,
"completion_length": 3063.8750610351562,
"epoch": 0.3965714285714286,
"grad_norm": 0.052545636892318726,
"kl": 0.04752349853515625,
"learning_rate": 3.3321084665422803e-07,
"loss": 0.0178,
"reward": 0.01757329748943448,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.13719017151743174,
"rewards/cosine_scaled_reward": -0.14757848158478737,
"rewards/format_reward": 0.39583334140479565,
"step": 347
},
{
"advantage_max": 0.13676777854561806,
"advantage_mean": -9.701277940699082e-10,
"advantage_min": -0.13411255180835724,
"advantage_std": 0.10507096443325281,
"completion_length": 2446.2083740234375,
"epoch": 0.3977142857142857,
"grad_norm": 0.03185461834073067,
"kl": 0.04033660888671875,
"learning_rate": 3.3046315338757026e-07,
"loss": 0.0118,
"reward": 0.017525036178994924,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10507096629589796,
"rewards/cosine_scaled_reward": -0.1783681306988001,
"rewards/format_reward": 0.45833333767950535,
"step": 348
},
{
"advantage_max": 0.12805233104154468,
"advantage_mean": -5.820766091346741e-10,
"advantage_min": -0.13923298381268978,
"advantage_std": 0.10560231888666749,
"completion_length": 2235.6458892822266,
"epoch": 0.39885714285714285,
"grad_norm": 0.08265335857868195,
"kl": 0.0382232666015625,
"learning_rate": 3.2772616003709616e-07,
"loss": 0.0176,
"reward": 0.107158649538178,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.10560232121497393,
"rewards/cosine_scaled_reward": 0.03427761234343052,
"rewards/format_reward": 0.5625000037252903,
"step": 349
},
{
"advantage_max": 0.1867425860837102,
"advantage_mean": -2.095475848395978e-09,
"advantage_min": -0.13708714861422777,
"advantage_std": 0.1264267978258431,
"completion_length": 1768.7500762939453,
"epoch": 0.4,
"grad_norm": 0.054677292704582214,
"kl": 0.02910614013671875,
"learning_rate": 3.250000000000001e-07,
"loss": 0.0072,
"reward": 0.09746129438281059,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12642680434510112,
"rewards/cosine_scaled_reward": -0.1304661799222231,
"rewards/format_reward": 0.8333333395421505,
"step": 350
},
{
"advantage_max": 0.16754774982109666,
"advantage_mean": 6.984919309616089e-10,
"advantage_min": -0.17636614479124546,
"advantage_std": 0.1403536181896925,
"completion_length": 2874.229202270508,
"epoch": 0.40114285714285713,
"grad_norm": 0.04100406542420387,
"kl": 0.057098388671875,
"learning_rate": 3.222848061454764e-07,
"loss": 0.0149,
"reward": 0.08200124464929104,
"reward_advantage_correlation": 1.0,
"reward_std": 0.14035362238064408,
"rewards/cosine_scaled_reward": 0.011260326951742172,
"rewards/format_reward": 0.4583333432674408,
"step": 351
},
{
"advantage_max": 0.19811256416141987,
"advantage_mean": -4.9670539625790155e-09,
"advantage_min": -0.14627101365476847,
"advantage_std": 0.1381764942780137,
"completion_length": 2160.687530517578,
"epoch": 0.4022857142857143,
"grad_norm": 0.07493390142917633,
"kl": 0.02725982666015625,
"learning_rate": 3.195807108082429e-07,
"loss": 0.0139,
"reward": 0.09585971757769585,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.13817649986594915,
"rewards/cosine_scaled_reward": 0.03348735300824046,
"rewards/format_reward": 0.5000000167638063,
"step": 352
},
{
"advantage_max": 0.22151278518140316,
"advantage_mean": -3.6476802342533254e-09,
"advantage_min": -0.1870957650244236,
"advantage_std": 0.16393206408247352,
"completion_length": 1902.333381652832,
"epoch": 0.4034285714285714,
"grad_norm": 0.04224457964301109,
"kl": 0.0423431396484375,
"learning_rate": 3.168878457820915e-07,
"loss": 0.0074,
"reward": 0.1670964928343892,
"reward_advantage_correlation": 1.0,
"reward_std": 0.16393207339569926,
"rewards/cosine_scaled_reward": 0.09996501728892326,
"rewards/format_reward": 0.7708333488553762,
"step": 353
},
{
"advantage_max": 0.1349656144157052,
"advantage_mean": -8.226683126388856e-09,
"advantage_min": -0.1224656468257308,
"advantage_std": 0.10073514329269528,
"completion_length": 1743.6042404174805,
"epoch": 0.4045714285714286,
"grad_norm": 0.03602663427591324,
"kl": 0.036907196044921875,
"learning_rate": 3.142063423134644e-07,
"loss": 0.0109,
"reward": 0.14471303531900048,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10073514748364687,
"rewards/cosine_scaled_reward": 0.057613649405539036,
"rewards/format_reward": 0.7291666753590107,
"step": 354
},
{
"advantage_max": 0.199096888769418,
"advantage_mean": -3.802900647165153e-09,
"advantage_min": -0.23116095550358295,
"advantage_std": 0.17236013431102037,
"completion_length": 1989.2292175292969,
"epoch": 0.4057142857142857,
"grad_norm": 0.05271374434232712,
"kl": 0.044979095458984375,
"learning_rate": 3.115363310950578e-07,
"loss": 0.0004,
"reward": 0.1605138722807169,
"reward_advantage_correlation": 1.0,
"reward_std": 0.17236013896763325,
"rewards/cosine_scaled_reward": 0.10444843280129135,
"rewards/format_reward": 0.7291666753590107,
"step": 355
},
{
"advantage_max": 0.21099152276292443,
"advantage_mean": 1.164153294597181e-09,
"advantage_min": -0.17593623790889978,
"advantage_std": 0.16403738921508193,
"completion_length": 2499.9583740234375,
"epoch": 0.40685714285714286,
"grad_norm": 0.0717054158449173,
"kl": 0.043792724609375,
"learning_rate": 3.0887794225945143e-07,
"loss": -0.0039,
"reward": 0.12240747502073646,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.16403739666566253,
"rewards/cosine_scaled_reward": 0.027360030449926853,
"rewards/format_reward": 0.6666666772216558,
"step": 356
},
{
"advantage_max": 0.2026348551735282,
"advantage_mean": -2.3283064226609085e-09,
"advantage_min": -0.15383625030517578,
"advantage_std": 0.14354443550109863,
"completion_length": 2546.270881652832,
"epoch": 0.408,
"grad_norm": 0.07251500338315964,
"kl": 0.040679931640625,
"learning_rate": 3.062313053727671e-07,
"loss": 0.0075,
"reward": 0.06861508125439286,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.14354444295167923,
"rewards/cosine_scaled_reward": -0.06084787752479315,
"rewards/format_reward": 0.5208333432674408,
"step": 357
},
{
"advantage_max": 0.14043398201465607,
"advantage_mean": -9.778887422040583e-09,
"advantage_min": -0.20001309178769588,
"advantage_std": 0.13735910970717669,
"completion_length": 1679.8750610351562,
"epoch": 0.40914285714285714,
"grad_norm": 0.044725243002176285,
"kl": 0.018812179565429688,
"learning_rate": 3.0359654942835247e-07,
"loss": 0.007,
"reward": 0.19390923529863358,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.13735911808907986,
"rewards/cosine_scaled_reward": 0.18609214387834072,
"rewards/format_reward": 0.7708333358168602,
"step": 358
},
{
"advantage_max": 0.1749684326350689,
"advantage_mean": -4.967053976456803e-09,
"advantage_min": -0.16937719751149416,
"advantage_std": 0.14141156151890755,
"completion_length": 2299.6459197998047,
"epoch": 0.4102857142857143,
"grad_norm": 0.06835544854402542,
"kl": 0.0594482421875,
"learning_rate": 3.0097380284049523e-07,
"loss": 0.0138,
"reward": 0.08632952661719173,
"reward_advantage_correlation": 1.0,
"reward_std": 0.141411567106843,
"rewards/cosine_scaled_reward": -0.05791237950325012,
"rewards/format_reward": 0.6250000186264515,
"step": 359
},
{
"advantage_max": 0.1915941759943962,
"advantage_mean": 1.3969838619232178e-09,
"advantage_min": -0.1723188515752554,
"advantage_std": 0.14488590229302645,
"completion_length": 2417.229217529297,
"epoch": 0.4114285714285714,
"grad_norm": 0.05922839418053627,
"kl": 0.04900360107421875,
"learning_rate": 2.9836319343816397e-07,
"loss": 0.0139,
"reward": 0.08855638474415173,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.14488590601831675,
"rewards/cosine_scaled_reward": -0.028948407620191574,
"rewards/format_reward": 0.5833333507180214,
"step": 360
},
{
"advantage_max": 0.2224835902452469,
"advantage_mean": -4.96705380992335e-09,
"advantage_min": -0.24620150960981846,
"advantage_std": 0.1885995902121067,
"completion_length": 2589.166748046875,
"epoch": 0.4125714285714286,
"grad_norm": 0.0959998071193695,
"kl": 0.05340576171875,
"learning_rate": 2.9576484845877793e-07,
"loss": 0.0184,
"reward": 0.08691775565966964,
"reward_advantage_correlation": 0.9999999999999996,
"reward_std": 0.18859959580004215,
"rewards/cosine_scaled_reward": -0.005889172665774822,
"rewards/format_reward": 0.5208333432674408,
"step": 361
},
{
"advantage_max": 0.08507681265473366,
"advantage_mean": -3.7252902568285506e-09,
"advantage_min": -0.12208179384469986,
"advantage_std": 0.09099588170647621,
"completion_length": 1447.708381652832,
"epoch": 0.4137142857142857,
"grad_norm": 0.05355558171868324,
"kl": 0.04192352294921875,
"learning_rate": 2.931788945420058e-07,
"loss": 0.0133,
"reward": 0.17968237926834263,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.09099588077515364,
"rewards/cosine_scaled_reward": 0.12263097055256367,
"rewards/format_reward": 0.8125000111758709,
"step": 362
},
{
"advantage_max": 0.1842280002310872,
"advantage_mean": -6.9073092315297124e-09,
"advantage_min": -0.15253216493874788,
"advantage_std": 0.1245655040256679,
"completion_length": 1517.458396911621,
"epoch": 0.41485714285714287,
"grad_norm": 0.04148838296532631,
"kl": 0.043212890625,
"learning_rate": 2.9060545772359305e-07,
"loss": 0.0098,
"reward": 0.17585814488120377,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12456551007926464,
"rewards/cosine_scaled_reward": 0.11942176637239754,
"rewards/format_reward": 0.791666679084301,
"step": 363
},
{
"advantage_max": 0.13093959633260965,
"advantage_mean": 2.9103831254706503e-09,
"advantage_min": -0.1537663722410798,
"advantage_std": 0.11034792196005583,
"completion_length": 2274.2500610351562,
"epoch": 0.416,
"grad_norm": 0.07935275882482529,
"kl": 0.04637908935546875,
"learning_rate": 2.8804466342921987e-07,
"loss": 0.0065,
"reward": 0.038603525958023965,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1103479228913784,
"rewards/cosine_scaled_reward": -0.17720217071473598,
"rewards/format_reward": 0.5833333432674408,
"step": 364
},
{
"advantage_max": 0.1946524642407894,
"advantage_mean": -3.802900563898426e-09,
"advantage_min": -0.17981757363304496,
"advantage_std": 0.1640670644119382,
"completion_length": 2797.6667098999023,
"epoch": 0.41714285714285715,
"grad_norm": 0.10382858663797379,
"kl": 0.06769561767578125,
"learning_rate": 2.854966364683872e-07,
"loss": 0.016,
"reward": 0.07118311786325648,
"reward_advantage_correlation": 1.0,
"reward_std": 0.16406708303838968,
"rewards/cosine_scaled_reward": -0.0007578474469482899,
"rewards/format_reward": 0.41666667349636555,
"step": 365
},
{
"advantage_max": 0.13503147196024656,
"advantage_mean": -5.432715111108877e-10,
"advantage_min": -0.13952044351026416,
"advantage_std": 0.1080038407817483,
"completion_length": 1690.4375267028809,
"epoch": 0.41828571428571426,
"grad_norm": 0.03620801120996475,
"kl": 0.02330780029296875,
"learning_rate": 2.829615010283344e-07,
"loss": -0.001,
"reward": 0.1814038148149848,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1080038477666676,
"rewards/cosine_scaled_reward": 0.09267531940713525,
"rewards/format_reward": 0.875,
"step": 366
},
{
"advantage_max": 0.26444832887500525,
"advantage_mean": -3.4148495420271985e-09,
"advantage_min": -0.1704773958772421,
"advantage_std": 0.16923782834783196,
"completion_length": 2643.2292098999023,
"epoch": 0.41942857142857143,
"grad_norm": 0.07513663917779922,
"kl": 0.046604156494140625,
"learning_rate": 2.8043938066798645e-07,
"loss": 0.0162,
"reward": 0.06340029306011274,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.16923783347010612,
"rewards/cosine_scaled_reward": -0.08535546949133277,
"rewards/format_reward": 0.5416666734963655,
"step": 367
},
{
"advantage_max": 0.15277679590508342,
"advantage_mean": 3.725290367850853e-09,
"advantage_min": -0.1591153903864324,
"advantage_std": 0.12118267058394849,
"completion_length": 2796.979217529297,
"epoch": 0.4205714285714286,
"grad_norm": 0.06917224824428558,
"kl": 0.0589141845703125,
"learning_rate": 2.7793039831193133e-07,
"loss": 0.0198,
"reward": 0.015495523664867505,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12118267104960978,
"rewards/cosine_scaled_reward": -0.14332137536257505,
"rewards/format_reward": 0.37500000931322575,
"step": 368
},
{
"advantage_max": 0.15400357451289892,
"advantage_mean": -2.7939677238464355e-09,
"advantage_min": -0.2148400265723467,
"advantage_std": 0.1464338074438274,
"completion_length": 2664.6250610351562,
"epoch": 0.4217142857142857,
"grad_norm": 0.12221620231866837,
"kl": 0.0754241943359375,
"learning_rate": 2.7543467624442956e-07,
"loss": 0.0104,
"reward": 0.10048398980870843,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1464338069781661,
"rewards/cosine_scaled_reward": 0.05717929266393185,
"rewards/format_reward": 0.4791666828095913,
"step": 369
},
{
"advantage_max": 0.14356589503586292,
"advantage_mean": -4.190951551075184e-09,
"advantage_min": -0.14298985060304403,
"advantage_std": 0.11907922197133303,
"completion_length": 2785.416702270508,
"epoch": 0.4228571428571429,
"grad_norm": 0.05613729730248451,
"kl": 0.06988525390625,
"learning_rate": 2.729523361034538e-07,
"loss": 0.0162,
"reward": 0.07710259314626455,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.11907922383397818,
"rewards/cosine_scaled_reward": 0.00833977572619915,
"rewards/format_reward": 0.43750000931322575,
"step": 370
},
{
"advantage_max": 0.15934794954955578,
"advantage_mean": -5.044664012909816e-09,
"advantage_min": -0.14198165433481336,
"advantage_std": 0.11952129052951932,
"completion_length": 1526.4583587646484,
"epoch": 0.424,
"grad_norm": 0.05127997323870659,
"kl": 0.04236602783203125,
"learning_rate": 2.7048349887476037e-07,
"loss": 0.0102,
"reward": 0.16448132740333676,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11952129052951932,
"rewards/cosine_scaled_reward": 0.11803595721721649,
"rewards/format_reward": 0.7291666753590107,
"step": 371
},
{
"advantage_max": 0.22729028388857841,
"advantage_mean": 1.2417634420724966e-09,
"advantage_min": -0.15133884362876415,
"advantage_std": 0.14886979572474957,
"completion_length": 2749.7917404174805,
"epoch": 0.42514285714285716,
"grad_norm": 0.10154866427183151,
"kl": 0.0570220947265625,
"learning_rate": 2.6802828488599294e-07,
"loss": 0.0161,
"reward": 0.0784059870056808,
"reward_advantage_correlation": 1.0,
"reward_std": 0.14886980084702373,
"rewards/cosine_scaled_reward": 0.010133292176760733,
"rewards/format_reward": 0.4375000037252903,
"step": 372
},
{
"advantage_max": 0.15681373560801148,
"advantage_mean": -1.0399768823887712e-08,
"advantage_min": -0.1237130188383162,
"advantage_std": 0.11755135306157172,
"completion_length": 1514.3958625793457,
"epoch": 0.42628571428571427,
"grad_norm": 0.032398343086242676,
"kl": 0.034709930419921875,
"learning_rate": 2.655868138008171e-07,
"loss": 0.0063,
"reward": 0.08641793858259916,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11755135981366038,
"rewards/cosine_scaled_reward": -0.11321200663223863,
"rewards/format_reward": 0.7291666772216558,
"step": 373
},
{
"advantage_max": 0.1916589979082346,
"advantage_mean": -4.967053740534411e-09,
"advantage_min": -0.24126344360411167,
"advantage_std": 0.17274773959070444,
"completion_length": 2046.4375762939453,
"epoch": 0.42742857142857144,
"grad_norm": 0.07882247865200043,
"kl": 0.0540771484375,
"learning_rate": 2.631592046130896e-07,
"loss": 0.0185,
"reward": 0.12403235118836164,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1727477479726076,
"rewards/cosine_scaled_reward": 0.09565928019583225,
"rewards/format_reward": 0.5416666828095913,
"step": 374
},
{
"advantage_max": 0.13977034855633974,
"advantage_mean": -8.459513499425864e-09,
"advantage_min": -0.21618898399174213,
"advantage_std": 0.14144186303019524,
"completion_length": 2736.4375610351562,
"epoch": 0.42857142857142855,
"grad_norm": 0.13247309625148773,
"kl": 0.079193115234375,
"learning_rate": 2.6074557564105724e-07,
"loss": 0.0266,
"reward": 0.07485455530695617,
"reward_advantage_correlation": 1.0,
"reward_std": 0.14144186628982425,
"rewards/cosine_scaled_reward": 0.022881068289279938,
"rewards/format_reward": 0.3958333507180214,
"step": 375
},
{
"advantage_max": 0.2252150783315301,
"advantage_mean": 1.249000902703301e-16,
"advantage_min": -0.15643711481243372,
"advantage_std": 0.15819351840764284,
"completion_length": 2360.354248046875,
"epoch": 0.4297142857142857,
"grad_norm": 0.08375173062086105,
"kl": 0.06674385070800781,
"learning_rate": 2.583460445215911e-07,
"loss": 0.0122,
"reward": 0.061880006454885006,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.15819351840764284,
"rewards/cosine_scaled_reward": -0.11892816657200456,
"rewards/format_reward": 0.6041666734963655,
"step": 376
},
{
"advantage_max": 0.2330569690093398,
"advantage_mean": 2.6387473733846534e-09,
"advantage_min": -0.16961478628218174,
"advantage_std": 0.15390967670828104,
"completion_length": 2938.9584350585938,
"epoch": 0.4308571428571429,
"grad_norm": 0.08977462351322174,
"kl": 0.1001129150390625,
"learning_rate": 2.5596072820445254e-07,
"loss": 0.0238,
"reward": 0.04838086655945517,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1539096813648939,
"rewards/cosine_scaled_reward": -0.03433333709836006,
"rewards/format_reward": 0.3541666753590107,
"step": 377
},
{
"advantage_max": 0.23671884834766388,
"advantage_mean": -1.396983917434369e-09,
"advantage_min": -0.1725560138002038,
"advantage_std": 0.15941207576543093,
"completion_length": 2238.645881652832,
"epoch": 0.432,
"grad_norm": 0.054851531982421875,
"kl": 0.06591415405273438,
"learning_rate": 2.5358974294659373e-07,
"loss": 0.0141,
"reward": 0.09857236547395587,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.15941207949072123,
"rewards/cosine_scaled_reward": 0.007719084620475769,
"rewards/format_reward": 0.5625000055879354,
"step": 378
},
{
"advantage_max": 0.2065592324361205,
"advantage_mean": 6.208817210362483e-10,
"advantage_min": -0.1892344020307064,
"advantage_std": 0.14801982045173645,
"completion_length": 2359.416717529297,
"epoch": 0.43314285714285716,
"grad_norm": 0.0550236701965332,
"kl": 0.06499481201171875,
"learning_rate": 2.512332043064913e-07,
"loss": 0.0143,
"reward": 0.08630741806700826,
"reward_advantage_correlation": 1.0,
"reward_std": 0.14801982697099447,
"rewards/cosine_scaled_reward": -0.05953240022063255,
"rewards/format_reward": 0.625000013038516,
"step": 379
},
{
"advantage_max": 0.11885185819119215,
"advantage_mean": -3.2596290944209017e-09,
"advantage_min": -0.15346426516771317,
"advantage_std": 0.11005022726021707,
"completion_length": 2270.250045776367,
"epoch": 0.4342857142857143,
"grad_norm": 0.06920424103736877,
"kl": 0.06329345703125,
"learning_rate": 2.488912271385139e-07,
"loss": 0.0113,
"reward": 0.09566288208588958,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11005022516474128,
"rewards/cosine_scaled_reward": 0.010071046184748411,
"rewards/format_reward": 0.5416666772216558,
"step": 380
},
{
"advantage_max": 0.16205658204853535,
"advantage_mean": 2.0372682898311956e-09,
"advantage_min": -0.10958120739087462,
"advantage_std": 0.10907484404742718,
"completion_length": 2616.7500610351562,
"epoch": 0.43542857142857144,
"grad_norm": 0.051583774387836456,
"kl": 0.081939697265625,
"learning_rate": 2.465639255873246e-07,
"loss": 0.0113,
"reward": -0.01113765970512759,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10907484823837876,
"rewards/cosine_scaled_reward": -0.2404602374881506,
"rewards/format_reward": 0.4166666753590107,
"step": 381
},
{
"advantage_max": 0.18154283426702023,
"advantage_mean": -2.716357583310014e-09,
"advantage_min": -0.14717091992497444,
"advantage_std": 0.12136137438938022,
"completion_length": 1677.3125610351562,
"epoch": 0.43657142857142855,
"grad_norm": 0.04167007654905319,
"kl": 0.0534820556640625,
"learning_rate": 2.4425141308231765e-07,
"loss": 0.0093,
"reward": 0.0900630738469772,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12136137904599309,
"rewards/cosine_scaled_reward": -0.14255204549408518,
"rewards/format_reward": 0.8125000204890966,
"step": 382
},
{
"advantage_max": 0.16546936659142375,
"advantage_mean": -2.056670778127767e-09,
"advantage_min": -0.1402215976268053,
"advantage_std": 0.12318957853130996,
"completion_length": 2511.020881652832,
"epoch": 0.4377142857142857,
"grad_norm": 0.10153740644454956,
"kl": 0.08424758911132812,
"learning_rate": 2.4195380233209006e-07,
"loss": 0.012,
"reward": 0.07365681836381555,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1231895792298019,
"rewards/cosine_scaled_reward": -0.03396464860998094,
"rewards/format_reward": 0.500000013038516,
"step": 383
},
{
"advantage_max": 0.26926819048821926,
"advantage_mean": -6.519258133330652e-09,
"advantage_min": -0.18919609300792217,
"advantage_std": 0.18077142210677266,
"completion_length": 1919.895896911621,
"epoch": 0.43885714285714283,
"grad_norm": 0.0882238820195198,
"kl": 0.073150634765625,
"learning_rate": 2.3967120531894857e-07,
"loss": 0.0167,
"reward": 0.20658226870000362,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1807714276947081,
"rewards/cosine_scaled_reward": 0.285074929241091,
"rewards/format_reward": 0.6458333414047956,
"step": 384
},
{
"advantage_max": 0.17060526320710778,
"advantage_mean": -1.202958357926498e-09,
"advantage_min": -0.12168563972227275,
"advantage_std": 0.12083742453251034,
"completion_length": 2190.6667251586914,
"epoch": 0.44,
"grad_norm": 0.03899059444665909,
"kl": 0.06510162353515625,
"learning_rate": 2.374037332934512e-07,
"loss": 0.0125,
"reward": 0.06560673611238599,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12083742849063128,
"rewards/cosine_scaled_reward": -0.1208215095102787,
"rewards/format_reward": 0.6250000037252903,
"step": 385
},
{
"advantage_max": 0.19935980439186096,
"advantage_mean": -4.5013925920045494e-09,
"advantage_min": -0.2209212351590395,
"advantage_std": 0.16272395756095648,
"completion_length": 2633.6250534057617,
"epoch": 0.44114285714285717,
"grad_norm": 0.115007184445858,
"kl": 0.0880279541015625,
"learning_rate": 2.3515149676898552e-07,
"loss": 0.0139,
"reward": 0.09591761871706694,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1627239640802145,
"rewards/cosine_scaled_reward": -0.011030611349269748,
"rewards/format_reward": 0.5833333488553762,
"step": 386
},
{
"advantage_max": 0.15378239285200834,
"advantage_mean": -2.2506962543689113e-09,
"advantage_min": -0.14221325889229774,
"advantage_std": 0.123248225543648,
"completion_length": 2798.7708740234375,
"epoch": 0.4422857142857143,
"grad_norm": 0.05777512118220329,
"kl": 0.1002044677734375,
"learning_rate": 2.3291460551638237e-07,
"loss": 0.02,
"reward": 0.05685883387923241,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12324822414666414,
"rewards/cosine_scaled_reward": -0.03132744878530502,
"rewards/format_reward": 0.3958333358168602,
"step": 387
},
{
"advantage_max": 0.17052914388477802,
"advantage_mean": -2.173086155465853e-09,
"advantage_min": -0.13332293508574367,
"advantage_std": 0.12420817371457815,
"completion_length": 2340.6250610351562,
"epoch": 0.44342857142857145,
"grad_norm": 0.04883037880063057,
"kl": 0.093536376953125,
"learning_rate": 2.306931685585657e-07,
"loss": 0.0165,
"reward": 0.11339898826554418,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1242081755772233,
"rewards/cosine_scaled_reward": 0.06429434940218925,
"rewards/format_reward": 0.5416666679084301,
"step": 388
},
{
"advantage_max": 0.120490204077214,
"advantage_mean": -2.7939677169075416e-09,
"advantage_min": -0.16002231976017356,
"advantage_std": 0.10619163559749722,
"completion_length": 1919.5833740234375,
"epoch": 0.44457142857142856,
"grad_norm": 0.06259764730930328,
"kl": 0.06043243408203125,
"learning_rate": 2.2848729416523859e-07,
"loss": 0.0032,
"reward": 0.08577554707881063,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.10619163932278752,
"rewards/cosine_scaled_reward": -0.10086721181869507,
"rewards/format_reward": 0.7083333507180214,
"step": 389
},
{
"advantage_max": 0.18258788716048002,
"advantage_mean": -2.3283064365386963e-09,
"advantage_min": -0.13553531654179096,
"advantage_std": 0.1262248894199729,
"completion_length": 2658.9375534057617,
"epoch": 0.44571428571428573,
"grad_norm": 0.12601332366466522,
"kl": 0.0760498046875,
"learning_rate": 2.2629708984760706e-07,
"loss": 0.0039,
"reward": 0.05920348968356848,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.12622489035129547,
"rewards/cosine_scaled_reward": -0.03534679673612118,
"rewards/format_reward": 0.4166666679084301,
"step": 390
},
{
"advantage_max": 0.17521672742441297,
"advantage_mean": -5.122274354674161e-09,
"advantage_min": -0.19580721389502287,
"advantage_std": 0.15704865427687764,
"completion_length": 2380.625030517578,
"epoch": 0.44685714285714284,
"grad_norm": 0.12029723823070526,
"kl": 0.07598876953125,
"learning_rate": 2.2412266235313973e-07,
"loss": 0.0076,
"reward": 0.13599346484988928,
"reward_advantage_correlation": 1.0,
"reward_std": 0.15704865800216794,
"rewards/cosine_scaled_reward": 0.12639703415334225,
"rewards/format_reward": 0.5416666697710752,
"step": 391
},
{
"advantage_max": 0.12857118248939514,
"advantage_mean": -6.984920281061235e-10,
"advantage_min": -0.1260515470057726,
"advantage_std": 0.10433831717818975,
"completion_length": 1794.0000457763672,
"epoch": 0.448,
"grad_norm": 0.03922954574227333,
"kl": 0.0439300537109375,
"learning_rate": 2.2196411766036487e-07,
"loss": 0.0043,
"reward": 0.09477179404348135,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.10433832183480263,
"rewards/cosine_scaled_reward": -0.12779118958860636,
"rewards/format_reward": 0.8125000074505806,
"step": 392
},
{
"advantage_max": 0.23233003355562687,
"advantage_mean": 2.0178656245928295e-09,
"advantage_min": -0.18882656935602427,
"advantage_std": 0.16627767169848084,
"completion_length": 2182.2708587646484,
"epoch": 0.4491428571428571,
"grad_norm": 0.07548714429140091,
"kl": 0.0761871337890625,
"learning_rate": 2.1982156097370557e-07,
"loss": 0.0054,
"reward": 0.08290141774341464,
"reward_advantage_correlation": 1.0,
"reward_std": 0.16627767169848084,
"rewards/cosine_scaled_reward": -0.05863434381899424,
"rewards/format_reward": 0.604166679084301,
"step": 393
},
{
"advantage_max": 0.1498968666419387,
"advantage_mean": 2.250696282124487e-09,
"advantage_min": -0.10414962749928236,
"advantage_std": 0.09691937500610948,
"completion_length": 2409.666763305664,
"epoch": 0.4502857142857143,
"grad_norm": 0.057605672627687454,
"kl": 0.0919647216796875,
"learning_rate": 2.1769509671835223e-07,
"loss": 0.0121,
"reward": -0.004766212543472648,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.09691937174648046,
"rewards/cosine_scaled_reward": -0.2431314792484045,
"rewards/format_reward": 0.45833333767950535,
"step": 394
},
{
"advantage_max": 0.11893777782097459,
"advantage_mean": 1.4745940718485784e-09,
"advantage_min": -0.06753726582974195,
"advantage_std": 0.07489508436992764,
"completion_length": 2225.0208435058594,
"epoch": 0.4514285714285714,
"grad_norm": 0.07177528738975525,
"kl": 0.082550048828125,
"learning_rate": 2.1558482853517253e-07,
"loss": 0.007,
"reward": 0.05327743641100824,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.07489508809521794,
"rewards/cosine_scaled_reward": -0.10739827249199152,
"rewards/format_reward": 0.520833333954215,
"step": 395
},
{
"advantage_max": 0.17730457615107298,
"advantage_mean": 0.0,
"advantage_min": -0.20803257264196873,
"advantage_std": 0.1546328696422279,
"completion_length": 2330.9166946411133,
"epoch": 0.45257142857142857,
"grad_norm": 0.06790260970592499,
"kl": 0.09099960327148438,
"learning_rate": 2.134908592756607e-07,
"loss": 0.018,
"reward": 0.07746678782859817,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.15463287895545363,
"rewards/cosine_scaled_reward": -0.023853310383856297,
"rewards/format_reward": 0.5000000111758709,
"step": 396
},
{
"advantage_max": 0.2035736571997404,
"advantage_mean": -1.5522043442239841e-09,
"advantage_min": -0.13746324321255088,
"advantage_std": 0.13648039917461574,
"completion_length": 2338.104263305664,
"epoch": 0.45371428571428574,
"grad_norm": 0.06633574515581131,
"kl": 0.0894622802734375,
"learning_rate": 2.1141329099692406e-07,
"loss": 0.0108,
"reward": 0.06513352657202631,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13648040336556733,
"rewards/cosine_scaled_reward": -0.0791270025074482,
"rewards/format_reward": 0.5416666716337204,
"step": 397
},
{
"advantage_max": 0.19265004387125373,
"advantage_mean": -2.3283065475609988e-09,
"advantage_min": -0.15929417125880718,
"advantage_std": 0.13376712054014206,
"completion_length": 2583.8750534057617,
"epoch": 0.45485714285714285,
"grad_norm": 0.06692863255739212,
"kl": 0.09836196899414062,
"learning_rate": 2.0935222495670968e-07,
"loss": 0.0099,
"reward": 0.07285916851833463,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1337671286892146,
"rewards/cosine_scaled_reward": -0.06691717123612761,
"rewards/format_reward": 0.562500013038516,
"step": 398
},
{
"advantage_max": 0.1806424199603498,
"advantage_mean": 3.414849639171713e-09,
"advantage_min": -0.16816816572099924,
"advantage_std": 0.154349563177675,
"completion_length": 2301.1458892822266,
"epoch": 0.456,
"grad_norm": 0.1409112811088562,
"kl": 0.06836700439453125,
"learning_rate": 2.0730776160846853e-07,
"loss": 0.0188,
"reward": 0.12668270588619635,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.15434956131502986,
"rewards/cosine_scaled_reward": 0.029449453577399254,
"rewards/format_reward": 0.6875000149011612,
"step": 399
},
{
"advantage_max": 0.18242146540433168,
"advantage_mean": 3.570069934122344e-09,
"advantage_min": -0.15212072804570198,
"advantage_std": 0.1341659629251808,
"completion_length": 1446.437515258789,
"epoch": 0.45714285714285713,
"grad_norm": 0.04043472185730934,
"kl": 0.0493316650390625,
"learning_rate": 2.0528000059645995e-07,
"loss": 0.0073,
"reward": 0.16493975650519133,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13416596641764045,
"rewards/cosine_scaled_reward": 0.08686932059936225,
"rewards/format_reward": 0.7916666716337204,
"step": 400
},
{
"advantage_max": 0.11342965112999082,
"advantage_mean": -7.140140007022566e-09,
"advantage_min": -0.1604298735037446,
"advantage_std": 0.10300690727308393,
"completion_length": 2901.4791717529297,
"epoch": 0.4582857142857143,
"grad_norm": 0.08971995860338211,
"kl": 0.09316825866699219,
"learning_rate": 2.032690407508949e-07,
"loss": 0.0122,
"reward": 0.05048931506462395,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1030069119296968,
"rewards/cosine_scaled_reward": -0.05228261277079582,
"rewards/format_reward": 0.39583333767950535,
"step": 401
},
{
"advantage_max": 0.12064083246514201,
"advantage_mean": -1.319373665875645e-09,
"advantage_min": -0.10163196362555027,
"advantage_std": 0.09164711134508252,
"completion_length": 2343.270866394043,
"epoch": 0.4594285714285714,
"grad_norm": 0.08057854324579239,
"kl": 0.0913543701171875,
"learning_rate": 2.0127498008311922e-07,
"loss": 0.009,
"reward": 0.06032170820981264,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.09164711693301797,
"rewards/cosine_scaled_reward": -0.08369093760848045,
"rewards/format_reward": 0.5208333376795053,
"step": 402
},
{
"advantage_max": 0.08787010563537478,
"advantage_mean": 1.3969838896787934e-09,
"advantage_min": -0.1497042439877987,
"advantage_std": 0.09532873774878681,
"completion_length": 1638.4791793823242,
"epoch": 0.4605714285714286,
"grad_norm": 0.03606126457452774,
"kl": 0.042949676513671875,
"learning_rate": 1.9929791578083655e-07,
"loss": 0.0072,
"reward": 0.11972164455801249,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09532873728312552,
"rewards/cosine_scaled_reward": -0.0013210475444793701,
"rewards/format_reward": 0.7083333432674408,
"step": 403
},
{
"advantage_max": 0.12971502542495728,
"advantage_mean": -4.8118334594615675e-09,
"advantage_min": -0.1479872101917863,
"advantage_std": 0.10552329616621137,
"completion_length": 1950.8542175292969,
"epoch": 0.4617142857142857,
"grad_norm": 0.13382993638515472,
"kl": 0.07288742065429688,
"learning_rate": 1.9733794420337213e-07,
"loss": 0.0012,
"reward": 0.1100343014113605,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.10552330128848553,
"rewards/cosine_scaled_reward": -0.013227662071585655,
"rewards/format_reward": 0.6666666697710752,
"step": 404
},
{
"advantage_max": 0.1491077565588057,
"advantage_mean": -6.364038018791263e-09,
"advantage_min": -0.13719645235687494,
"advantage_std": 0.12250480009242892,
"completion_length": 1873.125015258789,
"epoch": 0.46285714285714286,
"grad_norm": 0.16895225644111633,
"kl": 0.057651519775390625,
"learning_rate": 1.9539516087697517e-07,
"loss": 0.0009,
"reward": 0.17472200049087405,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12250480428338051,
"rewards/cosine_scaled_reward": 0.1878266427665949,
"rewards/format_reward": 0.6458333395421505,
"step": 405
},
{
"advantage_max": 0.24610369745641947,
"advantage_mean": -1.474594175931987e-09,
"advantage_min": -0.16500617330893874,
"advantage_std": 0.17077338322997093,
"completion_length": 2001.6875457763672,
"epoch": 0.464,
"grad_norm": 0.037415359169244766,
"kl": 0.05828857421875,
"learning_rate": 1.934696604901642e-07,
"loss": 0.0095,
"reward": 0.13929871143773198,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1707733916118741,
"rewards/cosine_scaled_reward": 0.01136303087696433,
"rewards/format_reward": 0.7916666753590107,
"step": 406
},
{
"advantage_max": 0.10421117953956127,
"advantage_mean": -5.432716707054475e-10,
"advantage_min": -0.10994730168022215,
"advantage_std": 0.08500955649651587,
"completion_length": 2364.541702270508,
"epoch": 0.46514285714285714,
"grad_norm": 0.06763742864131927,
"kl": 0.05281829833984375,
"learning_rate": 1.915615368891117e-07,
"loss": 0.0026,
"reward": 0.1408998296046775,
"reward_advantage_correlation": 1.0,
"reward_std": 0.0850095555651933,
"rewards/cosine_scaled_reward": 0.10000266879796982,
"rewards/format_reward": 0.6250000055879354,
"step": 407
},
{
"advantage_max": 0.13665086403489113,
"advantage_mean": -6.364037768991082e-09,
"advantage_min": -0.12718658475205302,
"advantage_std": 0.10699888691306114,
"completion_length": 1904.4583587646484,
"epoch": 0.4662857142857143,
"grad_norm": 0.04600901156663895,
"kl": 0.04422760009765625,
"learning_rate": 1.8967088307307e-07,
"loss": 0.0046,
"reward": 0.15510824089869857,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10699889156967402,
"rewards/cosine_scaled_reward": 0.12226809374988079,
"rewards/format_reward": 0.666666679084301,
"step": 408
},
{
"advantage_max": 0.21366258803755045,
"advantage_mean": -1.5522042678961512e-09,
"advantage_min": -0.1656627543270588,
"advantage_std": 0.1524591613560915,
"completion_length": 3079.3750915527344,
"epoch": 0.4674285714285714,
"grad_norm": 0.1300925612449646,
"kl": 0.0786285400390625,
"learning_rate": 1.8779779118983867e-07,
"loss": 0.0225,
"reward": 0.049404874444007874,
"reward_advantage_correlation": 1.0,
"reward_std": 0.15245915856212378,
"rewards/cosine_scaled_reward": -0.031727675814181566,
"rewards/format_reward": 0.354166679084301,
"step": 409
},
{
"advantage_max": 0.17127191461622715,
"advantage_mean": -4.811833501094931e-09,
"advantage_min": -0.1715862648561597,
"advantage_std": 0.1427832981571555,
"completion_length": 2284.395835876465,
"epoch": 0.4685714285714286,
"grad_norm": 0.05336384102702141,
"kl": 0.051250457763671875,
"learning_rate": 1.8594235253127372e-07,
"loss": 0.0085,
"reward": 0.11855864059180021,
"reward_advantage_correlation": 1.0,
"reward_std": 0.14278329606167972,
"rewards/cosine_scaled_reward": 0.03388424590229988,
"rewards/format_reward": 0.6250000093132257,
"step": 410
},
{
"advantage_max": 0.15930527402088046,
"advantage_mean": 3.1820188706177532e-09,
"advantage_min": -0.14635033579543233,
"advantage_std": 0.12614521011710167,
"completion_length": 2849.9166870117188,
"epoch": 0.4697142857142857,
"grad_norm": 0.060255929827690125,
"kl": 0.06512451171875,
"learning_rate": 1.8410465752883758e-07,
"loss": 0.007,
"reward": 0.06567088048905134,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.12614521756768227,
"rewards/cosine_scaled_reward": -0.026130111888051033,
"rewards/format_reward": 0.43750000558793545,
"step": 411
},
{
"advantage_max": 0.134061299264431,
"advantage_mean": -4.5013923699599445e-09,
"advantage_min": -0.1902531199157238,
"advantage_std": 0.13698356272652745,
"completion_length": 2615.437545776367,
"epoch": 0.47085714285714286,
"grad_norm": 0.13102658092975616,
"kl": 0.066558837890625,
"learning_rate": 1.822847957491922e-07,
"loss": 0.0256,
"reward": 0.08477710420265794,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13698356179520488,
"rewards/cosine_scaled_reward": 0.008670665323734283,
"rewards/format_reward": 0.47916667722165585,
"step": 412
},
{
"advantage_max": 0.11806221166625619,
"advantage_mean": -4.462587521736339e-09,
"advantage_min": -0.1252172514796257,
"advantage_std": 0.10003542294725776,
"completion_length": 1962.1875534057617,
"epoch": 0.472,
"grad_norm": 0.04213166981935501,
"kl": 0.04528236389160156,
"learning_rate": 1.804828558898332e-07,
"loss": 0.0115,
"reward": 0.10885413386859,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10003542667254806,
"rewards/cosine_scaled_reward": -0.013450214173644781,
"rewards/format_reward": 0.6666666734963655,
"step": 413
},
{
"advantage_max": 0.126890292391181,
"advantage_mean": 9.701277836615674e-10,
"advantage_min": -0.10505080316215754,
"advantage_std": 0.0860840454697609,
"completion_length": 2914.354217529297,
"epoch": 0.47314285714285714,
"grad_norm": 0.06571124494075775,
"kl": 0.061893463134765625,
"learning_rate": 1.7869892577476722e-07,
"loss": 0.0088,
"reward": -0.004312141099944711,
"reward_advantage_correlation": 1.0,
"reward_std": 0.08608405268751085,
"rewards/cosine_scaled_reward": -0.19047635607421398,
"rewards/format_reward": 0.35416667349636555,
"step": 414
},
{
"advantage_max": 0.23087134351953864,
"advantage_mean": -1.0089327637241574e-09,
"advantage_min": -0.14913060469552875,
"advantage_std": 0.14599163830280304,
"completion_length": 2982.791732788086,
"epoch": 0.4742857142857143,
"grad_norm": 0.12604767084121704,
"kl": 0.083526611328125,
"learning_rate": 1.7693309235023127e-07,
"loss": 0.0284,
"reward": 0.006746219587512314,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.14599164854735136,
"rewards/cosine_scaled_reward": -0.10747438576072454,
"rewards/format_reward": 0.2500000074505806,
"step": 415
},
{
"advantage_max": 0.17671362031251192,
"advantage_mean": -5.122274285285222e-09,
"advantage_min": -0.16972810495644808,
"advantage_std": 0.13787524495273829,
"completion_length": 2139.8541831970215,
"epoch": 0.4754285714285714,
"grad_norm": 0.1460166573524475,
"kl": 0.05098724365234375,
"learning_rate": 1.7518544168045524e-07,
"loss": 0.0163,
"reward": 0.08973793289624155,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13787524681538343,
"rewards/cosine_scaled_reward": -0.07056776992976665,
"rewards/format_reward": 0.6666666753590107,
"step": 416
},
{
"advantage_max": 0.18380063539370894,
"advantage_mean": -2.638747387262441e-09,
"advantage_min": -0.12609656807035208,
"advantage_std": 0.11879561748355627,
"completion_length": 3139.375030517578,
"epoch": 0.4765714285714286,
"grad_norm": 0.11847390979528427,
"kl": 0.088165283203125,
"learning_rate": 1.7345605894346726e-07,
"loss": 0.0137,
"reward": -0.02197717159288004,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11879561934620142,
"rewards/cosine_scaled_reward": -0.2001864407211542,
"rewards/format_reward": 0.27083334140479565,
"step": 417
},
{
"advantage_max": 0.15402009896934032,
"advantage_mean": 1.5522044760629683e-10,
"advantage_min": -0.16916337795555592,
"advantage_std": 0.12051357375457883,
"completion_length": 2231.395854949951,
"epoch": 0.4777142857142857,
"grad_norm": 0.04836947098374367,
"kl": 0.0673980712890625,
"learning_rate": 1.7174502842694212e-07,
"loss": 0.014,
"reward": 0.09378412109799683,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12051357794553041,
"rewards/cosine_scaled_reward": -0.01670103892683983,
"rewards/format_reward": 0.5833333414047956,
"step": 418
},
{
"advantage_max": 0.1730736023746431,
"advantage_mean": -2.5223320619660594e-09,
"advantage_min": -0.16914140712469816,
"advantage_std": 0.13918489590287209,
"completion_length": 2572.7708587646484,
"epoch": 0.47885714285714287,
"grad_norm": 0.07898696511983871,
"kl": 0.07297134399414062,
"learning_rate": 1.7005243352409333e-07,
"loss": 0.0067,
"reward": 0.08029933553189039,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1391849028877914,
"rewards/cosine_scaled_reward": -0.023447027429938316,
"rewards/format_reward": 0.5208333432674408,
"step": 419
},
{
"advantage_max": 0.1278257886879146,
"advantage_mean": 1.3969840145788837e-09,
"advantage_min": -0.16028660000301898,
"advantage_std": 0.11588235246017575,
"completion_length": 1729.8750228881836,
"epoch": 0.48,
"grad_norm": 0.045995082706213,
"kl": 0.061191558837890625,
"learning_rate": 1.6837835672960831e-07,
"loss": 0.0122,
"reward": 0.09284290811046958,
"reward_advantage_correlation": 0.9999999999999997,
"reward_std": 0.11588235734961927,
"rewards/cosine_scaled_reward": -0.09275021031498909,
"rewards/format_reward": 0.7291666753590107,
"step": 420
},
{
"advantage_max": 0.17663031490519643,
"advantage_mean": 2.48352696741172e-09,
"advantage_min": -0.12583108618855476,
"advantage_std": 0.11714980588294566,
"completion_length": 2878.0625534057617,
"epoch": 0.48114285714285715,
"grad_norm": 0.04500269517302513,
"kl": 0.08325958251953125,
"learning_rate": 1.6672287963562852e-07,
"loss": 0.0154,
"reward": 0.0023640617728233337,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1171498135663569,
"rewards/cosine_scaled_reward": -0.15943891881033778,
"rewards/format_reward": 0.3333333358168602,
"step": 421
},
{
"advantage_max": 0.15642781276255846,
"advantage_mean": -2.3283066447055134e-10,
"advantage_min": -0.16361185582354665,
"advantage_std": 0.12902699504047632,
"completion_length": 2730.6458892822266,
"epoch": 0.48228571428571426,
"grad_norm": 0.14173462986946106,
"kl": 0.0648040771484375,
"learning_rate": 1.6508608292777203e-07,
"loss": 0.0212,
"reward": 0.03911724709905684,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1290269959717989,
"rewards/cosine_scaled_reward": -0.10371733736246824,
"rewards/format_reward": 0.43750000931322575,
"step": 422
},
{
"advantage_max": 0.1365548074245453,
"advantage_mean": 1.1641532390860299e-09,
"advantage_min": -0.16458274144679308,
"advantage_std": 0.12328715343028307,
"completion_length": 2499.8958740234375,
"epoch": 0.48342857142857143,
"grad_norm": 0.06037713214755058,
"kl": 0.0506591796875,
"learning_rate": 1.6346804638120098e-07,
"loss": 0.0137,
"reward": 0.06676678382791579,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.12328716181218624,
"rewards/cosine_scaled_reward": -0.05246353894472122,
"rewards/format_reward": 0.5000000074505806,
"step": 423
},
{
"advantage_max": 0.22071715723723173,
"advantage_mean": 2.3283065059276353e-10,
"advantage_min": -0.12588559091091156,
"advantage_std": 0.1304079587571323,
"completion_length": 2821.6250762939453,
"epoch": 0.4845714285714286,
"grad_norm": 0.08061812072992325,
"kl": 0.068572998046875,
"learning_rate": 1.6186884885673413e-07,
"loss": 0.0176,
"reward": 0.003499031998217106,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13040796015411615,
"rewards/cosine_scaled_reward": -0.16878705425187945,
"rewards/format_reward": 0.3541666716337204,
"step": 424
},
{
"advantage_max": 0.15021021151915193,
"advantage_mean": 2.7939678071131624e-09,
"advantage_min": -0.20629673171788454,
"advantage_std": 0.14314838591963053,
"completion_length": 1907.8125457763672,
"epoch": 0.4857142857142857,
"grad_norm": 0.09490145742893219,
"kl": 0.0328369140625,
"learning_rate": 1.6028856829700258e-07,
"loss": 0.0125,
"reward": 0.26927735190838575,
"reward_advantage_correlation": 1.0,
"reward_std": 0.143148398026824,
"rewards/cosine_scaled_reward": 0.3964337124489248,
"rewards/format_reward": 0.7916666772216558,
"step": 425
},
{
"advantage_max": 0.17377169243991375,
"advantage_mean": -1.3969839035565812e-09,
"advantage_min": -0.13985866121947765,
"advantage_std": 0.12318799132481217,
"completion_length": 2227.520881652832,
"epoch": 0.4868571428571429,
"grad_norm": 0.06765562295913696,
"kl": 0.0846099853515625,
"learning_rate": 1.5872728172265146e-07,
"loss": 0.0153,
"reward": 0.06637881277129054,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1231879978440702,
"rewards/cosine_scaled_reward": -0.11847709584981203,
"rewards/format_reward": 0.625000013038516,
"step": 426
},
{
"advantage_max": 0.2253029616549611,
"advantage_mean": -3.104408619059029e-09,
"advantage_min": -0.17749943817034364,
"advantage_std": 0.15275797015056014,
"completion_length": 2793.416748046875,
"epoch": 0.488,
"grad_norm": 0.12349321693181992,
"kl": 0.087188720703125,
"learning_rate": 1.5718506522858572e-07,
"loss": 0.0226,
"reward": 0.11759098537731916,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1527579859830439,
"rewards/cosine_scaled_reward": 0.10679585766047239,
"rewards/format_reward": 0.4791666753590107,
"step": 427
},
{
"advantage_max": 0.15732468385249376,
"advantage_mean": -4.268561941411786e-09,
"advantage_min": -0.1455386085435748,
"advantage_std": 0.12252023816108704,
"completion_length": 2145.104232788086,
"epoch": 0.48914285714285716,
"grad_norm": 0.0802699625492096,
"kl": 0.0648345947265625,
"learning_rate": 1.5566199398026147e-07,
"loss": 0.0137,
"reward": 0.09161906410008669,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1225202432833612,
"rewards/cosine_scaled_reward": -0.07740333583205938,
"rewards/format_reward": 0.687500013038516,
"step": 428
},
{
"advantage_max": 0.19291006959974766,
"advantage_mean": -1.3969839451899446e-09,
"advantage_min": -0.14975288417190313,
"advantage_std": 0.13649496110156178,
"completion_length": 2090.354202270508,
"epoch": 0.49028571428571427,
"grad_norm": 0.04922656714916229,
"kl": 0.06599807739257812,
"learning_rate": 1.5415814221002265e-07,
"loss": 0.0087,
"reward": 0.09877313417382538,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.13649496855214238,
"rewards/cosine_scaled_reward": -0.10497084120288491,
"rewards/format_reward": 0.7916666734963655,
"step": 429
},
{
"advantage_max": 0.13820017455145717,
"advantage_mean": -4.152146633462639e-09,
"advantage_min": -0.10727433580905199,
"advantage_std": 0.0930909130256623,
"completion_length": 2264.000045776367,
"epoch": 0.49142857142857144,
"grad_norm": 0.15907180309295654,
"kl": 0.07495880126953125,
"learning_rate": 1.5267358321348285e-07,
"loss": 0.0011,
"reward": 0.12007943191565573,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09309091651812196,
"rewards/cosine_scaled_reward": 0.03994573000818491,
"rewards/format_reward": 0.6250000037252903,
"step": 430
},
{
"advantage_max": 0.10399939585477114,
"advantage_mean": -1.1641531211248335e-09,
"advantage_min": -0.13584416639059782,
"advantage_std": 0.09472417016513646,
"completion_length": 2309.6250534057617,
"epoch": 0.49257142857142855,
"grad_norm": 0.06702486425638199,
"kl": 0.07297134399414062,
"learning_rate": 1.5120838934595337e-07,
"loss": 0.012,
"reward": 0.04253681842237711,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09472417179495096,
"rewards/cosine_scaled_reward": -0.1354589331895113,
"rewards/format_reward": 0.5208333395421505,
"step": 431
},
{
"advantage_max": 0.1814054111018777,
"advantage_mean": -3.492460209919557e-10,
"advantage_min": -0.16293080430477858,
"advantage_std": 0.13140337774530053,
"completion_length": 2683.3333740234375,
"epoch": 0.4937142857142857,
"grad_norm": 0.12716400623321533,
"kl": 0.09224700927734375,
"learning_rate": 1.4976263201891613e-07,
"loss": 0.0095,
"reward": 0.07091285544447601,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.13140338193625212,
"rewards/cosine_scaled_reward": -0.011083395686000586,
"rewards/format_reward": 0.43750000558793545,
"step": 432
},
{
"advantage_max": 0.12933768006041646,
"advantage_mean": 1.3193737630201596e-09,
"advantage_min": -0.15304936189204454,
"advantage_std": 0.1121934037655592,
"completion_length": 2891.5416870117188,
"epoch": 0.4948571428571429,
"grad_norm": 0.05526670441031456,
"kl": 0.08255767822265625,
"learning_rate": 1.483363816965435e-07,
"loss": 0.013,
"reward": 0.0877141747623682,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11219340935349464,
"rewards/cosine_scaled_reward": 0.06979287602007389,
"rewards/format_reward": 0.37500000931322575,
"step": 433
},
{
"advantage_max": 0.09472581138834357,
"advantage_mean": -9.701277038642875e-10,
"advantage_min": -0.09733299724757671,
"advantage_std": 0.07522567734122276,
"completion_length": 2730.625030517578,
"epoch": 0.496,
"grad_norm": 0.061184678226709366,
"kl": 0.10284423828125,
"learning_rate": 1.469297078922642e-07,
"loss": 0.0124,
"reward": -0.020490076043643057,
"reward_advantage_correlation": 1.0,
"reward_std": 0.07522568292915821,
"rewards/cosine_scaled_reward": -0.2582273744046688,
"rewards/format_reward": 0.3958333395421505,
"step": 434
},
{
"advantage_max": 0.14937169384211302,
"advantage_mean": 2.3283065059276353e-10,
"advantage_min": -0.11381850577890873,
"advantage_std": 0.10129550378769636,
"completion_length": 2136.229202270508,
"epoch": 0.49714285714285716,
"grad_norm": 0.08722381293773651,
"kl": 0.07655715942382812,
"learning_rate": 1.4554267916537495e-07,
"loss": 0.012,
"reward": 0.02877517172601074,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.10129550378769636,
"rewards/cosine_scaled_reward": -0.1954280026257038,
"rewards/format_reward": 0.5625000037252903,
"step": 435
},
{
"advantage_max": 0.10816556308418512,
"advantage_mean": -5.665545969868457e-09,
"advantage_min": -0.11551058106124401,
"advantage_std": 0.09109327476471663,
"completion_length": 1993.3333587646484,
"epoch": 0.4982857142857143,
"grad_norm": 0.040935587137937546,
"kl": 0.03719520568847656,
"learning_rate": 1.4417536311769885e-07,
"loss": 0.0024,
"reward": 0.1375275724567473,
"reward_advantage_correlation": 1.0,
"reward_std": 0.0910932756960392,
"rewards/cosine_scaled_reward": 0.12174435332417488,
"rewards/format_reward": 0.5625000018626451,
"step": 436
},
{
"advantage_max": 0.15487646870315075,
"advantage_mean": 4.163336342344337e-17,
"advantage_min": -0.1355657959356904,
"advantage_std": 0.12161733116954565,
"completion_length": 2353.0834197998047,
"epoch": 0.49942857142857144,
"grad_norm": 0.15399853885173798,
"kl": 0.08425140380859375,
"learning_rate": 1.4282782639029128e-07,
"loss": 0.0238,
"reward": 0.03445580159313977,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.12161733210086823,
"rewards/cosine_scaled_reward": -0.15955427661538124,
"rewards/format_reward": 0.5208333376795053,
"step": 437
},
{
"advantage_max": 0.17049633665010333,
"advantage_mean": -1.9402554216063628e-09,
"advantage_min": -0.15271185897290707,
"advantage_std": 0.13004512721090578,
"completion_length": 2923.1875534057617,
"epoch": 0.5005714285714286,
"grad_norm": 0.1422165185213089,
"kl": 0.054004669189453125,
"learning_rate": 1.4150013466019114e-07,
"loss": 0.0172,
"reward": 0.04482788871973753,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.13004513212945312,
"rewards/cosine_scaled_reward": -0.08538412535563111,
"rewards/format_reward": 0.4375000111758709,
"step": 438
},
{
"advantage_max": 0.12892017513513565,
"advantage_mean": -1.9402553730341054e-09,
"advantage_min": -0.102175647392869,
"advantage_std": 0.09014736721292138,
"completion_length": 2296.0000228881836,
"epoch": 0.5017142857142857,
"grad_norm": 0.035092201083898544,
"kl": 0.061981201171875,
"learning_rate": 1.4019235263722034e-07,
"loss": 0.0096,
"reward": 0.03524679830297828,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09014736441895366,
"rewards/cosine_scaled_reward": -0.08231546822935343,
"rewards/format_reward": 0.37500000186264515,
"step": 439
},
{
"advantage_max": 0.13992469711229205,
"advantage_mean": -2.1730860583213385e-09,
"advantage_min": -0.1054043099284172,
"advantage_std": 0.09207697911188006,
"completion_length": 2813.041717529297,
"epoch": 0.5028571428571429,
"grad_norm": 0.14594881236553192,
"kl": 0.09942626953125,
"learning_rate": 1.3890454406082956e-07,
"loss": 0.0099,
"reward": -0.01191971474327147,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.09207698097452521,
"rewards/cosine_scaled_reward": -0.21304061822593212,
"rewards/format_reward": 0.3541666716337204,
"step": 440
},
{
"advantage_max": 0.12328352779150009,
"advantage_mean": -3.4148496946828644e-09,
"advantage_min": -0.17189356870949268,
"advantage_std": 0.11699239769950509,
"completion_length": 2687.5208740234375,
"epoch": 0.504,
"grad_norm": 0.11024197936058044,
"kl": 0.07801437377929688,
"learning_rate": 1.3763677169699217e-07,
"loss": 0.0047,
"reward": 0.09399675484746695,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.11699240235611796,
"rewards/cosine_scaled_reward": 0.03823857568204403,
"rewards/format_reward": 0.4791666753590107,
"step": 441
},
{
"advantage_max": 0.1927649211138487,
"advantage_mean": -7.761021547647573e-10,
"advantage_min": -0.14677509432658553,
"advantage_std": 0.1355924210511148,
"completion_length": 2098.208366394043,
"epoch": 0.5051428571428571,
"grad_norm": 0.14278782904148102,
"kl": 0.06755447387695312,
"learning_rate": 1.3638909733514452e-07,
"loss": 0.024,
"reward": 0.04010949970688671,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1355924243107438,
"rewards/cosine_scaled_reward": -0.18566999700851738,
"rewards/format_reward": 0.604166679084301,
"step": 442
},
{
"advantage_max": 0.18520016921684146,
"advantage_mean": -1.7850350295112172e-09,
"advantage_min": -0.21217109076678753,
"advantage_std": 0.16153573151677847,
"completion_length": 2472.000045776367,
"epoch": 0.5062857142857143,
"grad_norm": 0.03744587302207947,
"kl": 0.033538818359375,
"learning_rate": 1.351615817851748e-07,
"loss": 0.0077,
"reward": 0.09077572450041771,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.16153573850169778,
"rewards/cosine_scaled_reward": -0.012071860954165459,
"rewards/format_reward": 0.5625000074505806,
"step": 443
},
{
"advantage_max": 0.16735272575169802,
"advantage_mean": -1.2417634004391331e-09,
"advantage_min": -0.10703678708523512,
"advantage_std": 0.10983354318886995,
"completion_length": 2711.916702270508,
"epoch": 0.5074285714285715,
"grad_norm": 0.1395997852087021,
"kl": 0.083709716796875,
"learning_rate": 1.3395428487445914e-07,
"loss": 0.0051,
"reward": 0.0046759541146457195,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.10983354831114411,
"rewards/cosine_scaled_reward": -0.1657371548935771,
"rewards/format_reward": 0.35416666977107525,
"step": 444
},
{
"advantage_max": 0.14539906289428473,
"advantage_mean": -4.423782298812462e-09,
"advantage_min": -0.1462356518022716,
"advantage_std": 0.11147704510949552,
"completion_length": 2879.6250534057617,
"epoch": 0.5085714285714286,
"grad_norm": 0.10607258975505829,
"kl": 0.10732269287109375,
"learning_rate": 1.3276726544494571e-07,
"loss": 0.0139,
"reward": 0.038264825008809566,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.11147704510949552,
"rewards/cosine_scaled_reward": -0.10536502301692963,
"rewards/format_reward": 0.4375000111758709,
"step": 445
},
{
"advantage_max": 0.18112129345536232,
"advantage_mean": -1.6298145749660264e-09,
"advantage_min": -0.1495385468006134,
"advantage_std": 0.12615549704059958,
"completion_length": 2642.5208892822266,
"epoch": 0.5097142857142857,
"grad_norm": 0.18339461088180542,
"kl": 0.07524871826171875,
"learning_rate": 1.316005813502869e-07,
"loss": 0.0173,
"reward": 0.041458213003352284,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.12615550868213177,
"rewards/cosine_scaled_reward": -0.13958797266241163,
"rewards/format_reward": 0.5208333376795053,
"step": 446
},
{
"advantage_max": 0.20427228696644306,
"advantage_mean": 3.1044089521259366e-10,
"advantage_min": -0.1569793475791812,
"advantage_std": 0.14149027224630117,
"completion_length": 2453.458427429199,
"epoch": 0.5108571428571429,
"grad_norm": 0.05572226271033287,
"kl": 0.0703887939453125,
"learning_rate": 1.3045428945301953e-07,
"loss": 0.0117,
"reward": 0.09940684377215803,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.14149027597159147,
"rewards/cosine_scaled_reward": -0.06487349630333483,
"rewards/format_reward": 0.7083333376795053,
"step": 447
},
{
"advantage_max": 0.14345520036295056,
"advantage_mean": -4.656613053488634e-09,
"advantage_min": -0.14839566173031926,
"advantage_std": 0.11676807375624776,
"completion_length": 2164.645881652832,
"epoch": 0.512,
"grad_norm": 0.11576156318187714,
"kl": 0.07922554016113281,
"learning_rate": 1.2932844562179352e-07,
"loss": 0.0173,
"reward": 0.11762388469651341,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.1167680760845542,
"rewards/cosine_scaled_reward": 0.03612196817994118,
"rewards/format_reward": 0.6041666753590107,
"step": 448
},
{
"advantage_max": 0.12314849765971303,
"advantage_mean": -2.8715780586718864e-09,
"advantage_min": -0.09987784596160054,
"advantage_std": 0.09292110335081816,
"completion_length": 1891.7500534057617,
"epoch": 0.5131428571428571,
"grad_norm": 0.036677490919828415,
"kl": 0.051727294921875,
"learning_rate": 1.2822310472864885e-07,
"loss": 0.0053,
"reward": 0.05903090629726648,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.09292111033573747,
"rewards/cosine_scaled_reward": -0.1294665727764368,
"rewards/format_reward": 0.6041666753590107,
"step": 449
},
{
"advantage_max": 0.12233589449897408,
"advantage_mean": -2.483526884144993e-09,
"advantage_min": -0.12476734491065145,
"advantage_std": 0.09501770418137312,
"completion_length": 2092.4791946411133,
"epoch": 0.5142857142857142,
"grad_norm": 0.08849738538265228,
"kl": 0.0531463623046875,
"learning_rate": 1.2713832064634125e-07,
"loss": 0.0012,
"reward": 0.07593667833134532,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09501770790666342,
"rewards/cosine_scaled_reward": -0.08989906311035156,
"rewards/format_reward": 0.6250000055879354,
"step": 450
},
{
"advantage_max": 0.17539388965815306,
"advantage_mean": -6.519258161086228e-09,
"advantage_min": -0.13914753962308168,
"advantage_std": 0.1160832904279232,
"completion_length": 2289.95841217041,
"epoch": 0.5154285714285715,
"grad_norm": 0.08232926577329636,
"kl": 0.0707550048828125,
"learning_rate": 1.260741462457165e-07,
"loss": 0.0146,
"reward": 0.1173245128011331,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.11608328856527805,
"rewards/cosine_scaled_reward": 0.010675758589059114,
"rewards/format_reward": 0.6666666828095913,
"step": 451
},
{
"advantage_max": 0.2101496052928269,
"advantage_mean": -4.190951877203197e-09,
"advantage_min": -0.11972010880708694,
"advantage_std": 0.12568515678867698,
"completion_length": 2983.8750610351562,
"epoch": 0.5165714285714286,
"grad_norm": 0.05641166865825653,
"kl": 0.0640869140625,
"learning_rate": 1.2503063339313356e-07,
"loss": 0.0053,
"reward": 0.09685743995942175,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12568516144528985,
"rewards/cosine_scaled_reward": 0.06668644212186337,
"rewards/format_reward": 0.43750000558793545,
"step": 452
},
{
"advantage_max": 0.205594627186656,
"advantage_mean": -3.182018815106602e-09,
"advantage_min": -0.14165670238435268,
"advantage_std": 0.14093559887260199,
"completion_length": 2465.9792556762695,
"epoch": 0.5177142857142857,
"grad_norm": 0.06710201501846313,
"kl": 0.05599212646484375,
"learning_rate": 1.2400783294793668e-07,
"loss": 0.0055,
"reward": 0.06225473037920892,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.14093559375032783,
"rewards/cosine_scaled_reward": -0.10087771527469158,
"rewards/format_reward": 0.5625000018626451,
"step": 453
},
{
"advantage_max": 0.16545915231108665,
"advantage_mean": 3.2596291082986895e-09,
"advantage_min": -0.1489626714028418,
"advantage_std": 0.13455732073634863,
"completion_length": 2598.0833892822266,
"epoch": 0.5188571428571429,
"grad_norm": 0.041934214532375336,
"kl": 0.04929351806640625,
"learning_rate": 1.2300579475997657e-07,
"loss": 0.0007,
"reward": 0.05813994584605098,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.13455732772126794,
"rewards/cosine_scaled_reward": -0.13080565445125103,
"rewards/format_reward": 0.6041666716337204,
"step": 454
},
{
"advantage_max": 0.14655470103025436,
"advantage_mean": 1.1641532182693481e-10,
"advantage_min": -0.12023748084902763,
"advantage_std": 0.1034624595195055,
"completion_length": 2623.9375610351562,
"epoch": 0.52,
"grad_norm": 0.09280390292406082,
"kl": 0.0705718994140625,
"learning_rate": 1.220245676671809e-07,
"loss": 0.0236,
"reward": -0.008920757623855025,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.10346246417611837,
"rewards/cosine_scaled_reward": -0.22455782443284988,
"rewards/format_reward": 0.3958333432674408,
"step": 455
},
{
"advantage_max": 0.1612388575449586,
"advantage_mean": 3.8805108432127255e-11,
"advantage_min": -0.08964319387450814,
"advantage_std": 0.09334565093740821,
"completion_length": 2911.5208740234375,
"epoch": 0.5211428571428571,
"grad_norm": 0.045997679233551025,
"kl": 0.07303619384765625,
"learning_rate": 1.2106419949317388e-07,
"loss": 0.0133,
"reward": -0.03551657311618328,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.09334565559402108,
"rewards/cosine_scaled_reward": -0.24067565985023975,
"rewards/format_reward": 0.2708333432674408,
"step": 456
},
{
"advantage_max": 0.13807542622089386,
"advantage_mean": -1.940255463239726e-09,
"advantage_min": -0.12592454068362713,
"advantage_std": 0.10808086302131414,
"completion_length": 2840.4375610351562,
"epoch": 0.5222857142857142,
"grad_norm": 0.06265939027070999,
"kl": 0.0859375,
"learning_rate": 1.2012473704494537e-07,
"loss": 0.0117,
"reward": 0.005134745966643095,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1080808648839593,
"rewards/cosine_scaled_reward": -0.1515765618532896,
"rewards/format_reward": 0.3333333358168602,
"step": 457
},
{
"advantage_max": 0.18150201439857483,
"advantage_mean": 2.793967751602011e-09,
"advantage_min": -0.13712891470640898,
"advantage_std": 0.13010421255603433,
"completion_length": 2471.312515258789,
"epoch": 0.5234285714285715,
"grad_norm": 0.07266143709421158,
"kl": 0.0667877197265625,
"learning_rate": 1.1920622611056974e-07,
"loss": 0.0112,
"reward": 0.03780588391236961,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.13010421255603433,
"rewards/cosine_scaled_reward": -0.13179424591362476,
"rewards/format_reward": 0.4791666753590107,
"step": 458
},
{
"advantage_max": 0.2504174951463938,
"advantage_mean": -7.450580846724009e-09,
"advantage_min": -0.23487072996795177,
"advantage_std": 0.19904521945863962,
"completion_length": 2292.5833740234375,
"epoch": 0.5245714285714286,
"grad_norm": 0.07614655047655106,
"kl": 0.0789947509765625,
"learning_rate": 1.1830871145697412e-07,
"loss": 0.0051,
"reward": 0.14349280833266675,
"reward_advantage_correlation": 1.0,
"reward_std": 0.19904522132128477,
"rewards/cosine_scaled_reward": 0.10745827108621597,
"rewards/format_reward": 0.6250000037252903,
"step": 459
},
{
"advantage_max": 0.1652562553063035,
"advantage_mean": -8.149072666663315e-10,
"advantage_min": -0.15781007520854473,
"advantage_std": 0.13692284328863025,
"completion_length": 2972.916748046875,
"epoch": 0.5257142857142857,
"grad_norm": 0.256010502576828,
"kl": 0.0777587890625,
"learning_rate": 1.1743223682775649e-07,
"loss": 0.0243,
"reward": 0.012114565295632929,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13692284747958183,
"rewards/cosine_scaled_reward": -0.16331932321190834,
"rewards/format_reward": 0.3958333432674408,
"step": 460
},
{
"advantage_max": 0.1954550128430128,
"advantage_mean": -3.8805108432127255e-11,
"advantage_min": -0.16478789877146482,
"advantage_std": 0.1376098650507629,
"completion_length": 2490.5833587646484,
"epoch": 0.5268571428571428,
"grad_norm": 0.04703768342733383,
"kl": 0.053020477294921875,
"learning_rate": 1.1657684494105386e-07,
"loss": 0.0128,
"reward": 0.04842737386934459,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13760986551642418,
"rewards/cosine_scaled_reward": -0.10884158127009869,
"rewards/format_reward": 0.5000000074505806,
"step": 461
},
{
"advantage_max": 0.10769666731357574,
"advantage_mean": -9.895302181817112e-10,
"advantage_min": -0.09830914624035358,
"advantage_std": 0.07689572288654745,
"completion_length": 2611.68754196167,
"epoch": 0.528,
"grad_norm": 0.06967299431562424,
"kl": 0.0802764892578125,
"learning_rate": 1.1574257748745986e-07,
"loss": 0.014,
"reward": -0.02583047526422888,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.07689572405070066,
"rewards/cosine_scaled_reward": -0.2742087193764746,
"rewards/format_reward": 0.39583334513008595,
"step": 462
},
{
"advantage_max": 0.11235859198495746,
"advantage_mean": -7.761020853758183e-10,
"advantage_min": -0.10793718742206693,
"advantage_std": 0.08298033010214567,
"completion_length": 2999.4583587646484,
"epoch": 0.5291428571428571,
"grad_norm": 0.0980202853679657,
"kl": 0.07388687133789062,
"learning_rate": 1.1492947512799328e-07,
"loss": 0.008,
"reward": 0.028239358702194295,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.08298033056780696,
"rewards/cosine_scaled_reward": -0.061539738439023495,
"rewards/format_reward": 0.29166666977107525,
"step": 463
},
{
"advantage_max": 0.07747854851186275,
"advantage_mean": -2.173086016687975e-09,
"advantage_min": -0.10103305988013744,
"advantage_std": 0.07666776818223298,
"completion_length": 2035.020851135254,
"epoch": 0.5302857142857142,
"grad_norm": 0.11843991279602051,
"kl": 0.06780242919921875,
"learning_rate": 1.1413757749211602e-07,
"loss": 0.0076,
"reward": 0.12095656874589622,
"reward_advantage_correlation": 1.0,
"reward_std": 0.0766677693463862,
"rewards/cosine_scaled_reward": 0.06199156865477562,
"rewards/format_reward": 0.5833333432674408,
"step": 464
},
{
"advantage_max": 0.19982548616826534,
"advantage_mean": -2.2506963168189564e-09,
"advantage_min": -0.16617505624890327,
"advantage_std": 0.1316032218746841,
"completion_length": 2798.500030517578,
"epoch": 0.5314285714285715,
"grad_norm": 0.10557418316602707,
"kl": 0.0775604248046875,
"learning_rate": 1.1336692317580158e-07,
"loss": 0.0201,
"reward": 0.0002755961613729596,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1316032288596034,
"rewards/cosine_scaled_reward": -0.19768394669517875,
"rewards/format_reward": 0.39583334885537624,
"step": 465
},
{
"advantage_max": 0.14168964140117168,
"advantage_mean": 3.6864852975826423e-09,
"advantage_min": -0.13088442478328943,
"advantage_std": 0.12304930435493588,
"completion_length": 2720.0833740234375,
"epoch": 0.5325714285714286,
"grad_norm": 0.06551773101091385,
"kl": 0.08535003662109375,
"learning_rate": 1.1261754973965422e-07,
"loss": 0.0194,
"reward": 0.05027168616652489,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.12304930854588747,
"rewards/cosine_scaled_reward": -0.018773799762129784,
"rewards/format_reward": 0.3333333395421505,
"step": 466
},
{
"advantage_max": 0.14841501927003264,
"advantage_mean": -4.346172137459359e-09,
"advantage_min": -0.13719767704606056,
"advantage_std": 0.10813061986118555,
"completion_length": 2950.7708740234375,
"epoch": 0.5337142857142857,
"grad_norm": 0.0421525277197361,
"kl": 0.078033447265625,
"learning_rate": 1.1188949370707787e-07,
"loss": 0.0125,
"reward": 0.018481011386029422,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.10813062265515327,
"rewards/cosine_scaled_reward": -0.12253948114812374,
"rewards/format_reward": 0.354166679084301,
"step": 467
},
{
"advantage_max": 0.14324644766747952,
"advantage_mean": -2.0372682273811504e-10,
"advantage_min": -0.0815016619162634,
"advantage_std": 0.08769956149626523,
"completion_length": 2793.041679382324,
"epoch": 0.5348571428571428,
"grad_norm": 0.06327426433563232,
"kl": 0.0679473876953125,
"learning_rate": 1.1118279056249653e-07,
"loss": 0.0069,
"reward": -0.028568633482791483,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.08769956947071478,
"rewards/cosine_scaled_reward": -0.26157646207138896,
"rewards/format_reward": 0.3541666679084301,
"step": 468
},
{
"advantage_max": 0.10998845845460892,
"advantage_mean": 3.0656035349130306e-09,
"advantage_min": -0.09244827646762133,
"advantage_std": 0.07723978580906987,
"completion_length": 2816.6666946411133,
"epoch": 0.536,
"grad_norm": 0.07730638980865479,
"kl": 0.0949859619140625,
"learning_rate": 1.1049747474962444e-07,
"loss": 0.0132,
"reward": 0.026579681783914566,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.07723978534340858,
"rewards/cosine_scaled_reward": -0.07851309701800346,
"rewards/format_reward": 0.31250000186264515,
"step": 469
},
{
"advantage_max": 0.17276223795488477,
"advantage_mean": 4.035731332452386e-09,
"advantage_min": -0.15901347948238254,
"advantage_std": 0.1274056350812316,
"completion_length": 2975.437545776367,
"epoch": 0.5371428571428571,
"grad_norm": 0.05028533563017845,
"kl": 0.07293701171875,
"learning_rate": 1.0983357966978745e-07,
"loss": 0.0147,
"reward": 0.06911044649314135,
"reward_advantage_correlation": 1.0,
"reward_std": 0.12740564392879605,
"rewards/cosine_scaled_reward": -0.037810999900102615,
"rewards/format_reward": 0.47916667349636555,
"step": 470
},
{
"advantage_max": 0.14420464355498552,
"advantage_mean": -5.3551047971001076e-09,
"advantage_min": -0.09883822966367006,
"advantage_std": 0.09858017042279243,
"completion_length": 2936.1458740234375,
"epoch": 0.5382857142857143,
"grad_norm": 0.07671011984348297,
"kl": 0.071929931640625,
"learning_rate": 1.0919113768029517e-07,
"loss": 0.0169,
"reward": 0.04580727685242891,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09858017321676016,
"rewards/cosine_scaled_reward": -0.03325042815413326,
"rewards/format_reward": 0.3333333358168602,
"step": 471
},
{
"advantage_max": 0.12537370715290308,
"advantage_mean": -1.9014503166436825e-09,
"advantage_min": -0.12702995259314775,
"advantage_std": 0.105596958193928,
"completion_length": 2686.125030517578,
"epoch": 0.5394285714285715,
"grad_norm": 0.052131302654743195,
"kl": 0.05517578125,
"learning_rate": 1.0857018009286381e-07,
"loss": 0.0072,
"reward": 0.03351810248568654,
"reward_advantage_correlation": 0.9999999999999997,
"reward_std": 0.10559695912525058,
"rewards/cosine_scaled_reward": -0.15133550064638257,
"rewards/format_reward": 0.5000000111758709,
"step": 472
},
{
"advantage_max": 0.16438235435634851,
"advantage_mean": 1.7462298690373856e-09,
"advantage_min": -0.1541678113862872,
"advantage_std": 0.12999227130785584,
"completion_length": 2772.229202270508,
"epoch": 0.5405714285714286,
"grad_norm": 0.20341692864894867,
"kl": 0.0763702392578125,
"learning_rate": 1.0797073717209013e-07,
"loss": 0.0181,
"reward": 0.011351976543664932,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.1299922768957913,
"rewards/cosine_scaled_reward": -0.15522437915205956,
"rewards/format_reward": 0.3750000111758709,
"step": 473
},
{
"advantage_max": 0.12965345289558172,
"advantage_mean": -1.0477379214224314e-08,
"advantage_min": -0.13344207033514977,
"advantage_std": 0.1010923438007012,
"completion_length": 2297.3333740234375,
"epoch": 0.5417142857142857,
"grad_norm": 0.03711497038602829,
"kl": 0.05191802978515625,
"learning_rate": 1.0739283813397639e-07,
"loss": 0.0078,
"reward": 0.1403076218557544,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10109234321862459,
"rewards/cosine_scaled_reward": 0.16363799665123224,
"rewards/format_reward": 0.5000000093132257,
"step": 474
},
{
"advantage_max": 0.24887295626103878,
"advantage_mean": -1.1641532182693481e-09,
"advantage_min": -0.16283766739070415,
"advantage_std": 0.16864926600828767,
"completion_length": 2069.479217529297,
"epoch": 0.5428571428571428,
"grad_norm": 0.20244264602661133,
"kl": 0.05932807922363281,
"learning_rate": 1.068365111445064e-07,
"loss": 0.02,
"reward": 0.10613728279713541,
"reward_advantage_correlation": 1.0,
"reward_std": 0.16864926647394896,
"rewards/cosine_scaled_reward": -0.0021053925156593323,
"rewards/format_reward": 0.6250000037252903,
"step": 475
},
{
"advantage_max": 0.2350059635937214,
"advantage_mean": -3.1044085357923024e-09,
"advantage_min": -0.2852534279227257,
"advantage_std": 0.22215755190700293,
"completion_length": 2249.7500762939453,
"epoch": 0.544,
"grad_norm": 0.27493569254875183,
"kl": 0.056476593017578125,
"learning_rate": 1.063017833182728e-07,
"loss": 0.0296,
"reward": 0.18973252084106207,
"reward_advantage_correlation": 1.0,
"reward_std": 0.22215756494551897,
"rewards/cosine_scaled_reward": 0.2016936163417995,
"rewards/format_reward": 0.7083333563059568,
"step": 476
},
{
"advantage_max": 0.20565590541809797,
"advantage_mean": 2.173086016687975e-09,
"advantage_min": -0.20990248955786228,
"advantage_std": 0.16222712211310863,
"completion_length": 1889.9792137145996,
"epoch": 0.5451428571428572,
"grad_norm": 0.08550294488668442,
"kl": 0.049556732177734375,
"learning_rate": 1.0578868071715544e-07,
"loss": 0.007,
"reward": 0.1523238776717335,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1622271267697215,
"rewards/cosine_scaled_reward": 0.10249492339789867,
"rewards/format_reward": 0.6875000223517418,
"step": 477
},
{
"advantage_max": 0.19569236552342772,
"advantage_mean": -5.355105053839182e-09,
"advantage_min": -0.170789347961545,
"advantage_std": 0.14821833465248346,
"completion_length": 2707.7917251586914,
"epoch": 0.5462857142857143,
"grad_norm": 0.0707746297121048,
"kl": 0.04302215576171875,
"learning_rate": 1.0529722834905125e-07,
"loss": 0.0062,
"reward": 0.08682430069893599,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.14821833930909634,
"rewards/cosine_scaled_reward": 0.005293058231472969,
"rewards/format_reward": 0.5000000111758709,
"step": 478
},
{
"advantage_max": 0.20708859246224165,
"advantage_mean": 3.49245968256362e-09,
"advantage_min": -0.0934189772233367,
"advantage_std": 0.11944379657506943,
"completion_length": 2784.8750610351562,
"epoch": 0.5474285714285714,
"grad_norm": 0.06040840968489647,
"kl": 0.058135986328125,
"learning_rate": 1.0482745016665526e-07,
"loss": 0.0089,
"reward": 0.025700575672090054,
"reward_advantage_correlation": 1.0,
"reward_std": 0.11944379936903715,
"rewards/cosine_scaled_reward": -0.18563230335712433,
"rewards/format_reward": 0.5208333376795053,
"step": 479
},
{
"advantage_max": 0.09567822678945959,
"advantage_mean": -1.358178718796621e-09,
"advantage_min": -0.12376060243695974,
"advantage_std": 0.08115771319717169,
"completion_length": 2284.291732788086,
"epoch": 0.5485714285714286,
"grad_norm": 0.04082602262496948,
"kl": 0.0604248046875,
"learning_rate": 1.0437936906629334e-07,
"loss": 0.0068,
"reward": 0.0292730022338219,
"reward_advantage_correlation": 1.0,
"reward_std": 0.08115771505981684,
"rewards/cosine_scaled_reward": -0.17489056783961132,
"rewards/format_reward": 0.5208333469927311,
"step": 480
},
{
"advantage_max": 0.15009752474725246,
"advantage_mean": -1.2417634628891783e-09,
"advantage_min": -0.10813257563859224,
"advantage_std": 0.0981076592579484,
"completion_length": 2793.0625762939453,
"epoch": 0.5497142857142857,
"grad_norm": 0.039628688246011734,
"kl": 0.0657196044921875,
"learning_rate": 1.0395300688680625e-07,
"loss": 0.012,
"reward": 0.006433199101593345,
"reward_advantage_correlation": 1.0,
"reward_std": 0.09810765460133553,
"rewards/cosine_scaled_reward": -0.24164481833577156,
"rewards/format_reward": 0.5208333469927311,
"step": 481
},
{
"advantage_max": 0.18115054722875357,
"advantage_mean": 3.3372393737352013e-09,
"advantage_min": -0.11325318366289139,
"advantage_std": 0.11549648176878691,
"completion_length": 2272.8958892822266,
"epoch": 0.5508571428571428,
"grad_norm": 0.05143104866147041,
"kl": 0.0427398681640625,
"learning_rate": 1.0354838440848501e-07,
"loss": 0.0101,
"reward": 0.08894058922305703,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1154964854940772,
"rewards/cosine_scaled_reward": -0.01966316059406381,
"rewards/format_reward": 0.5625000055879354,
"step": 482
},
{
"advantage_max": 0.15216759871691465,
"advantage_mean": -4.190951752303107e-09,
"advantage_min": -0.1571501288563013,
"advantage_std": 0.11147296661511064,
"completion_length": 2788.2708740234375,
"epoch": 0.552,
"grad_norm": 0.07072841376066208,
"kl": 0.05242156982421875,
"learning_rate": 1.0316552135205837e-07,
"loss": 0.0069,
"reward": 0.047933751717209816,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.11147296894341707,
"rewards/cosine_scaled_reward": -0.10883669415488839,
"rewards/format_reward": 0.5000000223517418,
"step": 483
},
{
"advantage_max": 0.17311536194756627,
"advantage_mean": -3.880510884846089e-09,
"advantage_min": -0.14954284392297268,
"advantage_std": 0.1357504017651081,
"completion_length": 2194.6458625793457,
"epoch": 0.5531428571428572,
"grad_norm": 0.06790361553430557,
"kl": 0.04769134521484375,
"learning_rate": 1.0280443637773163e-07,
"loss": 0.0047,
"reward": 0.08819579007104039,
"reward_advantage_correlation": 1.0,
"reward_std": 0.13575040455907583,
"rewards/cosine_scaled_reward": -0.04436913412064314,
"rewards/format_reward": 0.6041666679084301,
"step": 484
},
{
"advantage_max": 0.2007480701431632,
"advantage_mean": -1.7074247016246602e-09,
"advantage_min": -0.21596927661448717,
"advantage_std": 0.15590744372457266,
"completion_length": 2008.2708892822266,
"epoch": 0.5542857142857143,
"grad_norm": 0.1535339206457138,
"kl": 0.056400299072265625,
"learning_rate": 1.0246514708427701e-07,
"loss": 0.0204,
"reward": 0.11056611873209476,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1559074493125081,
"rewards/cosine_scaled_reward": -0.017422407865524292,
"rewards/format_reward": 0.6875000204890966,
"step": 485
},
{
"advantage_max": 0.13929086178541183,
"advantage_mean": -5.665545983746245e-09,
"advantage_min": -0.08767893025651574,
"advantage_std": 0.08762249257415533,
"completion_length": 2356.1667098999023,
"epoch": 0.5554285714285714,
"grad_norm": 0.1545788198709488,
"kl": 0.08535957336425781,
"learning_rate": 1.0214767000817596e-07,
"loss": 0.0109,
"reward": 0.0351280951872468,
"reward_advantage_correlation": 1.0,
"reward_std": 0.08762249490246177,
"rewards/cosine_scaled_reward": -0.11772407731041312,
"rewards/format_reward": 0.43750000186264515,
"step": 486
},
{
"advantage_max": 0.1587705770507455,
"advantage_mean": -7.101335020021082e-09,
"advantage_min": -0.15303611755371094,
"advantage_std": 0.1276407791301608,
"completion_length": 1725.5208358764648,
"epoch": 0.5565714285714286,
"grad_norm": 0.025498030707240105,
"kl": 0.0302734375,
"learning_rate": 1.0185202062281336e-07,
"loss": 0.0019,
"reward": 0.18238393031060696,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.12764078192412853,
"rewards/cosine_scaled_reward": 0.16194906132295728,
"rewards/format_reward": 0.7500000111758709,
"step": 487
},
{
"advantage_max": 0.09379342943429947,
"advantage_mean": -1.0865430222217753e-09,
"advantage_min": -0.14438163582235575,
"advantage_std": 0.08854867145419121,
"completion_length": 1983.2500381469727,
"epoch": 0.5577142857142857,
"grad_norm": 0.07698236405849457,
"kl": 0.0438690185546875,
"learning_rate": 1.0157821333772304e-07,
"loss": 0.0027,
"reward": 0.04897049597639125,
"reward_advantage_correlation": 1.0,
"reward_std": 0.08854867285117507,
"rewards/cosine_scaled_reward": -0.16868338361382484,
"rewards/format_reward": 0.6250000223517418,
"step": 488
},
{
"advantage_max": 0.15139921591617167,
"advantage_mean": 1.7074247571358114e-09,
"advantage_min": -0.1027588089928031,
"advantage_std": 0.10175243532285094,
"completion_length": 3048.5833892822266,
"epoch": 0.5588571428571428,
"grad_norm": 0.08280462771654129,
"kl": 0.09661865234375,
"learning_rate": 1.013262614978859e-07,
"loss": 0.0149,
"reward": -0.036369886714965105,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10175243951380253,
"rewards/cosine_scaled_reward": -0.23218814376741648,
"rewards/format_reward": 0.2500000074505806,
"step": 489
},
{
"advantage_max": 0.16806945484131575,
"advantage_mean": -5.587935794637566e-09,
"advantage_min": -0.10560749378055334,
"advantage_std": 0.10307761421427131,
"completion_length": 2269.4375610351562,
"epoch": 0.56,
"grad_norm": 0.04493989422917366,
"kl": 0.047054290771484375,
"learning_rate": 1.0109617738307911e-07,
"loss": 0.0087,
"reward": 0.0595971189904958,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.1030776179395616,
"rewards/cosine_scaled_reward": -0.1576940380036831,
"rewards/format_reward": 0.6666666772216558,
"step": 490
},
{
"advantage_max": 0.16114369360730052,
"advantage_mean": -2.910383146287332e-09,
"advantage_min": -0.17943292623385787,
"advantage_std": 0.13655775994993746,
"completion_length": 2471.375068664551,
"epoch": 0.5611428571428572,
"grad_norm": 0.07799820601940155,
"kl": 0.08380126953125,
"learning_rate": 1.0088797220727779e-07,
"loss": 0.0112,
"reward": 0.15404712711460888,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.13655776320956647,
"rewards/cosine_scaled_reward": 0.16089667566120625,
"rewards/format_reward": 0.5833333376795053,
"step": 491
},
{
"advantage_max": 0.13228206429630518,
"advantage_mean": -1.785034897672233e-09,
"advantage_min": -0.1156699366401881,
"advantage_std": 0.10165782272815704,
"completion_length": 2402.5833892822266,
"epoch": 0.5622857142857143,
"grad_norm": 0.11941049993038177,
"kl": 0.1121826171875,
"learning_rate": 1.0070165611810855e-07,
"loss": 0.0104,
"reward": 0.04288489208556712,
"reward_advantage_correlation": 1.0,
"reward_std": 0.10165782598778605,
"rewards/cosine_scaled_reward": -0.02407931163907051,
"rewards/format_reward": 0.2916666716337204,
"step": 492
},
{
"advantage_max": 0.2312077321112156,
"advantage_mean": -5.5103253071564495e-09,
"advantage_min": -0.20651198737323284,
"advantage_std": 0.17073472030460835,
"completion_length": 2009.2500534057617,
"epoch": 0.5634285714285714,
"grad_norm": 0.08874164521694183,
"kl": 0.06610870361328125,
"learning_rate": 1.005372381963547e-07,
"loss": 0.015,
"reward": 0.12461904282099567,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.17073472402989864,
"rewards/cosine_scaled_reward": 0.02050326857715845,
"rewards/format_reward": 0.6875000074505806,
"step": 493
},
{
"advantage_max": 0.16276492271572351,
"advantage_mean": 6.984919101449272e-10,
"advantage_min": -0.107215684838593,
"advantage_std": 0.10512557066977024,
"completion_length": 2005.8333587646484,
"epoch": 0.5645714285714286,
"grad_norm": 0.026444094255566597,
"kl": 0.0601348876953125,
"learning_rate": 1.0039472645551372e-07,
"loss": 0.009,
"reward": 0.1630040816962719,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.10512557113543153,
"rewards/cosine_scaled_reward": 0.03179990313947201,
"rewards/format_reward": 0.8958333395421505,
"step": 494
},
{
"advantage_max": 0.15834691934287548,
"advantage_mean": -3.9581211780381764e-09,
"advantage_min": -0.12651699222624302,
"advantage_std": 0.12769749155268073,
"completion_length": 2761.166679382324,
"epoch": 0.5657142857142857,
"grad_norm": 0.06954298168420792,
"kl": 0.0700836181640625,
"learning_rate": 1.002741278414069e-07,
"loss": 0.0113,
"reward": 0.020432849414646626,
"reward_advantage_correlation": 1.0,
"reward_std": 0.1276974929496646,
"rewards/cosine_scaled_reward": -0.11860458739101887,
"rewards/format_reward": 0.3541666753590107,
"step": 495
},
{
"advantage_max": 0.1300267931073904,
"advantage_mean": -7.761022102759085e-10,
"advantage_min": -0.10878966562449932,
"advantage_std": 0.08884454821236432,
"completion_length": 1965.7708740234375,
"epoch": 0.5668571428571428,
"grad_norm": 0.03823034092783928,
"kl": 0.043365478515625,
"learning_rate": 1.0017544823184055e-07,
"loss": 0.0057,
"reward": 0.13961502793245018,
"reward_advantage_correlation": 1.0,
"reward_std": 0.08884455054067075,
"rewards/cosine_scaled_reward": 0.09778555016964674,
"rewards/format_reward": 0.6250000055879354,
"step": 496
},
{
"advantage_max": 0.16680945828557014,
"advantage_mean": -1.552204281773939e-09,
"advantage_min": -0.16890859883278608,
"advantage_std": 0.13287213910371065,
"completion_length": 2398.020866394043,
"epoch": 0.568,
"grad_norm": 0.05378331243991852,
"kl": 0.062191009521484375,
"learning_rate": 1.0009869243631952e-07,
"loss": 0.0095,
"reward": 0.1791801903546002,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.13287214329466224,
"rewards/cosine_scaled_reward": 0.2254606424830854,
"rewards/format_reward": 0.6041666734963655,
"step": 497
},
{
"advantage_max": 0.24626314919441938,
"advantage_mean": -7.761021270091817e-10,
"advantage_min": -0.15786647517234087,
"advantage_std": 0.1566294403746724,
"completion_length": 2678.2292404174805,
"epoch": 0.5691428571428572,
"grad_norm": 0.10654427111148834,
"kl": 0.0792694091796875,
"learning_rate": 1.000438641958131e-07,
"loss": 0.0229,
"reward": 0.043071957159554586,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.15662944316864014,
"rewards/cosine_scaled_reward": -0.10418755980208516,
"rewards/format_reward": 0.4583333432674408,
"step": 498
},
{
"advantage_max": 0.1925945421680808,
"advantage_mean": -3.6476802134366437e-09,
"advantage_min": -0.2209562873467803,
"advantage_std": 0.1784151755273342,
"completion_length": 2237.104217529297,
"epoch": 0.5702857142857143,
"grad_norm": 0.06882494688034058,
"kl": 0.047565460205078125,
"learning_rate": 1.0001096618257236e-07,
"loss": 0.0128,
"reward": 0.15613868786022067,
"reward_advantage_correlation": 0.9999999999999998,
"reward_std": 0.1784151801839471,
"rewards/cosine_scaled_reward": 0.10448653064668179,
"rewards/format_reward": 0.7083333432674408,
"step": 499
},
{
"advantage_max": 0.1740710544399917,
"advantage_mean": -2.793967751602011e-09,
"advantage_min": -0.15050521213561296,
"advantage_std": 0.135882212780416,
"completion_length": 2955.3959045410156,
"epoch": 0.5714285714285714,
"grad_norm": 0.10638931393623352,
"kl": 0.091705322265625,
"learning_rate": 1e-07,
"loss": 0.016,
"reward": 0.022939922448131256,
"reward_advantage_correlation": 0.9999999999999999,
"reward_std": 0.13588221883401275,
"rewards/cosine_scaled_reward": -0.14141679741442204,
"rewards/format_reward": 0.4166666753590107,
"step": 500
},
{
"epoch": 0.5714285714285714,
"step": 500,
"total_flos": 0.0,
"train_loss": 0.007517670260873274,
"train_runtime": 53153.3665,
"train_samples_per_second": 0.452,
"train_steps_per_second": 0.009
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}