MMR-Adaptive-Smooth-GRPO / trainer_state.json
kangdawei's picture
Model save
3c62533 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5714285714285714,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 2571.2083587646484,
"epoch": 0.001142857142857143,
"grad_norm": 0.19696776568889618,
"kl": 0.0,
"learning_rate": 2e-08,
"loss": 0.0,
"reward": 0.4897647276520729,
"reward_std": 0.8290339708328247,
"rewards/cosine_scaled_reward": -0.015534311532974243,
"rewards/format_reward": 0.5208333488553762,
"step": 1
},
{
"completion_length": 2804.395881652832,
"epoch": 0.002285714285714286,
"grad_norm": 0.1806372106075287,
"kl": 0.0,
"learning_rate": 4e-08,
"loss": 0.0,
"reward": 0.27539755776524544,
"reward_std": 0.42092563211917877,
"rewards/cosine_scaled_reward": -0.04980122856795788,
"rewards/format_reward": 0.37500000558793545,
"step": 2
},
{
"completion_length": 3339.625015258789,
"epoch": 0.0034285714285714284,
"grad_norm": 0.1699189692735672,
"kl": 4.197657108306885e-05,
"learning_rate": 6e-08,
"loss": 0.0,
"reward": -0.24649023730307817,
"reward_std": 0.7038179924711585,
"rewards/cosine_scaled_reward": -0.18574512389022857,
"rewards/format_reward": 0.1250000037252903,
"step": 3
},
{
"completion_length": 2276.2708892822266,
"epoch": 0.004571428571428572,
"grad_norm": 0.2699170410633087,
"kl": 3.144703805446625e-05,
"learning_rate": 8e-08,
"loss": 0.0,
"reward": 0.37421327200718224,
"reward_std": 0.6797358561307192,
"rewards/cosine_scaled_reward": -0.09414338041096926,
"rewards/format_reward": 0.5625000055879354,
"step": 4
},
{
"completion_length": 3310.1041870117188,
"epoch": 0.005714285714285714,
"grad_norm": 0.17548619210720062,
"kl": 4.331022500991821e-05,
"learning_rate": 1e-07,
"loss": 0.0,
"reward": -0.07299477732158266,
"reward_std": 0.7602944187819958,
"rewards/cosine_scaled_reward": -0.19274738454259932,
"rewards/format_reward": 0.31250000558793545,
"step": 5
},
{
"completion_length": 3136.104217529297,
"epoch": 0.006857142857142857,
"grad_norm": 0.20332929491996765,
"kl": 4.7653913497924805e-05,
"learning_rate": 1.2e-07,
"loss": 0.0,
"reward": -0.02561904746107757,
"reward_std": 1.0297068133950233,
"rewards/cosine_scaled_reward": -0.13780953222885728,
"rewards/format_reward": 0.25000000931322575,
"step": 6
},
{
"completion_length": 3268.2500915527344,
"epoch": 0.008,
"grad_norm": 0.14488613605499268,
"kl": 2.9772520065307617e-05,
"learning_rate": 1.4e-07,
"loss": 0.0,
"reward": 0.23762857168912888,
"reward_std": 1.0299683846533298,
"rewards/cosine_scaled_reward": -0.09993573231622577,
"rewards/format_reward": 0.43750000558793545,
"step": 7
},
{
"completion_length": 2672.9791870117188,
"epoch": 0.009142857142857144,
"grad_norm": 0.16940173506736755,
"kl": 2.1375715732574463e-05,
"learning_rate": 1.6e-07,
"loss": 0.0,
"reward": 0.6125958878546953,
"reward_std": 0.5378385670483112,
"rewards/cosine_scaled_reward": 0.07713126111775637,
"rewards/format_reward": 0.4583333358168602,
"step": 8
},
{
"completion_length": 3104.3334045410156,
"epoch": 0.010285714285714285,
"grad_norm": 0.2231517881155014,
"kl": 3.183633089065552e-05,
"learning_rate": 1.8e-07,
"loss": 0.0,
"reward": 0.09738215431571007,
"reward_std": 0.6578879225999117,
"rewards/cosine_scaled_reward": -0.128392253711354,
"rewards/format_reward": 0.3541666716337204,
"step": 9
},
{
"completion_length": 2790.8541717529297,
"epoch": 0.011428571428571429,
"grad_norm": 0.19208469986915588,
"kl": 3.1501054763793945e-05,
"learning_rate": 2e-07,
"loss": 0.0,
"reward": 0.008928850293159485,
"reward_std": 0.5447255074977875,
"rewards/cosine_scaled_reward": -0.18303558183833957,
"rewards/format_reward": 0.3750000074505806,
"step": 10
},
{
"completion_length": 3341.437530517578,
"epoch": 0.012571428571428572,
"grad_norm": 0.19327814877033234,
"kl": 2.907589077949524e-05,
"learning_rate": 2.1999999999999998e-07,
"loss": 0.0,
"reward": -0.39456131402403116,
"reward_std": 0.5775978416204453,
"rewards/cosine_scaled_reward": -0.24936398956924677,
"rewards/format_reward": 0.1041666679084301,
"step": 11
},
{
"completion_length": 2766.7083740234375,
"epoch": 0.013714285714285714,
"grad_norm": 0.23883244395256042,
"kl": 3.895256668329239e-05,
"learning_rate": 2.4e-07,
"loss": 0.0,
"reward": 0.3529932126402855,
"reward_std": 0.861735014244914,
"rewards/cosine_scaled_reward": -0.08392009884119034,
"rewards/format_reward": 0.5208333488553762,
"step": 12
},
{
"completion_length": 2890.750030517578,
"epoch": 0.014857142857142857,
"grad_norm": 0.20623275637626648,
"kl": 2.8409063816070557e-05,
"learning_rate": 2.6e-07,
"loss": 0.0,
"reward": 0.39041033387184143,
"reward_std": 0.7538400888442993,
"rewards/cosine_scaled_reward": -0.023544855881482363,
"rewards/format_reward": 0.4375000111758709,
"step": 13
},
{
"completion_length": 2856.187530517578,
"epoch": 0.016,
"grad_norm": 0.16158564388751984,
"kl": 2.251937985420227e-05,
"learning_rate": 2.8e-07,
"loss": 0.0,
"reward": 0.11729078739881516,
"reward_std": 0.7728225328028202,
"rewards/cosine_scaled_reward": -0.13927128538489342,
"rewards/format_reward": 0.3958333358168602,
"step": 14
},
{
"completion_length": 2816.333354949951,
"epoch": 0.017142857142857144,
"grad_norm": 0.1757153868675232,
"kl": 3.409385681152344e-05,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.5039311908185482,
"reward_std": 0.6081470809876919,
"rewards/cosine_scaled_reward": 0.03321555629372597,
"rewards/format_reward": 0.43750000558793545,
"step": 15
},
{
"completion_length": 3518.9166870117188,
"epoch": 0.018285714285714287,
"grad_norm": 0.16117794811725616,
"kl": 4.215538501739502e-05,
"learning_rate": 3.2e-07,
"loss": 0.0,
"reward": -0.3141332839149982,
"reward_std": 0.5307199694216251,
"rewards/cosine_scaled_reward": -0.1987333269789815,
"rewards/format_reward": 0.0833333358168602,
"step": 16
},
{
"completion_length": 2329.0208587646484,
"epoch": 0.019428571428571427,
"grad_norm": 0.2838650941848755,
"kl": 4.272162914276123e-05,
"learning_rate": 3.4000000000000003e-07,
"loss": 0.0,
"reward": 0.39212552830576897,
"reward_std": 0.791872002184391,
"rewards/cosine_scaled_reward": -0.07477057632058859,
"rewards/format_reward": 0.5416666679084301,
"step": 17
},
{
"completion_length": 2841.3125534057617,
"epoch": 0.02057142857142857,
"grad_norm": 0.15529987215995789,
"kl": 2.863258123397827e-05,
"learning_rate": 3.6e-07,
"loss": 0.0,
"reward": 0.2834795080125332,
"reward_std": 0.7086473144590855,
"rewards/cosine_scaled_reward": -0.10826026648283005,
"rewards/format_reward": 0.5000000111758709,
"step": 18
},
{
"completion_length": 3026.9583740234375,
"epoch": 0.021714285714285714,
"grad_norm": 0.16150601208209991,
"kl": 2.719089388847351e-05,
"learning_rate": 3.7999999999999996e-07,
"loss": 0.0,
"reward": 0.6184139084070921,
"reward_std": 0.9973310008645058,
"rewards/cosine_scaled_reward": 0.11129027768038213,
"rewards/format_reward": 0.3958333395421505,
"step": 19
},
{
"completion_length": 2519.5000228881836,
"epoch": 0.022857142857142857,
"grad_norm": 0.22576642036437988,
"kl": 2.012774348258972e-05,
"learning_rate": 4e-07,
"loss": 0.0,
"reward": 0.6895501036196947,
"reward_std": 0.7981752147898078,
"rewards/cosine_scaled_reward": 0.021858368068933487,
"rewards/format_reward": 0.6458333469927311,
"step": 20
},
{
"completion_length": 2594.7916870117188,
"epoch": 0.024,
"grad_norm": 0.23808935284614563,
"kl": 3.5569071769714355e-05,
"learning_rate": 4.1999999999999995e-07,
"loss": 0.0,
"reward": 0.33955152705311775,
"reward_std": 0.5852350238710642,
"rewards/cosine_scaled_reward": -0.048974241130054,
"rewards/format_reward": 0.43750000186264515,
"step": 21
},
{
"completion_length": 1815.8125457763672,
"epoch": 0.025142857142857144,
"grad_norm": 0.297273725271225,
"kl": 3.524869680404663e-05,
"learning_rate": 4.3999999999999997e-07,
"loss": 0.0,
"reward": 0.9210781529545784,
"reward_std": 0.6531081832945347,
"rewards/cosine_scaled_reward": 0.03345571830868721,
"rewards/format_reward": 0.8541666716337204,
"step": 22
},
{
"completion_length": 2265.041702270508,
"epoch": 0.026285714285714287,
"grad_norm": 0.2135019600391388,
"kl": 2.358853816986084e-05,
"learning_rate": 4.6e-07,
"loss": 0.0,
"reward": 0.46448952704668045,
"reward_std": 0.6991389617323875,
"rewards/cosine_scaled_reward": -0.06983858160674572,
"rewards/format_reward": 0.6041666734963655,
"step": 23
},
{
"completion_length": 2737.4584197998047,
"epoch": 0.027428571428571427,
"grad_norm": 0.20275644958019257,
"kl": 2.9724091291427612e-05,
"learning_rate": 4.8e-07,
"loss": 0.0,
"reward": 0.5923267342150211,
"reward_std": 0.9929416831582785,
"rewards/cosine_scaled_reward": 0.025330022268462926,
"rewards/format_reward": 0.5416666809469461,
"step": 24
},
{
"completion_length": 2758.7083740234375,
"epoch": 0.02857142857142857,
"grad_norm": 0.2127925157546997,
"kl": 3.1773000955581665e-05,
"learning_rate": 5e-07,
"loss": 0.0,
"reward": 0.32758328318595886,
"reward_std": 0.7631605602800846,
"rewards/cosine_scaled_reward": -0.06537504983134568,
"rewards/format_reward": 0.45833334140479565,
"step": 25
},
{
"completion_length": 3101.4166870117188,
"epoch": 0.029714285714285714,
"grad_norm": 0.1526872217655182,
"kl": 3.3758580684661865e-05,
"learning_rate": 5.2e-07,
"loss": 0.0,
"reward": 0.2650221809744835,
"reward_std": 0.6115698590874672,
"rewards/cosine_scaled_reward": -0.07582224532961845,
"rewards/format_reward": 0.41666667722165585,
"step": 26
},
{
"completion_length": 2927.1458740234375,
"epoch": 0.030857142857142857,
"grad_norm": 0.22624598443508148,
"kl": 4.612654447555542e-05,
"learning_rate": 5.4e-07,
"loss": 0.0,
"reward": 0.18323302548378706,
"reward_std": 0.7548771295696497,
"rewards/cosine_scaled_reward": -0.09588348306715488,
"rewards/format_reward": 0.3750000037252903,
"step": 27
},
{
"completion_length": 2783.541679382324,
"epoch": 0.032,
"grad_norm": 0.19066381454467773,
"kl": 3.3406540751457214e-05,
"learning_rate": 5.6e-07,
"loss": 0.0,
"reward": 0.3194316625595093,
"reward_std": 0.5980064831674099,
"rewards/cosine_scaled_reward": -0.048617489635944366,
"rewards/format_reward": 0.416666679084301,
"step": 28
},
{
"completion_length": 3331.8125610351562,
"epoch": 0.03314285714285714,
"grad_norm": 0.24491053819656372,
"kl": 2.5831162929534912e-05,
"learning_rate": 5.8e-07,
"loss": 0.0,
"reward": -0.3617453798651695,
"reward_std": 0.5833318009972572,
"rewards/cosine_scaled_reward": -0.25378935784101486,
"rewards/format_reward": 0.14583333767950535,
"step": 29
},
{
"completion_length": 2860.062545776367,
"epoch": 0.03428571428571429,
"grad_norm": 0.17712661623954773,
"kl": 1.9449740648269653e-05,
"learning_rate": 6e-07,
"loss": 0.0,
"reward": 0.5695777088403702,
"reward_std": 0.9141397215425968,
"rewards/cosine_scaled_reward": 0.03478884696960449,
"rewards/format_reward": 0.5000000093132257,
"step": 30
},
{
"completion_length": 2967.625045776367,
"epoch": 0.03542857142857143,
"grad_norm": 0.19428333640098572,
"kl": 3.3482909202575684e-05,
"learning_rate": 6.2e-07,
"loss": 0.0,
"reward": 0.05395581666380167,
"reward_std": 0.726109255105257,
"rewards/cosine_scaled_reward": -0.12927209632471204,
"rewards/format_reward": 0.3125000074505806,
"step": 31
},
{
"completion_length": 3092.8750610351562,
"epoch": 0.036571428571428574,
"grad_norm": 0.18985038995742798,
"kl": 2.6285648345947266e-05,
"learning_rate": 6.4e-07,
"loss": 0.0,
"reward": 0.18355572840664536,
"reward_std": 0.8745833523571491,
"rewards/cosine_scaled_reward": -0.0853054765611887,
"rewards/format_reward": 0.3541666753590107,
"step": 32
},
{
"completion_length": 3396.3750610351562,
"epoch": 0.037714285714285714,
"grad_norm": 0.1381026655435562,
"kl": 3.3989548683166504e-05,
"learning_rate": 6.6e-07,
"loss": 0.0,
"reward": 0.2385600535199046,
"reward_std": 0.840669609606266,
"rewards/cosine_scaled_reward": -0.03696998115628958,
"rewards/format_reward": 0.3125000074505806,
"step": 33
},
{
"completion_length": 2426.937545776367,
"epoch": 0.038857142857142854,
"grad_norm": 0.3463960886001587,
"kl": 2.298876643180847e-05,
"learning_rate": 6.800000000000001e-07,
"loss": 0.0,
"reward": 0.8328317422419786,
"reward_std": 0.8242906630039215,
"rewards/cosine_scaled_reward": 0.12474916968494654,
"rewards/format_reward": 0.5833333414047956,
"step": 34
},
{
"completion_length": 3079.8750534057617,
"epoch": 0.04,
"grad_norm": 0.22398288547992706,
"kl": 4.157423973083496e-05,
"learning_rate": 7e-07,
"loss": 0.0,
"reward": -0.05081530287861824,
"reward_std": 0.9191469214856625,
"rewards/cosine_scaled_reward": -0.16082432121038437,
"rewards/format_reward": 0.2708333358168602,
"step": 35
},
{
"completion_length": 3372.6458740234375,
"epoch": 0.04114285714285714,
"grad_norm": 0.17480096220970154,
"kl": 3.078579902648926e-05,
"learning_rate": 7.2e-07,
"loss": 0.0,
"reward": -0.3681298622395843,
"reward_std": 0.6958093121647835,
"rewards/cosine_scaled_reward": -0.2673982698470354,
"rewards/format_reward": 0.1666666679084301,
"step": 36
},
{
"completion_length": 3249.5208435058594,
"epoch": 0.04228571428571429,
"grad_norm": 0.16790439188480377,
"kl": 2.954155206680298e-05,
"learning_rate": 7.4e-07,
"loss": 0.0,
"reward": -0.22375414264388382,
"reward_std": 0.5829485245049,
"rewards/cosine_scaled_reward": -0.22646040935069323,
"rewards/format_reward": 0.2291666679084301,
"step": 37
},
{
"completion_length": 3267.0208435058594,
"epoch": 0.04342857142857143,
"grad_norm": 0.15718603134155273,
"kl": 2.9595568776130676e-05,
"learning_rate": 7.599999999999999e-07,
"loss": 0.0,
"reward": -0.2248917780816555,
"reward_std": 0.3576042940840125,
"rewards/cosine_scaled_reward": -0.17494588904082775,
"rewards/format_reward": 0.125,
"step": 38
},
{
"completion_length": 2847.5208587646484,
"epoch": 0.044571428571428574,
"grad_norm": 0.24797889590263367,
"kl": 1.7508864402770996e-05,
"learning_rate": 7.799999999999999e-07,
"loss": 0.0,
"reward": 0.3429147396236658,
"reward_std": 0.48750871582888067,
"rewards/cosine_scaled_reward": -0.03687598556280136,
"rewards/format_reward": 0.41666666977107525,
"step": 39
},
{
"completion_length": 2490.791702270508,
"epoch": 0.045714285714285714,
"grad_norm": 0.18541350960731506,
"kl": 1.5174038708209991e-05,
"learning_rate": 8e-07,
"loss": 0.0,
"reward": 0.3607976003549993,
"reward_std": 0.40943362936377525,
"rewards/cosine_scaled_reward": -0.11126788146793842,
"rewards/format_reward": 0.5833333432674408,
"step": 40
},
{
"completion_length": 3017.6250610351562,
"epoch": 0.046857142857142854,
"grad_norm": 0.1669527143239975,
"kl": 2.2130087018013e-05,
"learning_rate": 8.199999999999999e-07,
"loss": 0.0,
"reward": -0.09058963833376765,
"reward_std": 0.6663156133145094,
"rewards/cosine_scaled_reward": -0.22237816639244556,
"rewards/format_reward": 0.3541666679084301,
"step": 41
},
{
"completion_length": 2780.5208587646484,
"epoch": 0.048,
"grad_norm": 0.2479742020368576,
"kl": 5.4270029067993164e-05,
"learning_rate": 8.399999999999999e-07,
"loss": 0.0,
"reward": -0.20546918595209718,
"reward_std": 0.47696714848279953,
"rewards/cosine_scaled_reward": -0.26940126344561577,
"rewards/format_reward": 0.33333333395421505,
"step": 42
},
{
"completion_length": 2762.8541870117188,
"epoch": 0.04914285714285714,
"grad_norm": 0.1957981437444687,
"kl": 2.5795772671699524e-05,
"learning_rate": 8.599999999999999e-07,
"loss": 0.0,
"reward": 0.1344442442059517,
"reward_std": 0.6109267733991146,
"rewards/cosine_scaled_reward": -0.1098612155765295,
"rewards/format_reward": 0.35416667722165585,
"step": 43
},
{
"completion_length": 2733.7708740234375,
"epoch": 0.05028571428571429,
"grad_norm": 0.3306754231452942,
"kl": 5.932897329330444e-05,
"learning_rate": 8.799999999999999e-07,
"loss": 0.0,
"reward": 0.36940332502126694,
"reward_std": 0.7888948805630207,
"rewards/cosine_scaled_reward": -0.044465010054409504,
"rewards/format_reward": 0.45833334513008595,
"step": 44
},
{
"completion_length": 3472.312530517578,
"epoch": 0.05142857142857143,
"grad_norm": 0.14603589475154877,
"kl": 3.156810998916626e-05,
"learning_rate": 9e-07,
"loss": 0.0,
"reward": -0.042608221992850304,
"reward_std": 0.5772924721240997,
"rewards/cosine_scaled_reward": -0.10463745961897075,
"rewards/format_reward": 0.16666667349636555,
"step": 45
},
{
"completion_length": 3173.9791870117188,
"epoch": 0.052571428571428575,
"grad_norm": 0.21064509451389313,
"kl": 3.273040056228638e-05,
"learning_rate": 9.2e-07,
"loss": 0.0,
"reward": -0.29001128301024437,
"reward_std": 0.4347268417477608,
"rewards/cosine_scaled_reward": -0.22833896055817604,
"rewards/format_reward": 0.1666666679084301,
"step": 46
},
{
"completion_length": 2814.500030517578,
"epoch": 0.053714285714285714,
"grad_norm": 0.2205515205860138,
"kl": 1.9498169422149658e-05,
"learning_rate": 9.399999999999999e-07,
"loss": 0.0,
"reward": 0.5555428601801395,
"reward_std": 0.9152592048048973,
"rewards/cosine_scaled_reward": 0.027771430788561702,
"rewards/format_reward": 0.5000000074505806,
"step": 47
},
{
"completion_length": 2843.6250610351562,
"epoch": 0.054857142857142854,
"grad_norm": 0.1848597675561905,
"kl": 8.176267147064209e-05,
"learning_rate": 9.6e-07,
"loss": 0.0,
"reward": 0.15840810351073742,
"reward_std": 0.8359496407210827,
"rewards/cosine_scaled_reward": -0.09787929493177217,
"rewards/format_reward": 0.35416666977107525,
"step": 48
},
{
"completion_length": 2283.8541946411133,
"epoch": 0.056,
"grad_norm": 0.20812320709228516,
"kl": 3.5099685192108154e-05,
"learning_rate": 9.8e-07,
"loss": 0.0,
"reward": 0.5328511632978916,
"reward_std": 0.8218232821673155,
"rewards/cosine_scaled_reward": -0.025241072289645672,
"rewards/format_reward": 0.5833333414047956,
"step": 49
},
{
"completion_length": 2871.3125534057617,
"epoch": 0.05714285714285714,
"grad_norm": 0.17633675038814545,
"kl": 5.5462121963500977e-05,
"learning_rate": 1e-06,
"loss": 0.0,
"reward": 0.425536647439003,
"reward_std": 0.6618794202804565,
"rewards/cosine_scaled_reward": 0.025268293917179108,
"rewards/format_reward": 0.37500000186264515,
"step": 50
},
{
"completion_length": 2325.1458740234375,
"epoch": 0.05828571428571429,
"grad_norm": 0.24762925505638123,
"kl": 0.00016194581985473633,
"learning_rate": 9.999890338174275e-07,
"loss": 0.0,
"reward": 0.2191239856183529,
"reward_std": 0.5495061241090298,
"rewards/cosine_scaled_reward": -0.16127135697752237,
"rewards/format_reward": 0.5416666716337204,
"step": 51
},
{
"completion_length": 2865.750045776367,
"epoch": 0.05942857142857143,
"grad_norm": 0.2184467613697052,
"kl": 8.131936192512512e-05,
"learning_rate": 9.999561358041868e-07,
"loss": 0.0,
"reward": 0.49965737760066986,
"reward_std": 1.128704745322466,
"rewards/cosine_scaled_reward": 0.031078664120286703,
"rewards/format_reward": 0.4375000111758709,
"step": 52
},
{
"completion_length": 2812.062530517578,
"epoch": 0.060571428571428575,
"grad_norm": 0.23506155610084534,
"kl": 0.0001082010567188263,
"learning_rate": 9.999013075636804e-07,
"loss": 0.0,
"reward": 0.17906612996011972,
"reward_std": 0.767060749232769,
"rewards/cosine_scaled_reward": -0.13963361305650324,
"rewards/format_reward": 0.45833333767950535,
"step": 53
},
{
"completion_length": 2882.104232788086,
"epoch": 0.061714285714285715,
"grad_norm": 0.20438477396965027,
"kl": 4.882924258708954e-05,
"learning_rate": 9.998245517681593e-07,
"loss": 0.0,
"reward": 0.8570227089803666,
"reward_std": 1.0325051695108414,
"rewards/cosine_scaled_reward": 0.1576780043542385,
"rewards/format_reward": 0.541666679084301,
"step": 54
},
{
"completion_length": 2910.7291870117188,
"epoch": 0.06285714285714286,
"grad_norm": 0.19835922122001648,
"kl": 6.431713700294495e-05,
"learning_rate": 9.997258721585931e-07,
"loss": 0.0,
"reward": 0.25683262944221497,
"reward_std": 0.8695680163800716,
"rewards/cosine_scaled_reward": -0.04866702202707529,
"rewards/format_reward": 0.35416667349636555,
"step": 55
},
{
"completion_length": 2741.3750534057617,
"epoch": 0.064,
"grad_norm": 0.1814289689064026,
"kl": 3.975629806518555e-05,
"learning_rate": 9.996052735444862e-07,
"loss": 0.0,
"reward": 0.47193842101842165,
"reward_std": 0.5631829425692558,
"rewards/cosine_scaled_reward": -0.003614123910665512,
"rewards/format_reward": 0.47916667349636555,
"step": 56
},
{
"completion_length": 3241.8334045410156,
"epoch": 0.06514285714285714,
"grad_norm": 0.1280871331691742,
"kl": 2.622697502374649e-05,
"learning_rate": 9.994627618036452e-07,
"loss": 0.0,
"reward": 0.1633035959675908,
"reward_std": 0.93094053119421,
"rewards/cosine_scaled_reward": -0.09543154633138329,
"rewards/format_reward": 0.35416667349636555,
"step": 57
},
{
"completion_length": 2383.1667251586914,
"epoch": 0.06628571428571428,
"grad_norm": 0.2606426179409027,
"kl": 0.000448569655418396,
"learning_rate": 9.992983438818915e-07,
"loss": 0.0,
"reward": 0.46104544680565596,
"reward_std": 0.8501972369849682,
"rewards/cosine_scaled_reward": -0.07156064454466105,
"rewards/format_reward": 0.6041666772216558,
"step": 58
},
{
"completion_length": 2776.9167098999023,
"epoch": 0.06742857142857143,
"grad_norm": 0.1767275035381317,
"kl": 4.192814230918884e-05,
"learning_rate": 9.991120277927223e-07,
"loss": 0.0,
"reward": 0.1411968027241528,
"reward_std": 0.6273871827870607,
"rewards/cosine_scaled_reward": -0.08565161540172994,
"rewards/format_reward": 0.31250000186264515,
"step": 59
},
{
"completion_length": 3010.2708435058594,
"epoch": 0.06857142857142857,
"grad_norm": 0.16747058928012848,
"kl": 4.213489592075348e-05,
"learning_rate": 9.989038226169207e-07,
"loss": 0.0,
"reward": -0.10470366384834051,
"reward_std": 0.5668917782604694,
"rewards/cosine_scaled_reward": -0.20860183122567832,
"rewards/format_reward": 0.3125000074505806,
"step": 60
},
{
"completion_length": 2930.125045776367,
"epoch": 0.06971428571428571,
"grad_norm": 0.18895801901817322,
"kl": 0.00015696324408054352,
"learning_rate": 9.98673738502114e-07,
"loss": 0.0,
"reward": 0.31786643620580435,
"reward_std": 0.8822249062359333,
"rewards/cosine_scaled_reward": -0.08065013960003853,
"rewards/format_reward": 0.4791666753590107,
"step": 61
},
{
"completion_length": 2585.104232788086,
"epoch": 0.07085714285714285,
"grad_norm": 0.19783088564872742,
"kl": 0.00045157596468925476,
"learning_rate": 9.98421786662277e-07,
"loss": 0.0,
"reward": 0.5959182996302843,
"reward_std": 0.8203712180256844,
"rewards/cosine_scaled_reward": -0.014540859963744879,
"rewards/format_reward": 0.6250000093132257,
"step": 62
},
{
"completion_length": 2321.2708587646484,
"epoch": 0.072,
"grad_norm": 0.19266927242279053,
"kl": 0.00032773613929748535,
"learning_rate": 9.981479793771866e-07,
"loss": 0.0,
"reward": 0.9606147482991219,
"reward_std": 0.7731334287673235,
"rewards/cosine_scaled_reward": 0.13655737973749638,
"rewards/format_reward": 0.6875000149011612,
"step": 63
},
{
"completion_length": 2989.8958587646484,
"epoch": 0.07314285714285715,
"grad_norm": 0.21102704107761383,
"kl": 0.0001429772237315774,
"learning_rate": 9.97852329991824e-07,
"loss": 0.0,
"reward": 0.30724555626511574,
"reward_std": 0.8989091999828815,
"rewards/cosine_scaled_reward": -0.054710563155822456,
"rewards/format_reward": 0.41666668467223644,
"step": 64
},
{
"completion_length": 2789.541690826416,
"epoch": 0.07428571428571429,
"grad_norm": 0.2767220735549927,
"kl": 0.0001298440620303154,
"learning_rate": 9.975348529157229e-07,
"loss": 0.0,
"reward": 0.10150328651070595,
"reward_std": 0.5617774492129683,
"rewards/cosine_scaled_reward": -0.14716504141688347,
"rewards/format_reward": 0.39583334140479565,
"step": 65
},
{
"completion_length": 2099.3333435058594,
"epoch": 0.07542857142857143,
"grad_norm": 0.24364213645458221,
"kl": 0.00016443058848381042,
"learning_rate": 9.971955636222684e-07,
"loss": 0.0,
"reward": 0.5610158704221249,
"reward_std": 0.46347122080624104,
"rewards/cosine_scaled_reward": 0.03050791099667549,
"rewards/format_reward": 0.5,
"step": 66
},
{
"completion_length": 3478.562530517578,
"epoch": 0.07657142857142857,
"grad_norm": 0.14123453199863434,
"kl": 0.00021456927061080933,
"learning_rate": 9.968344786479415e-07,
"loss": 0.0,
"reward": -0.4186356011778116,
"reward_std": 0.4762955382466316,
"rewards/cosine_scaled_reward": -0.26140114292502403,
"rewards/format_reward": 0.1041666679084301,
"step": 67
},
{
"completion_length": 1994.0000534057617,
"epoch": 0.07771428571428571,
"grad_norm": 0.24095165729522705,
"kl": 0.0009656250476837158,
"learning_rate": 9.964516155915151e-07,
"loss": 0.0,
"reward": 0.5732477158308029,
"reward_std": 0.779868096113205,
"rewards/cosine_scaled_reward": -0.04670950397849083,
"rewards/format_reward": 0.666666679084301,
"step": 68
},
{
"completion_length": 2451.6875228881836,
"epoch": 0.07885714285714286,
"grad_norm": 0.2385924905538559,
"kl": 0.0006657838821411133,
"learning_rate": 9.960469931131936e-07,
"loss": 0.0,
"reward": 0.06796575582120568,
"reward_std": 0.5406197272241116,
"rewards/cosine_scaled_reward": -0.19518379587680101,
"rewards/format_reward": 0.45833334140479565,
"step": 69
},
{
"completion_length": 3113.541702270508,
"epoch": 0.08,
"grad_norm": 0.1962832510471344,
"kl": 0.0009769648313522339,
"learning_rate": 9.956206309337066e-07,
"loss": 0.0,
"reward": 0.0411946764215827,
"reward_std": 0.5727978125214577,
"rewards/cosine_scaled_reward": -0.1773193427361548,
"rewards/format_reward": 0.3958333432674408,
"step": 70
},
{
"completion_length": 2725.270851135254,
"epoch": 0.08114285714285714,
"grad_norm": 0.18360581994056702,
"kl": 0.0005005262792110443,
"learning_rate": 9.951725498333448e-07,
"loss": 0.0,
"reward": 0.38103893026709557,
"reward_std": 0.6639891043305397,
"rewards/cosine_scaled_reward": 0.003019465133547783,
"rewards/format_reward": 0.37500000558793545,
"step": 71
},
{
"completion_length": 2805.8333587646484,
"epoch": 0.08228571428571428,
"grad_norm": 0.20988361537456512,
"kl": 0.0007044821977615356,
"learning_rate": 9.947027716509488e-07,
"loss": 0.0,
"reward": 0.25359107134863734,
"reward_std": 0.811398807913065,
"rewards/cosine_scaled_reward": -0.10237114643678069,
"rewards/format_reward": 0.45833335258066654,
"step": 72
},
{
"completion_length": 3307.854217529297,
"epoch": 0.08342857142857144,
"grad_norm": 0.1682441234588623,
"kl": 0.0001322627067565918,
"learning_rate": 9.942113192828444e-07,
"loss": 0.0,
"reward": 0.15133656188845634,
"reward_std": 0.8552793562412262,
"rewards/cosine_scaled_reward": -0.04933173581957817,
"rewards/format_reward": 0.25000000931322575,
"step": 73
},
{
"completion_length": 3133.104248046875,
"epoch": 0.08457142857142858,
"grad_norm": 0.15636852383613586,
"kl": 0.00043823570013046265,
"learning_rate": 9.93698216681727e-07,
"loss": 0.0,
"reward": 0.3009439923334867,
"reward_std": 0.9593954384326935,
"rewards/cosine_scaled_reward": -0.005778005812317133,
"rewards/format_reward": 0.3125000037252903,
"step": 74
},
{
"completion_length": 2883.166717529297,
"epoch": 0.08571428571428572,
"grad_norm": 0.15218932926654816,
"kl": 0.0008656233549118042,
"learning_rate": 9.931634888554935e-07,
"loss": 0.0,
"reward": 0.32558897137641907,
"reward_std": 0.5437379367649555,
"rewards/cosine_scaled_reward": -0.03512220270931721,
"rewards/format_reward": 0.39583334140479565,
"step": 75
},
{
"completion_length": 2863.3125228881836,
"epoch": 0.08685714285714285,
"grad_norm": 0.15872140228748322,
"kl": 9.638071060180664e-05,
"learning_rate": 9.926071618660237e-07,
"loss": 0.0,
"reward": 0.18390853703022003,
"reward_std": 0.5108423344790936,
"rewards/cosine_scaled_reward": -0.13721241243183613,
"rewards/format_reward": 0.45833334140479565,
"step": 76
},
{
"completion_length": 3153.354217529297,
"epoch": 0.088,
"grad_norm": 0.15251125395298004,
"kl": 0.00015661679208278656,
"learning_rate": 9.9202926282791e-07,
"loss": 0.0,
"reward": 0.06627489440143108,
"reward_std": 0.4633176252245903,
"rewards/cosine_scaled_reward": -0.14394589699804783,
"rewards/format_reward": 0.35416666977107525,
"step": 77
},
{
"completion_length": 3143.916702270508,
"epoch": 0.08914285714285715,
"grad_norm": 0.16262713074684143,
"kl": 0.0005014901980757713,
"learning_rate": 9.91429819907136e-07,
"loss": 0.0,
"reward": 0.38807435147464275,
"reward_std": 0.8590060994029045,
"rewards/cosine_scaled_reward": 0.016953811049461365,
"rewards/format_reward": 0.354166679084301,
"step": 78
},
{
"completion_length": 2234.041675567627,
"epoch": 0.09028571428571429,
"grad_norm": 0.22237282991409302,
"kl": 0.0009625256061553955,
"learning_rate": 9.908088623197048e-07,
"loss": 0.0,
"reward": 0.4828355088829994,
"reward_std": 0.6572117768228054,
"rewards/cosine_scaled_reward": -0.06066559627652168,
"rewards/format_reward": 0.6041666697710752,
"step": 79
},
{
"completion_length": 3146.1250610351562,
"epoch": 0.09142857142857143,
"grad_norm": 0.16743233799934387,
"kl": 0.00040898716542869806,
"learning_rate": 9.901664203302124e-07,
"loss": 0.0,
"reward": 0.3415686395019293,
"reward_std": 0.9433976151049137,
"rewards/cosine_scaled_reward": -0.016715684439986944,
"rewards/format_reward": 0.37500000558793545,
"step": 80
},
{
"completion_length": 2960.333366394043,
"epoch": 0.09257142857142857,
"grad_norm": 0.31833556294441223,
"kl": 0.0011633634567260742,
"learning_rate": 9.895025252503755e-07,
"loss": 0.0,
"reward": 0.0825501810759306,
"reward_std": 0.7551630176603794,
"rewards/cosine_scaled_reward": -0.1253915773704648,
"rewards/format_reward": 0.33333334513008595,
"step": 81
},
{
"completion_length": 2789.666679382324,
"epoch": 0.09371428571428571,
"grad_norm": 0.1964961588382721,
"kl": 0.0009930580854415894,
"learning_rate": 9.888172094375033e-07,
"loss": 0.0,
"reward": 0.32206014543771744,
"reward_std": 0.8189963661134243,
"rewards/cosine_scaled_reward": -0.05771994253154844,
"rewards/format_reward": 0.4375000074505806,
"step": 82
},
{
"completion_length": 2697.8541870117188,
"epoch": 0.09485714285714286,
"grad_norm": 0.23213206231594086,
"kl": 0.0010943468660116196,
"learning_rate": 9.881105062929221e-07,
"loss": 0.0,
"reward": 0.06216884404420853,
"reward_std": 0.5933049730956554,
"rewards/cosine_scaled_reward": -0.14599892310798168,
"rewards/format_reward": 0.35416666977107525,
"step": 83
},
{
"completion_length": 3034.8958587646484,
"epoch": 0.096,
"grad_norm": 0.16874635219573975,
"kl": 0.0005750656127929688,
"learning_rate": 9.873824502603459e-07,
"loss": 0.0,
"reward": 0.3496646843268536,
"reward_std": 0.9294291753321886,
"rewards/cosine_scaled_reward": -0.02308431826531887,
"rewards/format_reward": 0.39583334513008595,
"step": 84
},
{
"completion_length": 3031.625045776367,
"epoch": 0.09714285714285714,
"grad_norm": 0.14502210915088654,
"kl": 0.0002237856388092041,
"learning_rate": 9.866330768241983e-07,
"loss": 0.0,
"reward": 0.11518191546201706,
"reward_std": 0.844723105430603,
"rewards/cosine_scaled_reward": -0.15074237808585167,
"rewards/format_reward": 0.4166666753590107,
"step": 85
},
{
"completion_length": 2953.500045776367,
"epoch": 0.09828571428571428,
"grad_norm": 0.17785117030143738,
"kl": 0.0008893311023712158,
"learning_rate": 9.85862422507884e-07,
"loss": 0.0,
"reward": 0.19509585201740265,
"reward_std": 0.6929262951016426,
"rewards/cosine_scaled_reward": -0.08995212335139513,
"rewards/format_reward": 0.3750000074505806,
"step": 86
},
{
"completion_length": 2735.4375762939453,
"epoch": 0.09942857142857142,
"grad_norm": 0.23141460120677948,
"kl": 0.0007982850074768066,
"learning_rate": 9.850705248720068e-07,
"loss": 0.0,
"reward": 0.3154503256082535,
"reward_std": 0.7814465276896954,
"rewards/cosine_scaled_reward": -0.10269152000546455,
"rewards/format_reward": 0.5208333395421505,
"step": 87
},
{
"completion_length": 2611.250068664551,
"epoch": 0.10057142857142858,
"grad_norm": 0.21885579824447632,
"kl": 0.004000961780548096,
"learning_rate": 9.8425742251254e-07,
"loss": 0.0002,
"reward": 0.700496843084693,
"reward_std": 0.977291576564312,
"rewards/cosine_scaled_reward": 0.04816507982468465,
"rewards/format_reward": 0.6041666828095913,
"step": 88
},
{
"completion_length": 3090.6250610351562,
"epoch": 0.10171428571428572,
"grad_norm": 0.18484243750572205,
"kl": 0.0012155771255493164,
"learning_rate": 9.83423155058946e-07,
"loss": 0.0,
"reward": 0.1298842504620552,
"reward_std": 0.7731972448527813,
"rewards/cosine_scaled_reward": -0.1121412068605423,
"rewards/format_reward": 0.3541666753590107,
"step": 89
},
{
"completion_length": 2396.8125228881836,
"epoch": 0.10285714285714286,
"grad_norm": 0.26242735981941223,
"kl": 0.0012450218200683594,
"learning_rate": 9.825677631722435e-07,
"loss": 0.0,
"reward": -0.022300932556390762,
"reward_std": 0.4295784495770931,
"rewards/cosine_scaled_reward": -0.2715671341866255,
"rewards/format_reward": 0.520833333954215,
"step": 90
},
{
"completion_length": 3109.687530517578,
"epoch": 0.104,
"grad_norm": 0.15659227967262268,
"kl": 0.0008578300476074219,
"learning_rate": 9.816912885430258e-07,
"loss": 0.0,
"reward": 0.3678978532552719,
"reward_std": 0.8310167863965034,
"rewards/cosine_scaled_reward": -0.024384415162785444,
"rewards/format_reward": 0.41666667722165585,
"step": 91
},
{
"completion_length": 2683.479202270508,
"epoch": 0.10514285714285715,
"grad_norm": 0.2791576683521271,
"kl": 0.0041623711585998535,
"learning_rate": 9.807937738894303e-07,
"loss": 0.0002,
"reward": 0.17558906483463943,
"reward_std": 0.667485423386097,
"rewards/cosine_scaled_reward": -0.1622054846957326,
"rewards/format_reward": 0.5000000037252903,
"step": 92
},
{
"completion_length": 3554.875,
"epoch": 0.10628571428571429,
"grad_norm": 0.17273500561714172,
"kl": 0.001106351613998413,
"learning_rate": 9.798752629550546e-07,
"loss": 0.0,
"reward": -0.46894520218484104,
"reward_std": 0.36610983312129974,
"rewards/cosine_scaled_reward": -0.2553059346973896,
"rewards/format_reward": 0.0416666679084301,
"step": 93
},
{
"completion_length": 2966.3958435058594,
"epoch": 0.10742857142857143,
"grad_norm": 0.19159966707229614,
"kl": 0.0015701055526733398,
"learning_rate": 9.78935800506826e-07,
"loss": 0.0001,
"reward": -0.010549061640631407,
"reward_std": 0.39640795812010765,
"rewards/cosine_scaled_reward": -0.16152450628578663,
"rewards/format_reward": 0.3125000037252903,
"step": 94
},
{
"completion_length": 3374.4375610351562,
"epoch": 0.10857142857142857,
"grad_norm": 0.15439394116401672,
"kl": 0.00039564818143844604,
"learning_rate": 9.779754323328192e-07,
"loss": 0.0,
"reward": -0.28152387839509174,
"reward_std": 0.6863285079598427,
"rewards/cosine_scaled_reward": -0.23451193794608116,
"rewards/format_reward": 0.1875000074505806,
"step": 95
},
{
"completion_length": 2657.4791870117188,
"epoch": 0.10971428571428571,
"grad_norm": 0.17443199455738068,
"kl": 0.00177721306681633,
"learning_rate": 9.769942052400235e-07,
"loss": 0.0001,
"reward": 0.09489981457591057,
"reward_std": 0.5061229234561324,
"rewards/cosine_scaled_reward": -0.14005008898675442,
"rewards/format_reward": 0.37500000558793545,
"step": 96
},
{
"completion_length": 3091.729248046875,
"epoch": 0.11085714285714286,
"grad_norm": 0.18888260424137115,
"kl": 0.0008979141712188721,
"learning_rate": 9.759921670520634e-07,
"loss": 0.0,
"reward": 0.05433476809412241,
"reward_std": 0.6424587089568377,
"rewards/cosine_scaled_reward": -0.14991594851016998,
"rewards/format_reward": 0.35416667349636555,
"step": 97
},
{
"completion_length": 2351.4375381469727,
"epoch": 0.112,
"grad_norm": 0.2002606987953186,
"kl": 0.0006032586097717285,
"learning_rate": 9.749693666068663e-07,
"loss": 0.0,
"reward": 0.4714247789233923,
"reward_std": 0.7434845846146345,
"rewards/cosine_scaled_reward": -0.04553762264549732,
"rewards/format_reward": 0.5625000149011612,
"step": 98
},
{
"completion_length": 2837.7291717529297,
"epoch": 0.11314285714285714,
"grad_norm": 0.21085548400878906,
"kl": 0.000676274299621582,
"learning_rate": 9.739258537542835e-07,
"loss": 0.0,
"reward": 0.2813050076365471,
"reward_std": 0.6411089226603508,
"rewards/cosine_scaled_reward": -0.03643083991482854,
"rewards/format_reward": 0.3541666679084301,
"step": 99
},
{
"completion_length": 2754.875045776367,
"epoch": 0.11428571428571428,
"grad_norm": 0.17745624482631683,
"kl": 0.0011057853698730469,
"learning_rate": 9.728616793536587e-07,
"loss": 0.0,
"reward": 0.6976646836847067,
"reward_std": 0.9069891199469566,
"rewards/cosine_scaled_reward": 0.11966563505120575,
"rewards/format_reward": 0.45833334513008595,
"step": 100
},
{
"completion_length": 2905.625045776367,
"epoch": 0.11542857142857142,
"grad_norm": 0.22573921084403992,
"kl": 0.0009785890579223633,
"learning_rate": 9.717768952713511e-07,
"loss": 0.0,
"reward": 0.06032659858465195,
"reward_std": 0.503840334713459,
"rewards/cosine_scaled_reward": -0.13650337606668472,
"rewards/format_reward": 0.3333333395421505,
"step": 101
},
{
"completion_length": 2533.3959045410156,
"epoch": 0.11657142857142858,
"grad_norm": 0.21088238060474396,
"kl": 0.002618730068206787,
"learning_rate": 9.706715543782064e-07,
"loss": 0.0001,
"reward": 0.5906962184235454,
"reward_std": 1.0304273441433907,
"rewards/cosine_scaled_reward": -0.017151910811662674,
"rewards/format_reward": 0.6250000223517418,
"step": 102
},
{
"completion_length": 2815.187530517578,
"epoch": 0.11771428571428572,
"grad_norm": 0.23541951179504395,
"kl": 0.0009691715240478516,
"learning_rate": 9.695457105469804e-07,
"loss": 0.0,
"reward": 0.09393319487571716,
"reward_std": 0.8980946093797684,
"rewards/cosine_scaled_reward": -0.16136674140579998,
"rewards/format_reward": 0.41666666977107525,
"step": 103
},
{
"completion_length": 2650.6250228881836,
"epoch": 0.11885714285714286,
"grad_norm": 0.3019506335258484,
"kl": 0.007655918598175049,
"learning_rate": 9.683994186497132e-07,
"loss": 0.0003,
"reward": 0.03867958951741457,
"reward_std": 0.49825899116694927,
"rewards/cosine_scaled_reward": -0.16816022247076035,
"rewards/format_reward": 0.37500000558793545,
"step": 104
},
{
"completion_length": 2774.0833892822266,
"epoch": 0.12,
"grad_norm": 0.18828085064888,
"kl": 0.0009101927280426025,
"learning_rate": 9.672327345550543e-07,
"loss": 0.0,
"reward": 0.413003945723176,
"reward_std": 0.9300272315740585,
"rewards/cosine_scaled_reward": 0.008585309609770775,
"rewards/format_reward": 0.3958333358168602,
"step": 105
},
{
"completion_length": 2155.4584045410156,
"epoch": 0.12114285714285715,
"grad_norm": 0.1837824583053589,
"kl": 0.002248704433441162,
"learning_rate": 9.66045715125541e-07,
"loss": 0.0001,
"reward": 0.945452444255352,
"reward_std": 0.6177145391702652,
"rewards/cosine_scaled_reward": 0.0977262444794178,
"rewards/format_reward": 0.7500000037252903,
"step": 106
},
{
"completion_length": 2785.7708740234375,
"epoch": 0.12228571428571429,
"grad_norm": 0.20877282321453094,
"kl": 0.001208662986755371,
"learning_rate": 9.648384182148252e-07,
"loss": 0.0,
"reward": 0.41627691127359867,
"reward_std": 0.7068986408412457,
"rewards/cosine_scaled_reward": -0.05227821506559849,
"rewards/format_reward": 0.5208333507180214,
"step": 107
},
{
"completion_length": 2584.791702270508,
"epoch": 0.12342857142857143,
"grad_norm": 0.2157466560602188,
"kl": 0.0009077191352844238,
"learning_rate": 9.636109026648554e-07,
"loss": 0.0,
"reward": 0.5254465593025088,
"reward_std": 0.6625101678073406,
"rewards/cosine_scaled_reward": 0.0023066122084856033,
"rewards/format_reward": 0.5208333395421505,
"step": 108
},
{
"completion_length": 3017.6041870117188,
"epoch": 0.12457142857142857,
"grad_norm": 0.16122227907180786,
"kl": 0.000370025634765625,
"learning_rate": 9.623632283030077e-07,
"loss": 0.0,
"reward": 0.18975364477955736,
"reward_std": 0.49849717505276203,
"rewards/cosine_scaled_reward": -0.061373173259198666,
"rewards/format_reward": 0.31250000186264515,
"step": 109
},
{
"completion_length": 2612.8542404174805,
"epoch": 0.12571428571428572,
"grad_norm": 0.21259663999080658,
"kl": 0.000641578808426857,
"learning_rate": 9.610954559391704e-07,
"loss": 0.0,
"reward": 0.32054631412029266,
"reward_std": 0.7353306971490383,
"rewards/cosine_scaled_reward": -0.10014353273436427,
"rewards/format_reward": 0.5208333432674408,
"step": 110
},
{
"completion_length": 3100.354217529297,
"epoch": 0.12685714285714286,
"grad_norm": 0.18133312463760376,
"kl": 0.0013672113418579102,
"learning_rate": 9.598076473627796e-07,
"loss": 0.0001,
"reward": 0.07442984823137522,
"reward_std": 0.9228987656533718,
"rewards/cosine_scaled_reward": -0.10861842148005962,
"rewards/format_reward": 0.29166667349636555,
"step": 111
},
{
"completion_length": 3039.8959045410156,
"epoch": 0.128,
"grad_norm": 0.15633469820022583,
"kl": 0.0005895942449569702,
"learning_rate": 9.58499865339809e-07,
"loss": 0.0,
"reward": 0.5644901357591152,
"reward_std": 0.8792787380516529,
"rewards/cosine_scaled_reward": 0.021828406490385532,
"rewards/format_reward": 0.5208333414047956,
"step": 112
},
{
"completion_length": 2473.229202270508,
"epoch": 0.12914285714285714,
"grad_norm": 0.2744481861591339,
"kl": 0.0017359256744384766,
"learning_rate": 9.571721736097088e-07,
"loss": 0.0001,
"reward": 0.17924559116363525,
"reward_std": 0.607318002730608,
"rewards/cosine_scaled_reward": -0.1603772146627307,
"rewards/format_reward": 0.5000000037252903,
"step": 113
},
{
"completion_length": 2619.4167289733887,
"epoch": 0.13028571428571428,
"grad_norm": 0.2416525036096573,
"kl": 0.0021146535873413086,
"learning_rate": 9.55824636882301e-07,
"loss": 0.0001,
"reward": 0.3448843713849783,
"reward_std": 0.5230822302401066,
"rewards/cosine_scaled_reward": -0.12964116781949997,
"rewards/format_reward": 0.6041666772216558,
"step": 114
},
{
"completion_length": 2841.2916870117188,
"epoch": 0.13142857142857142,
"grad_norm": 0.20283274352550507,
"kl": 0.0020351409912109375,
"learning_rate": 9.54457320834625e-07,
"loss": 0.0001,
"reward": 0.21423707529902458,
"reward_std": 0.7165789231657982,
"rewards/cosine_scaled_reward": -0.08038146048784256,
"rewards/format_reward": 0.3750000037252903,
"step": 115
},
{
"completion_length": 3370.125030517578,
"epoch": 0.13257142857142856,
"grad_norm": 0.15726514160633087,
"kl": 0.001503288745880127,
"learning_rate": 9.530702921077358e-07,
"loss": 0.0001,
"reward": 0.014336531981825829,
"reward_std": 0.5990661717951298,
"rewards/cosine_scaled_reward": -0.07616507587954402,
"rewards/format_reward": 0.1666666679084301,
"step": 116
},
{
"completion_length": 3045.937530517578,
"epoch": 0.1337142857142857,
"grad_norm": 0.1993873417377472,
"kl": 0.0015254020690917969,
"learning_rate": 9.516636183034564e-07,
"loss": 0.0001,
"reward": -0.07220430299639702,
"reward_std": 0.667652253061533,
"rewards/cosine_scaled_reward": -0.20276882825419307,
"rewards/format_reward": 0.3333333469927311,
"step": 117
},
{
"completion_length": 2959.562530517578,
"epoch": 0.13485714285714287,
"grad_norm": 0.18038718402385712,
"kl": 0.0012336969375610352,
"learning_rate": 9.502373679810839e-07,
"loss": 0.0,
"reward": 0.5795598048716784,
"reward_std": 1.1043038107454777,
"rewards/cosine_scaled_reward": 0.07102989172562957,
"rewards/format_reward": 0.4375000149011612,
"step": 118
},
{
"completion_length": 2178.750030517578,
"epoch": 0.136,
"grad_norm": 0.24167504906654358,
"kl": 0.002427428960800171,
"learning_rate": 9.487916106540465e-07,
"loss": 0.0001,
"reward": 0.812275217846036,
"reward_std": 0.6674736551940441,
"rewards/cosine_scaled_reward": 0.08322094567120075,
"rewards/format_reward": 0.6458333432674408,
"step": 119
},
{
"completion_length": 2230.562568664551,
"epoch": 0.13714285714285715,
"grad_norm": 0.26925644278526306,
"kl": 0.0018974542617797852,
"learning_rate": 9.473264167865171e-07,
"loss": 0.0001,
"reward": 0.43469466688111424,
"reward_std": 0.5474803410470486,
"rewards/cosine_scaled_reward": -0.0951526677235961,
"rewards/format_reward": 0.6250000037252903,
"step": 120
},
{
"completion_length": 1692.5000534057617,
"epoch": 0.1382857142857143,
"grad_norm": 0.28395897150039673,
"kl": 0.005714893341064453,
"learning_rate": 9.458418577899774e-07,
"loss": 0.0002,
"reward": 0.7201991314068437,
"reward_std": 0.6400695294141769,
"rewards/cosine_scaled_reward": -0.02531713293865323,
"rewards/format_reward": 0.7708333507180214,
"step": 121
},
{
"completion_length": 2879.75004196167,
"epoch": 0.13942857142857143,
"grad_norm": 0.18640156090259552,
"kl": 0.0010684728622436523,
"learning_rate": 9.443380060197385e-07,
"loss": 0.0,
"reward": 0.4576802644878626,
"reward_std": 0.934948768466711,
"rewards/cosine_scaled_reward": 0.020506808534264565,
"rewards/format_reward": 0.4166666716337204,
"step": 122
},
{
"completion_length": 2707.9375610351562,
"epoch": 0.14057142857142857,
"grad_norm": 0.17253972589969635,
"kl": 0.0014038681983947754,
"learning_rate": 9.428149347714143e-07,
"loss": 0.0001,
"reward": 0.35453951358795166,
"reward_std": 0.7643875367939472,
"rewards/cosine_scaled_reward": -0.09356357716023922,
"rewards/format_reward": 0.5416666734963655,
"step": 123
},
{
"completion_length": 2144.5416946411133,
"epoch": 0.1417142857142857,
"grad_norm": 0.2417304664850235,
"kl": 0.0027747154235839844,
"learning_rate": 9.412727182773486e-07,
"loss": 0.0001,
"reward": 0.32306639989838004,
"reward_std": 0.8119018785655499,
"rewards/cosine_scaled_reward": -0.11971682743751444,
"rewards/format_reward": 0.5625000111758709,
"step": 124
},
{
"completion_length": 2816.479202270508,
"epoch": 0.14285714285714285,
"grad_norm": 0.20347152650356293,
"kl": 0.0017933845520019531,
"learning_rate": 9.397114317029974e-07,
"loss": 0.0001,
"reward": 0.15680650994181633,
"reward_std": 0.4326724670827389,
"rewards/cosine_scaled_reward": -0.06743010319769382,
"rewards/format_reward": 0.2916666679084301,
"step": 125
},
{
"completion_length": 2907.7500534057617,
"epoch": 0.144,
"grad_norm": 0.16710162162780762,
"kl": 0.0011298656463623047,
"learning_rate": 9.381311511432658e-07,
"loss": 0.0,
"reward": 0.24374442547559738,
"reward_std": 0.5884060095995665,
"rewards/cosine_scaled_reward": -0.07604445889592171,
"rewards/format_reward": 0.3958333395421505,
"step": 126
},
{
"completion_length": 2909.5625,
"epoch": 0.14514285714285713,
"grad_norm": 0.1726786196231842,
"kl": 0.0017578601837158203,
"learning_rate": 9.36531953618799e-07,
"loss": 0.0001,
"reward": -0.006642797961831093,
"reward_std": 0.5724836103618145,
"rewards/cosine_scaled_reward": -0.1908214169088751,
"rewards/format_reward": 0.37500000558793545,
"step": 127
},
{
"completion_length": 2919.9166946411133,
"epoch": 0.1462857142857143,
"grad_norm": 0.18813887238502502,
"kl": 0.0023109018802642822,
"learning_rate": 9.34913917072228e-07,
"loss": 0.0001,
"reward": 0.5732522960752249,
"reward_std": 0.8493749275803566,
"rewards/cosine_scaled_reward": 0.08870946802198887,
"rewards/format_reward": 0.3958333358168602,
"step": 128
},
{
"completion_length": 3364.8334045410156,
"epoch": 0.14742857142857144,
"grad_norm": 0.17843031883239746,
"kl": 0.0023398399353027344,
"learning_rate": 9.332771203643714e-07,
"loss": 0.0001,
"reward": 0.12374773435294628,
"reward_std": 0.6275974959135056,
"rewards/cosine_scaled_reward": -0.07354282308369875,
"rewards/format_reward": 0.27083334140479565,
"step": 129
},
{
"completion_length": 2752.5416946411133,
"epoch": 0.14857142857142858,
"grad_norm": 0.1666869968175888,
"kl": 0.0013623237609863281,
"learning_rate": 9.316216432703916e-07,
"loss": 0.0001,
"reward": 0.07455501146614552,
"reward_std": 0.8575821630656719,
"rewards/cosine_scaled_reward": -0.15022250125184655,
"rewards/format_reward": 0.37500000931322575,
"step": 130
},
{
"completion_length": 2932.8958740234375,
"epoch": 0.14971428571428572,
"grad_norm": 0.2144927680492401,
"kl": 0.0027085542678833008,
"learning_rate": 9.299475664759068e-07,
"loss": 0.0001,
"reward": 0.5810204441659153,
"reward_std": 0.7529679946601391,
"rewards/cosine_scaled_reward": 0.09259352087974548,
"rewards/format_reward": 0.39583334140479565,
"step": 131
},
{
"completion_length": 2482.7916870117188,
"epoch": 0.15085714285714286,
"grad_norm": 0.2159230262041092,
"kl": 0.0015616416931152344,
"learning_rate": 9.282549715730579e-07,
"loss": 0.0001,
"reward": 0.3923495952039957,
"reward_std": 0.6323644928634167,
"rewards/cosine_scaled_reward": -0.02257518842816353,
"rewards/format_reward": 0.4375,
"step": 132
},
{
"completion_length": 3183.0416870117188,
"epoch": 0.152,
"grad_norm": 0.21430212259292603,
"kl": 0.00218963623046875,
"learning_rate": 9.265439410565328e-07,
"loss": 0.0001,
"reward": -0.07401247788220644,
"reward_std": 0.4380334075540304,
"rewards/cosine_scaled_reward": -0.17242290638387203,
"rewards/format_reward": 0.27083333767950535,
"step": 133
},
{
"completion_length": 2434.312515258789,
"epoch": 0.15314285714285714,
"grad_norm": 0.21282276511192322,
"kl": 0.0028487443923950195,
"learning_rate": 9.248145583195447e-07,
"loss": 0.0001,
"reward": 0.6068310905247927,
"reward_std": 0.8159074038267136,
"rewards/cosine_scaled_reward": 0.01174885593354702,
"rewards/format_reward": 0.583333333954215,
"step": 134
},
{
"completion_length": 2001.7917137145996,
"epoch": 0.15428571428571428,
"grad_norm": 0.3072402775287628,
"kl": 0.0029096603393554688,
"learning_rate": 9.230669076497687e-07,
"loss": 0.0001,
"reward": 0.9695501551032066,
"reward_std": 0.6822597435675561,
"rewards/cosine_scaled_reward": 0.17227506916970015,
"rewards/format_reward": 0.6250000111758709,
"step": 135
},
{
"completion_length": 2730.0625610351562,
"epoch": 0.15542857142857142,
"grad_norm": 0.2851649224758148,
"kl": 0.002216339111328125,
"learning_rate": 9.213010742252327e-07,
"loss": 0.0001,
"reward": 0.4177038297057152,
"reward_std": 1.0492460913956165,
"rewards/cosine_scaled_reward": -0.020314730005338788,
"rewards/format_reward": 0.45833334140479565,
"step": 136
},
{
"completion_length": 2912.0000381469727,
"epoch": 0.15657142857142858,
"grad_norm": 0.2026607245206833,
"kl": 0.0021758079528808594,
"learning_rate": 9.195171441101668e-07,
"loss": 0.0001,
"reward": -0.09136633202433586,
"reward_std": 0.5394793637096882,
"rewards/cosine_scaled_reward": -0.19151651859283447,
"rewards/format_reward": 0.29166667349636555,
"step": 137
},
{
"completion_length": 2472.25004196167,
"epoch": 0.15771428571428572,
"grad_norm": 0.2178824096918106,
"kl": 0.0017549991607666016,
"learning_rate": 9.177152042508077e-07,
"loss": 0.0001,
"reward": 0.3587841596454382,
"reward_std": 0.9164101183414459,
"rewards/cosine_scaled_reward": -0.10185793554410338,
"rewards/format_reward": 0.5625000111758709,
"step": 138
},
{
"completion_length": 3295.791717529297,
"epoch": 0.15885714285714286,
"grad_norm": 0.17741361260414124,
"kl": 0.00331878662109375,
"learning_rate": 9.158953424711624e-07,
"loss": 0.0001,
"reward": 0.006225615739822388,
"reward_std": 0.7490637376904488,
"rewards/cosine_scaled_reward": -0.15313720237463713,
"rewards/format_reward": 0.3125000111758709,
"step": 139
},
{
"completion_length": 2845.375030517578,
"epoch": 0.16,
"grad_norm": 0.27644607424736023,
"kl": 0.005204200744628906,
"learning_rate": 9.140576474687263e-07,
"loss": 0.0002,
"reward": 0.2643515709787607,
"reward_std": 0.7205736935138702,
"rewards/cosine_scaled_reward": -0.04490754520520568,
"rewards/format_reward": 0.3541666753590107,
"step": 140
},
{
"completion_length": 2522.1458740234375,
"epoch": 0.16114285714285714,
"grad_norm": 0.21788166463375092,
"kl": 0.003261566162109375,
"learning_rate": 9.122022088101613e-07,
"loss": 0.0001,
"reward": 0.3097852375358343,
"reward_std": 0.7691731601953506,
"rewards/cosine_scaled_reward": -0.14719070680439472,
"rewards/format_reward": 0.604166679084301,
"step": 141
},
{
"completion_length": 2841.3958740234375,
"epoch": 0.16228571428571428,
"grad_norm": 0.18084634840488434,
"kl": 0.0034284591674804688,
"learning_rate": 9.103291169269299e-07,
"loss": 0.0001,
"reward": 0.39247775822877884,
"reward_std": 0.9289789237082005,
"rewards/cosine_scaled_reward": -0.0745944594964385,
"rewards/format_reward": 0.5416666828095913,
"step": 142
},
{
"completion_length": 2528.7708740234375,
"epoch": 0.16342857142857142,
"grad_norm": 0.31566375494003296,
"kl": 0.004278659820556641,
"learning_rate": 9.084384631108882e-07,
"loss": 0.0002,
"reward": 0.16241500061005354,
"reward_std": 0.6862597297877073,
"rewards/cosine_scaled_reward": -0.15837583474058192,
"rewards/format_reward": 0.4791666753590107,
"step": 143
},
{
"completion_length": 2870.937545776367,
"epoch": 0.16457142857142856,
"grad_norm": 0.20246466994285583,
"kl": 0.002933502197265625,
"learning_rate": 9.065303395098358e-07,
"loss": 0.0001,
"reward": 0.4816696117632091,
"reward_std": 0.7971302233636379,
"rewards/cosine_scaled_reward": 0.05333479621913284,
"rewards/format_reward": 0.3750000111758709,
"step": 144
},
{
"completion_length": 2188.9166946411133,
"epoch": 0.1657142857142857,
"grad_norm": 0.21244628727436066,
"kl": 0.003267526626586914,
"learning_rate": 9.046048391230247e-07,
"loss": 0.0001,
"reward": 0.6690180394798517,
"reward_std": 0.6978322137147188,
"rewards/cosine_scaled_reward": 0.042842356488108635,
"rewards/format_reward": 0.5833333414047956,
"step": 145
},
{
"completion_length": 2133.7917404174805,
"epoch": 0.16685714285714287,
"grad_norm": 0.22922900319099426,
"kl": 0.0025424957275390625,
"learning_rate": 9.026620557966279e-07,
"loss": 0.0001,
"reward": 0.276001513004303,
"reward_std": 0.6235547289252281,
"rewards/cosine_scaled_reward": -0.1849159342236817,
"rewards/format_reward": 0.645833345130086,
"step": 146
},
{
"completion_length": 2469.604202270508,
"epoch": 0.168,
"grad_norm": 0.25116774439811707,
"kl": 0.004103660583496094,
"learning_rate": 9.007020842191634e-07,
"loss": 0.0002,
"reward": 0.588484600186348,
"reward_std": 0.85157061368227,
"rewards/cosine_scaled_reward": 0.03382563544437289,
"rewards/format_reward": 0.5208333414047956,
"step": 147
},
{
"completion_length": 2214.6042251586914,
"epoch": 0.16914285714285715,
"grad_norm": 0.18739475309848785,
"kl": 0.0034880638122558594,
"learning_rate": 8.987250199168808e-07,
"loss": 0.0001,
"reward": 0.5079192472621799,
"reward_std": 0.7314141802489758,
"rewards/cosine_scaled_reward": -0.06895705359056592,
"rewards/format_reward": 0.6458333376795053,
"step": 148
},
{
"completion_length": 2570.416763305664,
"epoch": 0.1702857142857143,
"grad_norm": 0.22657646238803864,
"kl": 0.004002571105957031,
"learning_rate": 8.967309592491052e-07,
"loss": 0.0002,
"reward": 0.33003329299390316,
"reward_std": 1.0237452127039433,
"rewards/cosine_scaled_reward": -0.0849833432585001,
"rewards/format_reward": 0.5000000167638063,
"step": 149
},
{
"completion_length": 2632.166732788086,
"epoch": 0.17142857142857143,
"grad_norm": 0.24574832618236542,
"kl": 0.0053920745849609375,
"learning_rate": 8.9471999940354e-07,
"loss": 0.0002,
"reward": 0.27089521041489206,
"reward_std": 0.8753956761211157,
"rewards/cosine_scaled_reward": -0.08330239914357662,
"rewards/format_reward": 0.43750001303851604,
"step": 150
},
{
"completion_length": 2311.416702270508,
"epoch": 0.17257142857142857,
"grad_norm": 0.21789635717868805,
"kl": 0.0057392120361328125,
"learning_rate": 8.926922383915315e-07,
"loss": 0.0002,
"reward": 0.6955740721896291,
"reward_std": 0.7741780783981085,
"rewards/cosine_scaled_reward": 0.024870369350537658,
"rewards/format_reward": 0.6458333358168602,
"step": 151
},
{
"completion_length": 2757.125030517578,
"epoch": 0.1737142857142857,
"grad_norm": 0.2523602545261383,
"kl": 0.003414630889892578,
"learning_rate": 8.906477750432903e-07,
"loss": 0.0001,
"reward": 0.1108561996370554,
"reward_std": 0.6132981330156326,
"rewards/cosine_scaled_reward": -0.13207191228866577,
"rewards/format_reward": 0.37500000558793545,
"step": 152
},
{
"completion_length": 2993.416702270508,
"epoch": 0.17485714285714285,
"grad_norm": 0.20782147347927094,
"kl": 0.006793975830078125,
"learning_rate": 8.88586709003076e-07,
"loss": 0.0003,
"reward": -0.006021241657435894,
"reward_std": 0.6891133151948452,
"rewards/cosine_scaled_reward": -0.19051063433289528,
"rewards/format_reward": 0.37500000931322575,
"step": 153
},
{
"completion_length": 3374.416717529297,
"epoch": 0.176,
"grad_norm": 0.1451537162065506,
"kl": 0.0027971267700195312,
"learning_rate": 8.865091407243394e-07,
"loss": 0.0001,
"reward": 0.28345590829849243,
"reward_std": 0.9266153089702129,
"rewards/cosine_scaled_reward": -0.014522044686600566,
"rewards/format_reward": 0.31250000558793545,
"step": 154
},
{
"completion_length": 2701.187545776367,
"epoch": 0.17714285714285713,
"grad_norm": 0.21468013525009155,
"kl": 0.003292083740234375,
"learning_rate": 8.844151714648274e-07,
"loss": 0.0001,
"reward": 0.32251251488924026,
"reward_std": 0.7347648032009602,
"rewards/cosine_scaled_reward": -0.04707707092165947,
"rewards/format_reward": 0.4166666716337204,
"step": 155
},
{
"completion_length": 2688.3333892822266,
"epoch": 0.1782857142857143,
"grad_norm": 0.19141636788845062,
"kl": 0.003067493438720703,
"learning_rate": 8.823049032816478e-07,
"loss": 0.0001,
"reward": 0.23405547067523003,
"reward_std": 0.8397131189703941,
"rewards/cosine_scaled_reward": -0.08088893629610538,
"rewards/format_reward": 0.3958333358168602,
"step": 156
},
{
"completion_length": 2820.937515258789,
"epoch": 0.17942857142857144,
"grad_norm": 0.21651938557624817,
"kl": 0.004750490188598633,
"learning_rate": 8.801784390262943e-07,
"loss": 0.0002,
"reward": 0.18686847016215324,
"reward_std": 0.7123733684420586,
"rewards/cosine_scaled_reward": -0.11489910446107388,
"rewards/format_reward": 0.41666668467223644,
"step": 157
},
{
"completion_length": 2965.2709045410156,
"epoch": 0.18057142857142858,
"grad_norm": 0.1956688016653061,
"kl": 0.0041599273681640625,
"learning_rate": 8.780358823396352e-07,
"loss": 0.0002,
"reward": 0.6479230020195246,
"reward_std": 0.7748183347284794,
"rewards/cosine_scaled_reward": 0.11562815494835377,
"rewards/format_reward": 0.416666679084301,
"step": 158
},
{
"completion_length": 2539.500015258789,
"epoch": 0.18171428571428572,
"grad_norm": 0.21064119040966034,
"kl": 0.003520965576171875,
"learning_rate": 8.758773376468604e-07,
"loss": 0.0001,
"reward": 0.017007697373628616,
"reward_std": 0.5202154843136668,
"rewards/cosine_scaled_reward": -0.23107950016856194,
"rewards/format_reward": 0.4791666679084301,
"step": 159
},
{
"completion_length": 2526.562545776367,
"epoch": 0.18285714285714286,
"grad_norm": 0.2099539339542389,
"kl": 0.007557868957519531,
"learning_rate": 8.737029101523929e-07,
"loss": 0.0003,
"reward": 0.5310798175632954,
"reward_std": 0.8000563234090805,
"rewards/cosine_scaled_reward": 0.015539903659373522,
"rewards/format_reward": 0.5000000093132257,
"step": 160
},
{
"completion_length": 2373.6458740234375,
"epoch": 0.184,
"grad_norm": 0.23696200549602509,
"kl": 0.0062770843505859375,
"learning_rate": 8.715127058347614e-07,
"loss": 0.0003,
"reward": 0.3601957531645894,
"reward_std": 0.735405445098877,
"rewards/cosine_scaled_reward": -0.08031879365444183,
"rewards/format_reward": 0.5208333376795053,
"step": 161
},
{
"completion_length": 2870.604217529297,
"epoch": 0.18514285714285714,
"grad_norm": 0.20365777611732483,
"kl": 0.00856781005859375,
"learning_rate": 8.693068314414344e-07,
"loss": 0.0003,
"reward": 0.2685950770974159,
"reward_std": 0.7880549058318138,
"rewards/cosine_scaled_reward": -0.06361913960427046,
"rewards/format_reward": 0.39583334140479565,
"step": 162
},
{
"completion_length": 2309.8958740234375,
"epoch": 0.18628571428571428,
"grad_norm": 0.2089148312807083,
"kl": 0.004642963409423828,
"learning_rate": 8.670853944836176e-07,
"loss": 0.0002,
"reward": 0.6906282119452953,
"reward_std": 0.49390939995646477,
"rewards/cosine_scaled_reward": 0.04323074035346508,
"rewards/format_reward": 0.6041666716337204,
"step": 163
},
{
"completion_length": 2178.3750381469727,
"epoch": 0.18742857142857142,
"grad_norm": 0.26612454652786255,
"kl": 0.007852554321289062,
"learning_rate": 8.648485032310144e-07,
"loss": 0.0003,
"reward": 0.5011991895735264,
"reward_std": 0.5780392102897167,
"rewards/cosine_scaled_reward": -0.020233748480677605,
"rewards/format_reward": 0.5416666734963655,
"step": 164
},
{
"completion_length": 2490.666702270508,
"epoch": 0.18857142857142858,
"grad_norm": 0.25059667229652405,
"kl": 0.005207061767578125,
"learning_rate": 8.625962667065487e-07,
"loss": 0.0002,
"reward": 0.056412231642752886,
"reward_std": 0.6775014959275723,
"rewards/cosine_scaled_reward": -0.2009605555795133,
"rewards/format_reward": 0.45833334513008595,
"step": 165
},
{
"completion_length": 2813.187515258789,
"epoch": 0.18971428571428572,
"grad_norm": 0.20505455136299133,
"kl": 0.004026889801025391,
"learning_rate": 8.603287946810513e-07,
"loss": 0.0002,
"reward": -0.00409979373216629,
"reward_std": 0.6606614142656326,
"rewards/cosine_scaled_reward": -0.18954989779740572,
"rewards/format_reward": 0.3750000111758709,
"step": 166
},
{
"completion_length": 2199.562568664551,
"epoch": 0.19085714285714286,
"grad_norm": 0.2305985391139984,
"kl": 0.003949165344238281,
"learning_rate": 8.580461976679099e-07,
"loss": 0.0002,
"reward": 0.34335924312472343,
"reward_std": 0.8879089429974556,
"rewards/cosine_scaled_reward": -0.151237060315907,
"rewards/format_reward": 0.6458333414047956,
"step": 167
},
{
"completion_length": 2649.583396911621,
"epoch": 0.192,
"grad_norm": 0.2810652256011963,
"kl": 0.00469207763671875,
"learning_rate": 8.557485869176825e-07,
"loss": 0.0002,
"reward": 0.3732623625546694,
"reward_std": 0.9529096595942974,
"rewards/cosine_scaled_reward": -0.05295216618105769,
"rewards/format_reward": 0.4791666716337204,
"step": 168
},
{
"completion_length": 1590.3542022705078,
"epoch": 0.19314285714285714,
"grad_norm": 0.19241072237491608,
"kl": 0.004169940948486328,
"learning_rate": 8.534360744126753e-07,
"loss": 0.0002,
"reward": 1.4370996933430433,
"reward_std": 0.7862771563231945,
"rewards/cosine_scaled_reward": 0.260216549038887,
"rewards/format_reward": 0.9166666716337204,
"step": 169
},
{
"completion_length": 2502.375045776367,
"epoch": 0.19428571428571428,
"grad_norm": 0.1910967230796814,
"kl": 0.004596710205078125,
"learning_rate": 8.511087728614862e-07,
"loss": 0.0002,
"reward": 0.3506653420627117,
"reward_std": 0.43432530108839273,
"rewards/cosine_scaled_reward": -0.04341734014451504,
"rewards/format_reward": 0.43750000186264515,
"step": 170
},
{
"completion_length": 2321.041702270508,
"epoch": 0.19542857142857142,
"grad_norm": 0.18431736528873444,
"kl": 0.0033516883850097656,
"learning_rate": 8.487667956935087e-07,
"loss": 0.0001,
"reward": 0.40528116561472416,
"reward_std": 0.5608017090708017,
"rewards/cosine_scaled_reward": -0.057776106521487236,
"rewards/format_reward": 0.5208333395421505,
"step": 171
},
{
"completion_length": 2714.541732788086,
"epoch": 0.19657142857142856,
"grad_norm": 0.22830112278461456,
"kl": 0.007214546203613281,
"learning_rate": 8.464102570534061e-07,
"loss": 0.0003,
"reward": 0.7660160614177585,
"reward_std": 1.0001494381576777,
"rewards/cosine_scaled_reward": 0.14342465763911605,
"rewards/format_reward": 0.47916667722165585,
"step": 172
},
{
"completion_length": 1561.3333587646484,
"epoch": 0.1977142857142857,
"grad_norm": 0.2993278205394745,
"kl": 0.0068264007568359375,
"learning_rate": 8.440392717955475e-07,
"loss": 0.0003,
"reward": 0.46238506003282964,
"reward_std": 0.7197185447439551,
"rewards/cosine_scaled_reward": -0.1333908117376268,
"rewards/format_reward": 0.7291666679084301,
"step": 173
},
{
"completion_length": 1839.541732788086,
"epoch": 0.19885714285714284,
"grad_norm": 0.274972140789032,
"kl": 0.008336067199707031,
"learning_rate": 8.416539554784089e-07,
"loss": 0.0003,
"reward": 0.8871902981773019,
"reward_std": 0.7998310215771198,
"rewards/cosine_scaled_reward": 0.04776177019812167,
"rewards/format_reward": 0.7916666772216558,
"step": 174
},
{
"completion_length": 2464.145866394043,
"epoch": 0.2,
"grad_norm": 0.20217549800872803,
"kl": 0.004665374755859375,
"learning_rate": 8.392544243589427e-07,
"loss": 0.0002,
"reward": 0.4925261875614524,
"reward_std": 0.5973605997860432,
"rewards/cosine_scaled_reward": 0.006679709069430828,
"rewards/format_reward": 0.4791666753590107,
"step": 175
},
{
"completion_length": 1838.6667175292969,
"epoch": 0.20114285714285715,
"grad_norm": 0.24335332214832306,
"kl": 0.0049304962158203125,
"learning_rate": 8.368407953869103e-07,
"loss": 0.0002,
"reward": 0.9096714407205582,
"reward_std": 1.0707368738949299,
"rewards/cosine_scaled_reward": 0.059002356603741646,
"rewards/format_reward": 0.7916666716337204,
"step": 176
},
{
"completion_length": 2511.875030517578,
"epoch": 0.2022857142857143,
"grad_norm": 0.23394758999347687,
"kl": 0.0048389434814453125,
"learning_rate": 8.344131861991828e-07,
"loss": 0.0002,
"reward": 0.6948387259617448,
"reward_std": 0.757513590157032,
"rewards/cosine_scaled_reward": 0.05575268715620041,
"rewards/format_reward": 0.583333345130086,
"step": 177
},
{
"completion_length": 2130.1250534057617,
"epoch": 0.20342857142857143,
"grad_norm": 0.28063294291496277,
"kl": 0.00698089599609375,
"learning_rate": 8.319717151140072e-07,
"loss": 0.0003,
"reward": 0.5327047700993717,
"reward_std": 0.8441273644566536,
"rewards/cosine_scaled_reward": -0.035730951465666294,
"rewards/format_reward": 0.6041666697710752,
"step": 178
},
{
"completion_length": 2611.0416717529297,
"epoch": 0.20457142857142857,
"grad_norm": 0.25523021817207336,
"kl": 0.004611968994140625,
"learning_rate": 8.295165011252396e-07,
"loss": 0.0002,
"reward": -0.09883294254541397,
"reward_std": 0.4169027265161276,
"rewards/cosine_scaled_reward": -0.25774980895221233,
"rewards/format_reward": 0.41666667349636555,
"step": 179
},
{
"completion_length": 1505.0833892822266,
"epoch": 0.2057142857142857,
"grad_norm": 0.2882588505744934,
"kl": 0.0065937042236328125,
"learning_rate": 8.270476638965461e-07,
"loss": 0.0003,
"reward": 1.1103055961430073,
"reward_std": 0.8654880682006478,
"rewards/cosine_scaled_reward": 0.15931943291798234,
"rewards/format_reward": 0.7916666697710752,
"step": 180
},
{
"completion_length": 2787.2708892822266,
"epoch": 0.20685714285714285,
"grad_norm": 0.24275778234004974,
"kl": 0.00591278076171875,
"learning_rate": 8.245653237555705e-07,
"loss": 0.0002,
"reward": 0.1754364101216197,
"reward_std": 0.575681222602725,
"rewards/cosine_scaled_reward": -0.09978180378675461,
"rewards/format_reward": 0.37500000931322575,
"step": 181
},
{
"completion_length": 2012.4375076293945,
"epoch": 0.208,
"grad_norm": 0.17373254895210266,
"kl": 0.00243377685546875,
"learning_rate": 8.220696016880687e-07,
"loss": 0.0001,
"reward": 0.48792053386569023,
"reward_std": 0.6339404806494713,
"rewards/cosine_scaled_reward": -0.07895641587674618,
"rewards/format_reward": 0.6458333358168602,
"step": 182
},
{
"completion_length": 1676.2916946411133,
"epoch": 0.20914285714285713,
"grad_norm": 0.26158607006073,
"kl": 0.007886886596679688,
"learning_rate": 8.195606193320136e-07,
"loss": 0.0003,
"reward": 0.9213535767048597,
"reward_std": 0.7604264169931412,
"rewards/cosine_scaled_reward": 0.054426767863333225,
"rewards/format_reward": 0.812500013038516,
"step": 183
},
{
"completion_length": 2223.9167137145996,
"epoch": 0.2102857142857143,
"grad_norm": 0.25392112135887146,
"kl": 0.00632476806640625,
"learning_rate": 8.170384989716657e-07,
"loss": 0.0003,
"reward": 0.5227387230843306,
"reward_std": 0.5629259529523551,
"rewards/cosine_scaled_reward": -0.07196396728977561,
"rewards/format_reward": 0.666666679084301,
"step": 184
},
{
"completion_length": 1681.7500305175781,
"epoch": 0.21142857142857144,
"grad_norm": 0.2521141469478607,
"kl": 0.005031585693359375,
"learning_rate": 8.145033635316128e-07,
"loss": 0.0002,
"reward": 0.2217017700895667,
"reward_std": 0.572849384509027,
"rewards/cosine_scaled_reward": -0.24331580009311438,
"rewards/format_reward": 0.7083333395421505,
"step": 185
},
{
"completion_length": 2321.8541870117188,
"epoch": 0.21257142857142858,
"grad_norm": 0.17236538231372833,
"kl": 0.006072998046875,
"learning_rate": 8.119553365707802e-07,
"loss": 0.0002,
"reward": 0.5415416369214654,
"reward_std": 0.6540890671312809,
"rewards/cosine_scaled_reward": -6.251875311136246e-05,
"rewards/format_reward": 0.5416666697710752,
"step": 186
},
{
"completion_length": 1878.7083587646484,
"epoch": 0.21371428571428572,
"grad_norm": 0.24200907349586487,
"kl": 0.0061244964599609375,
"learning_rate": 8.093945422764069e-07,
"loss": 0.0002,
"reward": 0.5931817078962922,
"reward_std": 0.44314784556627274,
"rewards/cosine_scaled_reward": -0.08882584050297737,
"rewards/format_reward": 0.7708333395421505,
"step": 187
},
{
"completion_length": 2685.875015258789,
"epoch": 0.21485714285714286,
"grad_norm": 0.2027624398469925,
"kl": 0.0059528350830078125,
"learning_rate": 8.068211054579943e-07,
"loss": 0.0002,
"reward": 0.0031664990819990635,
"reward_std": 0.44171422626823187,
"rewards/cosine_scaled_reward": -0.19633342884480953,
"rewards/format_reward": 0.39583333395421505,
"step": 188
},
{
"completion_length": 2072.4166870117188,
"epoch": 0.216,
"grad_norm": 0.27338477969169617,
"kl": 0.0058345794677734375,
"learning_rate": 8.04235151541222e-07,
"loss": 0.0002,
"reward": 0.4580115145072341,
"reward_std": 0.6696468777954578,
"rewards/cosine_scaled_reward": -0.11474426090717316,
"rewards/format_reward": 0.6875000111758709,
"step": 189
},
{
"completion_length": 1406.2708740234375,
"epoch": 0.21714285714285714,
"grad_norm": 0.2341710776090622,
"kl": 0.00482940673828125,
"learning_rate": 8.01636806561836e-07,
"loss": 0.0002,
"reward": 0.892716939561069,
"reward_std": 0.6160999666899443,
"rewards/cosine_scaled_reward": 0.019275141414254904,
"rewards/format_reward": 0.8541666753590107,
"step": 190
},
{
"completion_length": 1371.333381652832,
"epoch": 0.21828571428571428,
"grad_norm": 0.2618865966796875,
"kl": 0.0059223175048828125,
"learning_rate": 7.990261971595048e-07,
"loss": 0.0002,
"reward": 1.064841603860259,
"reward_std": 0.8721816278994083,
"rewards/cosine_scaled_reward": 0.12617080115524004,
"rewards/format_reward": 0.8125000018626451,
"step": 191
},
{
"completion_length": 2194.729217529297,
"epoch": 0.21942857142857142,
"grad_norm": 0.22514396905899048,
"kl": 0.00543975830078125,
"learning_rate": 7.964034505716476e-07,
"loss": 0.0002,
"reward": 0.42392623238265514,
"reward_std": 0.6634904891252518,
"rewards/cosine_scaled_reward": -0.11095356999430805,
"rewards/format_reward": 0.6458333432674408,
"step": 192
},
{
"completion_length": 2677.687545776367,
"epoch": 0.22057142857142858,
"grad_norm": 0.24128860235214233,
"kl": 0.0064220428466796875,
"learning_rate": 7.93768694627233e-07,
"loss": 0.0003,
"reward": 0.0647691236808896,
"reward_std": 0.7267616987228394,
"rewards/cosine_scaled_reward": -0.1655321167781949,
"rewards/format_reward": 0.39583334513008595,
"step": 193
},
{
"completion_length": 2442.937545776367,
"epoch": 0.22171428571428572,
"grad_norm": 0.19651657342910767,
"kl": 0.005771636962890625,
"learning_rate": 7.911220577405484e-07,
"loss": 0.0002,
"reward": 0.9742466537281871,
"reward_std": 1.0655523668974638,
"rewards/cosine_scaled_reward": 0.14337329752743244,
"rewards/format_reward": 0.6875000093132257,
"step": 194
},
{
"completion_length": 1858.8542175292969,
"epoch": 0.22285714285714286,
"grad_norm": 0.25950801372528076,
"kl": 0.005645751953125,
"learning_rate": 7.884636689049422e-07,
"loss": 0.0002,
"reward": 0.6131953801959753,
"reward_std": 0.8148518763482571,
"rewards/cosine_scaled_reward": -0.08923565968871117,
"rewards/format_reward": 0.791666679084301,
"step": 195
},
{
"completion_length": 2861.3958587646484,
"epoch": 0.224,
"grad_norm": 0.26293885707855225,
"kl": 0.006893157958984375,
"learning_rate": 7.857936576865356e-07,
"loss": 0.0003,
"reward": 0.2952362271025777,
"reward_std": 0.7602911423891783,
"rewards/cosine_scaled_reward": -0.09196521900594234,
"rewards/format_reward": 0.47916667349636555,
"step": 196
},
{
"completion_length": 1365.2500381469727,
"epoch": 0.22514285714285714,
"grad_norm": 0.2826971709728241,
"kl": 0.006282806396484375,
"learning_rate": 7.831121542179086e-07,
"loss": 0.0003,
"reward": 0.9446334131062031,
"reward_std": 0.7611507624387741,
"rewards/cosine_scaled_reward": 0.04523337911814451,
"rewards/format_reward": 0.8541666753590107,
"step": 197
},
{
"completion_length": 1636.1667098999023,
"epoch": 0.22628571428571428,
"grad_norm": 0.25861766934394836,
"kl": 0.007457733154296875,
"learning_rate": 7.804192891917571e-07,
"loss": 0.0003,
"reward": 0.9376457966864109,
"reward_std": 0.8799612354487181,
"rewards/cosine_scaled_reward": 0.052156222984194756,
"rewards/format_reward": 0.8333333395421505,
"step": 198
},
{
"completion_length": 1516.4583892822266,
"epoch": 0.22742857142857142,
"grad_norm": 0.21497201919555664,
"kl": 0.006725311279296875,
"learning_rate": 7.777151938545235e-07,
"loss": 0.0003,
"reward": 0.6696783006191254,
"reward_std": 0.5459328815340996,
"rewards/cosine_scaled_reward": -0.14432752039283514,
"rewards/format_reward": 0.9583333432674408,
"step": 199
},
{
"completion_length": 1373.7291946411133,
"epoch": 0.22857142857142856,
"grad_norm": 0.22486717998981476,
"kl": 0.006084442138671875,
"learning_rate": 7.75e-07,
"loss": 0.0002,
"reward": 1.1582428617402911,
"reward_std": 0.6559928604401648,
"rewards/cosine_scaled_reward": 0.11037139501422644,
"rewards/format_reward": 0.9375000074505806,
"step": 200
},
{
"completion_length": 1785.0833587646484,
"epoch": 0.2297142857142857,
"grad_norm": 0.2287452071905136,
"kl": 0.0050449371337890625,
"learning_rate": 7.72273839962904e-07,
"loss": 0.0002,
"reward": 1.4045181944966316,
"reward_std": 0.7379098236560822,
"rewards/cosine_scaled_reward": 0.2855923995375633,
"rewards/format_reward": 0.8333333358168602,
"step": 201
},
{
"completion_length": 1509.4167098999023,
"epoch": 0.23085714285714284,
"grad_norm": 0.21152691543102264,
"kl": 0.0052967071533203125,
"learning_rate": 7.695368466124296e-07,
"loss": 0.0002,
"reward": 1.1190969496965408,
"reward_std": 0.41684375517070293,
"rewards/cosine_scaled_reward": 0.1637151322211139,
"rewards/format_reward": 0.7916666679084301,
"step": 202
},
{
"completion_length": 1581.0208892822266,
"epoch": 0.232,
"grad_norm": 0.25626081228256226,
"kl": 0.0076751708984375,
"learning_rate": 7.667891533457718e-07,
"loss": 0.0003,
"reward": 0.6776624768972397,
"reward_std": 0.659516304731369,
"rewards/cosine_scaled_reward": -0.07783541223034263,
"rewards/format_reward": 0.8333333432674408,
"step": 203
},
{
"completion_length": 1655.520881652832,
"epoch": 0.23314285714285715,
"grad_norm": 0.36918991804122925,
"kl": 0.012666702270507812,
"learning_rate": 7.640308940816239e-07,
"loss": 0.0005,
"reward": 0.8364788740873337,
"reward_std": 0.8245303072035313,
"rewards/cosine_scaled_reward": -0.008843917399644852,
"rewards/format_reward": 0.8541666865348816,
"step": 204
},
{
"completion_length": 1911.3542098999023,
"epoch": 0.2342857142857143,
"grad_norm": 0.31795355677604675,
"kl": 0.0064849853515625,
"learning_rate": 7.612622032536507e-07,
"loss": 0.0003,
"reward": 1.0839662365615368,
"reward_std": 0.9466215074062347,
"rewards/cosine_scaled_reward": 0.15656641125679016,
"rewards/format_reward": 0.7708333488553762,
"step": 205
},
{
"completion_length": 2088.812526702881,
"epoch": 0.23542857142857143,
"grad_norm": 0.21683187782764435,
"kl": 0.0050067901611328125,
"learning_rate": 7.584832158039378e-07,
"loss": 0.0002,
"reward": 0.2100734617561102,
"reward_std": 0.4785574749112129,
"rewards/cosine_scaled_reward": -0.24912995658814907,
"rewards/format_reward": 0.7083333358168602,
"step": 206
},
{
"completion_length": 1592.0208587646484,
"epoch": 0.23657142857142857,
"grad_norm": 0.3095583915710449,
"kl": 0.0069713592529296875,
"learning_rate": 7.556940671764124e-07,
"loss": 0.0003,
"reward": 0.7113522440195084,
"reward_std": 0.7786879017949104,
"rewards/cosine_scaled_reward": -0.08182388916611671,
"rewards/format_reward": 0.8750000074505806,
"step": 207
},
{
"completion_length": 1388.1250305175781,
"epoch": 0.2377142857142857,
"grad_norm": 0.22331999242305756,
"kl": 0.006237030029296875,
"learning_rate": 7.528948933102438e-07,
"loss": 0.0002,
"reward": 0.8039972663391382,
"reward_std": 0.817696575075388,
"rewards/cosine_scaled_reward": -0.035501367412507534,
"rewards/format_reward": 0.8750000149011612,
"step": 208
},
{
"completion_length": 1191.458366394043,
"epoch": 0.23885714285714285,
"grad_norm": 0.30164894461631775,
"kl": 0.01032257080078125,
"learning_rate": 7.500858306332172e-07,
"loss": 0.0004,
"reward": 1.0684526707045734,
"reward_std": 0.7270883917808533,
"rewards/cosine_scaled_reward": 0.0758929792791605,
"rewards/format_reward": 0.916666679084301,
"step": 209
},
{
"completion_length": 1736.8750228881836,
"epoch": 0.24,
"grad_norm": 0.2290075421333313,
"kl": 0.005527496337890625,
"learning_rate": 7.472670160550848e-07,
"loss": 0.0002,
"reward": 0.7679805280640721,
"reward_std": 0.45000065956264734,
"rewards/cosine_scaled_reward": -0.011843102052807808,
"rewards/format_reward": 0.7916666716337204,
"step": 210
},
{
"completion_length": 1724.3125534057617,
"epoch": 0.24114285714285713,
"grad_norm": 0.2615811824798584,
"kl": 0.00815582275390625,
"learning_rate": 7.444385869608921e-07,
"loss": 0.0003,
"reward": 0.8917091085459106,
"reward_std": 0.6099780108779669,
"rewards/cosine_scaled_reward": 0.07085455022752285,
"rewards/format_reward": 0.7500000037252903,
"step": 211
},
{
"completion_length": 1340.4583892822266,
"epoch": 0.2422857142857143,
"grad_norm": 0.2789488732814789,
"kl": 0.008213043212890625,
"learning_rate": 7.416006812042827e-07,
"loss": 0.0003,
"reward": 1.1324417181313038,
"reward_std": 0.835416778922081,
"rewards/cosine_scaled_reward": 0.15997080132365227,
"rewards/format_reward": 0.812500013038516,
"step": 212
},
{
"completion_length": 1215.229206085205,
"epoch": 0.24342857142857144,
"grad_norm": 0.29332393407821655,
"kl": 0.008787155151367188,
"learning_rate": 7.387534371007797e-07,
"loss": 0.0004,
"reward": 1.1256815567612648,
"reward_std": 0.7665913105010986,
"rewards/cosine_scaled_reward": 0.10450743697583675,
"rewards/format_reward": 0.916666679084301,
"step": 213
},
{
"completion_length": 1867.020866394043,
"epoch": 0.24457142857142858,
"grad_norm": 0.320909321308136,
"kl": 0.0081024169921875,
"learning_rate": 7.358969934210438e-07,
"loss": 0.0003,
"reward": 0.7646492760013643,
"reward_std": 0.8905698489397764,
"rewards/cosine_scaled_reward": 0.017741285264492035,
"rewards/format_reward": 0.7291666772216558,
"step": 214
},
{
"completion_length": 1342.937515258789,
"epoch": 0.24571428571428572,
"grad_norm": 0.25344061851501465,
"kl": 0.0052165985107421875,
"learning_rate": 7.330314893841101e-07,
"loss": 0.0002,
"reward": 0.5551245659589767,
"reward_std": 0.3942173570394516,
"rewards/cosine_scaled_reward": -0.15993775241076946,
"rewards/format_reward": 0.875,
"step": 215
},
{
"completion_length": 1305.9791717529297,
"epoch": 0.24685714285714286,
"grad_norm": 0.25426533818244934,
"kl": 0.007297515869140625,
"learning_rate": 7.301570646506027e-07,
"loss": 0.0003,
"reward": 1.0224211241584271,
"reward_std": 0.8492502048611641,
"rewards/cosine_scaled_reward": 0.09454387426376343,
"rewards/format_reward": 0.8333333414047956,
"step": 216
},
{
"completion_length": 1446.4166946411133,
"epoch": 0.248,
"grad_norm": 0.23595848679542542,
"kl": 0.0071563720703125,
"learning_rate": 7.27273859315928e-07,
"loss": 0.0003,
"reward": 1.0635684814769775,
"reward_std": 0.8497135229408741,
"rewards/cosine_scaled_reward": 0.10470088990405202,
"rewards/format_reward": 0.8541666716337204,
"step": 217
},
{
"completion_length": 1458.9167022705078,
"epoch": 0.24914285714285714,
"grad_norm": 0.23519130051136017,
"kl": 0.007747650146484375,
"learning_rate": 7.243820139034464e-07,
"loss": 0.0003,
"reward": 0.5358998300507665,
"reward_std": 0.8287533260881901,
"rewards/cosine_scaled_reward": -0.1799667701125145,
"rewards/format_reward": 0.8958333507180214,
"step": 218
},
{
"completion_length": 1337.0000381469727,
"epoch": 0.2502857142857143,
"grad_norm": 0.23715050518512726,
"kl": 0.0068759918212890625,
"learning_rate": 7.214816693576234e-07,
"loss": 0.0003,
"reward": 1.036239080131054,
"reward_std": 0.7431819960474968,
"rewards/cosine_scaled_reward": 0.0910361991263926,
"rewards/format_reward": 0.8541666716337204,
"step": 219
},
{
"completion_length": 1415.9166831970215,
"epoch": 0.25142857142857145,
"grad_norm": 0.28580737113952637,
"kl": 0.008144378662109375,
"learning_rate": 7.185729670371604e-07,
"loss": 0.0003,
"reward": 0.3407529406249523,
"reward_std": 0.472426300868392,
"rewards/cosine_scaled_reward": -0.2462902208790183,
"rewards/format_reward": 0.8333333432674408,
"step": 220
},
{
"completion_length": 1271.3958587646484,
"epoch": 0.25257142857142856,
"grad_norm": 0.2041638046503067,
"kl": 0.0054531097412109375,
"learning_rate": 7.156560487081051e-07,
"loss": 0.0002,
"reward": 0.9021317735314369,
"reward_std": 0.49666406959295273,
"rewards/cosine_scaled_reward": 0.013565851375460625,
"rewards/format_reward": 0.875,
"step": 221
},
{
"completion_length": 1608.208366394043,
"epoch": 0.2537142857142857,
"grad_norm": 0.24548400938510895,
"kl": 0.006893157958984375,
"learning_rate": 7.127310565369415e-07,
"loss": 0.0003,
"reward": 0.9311691895127296,
"reward_std": 0.7908135317265987,
"rewards/cosine_scaled_reward": 0.0489178872667253,
"rewards/format_reward": 0.8333333432674408,
"step": 222
},
{
"completion_length": 1477.6041870117188,
"epoch": 0.25485714285714284,
"grad_norm": 0.2579537034034729,
"kl": 0.007343292236328125,
"learning_rate": 7.097981330836616e-07,
"loss": 0.0003,
"reward": 0.6923428289592266,
"reward_std": 0.5176126770675182,
"rewards/cosine_scaled_reward": -0.03924527019262314,
"rewards/format_reward": 0.7708333507180214,
"step": 223
},
{
"completion_length": 2049.041702270508,
"epoch": 0.256,
"grad_norm": 0.18312445282936096,
"kl": 0.0058765411376953125,
"learning_rate": 7.068574212948169e-07,
"loss": 0.0002,
"reward": 0.6417203694581985,
"reward_std": 0.7141322754323483,
"rewards/cosine_scaled_reward": -0.07497315760701895,
"rewards/format_reward": 0.791666679084301,
"step": 224
},
{
"completion_length": 2165.0000915527344,
"epoch": 0.2571428571428571,
"grad_norm": 0.3362411558628082,
"kl": 0.018795013427734375,
"learning_rate": 7.039090644965509e-07,
"loss": 0.0008,
"reward": 0.4098323443904519,
"reward_std": 0.903562568128109,
"rewards/cosine_scaled_reward": -0.12841718492563814,
"rewards/format_reward": 0.6666666883975267,
"step": 225
},
{
"completion_length": 1517.020866394043,
"epoch": 0.2582857142857143,
"grad_norm": 0.20612592995166779,
"kl": 0.0065460205078125,
"learning_rate": 7.009532063876148e-07,
"loss": 0.0003,
"reward": 1.2147440239787102,
"reward_std": 0.8476376309990883,
"rewards/cosine_scaled_reward": 0.15945532266050577,
"rewards/format_reward": 0.8958333358168602,
"step": 226
},
{
"completion_length": 1337.7917098999023,
"epoch": 0.25942857142857145,
"grad_norm": 0.29225802421569824,
"kl": 0.010477066040039062,
"learning_rate": 6.979899910323624e-07,
"loss": 0.0004,
"reward": 0.8250191137194633,
"reward_std": 0.7360146790742874,
"rewards/cosine_scaled_reward": -0.06665713712573051,
"rewards/format_reward": 0.9583333358168602,
"step": 227
},
{
"completion_length": 1411.6666984558105,
"epoch": 0.26057142857142856,
"grad_norm": 0.2814786732196808,
"kl": 0.00687408447265625,
"learning_rate": 6.950195628537299e-07,
"loss": 0.0003,
"reward": 1.0481851011281833,
"reward_std": 0.7632241751998663,
"rewards/cosine_scaled_reward": 0.11784252151846886,
"rewards/format_reward": 0.8125000055879354,
"step": 228
},
{
"completion_length": 1429.1666870117188,
"epoch": 0.26171428571428573,
"grad_norm": 0.26221880316734314,
"kl": 0.008052825927734375,
"learning_rate": 6.920420666261961e-07,
"loss": 0.0003,
"reward": 0.7309114846866578,
"reward_std": 0.46567995101213455,
"rewards/cosine_scaled_reward": -0.06162761617451906,
"rewards/format_reward": 0.8541666679084301,
"step": 229
},
{
"completion_length": 2006.1667175292969,
"epoch": 0.26285714285714284,
"grad_norm": 0.24703645706176758,
"kl": 0.008714675903320312,
"learning_rate": 6.890576474687263e-07,
"loss": 0.0003,
"reward": 0.3682295884937048,
"reward_std": 0.6135462559759617,
"rewards/cosine_scaled_reward": -0.17005188344046474,
"rewards/format_reward": 0.7083333507180214,
"step": 230
},
{
"completion_length": 1523.9375305175781,
"epoch": 0.264,
"grad_norm": 0.21581248939037323,
"kl": 0.00759124755859375,
"learning_rate": 6.860664508377001e-07,
"loss": 0.0003,
"reward": 1.0537027399986982,
"reward_std": 0.791351318359375,
"rewards/cosine_scaled_reward": 0.058101359754800797,
"rewards/format_reward": 0.9375,
"step": 231
},
{
"completion_length": 1894.0833587646484,
"epoch": 0.2651428571428571,
"grad_norm": 0.23024173080921173,
"kl": 0.0105438232421875,
"learning_rate": 6.83068622519821e-07,
"loss": 0.0004,
"reward": 0.3341958597302437,
"reward_std": 0.666959872469306,
"rewards/cosine_scaled_reward": -0.2079020773526281,
"rewards/format_reward": 0.7500000093132257,
"step": 232
},
{
"completion_length": 1035.2500228881836,
"epoch": 0.2662857142857143,
"grad_norm": 0.2399219274520874,
"kl": 0.006580352783203125,
"learning_rate": 6.800643086250121e-07,
"loss": 0.0003,
"reward": 0.7500397872645408,
"reward_std": 0.6901755221188068,
"rewards/cosine_scaled_reward": -0.11456345673650503,
"rewards/format_reward": 0.9791666716337204,
"step": 233
},
{
"completion_length": 1623.7083892822266,
"epoch": 0.2674285714285714,
"grad_norm": 0.25343385338783264,
"kl": 0.0087127685546875,
"learning_rate": 6.770536555792944e-07,
"loss": 0.0003,
"reward": 0.5469427425414324,
"reward_std": 0.6777470409870148,
"rewards/cosine_scaled_reward": -0.12236198072787374,
"rewards/format_reward": 0.7916666716337204,
"step": 234
},
{
"completion_length": 1307.1458587646484,
"epoch": 0.26857142857142857,
"grad_norm": 0.2843048870563507,
"kl": 0.010669708251953125,
"learning_rate": 6.740368101176495e-07,
"loss": 0.0004,
"reward": 1.1278857234865427,
"reward_std": 0.7025517448782921,
"rewards/cosine_scaled_reward": 0.13685949333012104,
"rewards/format_reward": 0.8541666828095913,
"step": 235
},
{
"completion_length": 1815.020866394043,
"epoch": 0.26971428571428574,
"grad_norm": 0.21867913007736206,
"kl": 0.0071773529052734375,
"learning_rate": 6.710139192768694e-07,
"loss": 0.0003,
"reward": 0.7069348245859146,
"reward_std": 0.9415349438786507,
"rewards/cosine_scaled_reward": -0.031949267257004976,
"rewards/format_reward": 0.7708333414047956,
"step": 236
},
{
"completion_length": 1392.4167022705078,
"epoch": 0.27085714285714285,
"grad_norm": 0.23793146014213562,
"kl": 0.0063190460205078125,
"learning_rate": 6.679851303883891e-07,
"loss": 0.0003,
"reward": 0.8629608564078808,
"reward_std": 0.4519681539386511,
"rewards/cosine_scaled_reward": -0.006019574124366045,
"rewards/format_reward": 0.875,
"step": 237
},
{
"completion_length": 1157.6667098999023,
"epoch": 0.272,
"grad_norm": 0.23921653628349304,
"kl": 0.00856781005859375,
"learning_rate": 6.649505910711058e-07,
"loss": 0.0003,
"reward": 1.1124465055763721,
"reward_std": 0.6771237980574369,
"rewards/cosine_scaled_reward": 0.07705656159669161,
"rewards/format_reward": 0.9583333432674408,
"step": 238
},
{
"completion_length": 1381.0208778381348,
"epoch": 0.27314285714285713,
"grad_norm": 0.21902450919151306,
"kl": 0.005985260009765625,
"learning_rate": 6.619104492241847e-07,
"loss": 0.0002,
"reward": 1.3559186151251197,
"reward_std": 0.47896091267466545,
"rewards/cosine_scaled_reward": 0.2821259554475546,
"rewards/format_reward": 0.7916666679084301,
"step": 239
},
{
"completion_length": 1726.4583740234375,
"epoch": 0.2742857142857143,
"grad_norm": 0.32664069533348083,
"kl": 0.012561798095703125,
"learning_rate": 6.588648530198504e-07,
"loss": 0.0005,
"reward": 0.33731127402279526,
"reward_std": 0.5951757170259953,
"rewards/cosine_scaled_reward": -0.19592771586030722,
"rewards/format_reward": 0.729166679084301,
"step": 240
},
{
"completion_length": 1596.1667098999023,
"epoch": 0.2754285714285714,
"grad_norm": 0.2804366648197174,
"kl": 0.009317398071289062,
"learning_rate": 6.558139508961654e-07,
"loss": 0.0004,
"reward": 0.3176069939509034,
"reward_std": 0.5079055763781071,
"rewards/cosine_scaled_reward": -0.24744650442153215,
"rewards/format_reward": 0.8125000055879354,
"step": 241
},
{
"completion_length": 1209.8750381469727,
"epoch": 0.2765714285714286,
"grad_norm": 0.4020167887210846,
"kl": 0.01398468017578125,
"learning_rate": 6.527578915497951e-07,
"loss": 0.0006,
"reward": 0.7724186182022095,
"reward_std": 0.48072919889818877,
"rewards/cosine_scaled_reward": -0.08254071744158864,
"rewards/format_reward": 0.9375000074505806,
"step": 242
},
{
"completion_length": 1556.895866394043,
"epoch": 0.2777142857142857,
"grad_norm": 0.2093649059534073,
"kl": 0.0073986053466796875,
"learning_rate": 6.496968239287603e-07,
"loss": 0.0003,
"reward": 0.7962484285235405,
"reward_std": 0.7191448770463467,
"rewards/cosine_scaled_reward": -0.018542497418820858,
"rewards/format_reward": 0.8333333432674408,
"step": 243
},
{
"completion_length": 1644.9792022705078,
"epoch": 0.27885714285714286,
"grad_norm": 0.25173234939575195,
"kl": 0.008056640625,
"learning_rate": 6.466308972251785e-07,
"loss": 0.0003,
"reward": 1.000451274216175,
"reward_std": 0.8420288115739822,
"rewards/cosine_scaled_reward": 0.08355897013098001,
"rewards/format_reward": 0.8333333414047956,
"step": 244
},
{
"completion_length": 1743.6250457763672,
"epoch": 0.28,
"grad_norm": 0.24979065358638763,
"kl": 0.0075702667236328125,
"learning_rate": 6.435602608679916e-07,
"loss": 0.0003,
"reward": 0.6352044828236103,
"reward_std": 0.9859512895345688,
"rewards/cosine_scaled_reward": -0.03656444209627807,
"rewards/format_reward": 0.7083333414047956,
"step": 245
},
{
"completion_length": 1504.3542251586914,
"epoch": 0.28114285714285714,
"grad_norm": 0.2286195158958435,
"kl": 0.00997161865234375,
"learning_rate": 6.404850645156841e-07,
"loss": 0.0004,
"reward": 0.6435268502682447,
"reward_std": 0.7222435772418976,
"rewards/cosine_scaled_reward": -0.11573660443536937,
"rewards/format_reward": 0.875,
"step": 246
},
{
"completion_length": 1978.8959045410156,
"epoch": 0.2822857142857143,
"grad_norm": 0.2635015547275543,
"kl": 0.009313583374023438,
"learning_rate": 6.374054580489873e-07,
"loss": 0.0004,
"reward": 0.2780038760975003,
"reward_std": 0.4847924932837486,
"rewards/cosine_scaled_reward": -0.19433141965419054,
"rewards/format_reward": 0.6666666734963655,
"step": 247
},
{
"completion_length": 1440.7083358764648,
"epoch": 0.2834285714285714,
"grad_norm": 0.29032102227211,
"kl": 0.010227203369140625,
"learning_rate": 6.343215915635761e-07,
"loss": 0.0004,
"reward": 1.1146562099456787,
"reward_std": 0.5486158076673746,
"rewards/cosine_scaled_reward": 0.18232804723083973,
"rewards/format_reward": 0.7500000055879354,
"step": 248
},
{
"completion_length": 1401.3541946411133,
"epoch": 0.2845714285714286,
"grad_norm": 0.25068965554237366,
"kl": 0.012468338012695312,
"learning_rate": 6.31233615362752e-07,
"loss": 0.0005,
"reward": 1.2556766234338284,
"reward_std": 0.6144618764519691,
"rewards/cosine_scaled_reward": 0.21117158699780703,
"rewards/format_reward": 0.8333333358168602,
"step": 249
},
{
"completion_length": 1540.958381652832,
"epoch": 0.2857142857142857,
"grad_norm": 0.27915748953819275,
"kl": 0.013950347900390625,
"learning_rate": 6.281416799501187e-07,
"loss": 0.0006,
"reward": 0.8930262625217438,
"reward_std": 0.6684892605990171,
"rewards/cosine_scaled_reward": 0.00901312252972275,
"rewards/format_reward": 0.8750000149011612,
"step": 250
},
{
"completion_length": 1162.5416984558105,
"epoch": 0.28685714285714287,
"grad_norm": 0.3440181314945221,
"kl": 0.013868331909179688,
"learning_rate": 6.25045936022246e-07,
"loss": 0.0006,
"reward": 0.8275420293211937,
"reward_std": 0.6679180320352316,
"rewards/cosine_scaled_reward": -0.0445623523555696,
"rewards/format_reward": 0.9166666716337204,
"step": 251
},
{
"completion_length": 1791.7083740234375,
"epoch": 0.288,
"grad_norm": 0.25717219710350037,
"kl": 0.011442184448242188,
"learning_rate": 6.219465344613258e-07,
"loss": 0.0005,
"reward": 0.5102682635188103,
"reward_std": 0.5698316707275808,
"rewards/cosine_scaled_reward": -0.1406992208212614,
"rewards/format_reward": 0.7916666828095913,
"step": 252
},
{
"completion_length": 1678.583381652832,
"epoch": 0.28914285714285715,
"grad_norm": 0.2983105778694153,
"kl": 0.013790130615234375,
"learning_rate": 6.188436263278172e-07,
"loss": 0.0006,
"reward": 0.6175373089499772,
"reward_std": 0.8155984878540039,
"rewards/cosine_scaled_reward": -0.07664802484214306,
"rewards/format_reward": 0.7708333432674408,
"step": 253
},
{
"completion_length": 1746.1667289733887,
"epoch": 0.29028571428571426,
"grad_norm": 0.36004897952079773,
"kl": 0.013019561767578125,
"learning_rate": 6.157373628530852e-07,
"loss": 0.0005,
"reward": 0.724624989379663,
"reward_std": 0.8076631315052509,
"rewards/cosine_scaled_reward": 0.008145819883793592,
"rewards/format_reward": 0.7083333507180214,
"step": 254
},
{
"completion_length": 1871.0833892822266,
"epoch": 0.2914285714285714,
"grad_norm": 0.21529468894004822,
"kl": 0.01010894775390625,
"learning_rate": 6.126278954320294e-07,
"loss": 0.0004,
"reward": 0.23611869476735592,
"reward_std": 0.5141230337321758,
"rewards/cosine_scaled_reward": -0.2673573372885585,
"rewards/format_reward": 0.7708333432674408,
"step": 255
},
{
"completion_length": 1561.4375686645508,
"epoch": 0.2925714285714286,
"grad_norm": 0.2703370749950409,
"kl": 0.011600494384765625,
"learning_rate": 6.095153756157051e-07,
"loss": 0.0005,
"reward": 0.8081665523350239,
"reward_std": 0.6887451633810997,
"rewards/cosine_scaled_reward": -0.033416735008358955,
"rewards/format_reward": 0.8750000149011612,
"step": 256
},
{
"completion_length": 1959.0000686645508,
"epoch": 0.2937142857142857,
"grad_norm": 0.23733022809028625,
"kl": 0.0087432861328125,
"learning_rate": 6.06399955103937e-07,
"loss": 0.0003,
"reward": 1.0470282435417175,
"reward_std": 0.8330266922712326,
"rewards/cosine_scaled_reward": 0.12768075708299875,
"rewards/format_reward": 0.7916666697710752,
"step": 257
},
{
"completion_length": 1762.2083892822266,
"epoch": 0.2948571428571429,
"grad_norm": 0.26813915371894836,
"kl": 0.008697509765625,
"learning_rate": 6.032817857379256e-07,
"loss": 0.0003,
"reward": 0.9685253538191319,
"reward_std": 0.8117741793394089,
"rewards/cosine_scaled_reward": 0.03634601645171642,
"rewards/format_reward": 0.8958333432674408,
"step": 258
},
{
"completion_length": 1260.0416946411133,
"epoch": 0.296,
"grad_norm": 0.3185918927192688,
"kl": 0.010593414306640625,
"learning_rate": 6.001610194928464e-07,
"loss": 0.0004,
"reward": 1.0911660604178905,
"reward_std": 0.5376893002539873,
"rewards/cosine_scaled_reward": 0.09766635159030557,
"rewards/format_reward": 0.8958333432674408,
"step": 259
},
{
"completion_length": 1093.0000228881836,
"epoch": 0.29714285714285715,
"grad_norm": 0.27099287509918213,
"kl": 0.0073299407958984375,
"learning_rate": 5.97037808470444e-07,
"loss": 0.0003,
"reward": 1.3254448138177395,
"reward_std": 0.5788962724618614,
"rewards/cosine_scaled_reward": 0.1731390468776226,
"rewards/format_reward": 0.9791666716337204,
"step": 260
},
{
"completion_length": 2024.1041717529297,
"epoch": 0.29828571428571427,
"grad_norm": 0.20667137205600739,
"kl": 0.01210784912109375,
"learning_rate": 5.939123048916173e-07,
"loss": 0.0005,
"reward": 0.3676602290943265,
"reward_std": 0.42088818456977606,
"rewards/cosine_scaled_reward": -0.12866988312453032,
"rewards/format_reward": 0.625,
"step": 261
},
{
"completion_length": 1456.5417098999023,
"epoch": 0.29942857142857143,
"grad_norm": 0.3141776919364929,
"kl": 0.01071929931640625,
"learning_rate": 5.907846610890011e-07,
"loss": 0.0004,
"reward": 0.4814309529028833,
"reward_std": 0.6453931964933872,
"rewards/cosine_scaled_reward": -0.16553453914821148,
"rewards/format_reward": 0.8125000111758709,
"step": 262
},
{
"completion_length": 1237.5417098999023,
"epoch": 0.30057142857142854,
"grad_norm": 0.2402588427066803,
"kl": 0.007175445556640625,
"learning_rate": 5.87655029499542e-07,
"loss": 0.0003,
"reward": 0.7148367003537714,
"reward_std": 0.6169497780501842,
"rewards/cosine_scaled_reward": -0.10091499425470829,
"rewards/format_reward": 0.916666679084301,
"step": 263
},
{
"completion_length": 1231.3750228881836,
"epoch": 0.3017142857142857,
"grad_norm": 0.24805474281311035,
"kl": 0.007282257080078125,
"learning_rate": 5.845235626570683e-07,
"loss": 0.0003,
"reward": 0.6490657306276262,
"reward_std": 0.6103183180093765,
"rewards/cosine_scaled_reward": -0.1338004870340228,
"rewards/format_reward": 0.9166666679084301,
"step": 264
},
{
"completion_length": 1342.270881652832,
"epoch": 0.3028571428571429,
"grad_norm": 0.2401592880487442,
"kl": 0.00934600830078125,
"learning_rate": 5.813904131848564e-07,
"loss": 0.0004,
"reward": 1.1422894559800625,
"reward_std": 0.7356252074241638,
"rewards/cosine_scaled_reward": 0.09197801724076271,
"rewards/format_reward": 0.9583333432674408,
"step": 265
},
{
"completion_length": 1498.458366394043,
"epoch": 0.304,
"grad_norm": 0.2557384967803955,
"kl": 0.00916290283203125,
"learning_rate": 5.78255733788191e-07,
"loss": 0.0004,
"reward": 0.6136485729366541,
"reward_std": 0.7062950804829597,
"rewards/cosine_scaled_reward": -0.13067573634907603,
"rewards/format_reward": 0.8750000111758709,
"step": 266
},
{
"completion_length": 2083.458351135254,
"epoch": 0.30514285714285716,
"grad_norm": 0.24998563528060913,
"kl": 0.016796112060546875,
"learning_rate": 5.751196772469237e-07,
"loss": 0.0007,
"reward": 0.18751574913039804,
"reward_std": 0.5016277078539133,
"rewards/cosine_scaled_reward": -0.20832546008750796,
"rewards/format_reward": 0.6041666697710752,
"step": 267
},
{
"completion_length": 1216.0000534057617,
"epoch": 0.3062857142857143,
"grad_norm": 0.3383597433567047,
"kl": 0.012638092041015625,
"learning_rate": 5.71982396408026e-07,
"loss": 0.0005,
"reward": 0.7704824209213257,
"reward_std": 0.6409645788371563,
"rewards/cosine_scaled_reward": -0.07309213420376182,
"rewards/format_reward": 0.9166666716337204,
"step": 268
},
{
"completion_length": 1684.208381652832,
"epoch": 0.30742857142857144,
"grad_norm": 0.2874971032142639,
"kl": 0.009983062744140625,
"learning_rate": 5.688440441781398e-07,
"loss": 0.0004,
"reward": 0.6531256909947842,
"reward_std": 0.661817979067564,
"rewards/cosine_scaled_reward": -0.058853823225945234,
"rewards/format_reward": 0.7708333414047956,
"step": 269
},
{
"completion_length": 1621.5000762939453,
"epoch": 0.30857142857142855,
"grad_norm": 0.20542392134666443,
"kl": 0.010227203369140625,
"learning_rate": 5.657047735161255e-07,
"loss": 0.0004,
"reward": 0.9151211036369205,
"reward_std": 0.7712918408215046,
"rewards/cosine_scaled_reward": 0.00964385224506259,
"rewards/format_reward": 0.8958333358168602,
"step": 270
},
{
"completion_length": 1346.1042175292969,
"epoch": 0.3097142857142857,
"grad_norm": 0.258817583322525,
"kl": 0.01062774658203125,
"learning_rate": 5.625647374256061e-07,
"loss": 0.0004,
"reward": 1.3745552115142345,
"reward_std": 0.6398672200739384,
"rewards/cosine_scaled_reward": 0.23936093723023077,
"rewards/format_reward": 0.895833333954215,
"step": 271
},
{
"completion_length": 1769.3958740234375,
"epoch": 0.31085714285714283,
"grad_norm": 0.2116776555776596,
"kl": 0.011043548583984375,
"learning_rate": 5.594240889475106e-07,
"loss": 0.0004,
"reward": 0.827501755207777,
"reward_std": 0.5647195130586624,
"rewards/cosine_scaled_reward": -0.0029158147517591715,
"rewards/format_reward": 0.8333333432674408,
"step": 272
},
{
"completion_length": 1371.8958740234375,
"epoch": 0.312,
"grad_norm": 0.2546485960483551,
"kl": 0.010175704956054688,
"learning_rate": 5.562829811526154e-07,
"loss": 0.0004,
"reward": 0.9816151913255453,
"reward_std": 0.7047781832516193,
"rewards/cosine_scaled_reward": 0.06372424028813839,
"rewards/format_reward": 0.8541666772216558,
"step": 273
},
{
"completion_length": 1070.8333702087402,
"epoch": 0.31314285714285717,
"grad_norm": 0.29803934693336487,
"kl": 0.012859344482421875,
"learning_rate": 5.531415671340826e-07,
"loss": 0.0005,
"reward": 1.2880802303552628,
"reward_std": 0.7172955796122551,
"rewards/cosine_scaled_reward": 0.1544567703604116,
"rewards/format_reward": 0.9791666716337204,
"step": 274
},
{
"completion_length": 1502.5417098999023,
"epoch": 0.3142857142857143,
"grad_norm": 0.19667911529541016,
"kl": 0.0100860595703125,
"learning_rate": 5.5e-07,
"loss": 0.0004,
"reward": 1.1533876582980156,
"reward_std": 0.7076623123139143,
"rewards/cosine_scaled_reward": 0.14961049146950245,
"rewards/format_reward": 0.8541666716337204,
"step": 275
},
{
"completion_length": 1537.7916946411133,
"epoch": 0.31542857142857145,
"grad_norm": 0.27277708053588867,
"kl": 0.016254425048828125,
"learning_rate": 5.468584328659172e-07,
"loss": 0.0006,
"reward": 0.8316274359822273,
"reward_std": 0.9768596217036247,
"rewards/cosine_scaled_reward": 0.019980370067059994,
"rewards/format_reward": 0.791666679084301,
"step": 276
},
{
"completion_length": 1527.833381652832,
"epoch": 0.31657142857142856,
"grad_norm": 0.451528936624527,
"kl": 0.01665496826171875,
"learning_rate": 5.437170188473847e-07,
"loss": 0.0007,
"reward": 0.9556568302214146,
"reward_std": 0.8683530241250992,
"rewards/cosine_scaled_reward": 0.09241172997280955,
"rewards/format_reward": 0.7708333432674408,
"step": 277
},
{
"completion_length": 1412.8542175292969,
"epoch": 0.3177142857142857,
"grad_norm": 0.4396210312843323,
"kl": 0.011110305786132812,
"learning_rate": 5.405759110524894e-07,
"loss": 0.0004,
"reward": 1.1146881133317947,
"reward_std": 0.6263987999409437,
"rewards/cosine_scaled_reward": 0.11984403152018785,
"rewards/format_reward": 0.8750000074505806,
"step": 278
},
{
"completion_length": 1693.0000610351562,
"epoch": 0.31885714285714284,
"grad_norm": 0.4070056974887848,
"kl": 0.016384124755859375,
"learning_rate": 5.37435262574394e-07,
"loss": 0.0007,
"reward": 0.6888991240411997,
"reward_std": 0.8856545202434063,
"rewards/cosine_scaled_reward": -0.05138378031551838,
"rewards/format_reward": 0.7916666939854622,
"step": 279
},
{
"completion_length": 1793.3333892822266,
"epoch": 0.32,
"grad_norm": 0.3356776535511017,
"kl": 0.01377105712890625,
"learning_rate": 5.342952264838747e-07,
"loss": 0.0006,
"reward": 1.1871663890779018,
"reward_std": 1.0228201150894165,
"rewards/cosine_scaled_reward": 0.21858319267630577,
"rewards/format_reward": 0.7500000111758709,
"step": 280
},
{
"completion_length": 2391.250015258789,
"epoch": 0.3211428571428571,
"grad_norm": 0.20177552103996277,
"kl": 0.01556396484375,
"learning_rate": 5.311559558218603e-07,
"loss": 0.0006,
"reward": 0.35207285173237324,
"reward_std": 0.8479718416929245,
"rewards/cosine_scaled_reward": -0.10521360114216805,
"rewards/format_reward": 0.5625000018626451,
"step": 281
},
{
"completion_length": 1301.4167098999023,
"epoch": 0.3222857142857143,
"grad_norm": 0.3195320665836334,
"kl": 0.010005950927734375,
"learning_rate": 5.28017603591974e-07,
"loss": 0.0004,
"reward": 0.8210245547816157,
"reward_std": 0.5980393732897937,
"rewards/cosine_scaled_reward": -0.016571074724197388,
"rewards/format_reward": 0.854166679084301,
"step": 282
},
{
"completion_length": 2050.916732788086,
"epoch": 0.32342857142857145,
"grad_norm": 0.2939506769180298,
"kl": 0.0125579833984375,
"learning_rate": 5.248803227530763e-07,
"loss": 0.0005,
"reward": 0.9690306037664413,
"reward_std": 1.016658142209053,
"rewards/cosine_scaled_reward": 0.10951527790166438,
"rewards/format_reward": 0.7500000055879354,
"step": 283
},
{
"completion_length": 1105.9583587646484,
"epoch": 0.32457142857142857,
"grad_norm": 0.2861945629119873,
"kl": 0.008968353271484375,
"learning_rate": 5.21744266211809e-07,
"loss": 0.0004,
"reward": 0.7402211502194405,
"reward_std": 0.7317556664347649,
"rewards/cosine_scaled_reward": -0.11947278678417206,
"rewards/format_reward": 0.9791666716337204,
"step": 284
},
{
"completion_length": 1011.2917060852051,
"epoch": 0.32571428571428573,
"grad_norm": 0.2673451602458954,
"kl": 0.010982513427734375,
"learning_rate": 5.186095868151436e-07,
"loss": 0.0004,
"reward": 1.0782419480383396,
"reward_std": 0.5894357562065125,
"rewards/cosine_scaled_reward": 0.05995429493486881,
"rewards/format_reward": 0.9583333432674408,
"step": 285
},
{
"completion_length": 1348.2916870117188,
"epoch": 0.32685714285714285,
"grad_norm": 0.28173941373825073,
"kl": 0.0141448974609375,
"learning_rate": 5.154764373429315e-07,
"loss": 0.0006,
"reward": 0.7097440287470818,
"reward_std": 0.824833694845438,
"rewards/cosine_scaled_reward": -0.08262801356613636,
"rewards/format_reward": 0.8750000055879354,
"step": 286
},
{
"completion_length": 1311.8750457763672,
"epoch": 0.328,
"grad_norm": 0.3286299407482147,
"kl": 0.015102386474609375,
"learning_rate": 5.123449705004581e-07,
"loss": 0.0006,
"reward": 0.7719477713108063,
"reward_std": 0.6392118521034718,
"rewards/cosine_scaled_reward": 0.021390528418123722,
"rewards/format_reward": 0.7291666679084301,
"step": 287
},
{
"completion_length": 1523.4167251586914,
"epoch": 0.3291428571428571,
"grad_norm": 0.3453865349292755,
"kl": 0.01168060302734375,
"learning_rate": 5.09215338910999e-07,
"loss": 0.0005,
"reward": 0.7673552545456914,
"reward_std": 0.8511558324098587,
"rewards/cosine_scaled_reward": -0.06423905096016824,
"rewards/format_reward": 0.8958333432674408,
"step": 288
},
{
"completion_length": 1484.6875343322754,
"epoch": 0.3302857142857143,
"grad_norm": 0.6707473993301392,
"kl": 0.01996612548828125,
"learning_rate": 5.060876951083828e-07,
"loss": 0.0008,
"reward": 0.7081136375200003,
"reward_std": 0.5418885201215744,
"rewards/cosine_scaled_reward": -0.03135988023132086,
"rewards/format_reward": 0.7708333395421505,
"step": 289
},
{
"completion_length": 1023.958366394043,
"epoch": 0.3314285714285714,
"grad_norm": 0.28844523429870605,
"kl": 0.009992599487304688,
"learning_rate": 5.02962191529556e-07,
"loss": 0.0004,
"reward": 1.0698336996138096,
"reward_std": 0.7318692095577717,
"rewards/cosine_scaled_reward": 0.05575018119998276,
"rewards/format_reward": 0.9583333358168602,
"step": 290
},
{
"completion_length": 1331.9583740234375,
"epoch": 0.3325714285714286,
"grad_norm": 0.2491942197084427,
"kl": 0.011157989501953125,
"learning_rate": 4.998389805071536e-07,
"loss": 0.0004,
"reward": 0.9108222480863333,
"reward_std": 0.8276476263999939,
"rewards/cosine_scaled_reward": -0.013338901073439047,
"rewards/format_reward": 0.9375,
"step": 291
},
{
"completion_length": 1538.854232788086,
"epoch": 0.33371428571428574,
"grad_norm": 0.2686867117881775,
"kl": 0.011173248291015625,
"learning_rate": 4.967182142620745e-07,
"loss": 0.0004,
"reward": 0.6097328625619411,
"reward_std": 0.5690296031534672,
"rewards/cosine_scaled_reward": -0.15346692875027657,
"rewards/format_reward": 0.9166666865348816,
"step": 292
},
{
"completion_length": 1027.208351135254,
"epoch": 0.33485714285714285,
"grad_norm": 0.26298969984054565,
"kl": 0.00969696044921875,
"learning_rate": 4.93600044896063e-07,
"loss": 0.0004,
"reward": 0.8816844671964645,
"reward_std": 0.501637440174818,
"rewards/cosine_scaled_reward": -0.05915777012705803,
"rewards/format_reward": 1.0,
"step": 293
},
{
"completion_length": 1876.2291793823242,
"epoch": 0.336,
"grad_norm": 0.2713903784751892,
"kl": 0.015285491943359375,
"learning_rate": 4.904846243842949e-07,
"loss": 0.0006,
"reward": 0.8239198550581932,
"reward_std": 0.8811575844883919,
"rewards/cosine_scaled_reward": -0.004706733860075474,
"rewards/format_reward": 0.833333333954215,
"step": 294
},
{
"completion_length": 1408.4792022705078,
"epoch": 0.33714285714285713,
"grad_norm": 0.357388973236084,
"kl": 0.0163726806640625,
"learning_rate": 4.873721045679706e-07,
"loss": 0.0007,
"reward": 1.1488236412405968,
"reward_std": 0.6806459426879883,
"rewards/cosine_scaled_reward": 0.1160784661769867,
"rewards/format_reward": 0.9166666679084301,
"step": 295
},
{
"completion_length": 1524.8333702087402,
"epoch": 0.3382857142857143,
"grad_norm": 0.3012818694114685,
"kl": 0.01403045654296875,
"learning_rate": 4.842626371469149e-07,
"loss": 0.0006,
"reward": 0.6681146812625229,
"reward_std": 0.5912859179079533,
"rewards/cosine_scaled_reward": -0.11385933961719275,
"rewards/format_reward": 0.8958333507180214,
"step": 296
},
{
"completion_length": 2066.229202270508,
"epoch": 0.3394285714285714,
"grad_norm": 0.3773079514503479,
"kl": 0.02051544189453125,
"learning_rate": 4.811563736721829e-07,
"loss": 0.0008,
"reward": 0.6007692717248574,
"reward_std": 0.8569255843758583,
"rewards/cosine_scaled_reward": -0.022532058879733086,
"rewards/format_reward": 0.6458333544433117,
"step": 297
},
{
"completion_length": 1489.8541946411133,
"epoch": 0.3405714285714286,
"grad_norm": 0.28374582529067993,
"kl": 0.011333465576171875,
"learning_rate": 4.780534655386743e-07,
"loss": 0.0005,
"reward": 0.8617406606208533,
"reward_std": 0.6877223812043667,
"rewards/cosine_scaled_reward": 0.05587031855247915,
"rewards/format_reward": 0.7500000149011612,
"step": 298
},
{
"completion_length": 1407.5417022705078,
"epoch": 0.3417142857142857,
"grad_norm": 0.35291406512260437,
"kl": 0.013866424560546875,
"learning_rate": 4.749540639777539e-07,
"loss": 0.0006,
"reward": 0.8531467970460653,
"reward_std": 0.6483141556382179,
"rewards/cosine_scaled_reward": -0.021343314554542303,
"rewards/format_reward": 0.8958333507180214,
"step": 299
},
{
"completion_length": 1619.1042137145996,
"epoch": 0.34285714285714286,
"grad_norm": 0.3786706030368805,
"kl": 0.01715850830078125,
"learning_rate": 4.7185832004988133e-07,
"loss": 0.0007,
"reward": 0.6408427469432354,
"reward_std": 0.5885756351053715,
"rewards/cosine_scaled_reward": -0.08582865633070469,
"rewards/format_reward": 0.8125000149011612,
"step": 300
},
{
"completion_length": 1336.9375228881836,
"epoch": 0.344,
"grad_norm": 0.34971511363983154,
"kl": 0.01776885986328125,
"learning_rate": 4.68766384637248e-07,
"loss": 0.0007,
"reward": 0.6967362184077501,
"reward_std": 0.5865316716954112,
"rewards/cosine_scaled_reward": -0.12038189405575395,
"rewards/format_reward": 0.9375000074505806,
"step": 301
},
{
"completion_length": 1755.6458854675293,
"epoch": 0.34514285714285714,
"grad_norm": 0.4290507137775421,
"kl": 0.027614593505859375,
"learning_rate": 4.656784084364238e-07,
"loss": 0.0011,
"reward": 0.9262686646543443,
"reward_std": 0.850765410810709,
"rewards/cosine_scaled_reward": 0.08813432417809963,
"rewards/format_reward": 0.7500000074505806,
"step": 302
},
{
"completion_length": 1268.6250457763672,
"epoch": 0.3462857142857143,
"grad_norm": 0.34461113810539246,
"kl": 0.0146026611328125,
"learning_rate": 4.6259454195101267e-07,
"loss": 0.0006,
"reward": 0.7479919455945492,
"reward_std": 0.7392286397516727,
"rewards/cosine_scaled_reward": -0.06350404699333012,
"rewards/format_reward": 0.8750000074505806,
"step": 303
},
{
"completion_length": 1241.6250305175781,
"epoch": 0.3474285714285714,
"grad_norm": 0.34643009305000305,
"kl": 0.012691497802734375,
"learning_rate": 4.59514935484316e-07,
"loss": 0.0005,
"reward": 0.6787547403946519,
"reward_std": 0.644066970795393,
"rewards/cosine_scaled_reward": -0.10853931680321693,
"rewards/format_reward": 0.8958333395421505,
"step": 304
},
{
"completion_length": 1368.2708587646484,
"epoch": 0.3485714285714286,
"grad_norm": 0.27775081992149353,
"kl": 0.01265716552734375,
"learning_rate": 4.5643973913200837e-07,
"loss": 0.0005,
"reward": 0.6714181881397963,
"reward_std": 0.6255205329507589,
"rewards/cosine_scaled_reward": -0.13304092781618237,
"rewards/format_reward": 0.9375000149011612,
"step": 305
},
{
"completion_length": 1242.1667175292969,
"epoch": 0.3497142857142857,
"grad_norm": 0.3250581622123718,
"kl": 0.02001190185546875,
"learning_rate": 4.5336910277482155e-07,
"loss": 0.0008,
"reward": 1.175108065828681,
"reward_std": 0.6523913107812405,
"rewards/cosine_scaled_reward": 0.16047068312764168,
"rewards/format_reward": 0.8541666716337204,
"step": 306
},
{
"completion_length": 1049.9791831970215,
"epoch": 0.35085714285714287,
"grad_norm": 0.27137529850006104,
"kl": 0.00885009765625,
"learning_rate": 4.503031760712397e-07,
"loss": 0.0004,
"reward": 0.7276175357401371,
"reward_std": 0.7274761945009232,
"rewards/cosine_scaled_reward": -0.08410793542861938,
"rewards/format_reward": 0.8958333432674408,
"step": 307
},
{
"completion_length": 2084.3333892822266,
"epoch": 0.352,
"grad_norm": 0.3471006155014038,
"kl": 0.02095794677734375,
"learning_rate": 4.4724210845020494e-07,
"loss": 0.0008,
"reward": 0.6365162413567305,
"reward_std": 0.7663756646215916,
"rewards/cosine_scaled_reward": -0.025491908192634583,
"rewards/format_reward": 0.6875000149011612,
"step": 308
},
{
"completion_length": 1840.1667022705078,
"epoch": 0.35314285714285715,
"grad_norm": 0.2802172899246216,
"kl": 0.019718170166015625,
"learning_rate": 4.441860491038345e-07,
"loss": 0.0008,
"reward": 0.56682463362813,
"reward_std": 0.7632914148271084,
"rewards/cosine_scaled_reward": -0.11242103201220743,
"rewards/format_reward": 0.7916666772216558,
"step": 309
},
{
"completion_length": 1546.208396911621,
"epoch": 0.35428571428571426,
"grad_norm": 0.6738027930259705,
"kl": 0.025562286376953125,
"learning_rate": 4.4113514698014953e-07,
"loss": 0.001,
"reward": 0.5175626166164875,
"reward_std": 0.6167891919612885,
"rewards/cosine_scaled_reward": -0.16830204287543893,
"rewards/format_reward": 0.8541666828095913,
"step": 310
},
{
"completion_length": 1196.2708892822266,
"epoch": 0.3554285714285714,
"grad_norm": 0.3456050455570221,
"kl": 0.013538360595703125,
"learning_rate": 4.3808955077581546e-07,
"loss": 0.0005,
"reward": 1.0239670518785715,
"reward_std": 0.618289714679122,
"rewards/cosine_scaled_reward": 0.04323352035135031,
"rewards/format_reward": 0.9375000074505806,
"step": 311
},
{
"completion_length": 1250.6042022705078,
"epoch": 0.3565714285714286,
"grad_norm": 0.3434150218963623,
"kl": 0.02196502685546875,
"learning_rate": 4.350494089288943e-07,
"loss": 0.0009,
"reward": 1.3225273452699184,
"reward_std": 0.4847991270944476,
"rewards/cosine_scaled_reward": 0.2133469949476421,
"rewards/format_reward": 0.895833333954215,
"step": 312
},
{
"completion_length": 1696.4791831970215,
"epoch": 0.3577142857142857,
"grad_norm": 0.2879071533679962,
"kl": 0.02275848388671875,
"learning_rate": 4.3201486961161093e-07,
"loss": 0.0009,
"reward": 0.9180621989071369,
"reward_std": 0.7070650856476277,
"rewards/cosine_scaled_reward": 0.094447772949934,
"rewards/format_reward": 0.7291666716337204,
"step": 313
},
{
"completion_length": 1686.1250305175781,
"epoch": 0.3588571428571429,
"grad_norm": 0.26152166724205017,
"kl": 0.030666351318359375,
"learning_rate": 4.2898608072313045e-07,
"loss": 0.0012,
"reward": 0.8729692408815026,
"reward_std": 0.4160381481051445,
"rewards/cosine_scaled_reward": 0.0823179455474019,
"rewards/format_reward": 0.7083333432674408,
"step": 314
},
{
"completion_length": 1898.3125305175781,
"epoch": 0.36,
"grad_norm": 0.48658284544944763,
"kl": 0.04524993896484375,
"learning_rate": 4.2596318988235037e-07,
"loss": 0.0018,
"reward": 0.8393008848652244,
"reward_std": 0.5817160941660404,
"rewards/cosine_scaled_reward": 0.03423376381397247,
"rewards/format_reward": 0.7708333488553762,
"step": 315
},
{
"completion_length": 2036.4583892822266,
"epoch": 0.36114285714285715,
"grad_norm": 0.43442559242248535,
"kl": 0.045146942138671875,
"learning_rate": 4.2294634442070553e-07,
"loss": 0.0018,
"reward": 0.39593026289367117,
"reward_std": 0.6670566536486149,
"rewards/cosine_scaled_reward": -0.1874515525996685,
"rewards/format_reward": 0.7708333469927311,
"step": 316
},
{
"completion_length": 1617.8542022705078,
"epoch": 0.36228571428571427,
"grad_norm": 0.4115068018436432,
"kl": 0.02794647216796875,
"learning_rate": 4.1993569137498776e-07,
"loss": 0.0011,
"reward": 0.5094092178624123,
"reward_std": 0.5757773853838444,
"rewards/cosine_scaled_reward": -0.09946207702159882,
"rewards/format_reward": 0.7083333414047956,
"step": 317
},
{
"completion_length": 1241.1875228881836,
"epoch": 0.36342857142857143,
"grad_norm": 0.5369545221328735,
"kl": 0.02693939208984375,
"learning_rate": 4.1693137748017915e-07,
"loss": 0.0011,
"reward": 0.561584232840687,
"reward_std": 0.5914283934980631,
"rewards/cosine_scaled_reward": -0.1775412478018552,
"rewards/format_reward": 0.9166666716337204,
"step": 318
},
{
"completion_length": 1330.9375457763672,
"epoch": 0.36457142857142855,
"grad_norm": 0.2861523926258087,
"kl": 0.016010284423828125,
"learning_rate": 4.1393354916230005e-07,
"loss": 0.0006,
"reward": 0.45444004982709885,
"reward_std": 0.46621063351631165,
"rewards/cosine_scaled_reward": -0.22069666720926762,
"rewards/format_reward": 0.8958333395421505,
"step": 319
},
{
"completion_length": 1153.9792022705078,
"epoch": 0.3657142857142857,
"grad_norm": 0.7957829236984253,
"kl": 0.04431343078613281,
"learning_rate": 4.1094235253127374e-07,
"loss": 0.0018,
"reward": 0.8477955907583237,
"reward_std": 0.5735172219574451,
"rewards/cosine_scaled_reward": -0.055268908850848675,
"rewards/format_reward": 0.9583333358168602,
"step": 320
},
{
"completion_length": 939.2291946411133,
"epoch": 0.3668571428571429,
"grad_norm": 0.34620678424835205,
"kl": 0.014583587646484375,
"learning_rate": 4.079579333738039e-07,
"loss": 0.0006,
"reward": 1.0206873081624508,
"reward_std": 0.5673208367079496,
"rewards/cosine_scaled_reward": 0.031176931224763393,
"rewards/format_reward": 0.9583333358168602,
"step": 321
},
{
"completion_length": 1747.8750228881836,
"epoch": 0.368,
"grad_norm": 0.6450039744377136,
"kl": 0.06949234008789062,
"learning_rate": 4.0498043714627006e-07,
"loss": 0.0028,
"reward": 0.44154978170990944,
"reward_std": 0.7411049008369446,
"rewards/cosine_scaled_reward": -0.12297512916848063,
"rewards/format_reward": 0.6875000037252903,
"step": 322
},
{
"completion_length": 1514.3125457763672,
"epoch": 0.36914285714285716,
"grad_norm": 0.43106693029403687,
"kl": 0.040225982666015625,
"learning_rate": 4.020100089676376e-07,
"loss": 0.0016,
"reward": 0.5237643220461905,
"reward_std": 0.7173379920423031,
"rewards/cosine_scaled_reward": -0.11311787366867065,
"rewards/format_reward": 0.7500000111758709,
"step": 323
},
{
"completion_length": 1323.6875228881836,
"epoch": 0.3702857142857143,
"grad_norm": 0.3708251416683197,
"kl": 0.024990081787109375,
"learning_rate": 3.9904679361238526e-07,
"loss": 0.001,
"reward": 0.5681585122365505,
"reward_std": 0.7323919646441936,
"rewards/cosine_scaled_reward": -0.14300409331917763,
"rewards/format_reward": 0.854166679084301,
"step": 324
},
{
"completion_length": 1668.4583587646484,
"epoch": 0.37142857142857144,
"grad_norm": 0.4666387140750885,
"kl": 0.03742218017578125,
"learning_rate": 3.9609093550344907e-07,
"loss": 0.0015,
"reward": 0.8824842711910605,
"reward_std": 0.6758766230195761,
"rewards/cosine_scaled_reward": 0.03499212674796581,
"rewards/format_reward": 0.8125000037252903,
"step": 325
},
{
"completion_length": 1059.020851135254,
"epoch": 0.37257142857142855,
"grad_norm": 0.34716206789016724,
"kl": 0.019435882568359375,
"learning_rate": 3.931425787051832e-07,
"loss": 0.0008,
"reward": 0.9567670961841941,
"reward_std": 0.6891666799783707,
"rewards/cosine_scaled_reward": 0.020050194929353893,
"rewards/format_reward": 0.9166666716337204,
"step": 326
},
{
"completion_length": 1584.8125381469727,
"epoch": 0.3737142857142857,
"grad_norm": 0.4085078537464142,
"kl": 0.02321624755859375,
"learning_rate": 3.902018669163384e-07,
"loss": 0.0009,
"reward": 0.8718113675713539,
"reward_std": 0.9188407957553864,
"rewards/cosine_scaled_reward": 0.02965566364582628,
"rewards/format_reward": 0.812500013038516,
"step": 327
},
{
"completion_length": 1545.1041870117188,
"epoch": 0.37485714285714283,
"grad_norm": 0.49370771646499634,
"kl": 0.027523040771484375,
"learning_rate": 3.872689434630585e-07,
"loss": 0.0011,
"reward": 0.4031808990985155,
"reward_std": 0.7104494869709015,
"rewards/cosine_scaled_reward": -0.20465956535190344,
"rewards/format_reward": 0.8125000204890966,
"step": 328
},
{
"completion_length": 898.4375228881836,
"epoch": 0.376,
"grad_norm": 0.4591941833496094,
"kl": 0.01458740234375,
"learning_rate": 3.843439512918949e-07,
"loss": 0.0006,
"reward": 0.96208087913692,
"reward_std": 0.4870151877403259,
"rewards/cosine_scaled_reward": 0.012290460988879204,
"rewards/format_reward": 0.9375000074505806,
"step": 329
},
{
"completion_length": 1459.0000457763672,
"epoch": 0.37714285714285717,
"grad_norm": 0.645241379737854,
"kl": 0.0567169189453125,
"learning_rate": 3.8142703296283953e-07,
"loss": 0.0023,
"reward": 0.4072363208979368,
"reward_std": 0.7110217055305839,
"rewards/cosine_scaled_reward": -0.17138185133808292,
"rewards/format_reward": 0.750000013038516,
"step": 330
},
{
"completion_length": 1496.1250381469727,
"epoch": 0.3782857142857143,
"grad_norm": 0.6915489435195923,
"kl": 0.037700653076171875,
"learning_rate": 3.785183306423767e-07,
"loss": 0.0015,
"reward": 0.7328273691236973,
"reward_std": 0.8054426815360785,
"rewards/cosine_scaled_reward": -0.050252995104528964,
"rewards/format_reward": 0.8333333469927311,
"step": 331
},
{
"completion_length": 1504.270881652832,
"epoch": 0.37942857142857145,
"grad_norm": 0.4096035361289978,
"kl": 0.03113555908203125,
"learning_rate": 3.7561798609655373e-07,
"loss": 0.0012,
"reward": 0.6928669670596719,
"reward_std": 0.5561900734901428,
"rewards/cosine_scaled_reward": -0.1014832123182714,
"rewards/format_reward": 0.8958333358168602,
"step": 332
},
{
"completion_length": 1285.3333587646484,
"epoch": 0.38057142857142856,
"grad_norm": 0.34817183017730713,
"kl": 0.03202056884765625,
"learning_rate": 3.72726140684072e-07,
"loss": 0.0013,
"reward": 0.9648001305758953,
"reward_std": 0.7506307512521744,
"rewards/cosine_scaled_reward": 0.0032333843410015106,
"rewards/format_reward": 0.9583333358168602,
"step": 333
},
{
"completion_length": 1845.9375610351562,
"epoch": 0.38171428571428573,
"grad_norm": 0.6175907850265503,
"kl": 0.06725311279296875,
"learning_rate": 3.6984293534939737e-07,
"loss": 0.0027,
"reward": 0.17733338894322515,
"reward_std": 0.5020285062491894,
"rewards/cosine_scaled_reward": -0.2654999643564224,
"rewards/format_reward": 0.7083333469927311,
"step": 334
},
{
"completion_length": 1298.895866394043,
"epoch": 0.38285714285714284,
"grad_norm": 0.28870290517807007,
"kl": 0.02230072021484375,
"learning_rate": 3.6696851061588994e-07,
"loss": 0.0009,
"reward": 0.8596498997649178,
"reward_std": 0.7761356085538864,
"rewards/cosine_scaled_reward": -0.018091744743287563,
"rewards/format_reward": 0.8958333507180214,
"step": 335
},
{
"completion_length": 1496.7500381469727,
"epoch": 0.384,
"grad_norm": 0.43910446763038635,
"kl": 0.038936614990234375,
"learning_rate": 3.641030065789562e-07,
"loss": 0.0016,
"reward": 0.9042559079825878,
"reward_std": 0.7216421309858561,
"rewards/cosine_scaled_reward": 0.04587792372331023,
"rewards/format_reward": 0.8125000111758709,
"step": 336
},
{
"completion_length": 1552.6250610351562,
"epoch": 0.3851428571428571,
"grad_norm": 0.5697144865989685,
"kl": 0.0431671142578125,
"learning_rate": 3.612465628992203e-07,
"loss": 0.0017,
"reward": 0.5564747964963317,
"reward_std": 0.7052949294447899,
"rewards/cosine_scaled_reward": -0.15926262829452753,
"rewards/format_reward": 0.8750000149011612,
"step": 337
},
{
"completion_length": 1273.7500457763672,
"epoch": 0.3862857142857143,
"grad_norm": 0.6811116337776184,
"kl": 0.03277587890625,
"learning_rate": 3.5839931879571725e-07,
"loss": 0.0013,
"reward": 0.6950095873326063,
"reward_std": 0.5781379528343678,
"rewards/cosine_scaled_reward": -0.08999521844089031,
"rewards/format_reward": 0.8750000111758709,
"step": 338
},
{
"completion_length": 1495.3750305175781,
"epoch": 0.38742857142857146,
"grad_norm": 0.4718823730945587,
"kl": 0.0449981689453125,
"learning_rate": 3.555614130391079e-07,
"loss": 0.0018,
"reward": 0.4238283894956112,
"reward_std": 0.58335055783391,
"rewards/cosine_scaled_reward": -0.17350250110030174,
"rewards/format_reward": 0.770833345130086,
"step": 339
},
{
"completion_length": 1411.208366394043,
"epoch": 0.38857142857142857,
"grad_norm": 0.38769999146461487,
"kl": 0.033802032470703125,
"learning_rate": 3.5273298394491515e-07,
"loss": 0.0014,
"reward": 0.5338742323219776,
"reward_std": 0.5776225738227367,
"rewards/cosine_scaled_reward": -0.17056290060281754,
"rewards/format_reward": 0.8750000074505806,
"step": 340
},
{
"completion_length": 1547.5417251586914,
"epoch": 0.38971428571428574,
"grad_norm": 0.7106971144676208,
"kl": 0.05750274658203125,
"learning_rate": 3.4991416936678276e-07,
"loss": 0.0023,
"reward": 1.1556610856205225,
"reward_std": 0.6674655824899673,
"rewards/cosine_scaled_reward": 0.16116386279463768,
"rewards/format_reward": 0.8333333432674408,
"step": 341
},
{
"completion_length": 1866.6250228881836,
"epoch": 0.39085714285714285,
"grad_norm": 0.8300604820251465,
"kl": 0.08011245727539062,
"learning_rate": 3.471051066897562e-07,
"loss": 0.0032,
"reward": 0.7030050987377763,
"reward_std": 1.0421876087784767,
"rewards/cosine_scaled_reward": -0.013080822303891182,
"rewards/format_reward": 0.7291666809469461,
"step": 342
},
{
"completion_length": 1520.3958587646484,
"epoch": 0.392,
"grad_norm": 0.41595587134361267,
"kl": 0.0402679443359375,
"learning_rate": 3.4430593282358777e-07,
"loss": 0.0016,
"reward": 1.10817926004529,
"reward_std": 0.7470837235450745,
"rewards/cosine_scaled_reward": 0.12700629979372025,
"rewards/format_reward": 0.854166679084301,
"step": 343
},
{
"completion_length": 1331.5417022705078,
"epoch": 0.3931428571428571,
"grad_norm": 0.5218416452407837,
"kl": 0.045536041259765625,
"learning_rate": 3.4151678419606233e-07,
"loss": 0.0018,
"reward": 1.484528623521328,
"reward_std": 0.6807431867346168,
"rewards/cosine_scaled_reward": 0.27351428056135774,
"rewards/format_reward": 0.9375,
"step": 344
},
{
"completion_length": 1333.645866394043,
"epoch": 0.3942857142857143,
"grad_norm": 0.5664793848991394,
"kl": 0.0484161376953125,
"learning_rate": 3.387377967463493e-07,
"loss": 0.0019,
"reward": 1.0393912829458714,
"reward_std": 0.7020993046462536,
"rewards/cosine_scaled_reward": 0.06136229634284973,
"rewards/format_reward": 0.916666679084301,
"step": 345
},
{
"completion_length": 1467.8750381469727,
"epoch": 0.3954285714285714,
"grad_norm": 0.4097067713737488,
"kl": 0.028484344482421875,
"learning_rate": 3.359691059183761e-07,
"loss": 0.0011,
"reward": 0.8963276408612728,
"reward_std": 0.6313250102102757,
"rewards/cosine_scaled_reward": -0.031002862378954887,
"rewards/format_reward": 0.9583333432674408,
"step": 346
},
{
"completion_length": 1563.0417022705078,
"epoch": 0.3965714285714286,
"grad_norm": 0.39142462611198425,
"kl": 0.02931976318359375,
"learning_rate": 3.3321084665422803e-07,
"loss": 0.0012,
"reward": 0.4781823381781578,
"reward_std": 0.5552436150610447,
"rewards/cosine_scaled_reward": -0.2088255239650607,
"rewards/format_reward": 0.8958333507180214,
"step": 347
},
{
"completion_length": 1574.6042251586914,
"epoch": 0.3977142857142857,
"grad_norm": 0.4851846694946289,
"kl": 0.08718490600585938,
"learning_rate": 3.3046315338757026e-07,
"loss": 0.0035,
"reward": 0.7802252694964409,
"reward_std": 0.684166319668293,
"rewards/cosine_scaled_reward": -0.0369707178324461,
"rewards/format_reward": 0.8541666697710752,
"step": 348
},
{
"completion_length": 1409.9583892822266,
"epoch": 0.39885714285714285,
"grad_norm": 0.5291475057601929,
"kl": 0.079132080078125,
"learning_rate": 3.2772616003709616e-07,
"loss": 0.0032,
"reward": 0.9304023738950491,
"reward_std": 0.6878850013017654,
"rewards/cosine_scaled_reward": 0.03811782307457179,
"rewards/format_reward": 0.8541666753590107,
"step": 349
},
{
"completion_length": 1006.1458587646484,
"epoch": 0.4,
"grad_norm": 0.5923649072647095,
"kl": 0.02787017822265625,
"learning_rate": 3.250000000000001e-07,
"loss": 0.0011,
"reward": 0.6561646163463593,
"reward_std": 0.7408818565309048,
"rewards/cosine_scaled_reward": -0.14066770486533642,
"rewards/format_reward": 0.9375000149011612,
"step": 350
},
{
"completion_length": 1188.145866394043,
"epoch": 0.40114285714285713,
"grad_norm": 0.7286760210990906,
"kl": 0.0440216064453125,
"learning_rate": 3.222848061454764e-07,
"loss": 0.0018,
"reward": 0.8188043851405382,
"reward_std": 0.5987816452980042,
"rewards/cosine_scaled_reward": -0.048931147903203964,
"rewards/format_reward": 0.9166666716337204,
"step": 351
},
{
"completion_length": 1407.7292022705078,
"epoch": 0.4022857142857143,
"grad_norm": 0.4982418119907379,
"kl": 0.07025146484375,
"learning_rate": 3.195807108082429e-07,
"loss": 0.0028,
"reward": 0.7963543608784676,
"reward_std": 0.7775063142180443,
"rewards/cosine_scaled_reward": -0.02890616189688444,
"rewards/format_reward": 0.8541666716337204,
"step": 352
},
{
"completion_length": 1118.7083549499512,
"epoch": 0.4034285714285714,
"grad_norm": 0.35043564438819885,
"kl": 0.023746490478515625,
"learning_rate": 3.168878457820915e-07,
"loss": 0.001,
"reward": 1.168786108493805,
"reward_std": 0.5713744387030602,
"rewards/cosine_scaled_reward": 0.08439303282648325,
"rewards/format_reward": 1.0,
"step": 353
},
{
"completion_length": 1018.5000228881836,
"epoch": 0.4045714285714286,
"grad_norm": 0.33321675658226013,
"kl": 0.014415740966796875,
"learning_rate": 3.142063423134644e-07,
"loss": 0.0006,
"reward": 1.1353367045521736,
"reward_std": 0.5776836480945349,
"rewards/cosine_scaled_reward": 0.06766833364963531,
"rewards/format_reward": 1.0,
"step": 354
},
{
"completion_length": 943.4375228881836,
"epoch": 0.4057142857142857,
"grad_norm": 0.4005463123321533,
"kl": 0.013919830322265625,
"learning_rate": 3.115363310950578e-07,
"loss": 0.0006,
"reward": 1.04657375626266,
"reward_std": 0.7807271406054497,
"rewards/cosine_scaled_reward": 0.044120170176029205,
"rewards/format_reward": 0.9583333432674408,
"step": 355
},
{
"completion_length": 1453.2500534057617,
"epoch": 0.40685714285714286,
"grad_norm": 5.388972759246826,
"kl": 0.15525436401367188,
"learning_rate": 3.0887794225945143e-07,
"loss": 0.0062,
"reward": 0.8825728315860033,
"reward_std": 0.692312303930521,
"rewards/cosine_scaled_reward": 0.003786402754485607,
"rewards/format_reward": 0.8750000055879354,
"step": 356
},
{
"completion_length": 1924.916732788086,
"epoch": 0.408,
"grad_norm": 2.25702166557312,
"kl": 0.13177490234375,
"learning_rate": 3.062313053727671e-07,
"loss": 0.0053,
"reward": 0.580949871102348,
"reward_std": 0.7843025289475918,
"rewards/cosine_scaled_reward": -0.12619174644351006,
"rewards/format_reward": 0.8333333469927311,
"step": 357
},
{
"completion_length": 1465.8958740234375,
"epoch": 0.40914285714285714,
"grad_norm": 0.3147200644016266,
"kl": 0.048160552978515625,
"learning_rate": 3.0359654942835247e-07,
"loss": 0.0019,
"reward": 1.2739417422562838,
"reward_std": 0.7031947523355484,
"rewards/cosine_scaled_reward": 0.16822083480656147,
"rewards/format_reward": 0.9375000074505806,
"step": 358
},
{
"completion_length": 967.7291946411133,
"epoch": 0.4102857142857143,
"grad_norm": 0.4690834879875183,
"kl": 0.01666259765625,
"learning_rate": 3.0097380284049523e-07,
"loss": 0.0007,
"reward": 0.8219893537461758,
"reward_std": 0.4733723718672991,
"rewards/cosine_scaled_reward": -0.07858868315815926,
"rewards/format_reward": 0.9791666716337204,
"step": 359
},
{
"completion_length": 1542.4792098999023,
"epoch": 0.4114285714285714,
"grad_norm": 0.809581995010376,
"kl": 0.09832000732421875,
"learning_rate": 2.9836319343816397e-07,
"loss": 0.0039,
"reward": 0.7635386185720563,
"reward_std": 0.8262498266994953,
"rewards/cosine_scaled_reward": -0.03489737829659134,
"rewards/format_reward": 0.8333333488553762,
"step": 360
},
{
"completion_length": 1216.2500228881836,
"epoch": 0.4125714285714286,
"grad_norm": 0.6465409398078918,
"kl": 0.032878875732421875,
"learning_rate": 2.9576484845877793e-07,
"loss": 0.0013,
"reward": 0.7554627768695354,
"reward_std": 0.66816546022892,
"rewards/cosine_scaled_reward": -0.09101861796807498,
"rewards/format_reward": 0.9375000149011612,
"step": 361
},
{
"completion_length": 1013.7083587646484,
"epoch": 0.4137142857142857,
"grad_norm": 0.6028101444244385,
"kl": 0.0688018798828125,
"learning_rate": 2.931788945420058e-07,
"loss": 0.0028,
"reward": 0.8830434214323759,
"reward_std": 0.4408119870349765,
"rewards/cosine_scaled_reward": -0.016811609268188477,
"rewards/format_reward": 0.9166666679084301,
"step": 362
},
{
"completion_length": 1001.5208587646484,
"epoch": 0.41485714285714287,
"grad_norm": 0.4196496605873108,
"kl": 0.02587890625,
"learning_rate": 2.9060545772359305e-07,
"loss": 0.001,
"reward": 1.3822015225887299,
"reward_std": 0.5926420330069959,
"rewards/cosine_scaled_reward": 0.22235074604395777,
"rewards/format_reward": 0.9375,
"step": 363
},
{
"completion_length": 1178.6042022705078,
"epoch": 0.416,
"grad_norm": 0.4955011308193207,
"kl": 0.03900909423828125,
"learning_rate": 2.8804466342921987e-07,
"loss": 0.0016,
"reward": 0.4972789268940687,
"reward_std": 0.4248249903321266,
"rewards/cosine_scaled_reward": -0.2096938779577613,
"rewards/format_reward": 0.916666679084301,
"step": 364
},
{
"completion_length": 1883.5833740234375,
"epoch": 0.41714285714285715,
"grad_norm": 0.9444948434829712,
"kl": 0.13416671752929688,
"learning_rate": 2.854966364683872e-07,
"loss": 0.0054,
"reward": 0.682358652818948,
"reward_std": 0.7634237520396709,
"rewards/cosine_scaled_reward": -0.033820681273937225,
"rewards/format_reward": 0.750000013038516,
"step": 365
},
{
"completion_length": 1313.6875457763672,
"epoch": 0.41828571428571426,
"grad_norm": 0.5134904980659485,
"kl": 0.040302276611328125,
"learning_rate": 2.829615010283344e-07,
"loss": 0.0016,
"reward": 0.9970972370356321,
"reward_std": 0.6578602809458971,
"rewards/cosine_scaled_reward": 0.05063193337991834,
"rewards/format_reward": 0.8958333358168602,
"step": 366
},
{
"completion_length": 1459.270896911621,
"epoch": 0.41942857142857143,
"grad_norm": 0.6990901827812195,
"kl": 0.06932830810546875,
"learning_rate": 2.8043938066798645e-07,
"loss": 0.0028,
"reward": 0.752178119495511,
"reward_std": 0.6665377467870712,
"rewards/cosine_scaled_reward": -0.061410948634147644,
"rewards/format_reward": 0.8750000149011612,
"step": 367
},
{
"completion_length": 1939.5625915527344,
"epoch": 0.4205714285714286,
"grad_norm": 0.9561004638671875,
"kl": 0.117095947265625,
"learning_rate": 2.7793039831193133e-07,
"loss": 0.0047,
"reward": 0.6907948858570307,
"reward_std": 0.7415912002325058,
"rewards/cosine_scaled_reward": -0.05043588951230049,
"rewards/format_reward": 0.7916666753590107,
"step": 368
},
{
"completion_length": 1556.6042289733887,
"epoch": 0.4217142857142857,
"grad_norm": 1.0411858558654785,
"kl": 0.10003662109375,
"learning_rate": 2.7543467624442956e-07,
"loss": 0.004,
"reward": 0.6202979227527976,
"reward_std": 0.921792209148407,
"rewards/cosine_scaled_reward": -0.09610107401385903,
"rewards/format_reward": 0.8125000223517418,
"step": 369
},
{
"completion_length": 1518.3542022705078,
"epoch": 0.4228571428571429,
"grad_norm": 0.8758165240287781,
"kl": 0.12267684936523438,
"learning_rate": 2.729523361034538e-07,
"loss": 0.0049,
"reward": 0.565900880843401,
"reward_std": 0.40963491424918175,
"rewards/cosine_scaled_reward": -0.09204956982284784,
"rewards/format_reward": 0.7500000167638063,
"step": 370
},
{
"completion_length": 785.8125152587891,
"epoch": 0.424,
"grad_norm": 0.8005909323692322,
"kl": 0.043704986572265625,
"learning_rate": 2.7048349887476037e-07,
"loss": 0.0017,
"reward": 1.155183486174792,
"reward_std": 0.4847665010020137,
"rewards/cosine_scaled_reward": 0.10884173773229122,
"rewards/format_reward": 0.9375000074505806,
"step": 371
},
{
"completion_length": 1529.9375610351562,
"epoch": 0.42514285714285716,
"grad_norm": 0.9557836055755615,
"kl": 0.06856536865234375,
"learning_rate": 2.6802828488599294e-07,
"loss": 0.0027,
"reward": 1.173981536179781,
"reward_std": 0.5648104697465897,
"rewards/cosine_scaled_reward": 0.1599073875695467,
"rewards/format_reward": 0.8541666753590107,
"step": 372
},
{
"completion_length": 908.5625228881836,
"epoch": 0.42628571428571427,
"grad_norm": 0.6306917667388916,
"kl": 0.0537872314453125,
"learning_rate": 2.655868138008171e-07,
"loss": 0.0022,
"reward": 0.7425720803439617,
"reward_std": 0.5991219412535429,
"rewards/cosine_scaled_reward": -0.10788064636290073,
"rewards/format_reward": 0.9583333432674408,
"step": 373
},
{
"completion_length": 1134.7708702087402,
"epoch": 0.42742857142857144,
"grad_norm": 0.3005812168121338,
"kl": 0.034290313720703125,
"learning_rate": 2.631592046130896e-07,
"loss": 0.0014,
"reward": 0.9842969626188278,
"reward_std": 0.48715431056916714,
"rewards/cosine_scaled_reward": 0.0025651296600699425,
"rewards/format_reward": 0.9791666716337204,
"step": 374
},
{
"completion_length": 1738.6875457763672,
"epoch": 0.42857142857142855,
"grad_norm": 1.1354238986968994,
"kl": 0.20126724243164062,
"learning_rate": 2.6074557564105724e-07,
"loss": 0.0081,
"reward": 1.1611301894299686,
"reward_std": 0.66292554885149,
"rewards/cosine_scaled_reward": 0.1951484135352075,
"rewards/format_reward": 0.770833345130086,
"step": 375
},
{
"completion_length": 1445.0416946411133,
"epoch": 0.4297142857142857,
"grad_norm": 0.8109616041183472,
"kl": 0.1262969970703125,
"learning_rate": 2.583460445215911e-07,
"loss": 0.0051,
"reward": 0.7195739368908107,
"reward_std": 0.644221305847168,
"rewards/cosine_scaled_reward": -0.07771303225308657,
"rewards/format_reward": 0.8750000037252903,
"step": 376
},
{
"completion_length": 1666.6042175292969,
"epoch": 0.4308571428571429,
"grad_norm": 0.8677497506141663,
"kl": 0.14620208740234375,
"learning_rate": 2.5596072820445254e-07,
"loss": 0.0059,
"reward": 0.5706351515837014,
"reward_std": 0.5533648394048214,
"rewards/cosine_scaled_reward": -0.14176576025784016,
"rewards/format_reward": 0.854166679084301,
"step": 377
},
{
"completion_length": 1371.50004196167,
"epoch": 0.432,
"grad_norm": 0.544824481010437,
"kl": 0.06582260131835938,
"learning_rate": 2.5358974294659373e-07,
"loss": 0.0026,
"reward": 1.024425177834928,
"reward_std": 0.6592462100088596,
"rewards/cosine_scaled_reward": 0.04346257133875042,
"rewards/format_reward": 0.9375000074505806,
"step": 378
},
{
"completion_length": 1651.2708740234375,
"epoch": 0.43314285714285716,
"grad_norm": 0.9048022031784058,
"kl": 0.14910507202148438,
"learning_rate": 2.512332043064913e-07,
"loss": 0.006,
"reward": 0.6398802241310477,
"reward_std": 0.6580366250127554,
"rewards/cosine_scaled_reward": -0.11755990888923407,
"rewards/format_reward": 0.8750000149011612,
"step": 379
},
{
"completion_length": 1450.5209121704102,
"epoch": 0.4342857142857143,
"grad_norm": 0.9721732139587402,
"kl": 0.1341705322265625,
"learning_rate": 2.488912271385139e-07,
"loss": 0.0054,
"reward": 0.9563853230793029,
"reward_std": 0.5542362295091152,
"rewards/cosine_scaled_reward": 0.05110928136855364,
"rewards/format_reward": 0.8541666772216558,
"step": 380
},
{
"completion_length": 1318.2500305175781,
"epoch": 0.43542857142857144,
"grad_norm": 0.7490306496620178,
"kl": 0.09369659423828125,
"learning_rate": 2.465639255873246e-07,
"loss": 0.0037,
"reward": 0.5720363007858396,
"reward_std": 0.4669734900817275,
"rewards/cosine_scaled_reward": -0.1618985361419618,
"rewards/format_reward": 0.8958333507180214,
"step": 381
},
{
"completion_length": 1104.6458587646484,
"epoch": 0.43657142857142855,
"grad_norm": 0.47110554575920105,
"kl": 0.06568145751953125,
"learning_rate": 2.4425141308231765e-07,
"loss": 0.0026,
"reward": 0.7494204398244619,
"reward_std": 0.6730066798627377,
"rewards/cosine_scaled_reward": -0.0732064712792635,
"rewards/format_reward": 0.8958333395421505,
"step": 382
},
{
"completion_length": 1225.208366394043,
"epoch": 0.4377142857142857,
"grad_norm": 0.8109216690063477,
"kl": 0.13178634643554688,
"learning_rate": 2.4195380233209006e-07,
"loss": 0.0053,
"reward": 1.1812428515404463,
"reward_std": 0.8560024872422218,
"rewards/cosine_scaled_reward": 0.12187140854075551,
"rewards/format_reward": 0.9375000149011612,
"step": 383
},
{
"completion_length": 1087.770866394043,
"epoch": 0.43885714285714283,
"grad_norm": 0.5764302015304565,
"kl": 0.05832672119140625,
"learning_rate": 2.3967120531894857e-07,
"loss": 0.0023,
"reward": 1.539357453584671,
"reward_std": 0.942340662702918,
"rewards/cosine_scaled_reward": 0.28009538841433823,
"rewards/format_reward": 0.9791666716337204,
"step": 384
},
{
"completion_length": 1719.9375457763672,
"epoch": 0.44,
"grad_norm": 1.024683952331543,
"kl": 0.16792678833007812,
"learning_rate": 2.374037332934512e-07,
"loss": 0.0067,
"reward": 0.597355630248785,
"reward_std": 0.8098498787730932,
"rewards/cosine_scaled_reward": -0.12840551760746166,
"rewards/format_reward": 0.854166679084301,
"step": 385
},
{
"completion_length": 1323.0625228881836,
"epoch": 0.44114285714285717,
"grad_norm": 0.8986222147941589,
"kl": 0.13629150390625,
"learning_rate": 2.3515149676898552e-07,
"loss": 0.0054,
"reward": 0.970568162156269,
"reward_std": 0.6673476994037628,
"rewards/cosine_scaled_reward": 0.026950686238706112,
"rewards/format_reward": 0.9166666716337204,
"step": 386
},
{
"completion_length": 1617.6458740234375,
"epoch": 0.4422857142857143,
"grad_norm": 1.9691290855407715,
"kl": 0.1884002685546875,
"learning_rate": 2.3291460551638237e-07,
"loss": 0.0075,
"reward": 0.8362433174625039,
"reward_std": 0.7687919363379478,
"rewards/cosine_scaled_reward": 0.022288329899311066,
"rewards/format_reward": 0.7916666865348816,
"step": 387
},
{
"completion_length": 1484.0417098999023,
"epoch": 0.44342857142857145,
"grad_norm": 0.9472360014915466,
"kl": 0.1433868408203125,
"learning_rate": 2.306931685585657e-07,
"loss": 0.0057,
"reward": 0.9805102795362473,
"reward_std": 0.7974189594388008,
"rewards/cosine_scaled_reward": 0.03192179277539253,
"rewards/format_reward": 0.9166666716337204,
"step": 388
},
{
"completion_length": 1409.9583740234375,
"epoch": 0.44457142857142856,
"grad_norm": 0.6685540080070496,
"kl": 0.14338302612304688,
"learning_rate": 2.2848729416523859e-07,
"loss": 0.0057,
"reward": 0.8139473758637905,
"reward_std": 0.602177394554019,
"rewards/cosine_scaled_reward": -0.07219298463314772,
"rewards/format_reward": 0.9583333432674408,
"step": 389
},
{
"completion_length": 1431.5416831970215,
"epoch": 0.44571428571428573,
"grad_norm": 0.8498513698577881,
"kl": 0.12875747680664062,
"learning_rate": 2.2629708984760706e-07,
"loss": 0.0051,
"reward": 0.8707941449247301,
"reward_std": 0.7655657678842545,
"rewards/cosine_scaled_reward": 0.018730382435023785,
"rewards/format_reward": 0.8333333395421505,
"step": 390
},
{
"completion_length": 1266.5417098999023,
"epoch": 0.44685714285714284,
"grad_norm": 1.7778056859970093,
"kl": 0.24418258666992188,
"learning_rate": 2.2412266235313973e-07,
"loss": 0.0098,
"reward": 1.0020010322332382,
"reward_std": 0.7098470069468021,
"rewards/cosine_scaled_reward": 0.04266716237179935,
"rewards/format_reward": 0.916666679084301,
"step": 391
},
{
"completion_length": 1210.6250381469727,
"epoch": 0.448,
"grad_norm": 0.9239005446434021,
"kl": 0.12252044677734375,
"learning_rate": 2.2196411766036487e-07,
"loss": 0.0049,
"reward": 0.6644403500249609,
"reward_std": 0.6213442989974283,
"rewards/cosine_scaled_reward": -0.12611316796392202,
"rewards/format_reward": 0.9166666716337204,
"step": 392
},
{
"completion_length": 1505.7500457763672,
"epoch": 0.4491428571428571,
"grad_norm": 1.1656075716018677,
"kl": 0.14298248291015625,
"learning_rate": 2.1982156097370557e-07,
"loss": 0.0057,
"reward": 0.6966553591191769,
"reward_std": 0.8095338940620422,
"rewards/cosine_scaled_reward": -0.06833898182958364,
"rewards/format_reward": 0.8333333507180214,
"step": 393
},
{
"completion_length": 1672.270881652832,
"epoch": 0.4502857142857143,
"grad_norm": 1.8255363702774048,
"kl": 0.25844573974609375,
"learning_rate": 2.1769509671835223e-07,
"loss": 0.0104,
"reward": 0.39489989215508103,
"reward_std": 0.6209751814603806,
"rewards/cosine_scaled_reward": -0.17755005788058043,
"rewards/format_reward": 0.7500000149011612,
"step": 394
},
{
"completion_length": 1502.5625381469727,
"epoch": 0.4514285714285714,
"grad_norm": 1.6484956741333008,
"kl": 0.34644317626953125,
"learning_rate": 2.1558482853517253e-07,
"loss": 0.0138,
"reward": 0.8687735805287957,
"reward_std": 0.6579391695559025,
"rewards/cosine_scaled_reward": 0.038553440012037754,
"rewards/format_reward": 0.7916666772216558,
"step": 395
},
{
"completion_length": 1279.6250457763672,
"epoch": 0.45257142857142857,
"grad_norm": 0.7100761532783508,
"kl": 0.11377334594726562,
"learning_rate": 2.134908592756607e-07,
"loss": 0.0046,
"reward": 0.9277213364839554,
"reward_std": 0.6887175664305687,
"rewards/cosine_scaled_reward": 0.005527290515601635,
"rewards/format_reward": 0.9166666865348816,
"step": 396
},
{
"completion_length": 1162.8958587646484,
"epoch": 0.45371428571428574,
"grad_norm": 1.176890254020691,
"kl": 0.1617584228515625,
"learning_rate": 2.1141329099692406e-07,
"loss": 0.0065,
"reward": 0.6928375033894554,
"reward_std": 0.7507635410875082,
"rewards/cosine_scaled_reward": -0.049414592678658664,
"rewards/format_reward": 0.791666679084301,
"step": 397
},
{
"completion_length": 1423.0000534057617,
"epoch": 0.45485714285714285,
"grad_norm": 1.1243730783462524,
"kl": 0.36114501953125,
"learning_rate": 2.0935222495670968e-07,
"loss": 0.0144,
"reward": 0.6900735814124346,
"reward_std": 0.6959933899343014,
"rewards/cosine_scaled_reward": -0.08204657444730401,
"rewards/format_reward": 0.8541666753590107,
"step": 398
},
{
"completion_length": 1099.6875305175781,
"epoch": 0.456,
"grad_norm": 0.6219501495361328,
"kl": 0.048126220703125,
"learning_rate": 2.0730776160846853e-07,
"loss": 0.0019,
"reward": 0.9560099802911282,
"reward_std": 0.6817612200975418,
"rewards/cosine_scaled_reward": -0.0011616908013820648,
"rewards/format_reward": 0.9583333432674408,
"step": 399
},
{
"completion_length": 1025.3541793823242,
"epoch": 0.45714285714285713,
"grad_norm": 1.3574082851409912,
"kl": 0.11576461791992188,
"learning_rate": 2.0528000059645995e-07,
"loss": 0.0046,
"reward": 1.4993134140968323,
"reward_std": 0.6285623479634523,
"rewards/cosine_scaled_reward": 0.2913233733997913,
"rewards/format_reward": 0.9166666679084301,
"step": 400
},
{
"completion_length": 1738.6459045410156,
"epoch": 0.4582857142857143,
"grad_norm": 2.3035199642181396,
"kl": 0.446929931640625,
"learning_rate": 2.032690407508949e-07,
"loss": 0.0179,
"reward": 0.588189116679132,
"reward_std": 0.7383468300104141,
"rewards/cosine_scaled_reward": -0.10173879377543926,
"rewards/format_reward": 0.7916666828095913,
"step": 401
},
{
"completion_length": 1228.0000305175781,
"epoch": 0.4594285714285714,
"grad_norm": 2.5062694549560547,
"kl": 0.3095245361328125,
"learning_rate": 2.0127498008311922e-07,
"loss": 0.0124,
"reward": 0.8380769728682935,
"reward_std": 0.5672764517366886,
"rewards/cosine_scaled_reward": -0.03929485194385052,
"rewards/format_reward": 0.916666679084301,
"step": 402
},
{
"completion_length": 1121.166690826416,
"epoch": 0.4605714285714286,
"grad_norm": 1.9036198854446411,
"kl": 0.18898773193359375,
"learning_rate": 1.9929791578083655e-07,
"loss": 0.0076,
"reward": 0.9163239896297455,
"reward_std": 0.4906519539654255,
"rewards/cosine_scaled_reward": -0.00017135590314865112,
"rewards/format_reward": 0.916666679084301,
"step": 403
},
{
"completion_length": 1184.1666831970215,
"epoch": 0.4617142857142857,
"grad_norm": 1.1318503618240356,
"kl": 0.2597007751464844,
"learning_rate": 1.9733794420337213e-07,
"loss": 0.0104,
"reward": 0.7345974240452051,
"reward_std": 0.3557329196482897,
"rewards/cosine_scaled_reward": -0.09103463962674141,
"rewards/format_reward": 0.9166666716337204,
"step": 404
},
{
"completion_length": 1083.6041984558105,
"epoch": 0.46285714285714286,
"grad_norm": 1.148695707321167,
"kl": 0.1588592529296875,
"learning_rate": 1.9539516087697517e-07,
"loss": 0.0064,
"reward": 1.164491715535405,
"reward_std": 0.7675561746582389,
"rewards/cosine_scaled_reward": 0.11349584814161062,
"rewards/format_reward": 0.9375000074505806,
"step": 405
},
{
"completion_length": 1347.4792022705078,
"epoch": 0.464,
"grad_norm": 1.5972338914871216,
"kl": 0.30425262451171875,
"learning_rate": 1.934696604901642e-07,
"loss": 0.0122,
"reward": 1.0750425793230534,
"reward_std": 0.8302016435191035,
"rewards/cosine_scaled_reward": 0.07918795384466648,
"rewards/format_reward": 0.9166666716337204,
"step": 406
},
{
"completion_length": 1360.416732788086,
"epoch": 0.46514285714285714,
"grad_norm": 2.243136405944824,
"kl": 0.40606689453125,
"learning_rate": 1.915615368891117e-07,
"loss": 0.0162,
"reward": 0.8054503360763192,
"reward_std": 0.43949691019952297,
"rewards/cosine_scaled_reward": -0.03477485757321119,
"rewards/format_reward": 0.8750000223517418,
"step": 407
},
{
"completion_length": 1446.395851135254,
"epoch": 0.4662857142857143,
"grad_norm": 2.1331191062927246,
"kl": 0.3451957702636719,
"learning_rate": 1.8967088307307e-07,
"loss": 0.0138,
"reward": 0.933131798170507,
"reward_std": 0.7964614983648062,
"rewards/cosine_scaled_reward": 0.03948255442082882,
"rewards/format_reward": 0.854166679084301,
"step": 408
},
{
"completion_length": 1716.7083740234375,
"epoch": 0.4674285714285714,
"grad_norm": 1.9532040357589722,
"kl": 0.445892333984375,
"learning_rate": 1.8779779118983867e-07,
"loss": 0.0178,
"reward": 0.6957469061017036,
"reward_std": 0.6721529066562653,
"rewards/cosine_scaled_reward": -0.08962656743824482,
"rewards/format_reward": 0.8750000037252903,
"step": 409
},
{
"completion_length": 1500.3750343322754,
"epoch": 0.4685714285714286,
"grad_norm": 1.127901315689087,
"kl": 0.49494171142578125,
"learning_rate": 1.8594235253127372e-07,
"loss": 0.0198,
"reward": 0.7732762107625604,
"reward_std": 0.8983776066452265,
"rewards/cosine_scaled_reward": -0.0300285741686821,
"rewards/format_reward": 0.8333333414047956,
"step": 410
},
{
"completion_length": 2009.000057220459,
"epoch": 0.4697142857142857,
"grad_norm": 2.1997931003570557,
"kl": 0.7542343139648438,
"learning_rate": 1.8410465752883758e-07,
"loss": 0.0302,
"reward": 0.2233330551534891,
"reward_std": 0.5823363810777664,
"rewards/cosine_scaled_reward": -0.24250016640871763,
"rewards/format_reward": 0.7083333414047956,
"step": 411
},
{
"completion_length": 1173.583366394043,
"epoch": 0.47085714285714286,
"grad_norm": 8.122756004333496,
"kl": 0.3453559875488281,
"learning_rate": 1.822847957491922e-07,
"loss": 0.0138,
"reward": 1.0573125034570694,
"reward_std": 0.6750743202865124,
"rewards/cosine_scaled_reward": 0.0390729159116745,
"rewards/format_reward": 0.9791666716337204,
"step": 412
},
{
"completion_length": 1252.1667022705078,
"epoch": 0.472,
"grad_norm": 107.0315933227539,
"kl": 3.7126235961914062,
"learning_rate": 1.804828558898332e-07,
"loss": 0.1485,
"reward": 1.0909956084797159,
"reward_std": 0.6099100448191166,
"rewards/cosine_scaled_reward": 0.07674776995554566,
"rewards/format_reward": 0.9375,
"step": 413
},
{
"completion_length": 1720.8333740234375,
"epoch": 0.47314285714285714,
"grad_norm": 1.0315011739730835,
"kl": 0.302764892578125,
"learning_rate": 1.7869892577476722e-07,
"loss": 0.0121,
"reward": 0.6044954676181078,
"reward_std": 0.5638450682163239,
"rewards/cosine_scaled_reward": -0.13525228761136532,
"rewards/format_reward": 0.8750000074505806,
"step": 414
},
{
"completion_length": 1495.4375457763672,
"epoch": 0.4742857142857143,
"grad_norm": 1.6685491800308228,
"kl": 0.31848907470703125,
"learning_rate": 1.7693309235023127e-07,
"loss": 0.0128,
"reward": 0.8473802506923676,
"reward_std": 0.8426450192928314,
"rewards/cosine_scaled_reward": -0.03464323375374079,
"rewards/format_reward": 0.916666679084301,
"step": 415
},
{
"completion_length": 1131.7291870117188,
"epoch": 0.4754285714285714,
"grad_norm": 0.9927107095718384,
"kl": 0.10882568359375,
"learning_rate": 1.7518544168045524e-07,
"loss": 0.0043,
"reward": 1.3657895848155022,
"reward_std": 0.7458459995687008,
"rewards/cosine_scaled_reward": 0.18289476446807384,
"rewards/format_reward": 1.0,
"step": 416
},
{
"completion_length": 1397.6041946411133,
"epoch": 0.4765714285714286,
"grad_norm": 1.2048169374465942,
"kl": 0.18719482421875,
"learning_rate": 1.7345605894346726e-07,
"loss": 0.0075,
"reward": 0.7268392583355308,
"reward_std": 0.5356767289340496,
"rewards/cosine_scaled_reward": -0.09491373039782047,
"rewards/format_reward": 0.916666679084301,
"step": 417
},
{
"completion_length": 1385.5417022705078,
"epoch": 0.4777142857142857,
"grad_norm": 2.015775203704834,
"kl": 0.3655548095703125,
"learning_rate": 1.7174502842694212e-07,
"loss": 0.0146,
"reward": 1.1464654430747032,
"reward_std": 0.8867814308032393,
"rewards/cosine_scaled_reward": 0.13573270197957754,
"rewards/format_reward": 0.8750000074505806,
"step": 418
},
{
"completion_length": 1347.2708892822266,
"epoch": 0.47885714285714287,
"grad_norm": 1.3547542095184326,
"kl": 0.3565025329589844,
"learning_rate": 1.7005243352409333e-07,
"loss": 0.0143,
"reward": 1.226757968775928,
"reward_std": 0.6351639665663242,
"rewards/cosine_scaled_reward": 0.18629562947899103,
"rewards/format_reward": 0.8541666753590107,
"step": 419
},
{
"completion_length": 994.354190826416,
"epoch": 0.48,
"grad_norm": 3.8445253372192383,
"kl": 0.1680145263671875,
"learning_rate": 1.6837835672960831e-07,
"loss": 0.0067,
"reward": 0.7386327926069498,
"reward_std": 0.5644268691539764,
"rewards/cosine_scaled_reward": -0.08901696337852627,
"rewards/format_reward": 0.9166666865348816,
"step": 420
},
{
"completion_length": 1412.833381652832,
"epoch": 0.48114285714285715,
"grad_norm": 1.4594179391860962,
"kl": 0.4204254150390625,
"learning_rate": 1.6672287963562852e-07,
"loss": 0.0168,
"reward": 0.3792721191421151,
"reward_std": 0.7107202988117933,
"rewards/cosine_scaled_reward": -0.22703061811625957,
"rewards/format_reward": 0.8333333432674408,
"step": 421
},
{
"completion_length": 1531.9583892822266,
"epoch": 0.48228571428571426,
"grad_norm": 1.9154317378997803,
"kl": 0.529266357421875,
"learning_rate": 1.6508608292777203e-07,
"loss": 0.0212,
"reward": 0.7085785139352083,
"reward_std": 0.9169136509299278,
"rewards/cosine_scaled_reward": -0.051960770739242435,
"rewards/format_reward": 0.8125000260770321,
"step": 422
},
{
"completion_length": 1723.458366394043,
"epoch": 0.48342857142857143,
"grad_norm": 2.2272391319274902,
"kl": 0.667022705078125,
"learning_rate": 1.6346804638120098e-07,
"loss": 0.0267,
"reward": 0.39935479685664177,
"reward_std": 0.7014467380940914,
"rewards/cosine_scaled_reward": -0.1440726025030017,
"rewards/format_reward": 0.6875000186264515,
"step": 423
},
{
"completion_length": 1572.1250457763672,
"epoch": 0.4845714285714286,
"grad_norm": 7.855838298797607,
"kl": 0.6064910888671875,
"learning_rate": 1.6186884885673413e-07,
"loss": 0.0242,
"reward": 0.4374563042074442,
"reward_std": 0.614509429782629,
"rewards/cosine_scaled_reward": -0.21877187490463257,
"rewards/format_reward": 0.8750000149011612,
"step": 424
},
{
"completion_length": 1239.1667022705078,
"epoch": 0.4857142857142857,
"grad_norm": 0.9158412218093872,
"kl": 0.13330078125,
"learning_rate": 1.6028856829700258e-07,
"loss": 0.0053,
"reward": 1.5343235712498426,
"reward_std": 0.6198269496671855,
"rewards/cosine_scaled_reward": 0.298411812633276,
"rewards/format_reward": 0.9375000149011612,
"step": 425
},
{
"completion_length": 1250.6666946411133,
"epoch": 0.4868571428571429,
"grad_norm": 1.3155421018600464,
"kl": 0.32469940185546875,
"learning_rate": 1.5872728172265146e-07,
"loss": 0.013,
"reward": 0.8722767792642117,
"reward_std": 0.6620613150298595,
"rewards/cosine_scaled_reward": -0.0013616248033940792,
"rewards/format_reward": 0.8750000149011612,
"step": 426
},
{
"completion_length": 1530.8333892822266,
"epoch": 0.488,
"grad_norm": 1.546027421951294,
"kl": 0.19110107421875,
"learning_rate": 1.5718506522858572e-07,
"loss": 0.0077,
"reward": 0.8462803540751338,
"reward_std": 0.8315089084208012,
"rewards/cosine_scaled_reward": -0.02477649785578251,
"rewards/format_reward": 0.8958333432674408,
"step": 427
},
{
"completion_length": 1438.7917098999023,
"epoch": 0.48914285714285716,
"grad_norm": 1.3113563060760498,
"kl": 0.44814300537109375,
"learning_rate": 1.5566199398026147e-07,
"loss": 0.018,
"reward": 0.8119768351316452,
"reward_std": 0.8527404256165028,
"rewards/cosine_scaled_reward": -0.041928261518478394,
"rewards/format_reward": 0.8958333358168602,
"step": 428
},
{
"completion_length": 1113.0625381469727,
"epoch": 0.49028571428571427,
"grad_norm": 8.569297790527344,
"kl": 0.5591049194335938,
"learning_rate": 1.5415814221002265e-07,
"loss": 0.0224,
"reward": 0.8809511847794056,
"reward_std": 0.7602541670203209,
"rewards/cosine_scaled_reward": -0.0074410997331142426,
"rewards/format_reward": 0.8958333432674408,
"step": 429
},
{
"completion_length": 1167.4792213439941,
"epoch": 0.49142857142857144,
"grad_norm": 1.5055890083312988,
"kl": 0.28092193603515625,
"learning_rate": 1.5267358321348285e-07,
"loss": 0.0112,
"reward": 1.0165742300450802,
"reward_std": 0.7622268162667751,
"rewards/cosine_scaled_reward": 0.039537094067782164,
"rewards/format_reward": 0.9375000074505806,
"step": 430
},
{
"completion_length": 1308.2917022705078,
"epoch": 0.49257142857142855,
"grad_norm": 1.3274399042129517,
"kl": 0.4292926788330078,
"learning_rate": 1.5120838934595337e-07,
"loss": 0.0172,
"reward": 0.5780141645809636,
"reward_std": 0.6294812802225351,
"rewards/cosine_scaled_reward": -0.13807626301422715,
"rewards/format_reward": 0.8541666865348816,
"step": 431
},
{
"completion_length": 1741.9375381469727,
"epoch": 0.4937142857142857,
"grad_norm": 2.412412166595459,
"kl": 0.7193603515625,
"learning_rate": 1.4976263201891613e-07,
"loss": 0.0288,
"reward": 0.5436302255839109,
"reward_std": 0.5460153482854366,
"rewards/cosine_scaled_reward": -0.092768220230937,
"rewards/format_reward": 0.729166679084301,
"step": 432
},
{
"completion_length": 1384.5625305175781,
"epoch": 0.4948571428571429,
"grad_norm": 0.9878861904144287,
"kl": 0.18514251708984375,
"learning_rate": 1.483363816965435e-07,
"loss": 0.0074,
"reward": 0.8780094515532255,
"reward_std": 0.44247524719685316,
"rewards/cosine_scaled_reward": -0.00891195610165596,
"rewards/format_reward": 0.8958333432674408,
"step": 433
},
{
"completion_length": 1242.6875305175781,
"epoch": 0.496,
"grad_norm": 1.256845474243164,
"kl": 0.20133209228515625,
"learning_rate": 1.469297078922642e-07,
"loss": 0.008,
"reward": 0.516814824193716,
"reward_std": 0.48154355585575104,
"rewards/cosine_scaled_reward": -0.21034259721636772,
"rewards/format_reward": 0.9375000149011612,
"step": 434
},
{
"completion_length": 925.0000228881836,
"epoch": 0.49714285714285716,
"grad_norm": 2.2059106826782227,
"kl": 0.3611564636230469,
"learning_rate": 1.4554267916537495e-07,
"loss": 0.0145,
"reward": 0.4291147319599986,
"reward_std": 0.3988812826573849,
"rewards/cosine_scaled_reward": -0.2541926633566618,
"rewards/format_reward": 0.9375000074505806,
"step": 435
},
{
"completion_length": 1295.1875381469727,
"epoch": 0.4982857142857143,
"grad_norm": 1.634010672569275,
"kl": 0.45708465576171875,
"learning_rate": 1.4417536311769885e-07,
"loss": 0.0183,
"reward": 1.1145056758541614,
"reward_std": 0.6509739924222231,
"rewards/cosine_scaled_reward": 0.1510028038173914,
"rewards/format_reward": 0.8125000037252903,
"step": 436
},
{
"completion_length": 1247.437515258789,
"epoch": 0.49942857142857144,
"grad_norm": 0.8101176619529724,
"kl": 0.2787322998046875,
"learning_rate": 1.4282782639029128e-07,
"loss": 0.0112,
"reward": 0.8161080442368984,
"reward_std": 0.5009766146540642,
"rewards/cosine_scaled_reward": -0.060695987194776535,
"rewards/format_reward": 0.9375000074505806,
"step": 437
},
{
"completion_length": 1704.9583740234375,
"epoch": 0.5005714285714286,
"grad_norm": 2.317962169647217,
"kl": 0.5898284912109375,
"learning_rate": 1.4150013466019114e-07,
"loss": 0.0236,
"reward": 0.8052712418138981,
"reward_std": 0.7321378365159035,
"rewards/cosine_scaled_reward": -0.04528105817735195,
"rewards/format_reward": 0.8958333432674408,
"step": 438
},
{
"completion_length": 1185.708366394043,
"epoch": 0.5017142857142857,
"grad_norm": 2.9166922569274902,
"kl": 0.2282562255859375,
"learning_rate": 1.4019235263722034e-07,
"loss": 0.0091,
"reward": 0.7440173244103789,
"reward_std": 0.822649909183383,
"rewards/cosine_scaled_reward": -0.05507467477582395,
"rewards/format_reward": 0.8541666865348816,
"step": 439
},
{
"completion_length": 1375.2917022705078,
"epoch": 0.5028571428571429,
"grad_norm": 1.8194392919540405,
"kl": 0.3829345703125,
"learning_rate": 1.3890454406082956e-07,
"loss": 0.0153,
"reward": 0.44826740212738514,
"reward_std": 0.573487613350153,
"rewards/cosine_scaled_reward": -0.21336631546728313,
"rewards/format_reward": 0.8750000149011612,
"step": 440
},
{
"completion_length": 1298.4166793823242,
"epoch": 0.504,
"grad_norm": 1.6758705377578735,
"kl": 0.220458984375,
"learning_rate": 1.3763677169699217e-07,
"loss": 0.0088,
"reward": 0.9705498463008553,
"reward_std": 0.4990251734852791,
"rewards/cosine_scaled_reward": 0.02694154903292656,
"rewards/format_reward": 0.916666679084301,
"step": 441
},
{
"completion_length": 1117.8750228881836,
"epoch": 0.5051428571428571,
"grad_norm": 1.8469866514205933,
"kl": 0.151519775390625,
"learning_rate": 1.3638909733514452e-07,
"loss": 0.0061,
"reward": 0.9607795821502805,
"reward_std": 0.8334435187280178,
"rewards/cosine_scaled_reward": 0.022056451067328453,
"rewards/format_reward": 0.9166666865348816,
"step": 442
},
{
"completion_length": 1698.020866394043,
"epoch": 0.5062857142857143,
"grad_norm": 1.6352139711380005,
"kl": 0.6801528930664062,
"learning_rate": 1.351615817851748e-07,
"loss": 0.0272,
"reward": 0.5505319386720657,
"reward_std": 0.514994228258729,
"rewards/cosine_scaled_reward": -0.11015073349699378,
"rewards/format_reward": 0.7708333469927311,
"step": 443
},
{
"completion_length": 1436.145881652832,
"epoch": 0.5074285714285715,
"grad_norm": 1.2693414688110352,
"kl": 0.5347976684570312,
"learning_rate": 1.3395428487445914e-07,
"loss": 0.0214,
"reward": 0.5698400810360909,
"reward_std": 0.8037937507033348,
"rewards/cosine_scaled_reward": -0.14216329460032284,
"rewards/format_reward": 0.8541666828095913,
"step": 444
},
{
"completion_length": 1327.7916946411133,
"epoch": 0.5085714285714286,
"grad_norm": 1.9523547887802124,
"kl": 0.2998390197753906,
"learning_rate": 1.3276726544494571e-07,
"loss": 0.012,
"reward": 0.6349876510794275,
"reward_std": 0.6037604101002216,
"rewards/cosine_scaled_reward": -0.13042284222319722,
"rewards/format_reward": 0.8958333507180214,
"step": 445
},
{
"completion_length": 1364.520881652832,
"epoch": 0.5097142857142857,
"grad_norm": 1.9374631643295288,
"kl": 0.27101898193359375,
"learning_rate": 1.316005813502869e-07,
"loss": 0.0109,
"reward": 0.7476022280752659,
"reward_std": 0.4615443851798773,
"rewards/cosine_scaled_reward": -0.06369888596236706,
"rewards/format_reward": 0.8750000074505806,
"step": 446
},
{
"completion_length": 1311.3542098999023,
"epoch": 0.5108571428571429,
"grad_norm": 1.7980448007583618,
"kl": 0.39284515380859375,
"learning_rate": 1.3045428945301953e-07,
"loss": 0.0157,
"reward": 0.7899397425353527,
"reward_std": 0.5542605184018612,
"rewards/cosine_scaled_reward": -0.06336347293108702,
"rewards/format_reward": 0.916666679084301,
"step": 447
},
{
"completion_length": 1308.7708587646484,
"epoch": 0.512,
"grad_norm": 3.7581255435943604,
"kl": 0.5269927978515625,
"learning_rate": 1.2932844562179352e-07,
"loss": 0.0211,
"reward": 0.7952553946524858,
"reward_std": 0.5684080645442009,
"rewards/cosine_scaled_reward": -0.039872318506240845,
"rewards/format_reward": 0.8750000149011612,
"step": 448
},
{
"completion_length": 1423.8750076293945,
"epoch": 0.5131428571428571,
"grad_norm": 2.130291700363159,
"kl": 0.5636138916015625,
"learning_rate": 1.2822310472864885e-07,
"loss": 0.0226,
"reward": 0.5615764576941729,
"reward_std": 0.6229820605367422,
"rewards/cosine_scaled_reward": -0.13587846513837576,
"rewards/format_reward": 0.8333333414047956,
"step": 449
},
{
"completion_length": 1162.8750610351562,
"epoch": 0.5142857142857142,
"grad_norm": 1.1003129482269287,
"kl": 0.37982177734375,
"learning_rate": 1.2713832064634125e-07,
"loss": 0.0152,
"reward": 0.6501909224316478,
"reward_std": 0.46628283709287643,
"rewards/cosine_scaled_reward": -0.12282122112810612,
"rewards/format_reward": 0.8958333358168602,
"step": 450
},
{
"completion_length": 1113.7500228881836,
"epoch": 0.5154285714285715,
"grad_norm": 1.0367095470428467,
"kl": 0.35689544677734375,
"learning_rate": 1.260741462457165e-07,
"loss": 0.0142,
"reward": 0.7563045807182789,
"reward_std": 0.48215247690677643,
"rewards/cosine_scaled_reward": -0.10101438034325838,
"rewards/format_reward": 0.9583333432674408,
"step": 451
},
{
"completion_length": 1709.3333740234375,
"epoch": 0.5165714285714286,
"grad_norm": 1.715613842010498,
"kl": 0.7028999328613281,
"learning_rate": 1.2503063339313356e-07,
"loss": 0.0281,
"reward": 0.6673658769577742,
"reward_std": 0.7115018367767334,
"rewards/cosine_scaled_reward": -0.06215040449751541,
"rewards/format_reward": 0.7916666716337204,
"step": 452
},
{
"completion_length": 1630.354263305664,
"epoch": 0.5177142857142857,
"grad_norm": 1.8834996223449707,
"kl": 0.749481201171875,
"learning_rate": 1.2400783294793668e-07,
"loss": 0.03,
"reward": 0.5653771113138646,
"reward_std": 0.7798537351191044,
"rewards/cosine_scaled_reward": -0.08189477771520615,
"rewards/format_reward": 0.7291666716337204,
"step": 453
},
{
"completion_length": 1188.0625457763672,
"epoch": 0.5188571428571429,
"grad_norm": 1.7752057313919067,
"kl": 0.3389015197753906,
"learning_rate": 1.2300579475997657e-07,
"loss": 0.0136,
"reward": 0.5443481418769807,
"reward_std": 0.537891261279583,
"rewards/cosine_scaled_reward": -0.16532594605814666,
"rewards/format_reward": 0.8750000074505806,
"step": 454
},
{
"completion_length": 1413.8541870117188,
"epoch": 0.52,
"grad_norm": 2.040367603302002,
"kl": 0.5235443115234375,
"learning_rate": 1.220245676671809e-07,
"loss": 0.021,
"reward": 0.45831110049039125,
"reward_std": 0.5242529977113008,
"rewards/cosine_scaled_reward": -0.2083444595336914,
"rewards/format_reward": 0.8750000074505806,
"step": 455
},
{
"completion_length": 1710.875057220459,
"epoch": 0.5211428571428571,
"grad_norm": 2.0157127380371094,
"kl": 0.6534805297851562,
"learning_rate": 1.2106419949317388e-07,
"loss": 0.0261,
"reward": 0.7503673816099763,
"reward_std": 0.8190008737146854,
"rewards/cosine_scaled_reward": -0.07273299805819988,
"rewards/format_reward": 0.895833358168602,
"step": 456
},
{
"completion_length": 1534.6250228881836,
"epoch": 0.5222857142857142,
"grad_norm": 2.5126774311065674,
"kl": 0.7040786743164062,
"learning_rate": 1.2012473704494537e-07,
"loss": 0.0281,
"reward": 0.7014026306569576,
"reward_std": 0.6737043410539627,
"rewards/cosine_scaled_reward": -0.045132044702768326,
"rewards/format_reward": 0.791666692122817,
"step": 457
},
{
"completion_length": 1155.5000381469727,
"epoch": 0.5234285714285715,
"grad_norm": 1.7595032453536987,
"kl": 0.328765869140625,
"learning_rate": 1.1920622611056974e-07,
"loss": 0.0132,
"reward": 0.5051953579531983,
"reward_std": 0.5484701581299305,
"rewards/cosine_scaled_reward": -0.19531898852437735,
"rewards/format_reward": 0.8958333358168602,
"step": 458
},
{
"completion_length": 1033.8750457763672,
"epoch": 0.5245714285714286,
"grad_norm": 1.7773371934890747,
"kl": 0.11635589599609375,
"learning_rate": 1.1830871145697412e-07,
"loss": 0.0047,
"reward": 0.8539259545505047,
"reward_std": 0.7942902967333794,
"rewards/cosine_scaled_reward": -0.03137038787826896,
"rewards/format_reward": 0.9166666716337204,
"step": 459
},
{
"completion_length": 1757.2083549499512,
"epoch": 0.5257142857142857,
"grad_norm": 2.649414539337158,
"kl": 0.716064453125,
"learning_rate": 1.1743223682775649e-07,
"loss": 0.0286,
"reward": 0.7232505343854427,
"reward_std": 0.7622859515249729,
"rewards/cosine_scaled_reward": -0.03420809283852577,
"rewards/format_reward": 0.791666679084301,
"step": 460
},
{
"completion_length": 1713.5625305175781,
"epoch": 0.5268571428571428,
"grad_norm": 2.396477699279785,
"kl": 0.8402938842773438,
"learning_rate": 1.1657684494105386e-07,
"loss": 0.0336,
"reward": 0.6444814316928387,
"reward_std": 0.48170122131705284,
"rewards/cosine_scaled_reward": -0.05275928042829037,
"rewards/format_reward": 0.7500000074505806,
"step": 461
},
{
"completion_length": 1440.1458740234375,
"epoch": 0.528,
"grad_norm": 1.3917316198349,
"kl": 0.4964752197265625,
"learning_rate": 1.1574257748745986e-07,
"loss": 0.0199,
"reward": 0.4077282305806875,
"reward_std": 0.6107278726994991,
"rewards/cosine_scaled_reward": -0.21280256658792496,
"rewards/format_reward": 0.8333333507180214,
"step": 462
},
{
"completion_length": 1800.666748046875,
"epoch": 0.5291428571428571,
"grad_norm": 2.1574342250823975,
"kl": 0.4113349914550781,
"learning_rate": 1.1492947512799328e-07,
"loss": 0.0164,
"reward": 0.7166567414533347,
"reward_std": 0.9189217295497656,
"rewards/cosine_scaled_reward": -0.02708831927157007,
"rewards/format_reward": 0.7708333432674408,
"step": 463
},
{
"completion_length": 986.208366394043,
"epoch": 0.5302857142857142,
"grad_norm": 1.0322054624557495,
"kl": 0.27919769287109375,
"learning_rate": 1.1413757749211602e-07,
"loss": 0.0112,
"reward": 1.186152020469308,
"reward_std": 0.33294933661818504,
"rewards/cosine_scaled_reward": 0.103492621332407,
"rewards/format_reward": 0.9791666716337204,
"step": 464
},
{
"completion_length": 1653.8750762939453,
"epoch": 0.5314285714285715,
"grad_norm": 1.5671355724334717,
"kl": 0.7370033264160156,
"learning_rate": 1.1336692317580158e-07,
"loss": 0.0295,
"reward": 0.7522133439779282,
"reward_std": 0.8285946622490883,
"rewards/cosine_scaled_reward": -0.06139334570616484,
"rewards/format_reward": 0.8750000149011612,
"step": 465
},
{
"completion_length": 1548.2708892822266,
"epoch": 0.5325714285714286,
"grad_norm": 2.2751619815826416,
"kl": 0.5921897888183594,
"learning_rate": 1.1261754973965422e-07,
"loss": 0.0237,
"reward": 0.9642188400030136,
"reward_std": 0.776534590870142,
"rewards/cosine_scaled_reward": 0.023776067420840263,
"rewards/format_reward": 0.9166666865348816,
"step": 466
},
{
"completion_length": 1739.5625610351562,
"epoch": 0.5337142857142857,
"grad_norm": 1.564422845840454,
"kl": 0.6026535034179688,
"learning_rate": 1.1188949370707787e-07,
"loss": 0.0241,
"reward": 0.34736107289791107,
"reward_std": 0.6108816284686327,
"rewards/cosine_scaled_reward": -0.22215282171964645,
"rewards/format_reward": 0.7916666846722364,
"step": 467
},
{
"completion_length": 1602.4166984558105,
"epoch": 0.5348571428571428,
"grad_norm": 2.7220699787139893,
"kl": 0.8326644897460938,
"learning_rate": 1.1118279056249653e-07,
"loss": 0.0333,
"reward": 0.6653020847588778,
"reward_std": 0.9241620562970638,
"rewards/cosine_scaled_reward": -0.03193228365853429,
"rewards/format_reward": 0.729166679084301,
"step": 468
},
{
"completion_length": 1568.229206085205,
"epoch": 0.536,
"grad_norm": 3.782336950302124,
"kl": 0.8657150268554688,
"learning_rate": 1.1049747474962444e-07,
"loss": 0.0346,
"reward": 0.6384231373667717,
"reward_std": 0.5810450203716755,
"rewards/cosine_scaled_reward": -0.055788458324968815,
"rewards/format_reward": 0.7500000111758709,
"step": 469
},
{
"completion_length": 2107.854248046875,
"epoch": 0.5371428571428571,
"grad_norm": 2.1429548263549805,
"kl": 1.3319091796875,
"learning_rate": 1.0983357966978745e-07,
"loss": 0.0533,
"reward": 0.3757573022157885,
"reward_std": 0.6251861937344074,
"rewards/cosine_scaled_reward": -0.1662880228832364,
"rewards/format_reward": 0.708333345130086,
"step": 470
},
{
"completion_length": 1868.4375381469727,
"epoch": 0.5382857142857143,
"grad_norm": 2.449512243270874,
"kl": 1.098663330078125,
"learning_rate": 1.0919113768029517e-07,
"loss": 0.0439,
"reward": 0.7246365919709206,
"reward_std": 0.8856858089566231,
"rewards/cosine_scaled_reward": -0.0022650789469480515,
"rewards/format_reward": 0.729166679084301,
"step": 471
},
{
"completion_length": 1710.812515258789,
"epoch": 0.5394285714285715,
"grad_norm": 1.708894968032837,
"kl": 0.6365203857421875,
"learning_rate": 1.0857018009286381e-07,
"loss": 0.0254,
"reward": 0.5696725435554981,
"reward_std": 0.7982751838862896,
"rewards/cosine_scaled_reward": -0.11099707769608358,
"rewards/format_reward": 0.7916666828095913,
"step": 472
},
{
"completion_length": 1787.8125381469727,
"epoch": 0.5405714285714286,
"grad_norm": 2.3143343925476074,
"kl": 0.6291122436523438,
"learning_rate": 1.0797073717209013e-07,
"loss": 0.0252,
"reward": 0.5162299545481801,
"reward_std": 0.7424188554286957,
"rewards/cosine_scaled_reward": -0.1377183818258345,
"rewards/format_reward": 0.7916666679084301,
"step": 473
},
{
"completion_length": 1814.4167213439941,
"epoch": 0.5417142857142857,
"grad_norm": 2.590423822402954,
"kl": 0.9863052368164062,
"learning_rate": 1.0739283813397639e-07,
"loss": 0.0394,
"reward": 1.190386475995183,
"reward_std": 0.7150390669703484,
"rewards/cosine_scaled_reward": 0.20977654308080673,
"rewards/format_reward": 0.7708333432674408,
"step": 474
},
{
"completion_length": 1373.0417098999023,
"epoch": 0.5428571428571428,
"grad_norm": 0.9969754815101624,
"kl": 0.34333038330078125,
"learning_rate": 1.068365111445064e-07,
"loss": 0.0137,
"reward": 0.6799025642685592,
"reward_std": 0.7420720048248768,
"rewards/cosine_scaled_reward": -0.1183820916339755,
"rewards/format_reward": 0.9166666679084301,
"step": 475
},
{
"completion_length": 1700.4792175292969,
"epoch": 0.544,
"grad_norm": 1.2897610664367676,
"kl": 0.5266647338867188,
"learning_rate": 1.063017833182728e-07,
"loss": 0.0211,
"reward": 0.872494999319315,
"reward_std": 0.8203131221234798,
"rewards/cosine_scaled_reward": -0.001252486981684342,
"rewards/format_reward": 0.8750000074505806,
"step": 476
},
{
"completion_length": 1266.5000534057617,
"epoch": 0.5451428571428572,
"grad_norm": 1.5318264961242676,
"kl": 0.49987030029296875,
"learning_rate": 1.0578868071715544e-07,
"loss": 0.02,
"reward": 1.0228368118405342,
"reward_std": 1.1473434120416641,
"rewards/cosine_scaled_reward": 0.09475172049133107,
"rewards/format_reward": 0.8333333432674408,
"step": 477
},
{
"completion_length": 1734.5417098999023,
"epoch": 0.5462857142857143,
"grad_norm": 1.782239317893982,
"kl": 0.7471237182617188,
"learning_rate": 1.0529722834905125e-07,
"loss": 0.0299,
"reward": 0.6132550844922662,
"reward_std": 0.5443017482757568,
"rewards/cosine_scaled_reward": -0.0892058244207874,
"rewards/format_reward": 0.7916666716337204,
"step": 478
},
{
"completion_length": 1831.3959045410156,
"epoch": 0.5474285714285714,
"grad_norm": 2.2087714672088623,
"kl": 0.98297119140625,
"learning_rate": 1.0482745016665526e-07,
"loss": 0.0393,
"reward": 0.7758033736608922,
"reward_std": 0.8804528266191483,
"rewards/cosine_scaled_reward": -0.028764987364411354,
"rewards/format_reward": 0.8333333507180214,
"step": 479
},
{
"completion_length": 1431.06254196167,
"epoch": 0.5485714285714286,
"grad_norm": 1.5771631002426147,
"kl": 0.810302734375,
"learning_rate": 1.0437936906629334e-07,
"loss": 0.0325,
"reward": 0.7137194634415209,
"reward_std": 0.9008150100708008,
"rewards/cosine_scaled_reward": -0.03897361445706338,
"rewards/format_reward": 0.7916666865348816,
"step": 480
},
{
"completion_length": 1595.729232788086,
"epoch": 0.5497142857142857,
"grad_norm": 1.9218149185180664,
"kl": 0.5342254638671875,
"learning_rate": 1.0395300688680625e-07,
"loss": 0.0214,
"reward": 0.4544407380744815,
"reward_std": 0.6266775503754616,
"rewards/cosine_scaled_reward": -0.19986298400908709,
"rewards/format_reward": 0.8541666716337204,
"step": 481
},
{
"completion_length": 1623.625015258789,
"epoch": 0.5508571428571428,
"grad_norm": 1.3348276615142822,
"kl": 0.8123359680175781,
"learning_rate": 1.0354838440848501e-07,
"loss": 0.0325,
"reward": 0.7579959752038121,
"reward_std": 0.8174431677907705,
"rewards/cosine_scaled_reward": -0.016835355083458126,
"rewards/format_reward": 0.7916666753590107,
"step": 482
},
{
"completion_length": 1678.958396911621,
"epoch": 0.552,
"grad_norm": 1.4332455396652222,
"kl": 0.5729446411132812,
"learning_rate": 1.0316552135205837e-07,
"loss": 0.0229,
"reward": 0.38456033915281296,
"reward_std": 0.7770704664289951,
"rewards/cosine_scaled_reward": -0.19313650764524937,
"rewards/format_reward": 0.7708333414047956,
"step": 483
},
{
"completion_length": 1232.3750457763672,
"epoch": 0.5531428571428572,
"grad_norm": 1.994523286819458,
"kl": 0.33402252197265625,
"learning_rate": 1.0280443637773163e-07,
"loss": 0.0134,
"reward": 0.7572063701227307,
"reward_std": 0.6004737913608551,
"rewards/cosine_scaled_reward": -0.027646828442811966,
"rewards/format_reward": 0.8125000149011612,
"step": 484
},
{
"completion_length": 1286.6875457763672,
"epoch": 0.5542857142857143,
"grad_norm": 3.306351900100708,
"kl": 0.5061264038085938,
"learning_rate": 1.0246514708427701e-07,
"loss": 0.0203,
"reward": 0.6613044207915664,
"reward_std": 0.6431709341704845,
"rewards/cosine_scaled_reward": -0.08601447567343712,
"rewards/format_reward": 0.833333358168602,
"step": 485
},
{
"completion_length": 727.2708473205566,
"epoch": 0.5554285714285714,
"grad_norm": 1.239498496055603,
"kl": 0.13980865478515625,
"learning_rate": 1.0214767000817596e-07,
"loss": 0.0056,
"reward": 0.857852790504694,
"reward_std": 0.4787697456777096,
"rewards/cosine_scaled_reward": -0.03982360428199172,
"rewards/format_reward": 0.9375000149011612,
"step": 486
},
{
"completion_length": 1126.6042137145996,
"epoch": 0.5565714285714286,
"grad_norm": 3.5425333976745605,
"kl": 0.3272552490234375,
"learning_rate": 1.0185202062281336e-07,
"loss": 0.0131,
"reward": 1.3010212499648333,
"reward_std": 0.5992186656221747,
"rewards/cosine_scaled_reward": 0.19217727705836296,
"rewards/format_reward": 0.916666679084301,
"step": 487
},
{
"completion_length": 1328.5000228881836,
"epoch": 0.5577142857142857,
"grad_norm": 4.340129852294922,
"kl": 0.7839202880859375,
"learning_rate": 1.0157821333772304e-07,
"loss": 0.0314,
"reward": 0.4630023818463087,
"reward_std": 0.5009241513907909,
"rewards/cosine_scaled_reward": -0.1539154672063887,
"rewards/format_reward": 0.7708333395421505,
"step": 488
},
{
"completion_length": 1523.2916946411133,
"epoch": 0.5588571428571428,
"grad_norm": 2.2074553966522217,
"kl": 0.5087432861328125,
"learning_rate": 1.013262614978859e-07,
"loss": 0.0204,
"reward": 0.4663016349077225,
"reward_std": 0.5114475898444653,
"rewards/cosine_scaled_reward": -0.20434920396655798,
"rewards/format_reward": 0.8750000111758709,
"step": 489
},
{
"completion_length": 1465.1458930969238,
"epoch": 0.56,
"grad_norm": 1.6939737796783447,
"kl": 0.6732406616210938,
"learning_rate": 1.0109617738307911e-07,
"loss": 0.0269,
"reward": 0.7020680662244558,
"reward_std": 0.6934347413480282,
"rewards/cosine_scaled_reward": -0.06563264457508922,
"rewards/format_reward": 0.8333333432674408,
"step": 490
},
{
"completion_length": 1713.2500381469727,
"epoch": 0.5611428571428572,
"grad_norm": 1.7025130987167358,
"kl": 0.6433792114257812,
"learning_rate": 1.0088797220727779e-07,
"loss": 0.0257,
"reward": 0.8764896970242262,
"reward_std": 0.9352022185921669,
"rewards/cosine_scaled_reward": 0.02157815732061863,
"rewards/format_reward": 0.8333333432674408,
"step": 491
},
{
"completion_length": 1347.4791946411133,
"epoch": 0.5622857142857143,
"grad_norm": 2.898775815963745,
"kl": 0.4826812744140625,
"learning_rate": 1.0070165611810855e-07,
"loss": 0.0193,
"reward": 0.711312476079911,
"reward_std": 0.5596998631954193,
"rewards/cosine_scaled_reward": -0.040177132934331894,
"rewards/format_reward": 0.7916666846722364,
"step": 492
},
{
"completion_length": 1267.5416946411133,
"epoch": 0.5634285714285714,
"grad_norm": 1.8216196298599243,
"kl": 0.35247039794921875,
"learning_rate": 1.005372381963547e-07,
"loss": 0.0141,
"reward": 0.9863412100821733,
"reward_std": 0.7628876008093357,
"rewards/cosine_scaled_reward": 0.03483725246042013,
"rewards/format_reward": 0.9166666716337204,
"step": 493
},
{
"completion_length": 1147.5416946411133,
"epoch": 0.5645714285714286,
"grad_norm": 1.519400715827942,
"kl": 0.529205322265625,
"learning_rate": 1.0039472645551372e-07,
"loss": 0.0211,
"reward": 1.0907946676015854,
"reward_std": 0.8422017935663462,
"rewards/cosine_scaled_reward": 0.08706398599315435,
"rewards/format_reward": 0.916666679084301,
"step": 494
},
{
"completion_length": 1579.0417098999023,
"epoch": 0.5657142857142857,
"grad_norm": 2.1514978408813477,
"kl": 0.5824813842773438,
"learning_rate": 1.002741278414069e-07,
"loss": 0.0233,
"reward": 0.7147043733857572,
"reward_std": 0.9280455261468887,
"rewards/cosine_scaled_reward": -0.028064499609172344,
"rewards/format_reward": 0.770833333954215,
"step": 495
},
{
"completion_length": 1475.2916717529297,
"epoch": 0.5668571428571428,
"grad_norm": 2.9298958778381348,
"kl": 0.7912979125976562,
"learning_rate": 1.0017544823184055e-07,
"loss": 0.0317,
"reward": 0.7844320052536204,
"reward_std": 0.7428931668400764,
"rewards/cosine_scaled_reward": 0.006799314171075821,
"rewards/format_reward": 0.7708333358168602,
"step": 496
},
{
"completion_length": 1518.9792022705078,
"epoch": 0.568,
"grad_norm": 1.90547513961792,
"kl": 0.940093994140625,
"learning_rate": 1.0009869243631952e-07,
"loss": 0.0376,
"reward": 1.0220248233526945,
"reward_std": 0.6859058924019337,
"rewards/cosine_scaled_reward": 0.1360124358907342,
"rewards/format_reward": 0.7500000111758709,
"step": 497
},
{
"completion_length": 1380.9375228881836,
"epoch": 0.5691428571428572,
"grad_norm": 1.8308056592941284,
"kl": 0.4208221435546875,
"learning_rate": 1.000438641958131e-07,
"loss": 0.0168,
"reward": 0.4634520011022687,
"reward_std": 0.6670360751450062,
"rewards/cosine_scaled_reward": -0.12244068086147308,
"rewards/format_reward": 0.7083333395421505,
"step": 498
},
{
"completion_length": 1510.9792022705078,
"epoch": 0.5702857142857143,
"grad_norm": 2.027233123779297,
"kl": 0.42650604248046875,
"learning_rate": 1.0001096618257236e-07,
"loss": 0.017,
"reward": 0.6949386205524206,
"reward_std": 0.7280914960429072,
"rewards/cosine_scaled_reward": -0.09003069484606385,
"rewards/format_reward": 0.8750000111758709,
"step": 499
},
{
"completion_length": 1275.8125534057617,
"epoch": 0.5714285714285714,
"grad_norm": 1.3618121147155762,
"kl": 0.5210762023925781,
"learning_rate": 1e-07,
"loss": 0.0208,
"reward": 0.670590927824378,
"reward_std": 0.7502868715673685,
"rewards/cosine_scaled_reward": -0.08137122076004744,
"rewards/format_reward": 0.8333333507180214,
"step": 500
},
{
"epoch": 0.5714285714285714,
"step": 500,
"total_flos": 0.0,
"train_loss": 0.004884798622434991,
"train_runtime": 57171.6753,
"train_samples_per_second": 0.42,
"train_steps_per_second": 0.009
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}