Open-RS / trainer_state.json
kangdawei's picture
Model save
9688d3c verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5714285714285714,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 2571.2083587646484,
"epoch": 0.001142857142857143,
"grad_norm": 0.19510559737682343,
"kl": 0.0,
"learning_rate": 2e-08,
"loss": 0.0,
"reward": 0.4897647276520729,
"reward_std": 0.8290339335799217,
"rewards/cosine_scaled_reward": -0.015534311532974243,
"rewards/format_reward": 0.5208333488553762,
"step": 1
},
{
"completion_length": 2804.395881652832,
"epoch": 0.002285714285714286,
"grad_norm": 0.18415163457393646,
"kl": 0.0,
"learning_rate": 4e-08,
"loss": 0.0,
"reward": 0.27539755403995514,
"reward_std": 0.42092563211917877,
"rewards/cosine_scaled_reward": -0.04980122856795788,
"rewards/format_reward": 0.37500000558793545,
"step": 2
},
{
"completion_length": 3361.1458435058594,
"epoch": 0.0034285714285714284,
"grad_norm": 0.16567473113536835,
"kl": 4.006922245025635e-05,
"learning_rate": 6e-08,
"loss": 0.0,
"reward": -0.23245980869978666,
"reward_std": 0.5928730629384518,
"rewards/cosine_scaled_reward": -0.17872990405885503,
"rewards/format_reward": 0.1250000037252903,
"step": 3
},
{
"completion_length": 2153.729202270508,
"epoch": 0.004571428571428572,
"grad_norm": 0.25097447633743286,
"kl": 4.071742296218872e-05,
"learning_rate": 8e-08,
"loss": 0.0,
"reward": 0.3372869056183845,
"reward_std": 0.6931154392659664,
"rewards/cosine_scaled_reward": -0.14385656313970685,
"rewards/format_reward": 0.6250000037252903,
"step": 4
},
{
"completion_length": 3400.375030517578,
"epoch": 0.005714285714285714,
"grad_norm": 0.224583700299263,
"kl": 4.339590668678284e-05,
"learning_rate": 1e-07,
"loss": 0.0,
"reward": -0.30452448688447475,
"reward_std": 0.5627153031527996,
"rewards/cosine_scaled_reward": -0.24601224437355995,
"rewards/format_reward": 0.18750000558793545,
"step": 5
},
{
"completion_length": 3246.2291717529297,
"epoch": 0.006857142857142857,
"grad_norm": 0.17905186116695404,
"kl": 4.048645496368408e-05,
"learning_rate": 1.2e-07,
"loss": 0.0,
"reward": -0.04279324598610401,
"reward_std": 0.5993511825799942,
"rewards/cosine_scaled_reward": -0.18806329306971747,
"rewards/format_reward": 0.3333333469927311,
"step": 6
},
{
"completion_length": 2938.0209197998047,
"epoch": 0.008,
"grad_norm": 0.19757460057735443,
"kl": 2.4452805519104004e-05,
"learning_rate": 1.4e-07,
"loss": 0.0,
"reward": 0.2690338185057044,
"reward_std": 0.7015659939497709,
"rewards/cosine_scaled_reward": -0.11548309866338968,
"rewards/format_reward": 0.5000000111758709,
"step": 7
},
{
"completion_length": 2751.770866394043,
"epoch": 0.009142857142857144,
"grad_norm": 0.1658889204263687,
"kl": 1.728162169456482e-05,
"learning_rate": 1.6e-07,
"loss": 0.0,
"reward": 0.6224091164767742,
"reward_std": 0.7972168251872063,
"rewards/cosine_scaled_reward": 0.09245455078780651,
"rewards/format_reward": 0.43750001303851604,
"step": 8
},
{
"completion_length": 3031.7084045410156,
"epoch": 0.010285714285714285,
"grad_norm": 0.2120562642812729,
"kl": 3.610551357269287e-05,
"learning_rate": 1.8e-07,
"loss": 0.0,
"reward": 0.18008227972313762,
"reward_std": 0.8001800198107958,
"rewards/cosine_scaled_reward": -0.10787552827969193,
"rewards/format_reward": 0.39583334140479565,
"step": 9
},
{
"completion_length": 2753.208366394043,
"epoch": 0.011428571428571429,
"grad_norm": 0.21144917607307434,
"kl": 3.1989067792892456e-05,
"learning_rate": 2e-07,
"loss": 0.0,
"reward": 0.08000842481851578,
"reward_std": 0.7203316055238247,
"rewards/cosine_scaled_reward": -0.14749579317867756,
"rewards/format_reward": 0.3750000111758709,
"step": 10
},
{
"completion_length": 3293.2708435058594,
"epoch": 0.012571428571428572,
"grad_norm": 0.1650972217321396,
"kl": 3.8489699363708496e-05,
"learning_rate": 2.1999999999999998e-07,
"loss": 0.0,
"reward": -0.41089826077222824,
"reward_std": 0.4038824327290058,
"rewards/cosine_scaled_reward": -0.2783657982945442,
"rewards/format_reward": 0.1458333395421505,
"step": 11
},
{
"completion_length": 2683.5000915527344,
"epoch": 0.013714285714285714,
"grad_norm": 0.19341549277305603,
"kl": 4.114210605621338e-05,
"learning_rate": 2.4e-07,
"loss": 0.0,
"reward": 0.43104756623506546,
"reward_std": 0.5792003609240055,
"rewards/cosine_scaled_reward": -0.11780955828726292,
"rewards/format_reward": 0.6666666828095913,
"step": 12
},
{
"completion_length": 2886.541702270508,
"epoch": 0.014857142857142857,
"grad_norm": 0.22984430193901062,
"kl": 3.719329833984375e-05,
"learning_rate": 2.6e-07,
"loss": 0.0,
"reward": 0.23748547211289406,
"reward_std": 0.7873078212141991,
"rewards/cosine_scaled_reward": -0.06875726429279894,
"rewards/format_reward": 0.3750000149011612,
"step": 13
},
{
"completion_length": 2877.312530517578,
"epoch": 0.016,
"grad_norm": 0.2078430950641632,
"kl": 2.997368574142456e-05,
"learning_rate": 2.8e-07,
"loss": 0.0,
"reward": 0.14176954282447696,
"reward_std": 0.7100466191768646,
"rewards/cosine_scaled_reward": -0.11661523208022118,
"rewards/format_reward": 0.3750000074505806,
"step": 14
},
{
"completion_length": 2681.437511444092,
"epoch": 0.017142857142857144,
"grad_norm": 0.20677024126052856,
"kl": 2.692500129342079e-05,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.5315095772966743,
"reward_std": 0.6707141287624836,
"rewards/cosine_scaled_reward": 0.04700478911399841,
"rewards/format_reward": 0.43750000558793545,
"step": 15
},
{
"completion_length": 3505.2708435058594,
"epoch": 0.018285714285714287,
"grad_norm": 0.19309639930725098,
"kl": 3.673136234283447e-05,
"learning_rate": 3.2e-07,
"loss": 0.0,
"reward": -0.2943936800584197,
"reward_std": 0.6762014180421829,
"rewards/cosine_scaled_reward": -0.19928016886115074,
"rewards/format_reward": 0.10416666977107525,
"step": 16
},
{
"completion_length": 2520.7916984558105,
"epoch": 0.019428571428571427,
"grad_norm": 0.2713626027107239,
"kl": 3.7260353565216064e-05,
"learning_rate": 3.4000000000000003e-07,
"loss": 0.0,
"reward": 0.29364213347435,
"reward_std": 0.7413498945534229,
"rewards/cosine_scaled_reward": -0.09276227621012367,
"rewards/format_reward": 0.4791666828095913,
"step": 17
},
{
"completion_length": 2989.500030517578,
"epoch": 0.02057142857142857,
"grad_norm": 0.14791515469551086,
"kl": 2.292729914188385e-05,
"learning_rate": 3.6e-07,
"loss": 0.0,
"reward": 0.22529255971312523,
"reward_std": 0.5643694922327995,
"rewards/cosine_scaled_reward": -0.08527039736509323,
"rewards/format_reward": 0.39583333395421505,
"step": 18
},
{
"completion_length": 2874.250030517578,
"epoch": 0.021714285714285714,
"grad_norm": 0.18149082362651825,
"kl": 2.5779008865356445e-05,
"learning_rate": 3.7999999999999996e-07,
"loss": 0.0,
"reward": 0.6877359300851822,
"reward_std": 0.7566238529980183,
"rewards/cosine_scaled_reward": 0.1355345994234085,
"rewards/format_reward": 0.416666679084301,
"step": 19
},
{
"completion_length": 2473.520854949951,
"epoch": 0.022857142857142857,
"grad_norm": 0.20354878902435303,
"kl": 1.685088500380516e-05,
"learning_rate": 4e-07,
"loss": 0.0,
"reward": 0.5272735962644219,
"reward_std": 0.7334154956042767,
"rewards/cosine_scaled_reward": -0.059279868844896555,
"rewards/format_reward": 0.6458333414047956,
"step": 20
},
{
"completion_length": 2523.520835876465,
"epoch": 0.024,
"grad_norm": 0.26596662402153015,
"kl": 4.191696643829346e-05,
"learning_rate": 4.1999999999999995e-07,
"loss": 0.0,
"reward": 0.5646272003650665,
"reward_std": 0.5900244954973459,
"rewards/cosine_scaled_reward": 0.04273026343435049,
"rewards/format_reward": 0.47916666977107525,
"step": 21
},
{
"completion_length": 2035.3542251586914,
"epoch": 0.025142857142857144,
"grad_norm": 0.2960587441921234,
"kl": 3.464333713054657e-05,
"learning_rate": 4.3999999999999997e-07,
"loss": 0.0,
"reward": 0.637257520109415,
"reward_std": 0.8020393662154675,
"rewards/cosine_scaled_reward": -0.03553791396552697,
"rewards/format_reward": 0.7083333488553762,
"step": 22
},
{
"completion_length": 2664.3125610351562,
"epoch": 0.026285714285714287,
"grad_norm": 0.21004951000213623,
"kl": 3.3404678106307983e-05,
"learning_rate": 4.6e-07,
"loss": 0.0,
"reward": 0.22290698066353798,
"reward_std": 0.7860734751448035,
"rewards/cosine_scaled_reward": -0.0968798566609621,
"rewards/format_reward": 0.4166666753590107,
"step": 23
},
{
"completion_length": 2712.6875610351562,
"epoch": 0.027428571428571427,
"grad_norm": 0.2664627432823181,
"kl": 2.100318670272827e-05,
"learning_rate": 4.8e-07,
"loss": 0.0,
"reward": 0.7516965055838227,
"reward_std": 1.1073086112737656,
"rewards/cosine_scaled_reward": 0.07376489660236984,
"rewards/format_reward": 0.6041666865348816,
"step": 24
},
{
"completion_length": 2708.1458740234375,
"epoch": 0.02857142857142857,
"grad_norm": 0.22358901798725128,
"kl": 3.844499588012695e-05,
"learning_rate": 5e-07,
"loss": 0.0,
"reward": 0.1855073875049129,
"reward_std": 0.6800910420715809,
"rewards/cosine_scaled_reward": -0.08432964608073235,
"rewards/format_reward": 0.3541666753590107,
"step": 25
},
{
"completion_length": 3107.541717529297,
"epoch": 0.029714285714285714,
"grad_norm": 0.1617579460144043,
"kl": 2.820044755935669e-05,
"learning_rate": 5.2e-07,
"loss": 0.0,
"reward": 0.587481252849102,
"reward_std": 0.693414282053709,
"rewards/cosine_scaled_reward": 0.07499059912515804,
"rewards/format_reward": 0.43750001303851604,
"step": 26
},
{
"completion_length": 2998.666702270508,
"epoch": 0.030857142857142857,
"grad_norm": 0.18699905276298523,
"kl": 2.479786053299904e-05,
"learning_rate": 5.4e-07,
"loss": 0.0,
"reward": 0.12037789449095726,
"reward_std": 0.8231884613633156,
"rewards/cosine_scaled_reward": -0.12731105368584394,
"rewards/format_reward": 0.37500000558793545,
"step": 27
},
{
"completion_length": 2880.4375534057617,
"epoch": 0.032,
"grad_norm": 0.1849849373102188,
"kl": 2.5503337383270264e-05,
"learning_rate": 5.6e-07,
"loss": 0.0,
"reward": 0.38755445554852486,
"reward_std": 0.8283951133489609,
"rewards/cosine_scaled_reward": -0.004139451950322837,
"rewards/format_reward": 0.39583334140479565,
"step": 28
},
{
"completion_length": 3422.604217529297,
"epoch": 0.03314285714285714,
"grad_norm": 0.17641501128673553,
"kl": 2.0990148186683655e-05,
"learning_rate": 5.8e-07,
"loss": 0.0,
"reward": -0.2620885446667671,
"reward_std": 0.563602440059185,
"rewards/cosine_scaled_reward": -0.20396093931049109,
"rewards/format_reward": 0.14583333767950535,
"step": 29
},
{
"completion_length": 2927.229217529297,
"epoch": 0.03428571428571429,
"grad_norm": 0.20999222993850708,
"kl": 3.0543655157089233e-05,
"learning_rate": 6e-07,
"loss": 0.0,
"reward": 0.3440352368634194,
"reward_std": 0.9646327681839466,
"rewards/cosine_scaled_reward": -0.0467323949560523,
"rewards/format_reward": 0.43750001676380634,
"step": 30
},
{
"completion_length": 2953.437530517578,
"epoch": 0.03542857142857143,
"grad_norm": 0.18049225211143494,
"kl": 4.493445158004761e-05,
"learning_rate": 6.2e-07,
"loss": 0.0,
"reward": 0.20754884742200375,
"reward_std": 0.8056708425283432,
"rewards/cosine_scaled_reward": -0.08372558187693357,
"rewards/format_reward": 0.37500000931322575,
"step": 31
},
{
"completion_length": 3194.937545776367,
"epoch": 0.036571428571428574,
"grad_norm": 0.15885497629642487,
"kl": 2.6959925889968872e-05,
"learning_rate": 6.4e-07,
"loss": 0.0,
"reward": 0.04864098597317934,
"reward_std": 0.6213513053953648,
"rewards/cosine_scaled_reward": -0.12151284422725439,
"rewards/format_reward": 0.2916666716337204,
"step": 32
},
{
"completion_length": 3312.2291870117188,
"epoch": 0.037714285714285714,
"grad_norm": 0.15803882479667664,
"kl": 4.449114203453064e-05,
"learning_rate": 6.6e-07,
"loss": 0.0,
"reward": 0.20429637841880322,
"reward_std": 0.770494450815022,
"rewards/cosine_scaled_reward": -0.03326847730204463,
"rewards/format_reward": 0.27083334140479565,
"step": 33
},
{
"completion_length": 2483.666702270508,
"epoch": 0.038857142857142854,
"grad_norm": 0.22648921608924866,
"kl": 7.438473403453827e-05,
"learning_rate": 6.800000000000001e-07,
"loss": 0.0,
"reward": 0.7040222510695457,
"reward_std": 0.8860063031315804,
"rewards/cosine_scaled_reward": 0.07076112227514386,
"rewards/format_reward": 0.5625000074505806,
"step": 34
},
{
"completion_length": 2937.750015258789,
"epoch": 0.04,
"grad_norm": 0.20356310904026031,
"kl": 4.690885543823242e-05,
"learning_rate": 7e-07,
"loss": 0.0,
"reward": 0.31086206063628197,
"reward_std": 0.8583472333848476,
"rewards/cosine_scaled_reward": -0.032068971544504166,
"rewards/format_reward": 0.37500000558793545,
"step": 35
},
{
"completion_length": 3399.6458435058594,
"epoch": 0.04114285714285714,
"grad_norm": 0.17643427848815918,
"kl": 4.9777328968048096e-05,
"learning_rate": 7.2e-07,
"loss": 0.0,
"reward": -0.3302987515926361,
"reward_std": 0.4607627494260669,
"rewards/cosine_scaled_reward": -0.22764937952160835,
"rewards/format_reward": 0.1250000037252903,
"step": 36
},
{
"completion_length": 3218.916702270508,
"epoch": 0.04228571428571429,
"grad_norm": 0.16882504522800446,
"kl": 3.8273632526397705e-05,
"learning_rate": 7.4e-07,
"loss": 0.0,
"reward": -0.16163601353764534,
"reward_std": 0.4668765738606453,
"rewards/cosine_scaled_reward": -0.21623468212783337,
"rewards/format_reward": 0.2708333395421505,
"step": 37
},
{
"completion_length": 3279.5416870117188,
"epoch": 0.04342857142857143,
"grad_norm": 0.15908610820770264,
"kl": 4.918128252029419e-05,
"learning_rate": 7.599999999999999e-07,
"loss": 0.0,
"reward": -0.20695911906659603,
"reward_std": 0.5721107684075832,
"rewards/cosine_scaled_reward": -0.176396232098341,
"rewards/format_reward": 0.1458333395421505,
"step": 38
},
{
"completion_length": 2879.375045776367,
"epoch": 0.044571428571428574,
"grad_norm": 0.2147342413663864,
"kl": 7.034093141555786e-05,
"learning_rate": 7.799999999999999e-07,
"loss": 0.0,
"reward": 0.29712690226733685,
"reward_std": 0.6135409390553832,
"rewards/cosine_scaled_reward": -0.04935322143137455,
"rewards/format_reward": 0.3958333432674408,
"step": 39
},
{
"completion_length": 2469.687530517578,
"epoch": 0.045714285714285714,
"grad_norm": 0.19995440542697906,
"kl": 0.00012910738587379456,
"learning_rate": 8e-07,
"loss": 0.0,
"reward": 0.2901697149500251,
"reward_std": 0.5493389200419188,
"rewards/cosine_scaled_reward": -0.13616514671593904,
"rewards/format_reward": 0.5625000074505806,
"step": 40
},
{
"completion_length": 3059.3333740234375,
"epoch": 0.046857142857142854,
"grad_norm": 0.1593063771724701,
"kl": 5.719810724258423e-05,
"learning_rate": 8.199999999999999e-07,
"loss": 0.0,
"reward": -0.08104978175833821,
"reward_std": 0.537400247529149,
"rewards/cosine_scaled_reward": -0.21760822273790836,
"rewards/format_reward": 0.3541666716337204,
"step": 41
},
{
"completion_length": 2785.979190826416,
"epoch": 0.048,
"grad_norm": 0.24719306826591492,
"kl": 5.231797695159912e-05,
"learning_rate": 8.399999999999999e-07,
"loss": 0.0,
"reward": -0.2765159234404564,
"reward_std": 0.3159199729561806,
"rewards/cosine_scaled_reward": -0.29450796730816364,
"rewards/format_reward": 0.3125,
"step": 42
},
{
"completion_length": 3143.4583435058594,
"epoch": 0.04914285714285714,
"grad_norm": 0.17697609961032867,
"kl": 5.420297384262085e-05,
"learning_rate": 8.599999999999999e-07,
"loss": 0.0,
"reward": -0.027741093188524246,
"reward_std": 0.5976252444088459,
"rewards/cosine_scaled_reward": -0.14928722102195024,
"rewards/format_reward": 0.2708333358168602,
"step": 43
},
{
"completion_length": 2617.750045776367,
"epoch": 0.05028571428571429,
"grad_norm": 0.30272603034973145,
"kl": 0.00017141178250312805,
"learning_rate": 8.799999999999999e-07,
"loss": 0.0,
"reward": 0.54763038828969,
"reward_std": 0.6350081358104944,
"rewards/cosine_scaled_reward": -0.007434792350977659,
"rewards/format_reward": 0.5625000055879354,
"step": 44
},
{
"completion_length": 3381.1458740234375,
"epoch": 0.05142857142857143,
"grad_norm": 0.14729151129722595,
"kl": 5.987286567687988e-05,
"learning_rate": 9e-07,
"loss": 0.0,
"reward": 0.1284105316735804,
"reward_std": 0.6408776678144932,
"rewards/cosine_scaled_reward": -0.06079472857527435,
"rewards/format_reward": 0.25,
"step": 45
},
{
"completion_length": 3190.1458740234375,
"epoch": 0.052571428571428575,
"grad_norm": 0.18984851241111755,
"kl": 0.0001230388879776001,
"learning_rate": 9.2e-07,
"loss": 0.0,
"reward": -0.2710073352791369,
"reward_std": 0.37906504422426224,
"rewards/cosine_scaled_reward": -0.22925367206335068,
"rewards/format_reward": 0.18750000186264515,
"step": 46
},
{
"completion_length": 2773.666748046875,
"epoch": 0.053714285714285714,
"grad_norm": 0.22614669799804688,
"kl": 7.295981049537659e-05,
"learning_rate": 9.399999999999999e-07,
"loss": 0.0,
"reward": 0.6267627328634262,
"reward_std": 1.0455659702420235,
"rewards/cosine_scaled_reward": 0.0633813701570034,
"rewards/format_reward": 0.5000000111758709,
"step": 47
},
{
"completion_length": 2853.6250534057617,
"epoch": 0.054857142857142854,
"grad_norm": 0.2049134522676468,
"kl": 0.00029557570815086365,
"learning_rate": 9.6e-07,
"loss": 0.0,
"reward": 0.1033776430413127,
"reward_std": 0.891391895711422,
"rewards/cosine_scaled_reward": -0.11497785244137049,
"rewards/format_reward": 0.33333334140479565,
"step": 48
},
{
"completion_length": 2345.6250610351562,
"epoch": 0.056,
"grad_norm": 0.21897369623184204,
"kl": 0.00014978647232055664,
"learning_rate": 9.8e-07,
"loss": 0.0,
"reward": 0.6087969962973148,
"reward_std": 0.7800232023000717,
"rewards/cosine_scaled_reward": -0.008101530373096466,
"rewards/format_reward": 0.6250000037252903,
"step": 49
},
{
"completion_length": 3047.5000228881836,
"epoch": 0.05714285714285714,
"grad_norm": 0.17249740660190582,
"kl": 0.0002017766237258911,
"learning_rate": 1e-06,
"loss": 0.0,
"reward": 0.43377939984202385,
"reward_std": 0.7017503455281258,
"rewards/cosine_scaled_reward": 0.029389701783657074,
"rewards/format_reward": 0.37500000931322575,
"step": 50
},
{
"completion_length": 2343.145881652832,
"epoch": 0.05828571428571429,
"grad_norm": 0.2304115891456604,
"kl": 0.0005070865154266357,
"learning_rate": 9.999890338174275e-07,
"loss": 0.0,
"reward": 0.30895326659083366,
"reward_std": 0.47219153563492,
"rewards/cosine_scaled_reward": -0.10594002925790846,
"rewards/format_reward": 0.5208333395421505,
"step": 51
},
{
"completion_length": 3017.1042098999023,
"epoch": 0.05942857142857143,
"grad_norm": 0.22551533579826355,
"kl": 0.0004254281520843506,
"learning_rate": 9.999561358041868e-07,
"loss": 0.0,
"reward": 0.3134525269269943,
"reward_std": 0.7696055620908737,
"rewards/cosine_scaled_reward": -0.020357078406959772,
"rewards/format_reward": 0.35416667349636555,
"step": 52
},
{
"completion_length": 2933.2084197998047,
"epoch": 0.060571428571428575,
"grad_norm": 0.1710565686225891,
"kl": 0.00019855797290802002,
"learning_rate": 9.999013075636804e-07,
"loss": 0.0,
"reward": 0.4190669923555106,
"reward_std": 0.9724426595494151,
"rewards/cosine_scaled_reward": -0.030049838591367006,
"rewards/format_reward": 0.4791666753590107,
"step": 53
},
{
"completion_length": 2728.2083740234375,
"epoch": 0.061714285714285715,
"grad_norm": 0.1685902327299118,
"kl": 8.179806172847748e-05,
"learning_rate": 9.998245517681593e-07,
"loss": 0.0,
"reward": 0.9252486824989319,
"reward_std": 1.0930374152958393,
"rewards/cosine_scaled_reward": 0.16054099425673485,
"rewards/format_reward": 0.6041666772216558,
"step": 54
},
{
"completion_length": 2987.250030517578,
"epoch": 0.06285714285714286,
"grad_norm": 0.1715623438358307,
"kl": 0.0003556758165359497,
"learning_rate": 9.997258721585931e-07,
"loss": 0.0,
"reward": 0.34154442069120705,
"reward_std": 0.7232516668736935,
"rewards/cosine_scaled_reward": -0.03756110556423664,
"rewards/format_reward": 0.41666667349636555,
"step": 55
},
{
"completion_length": 3029.250045776367,
"epoch": 0.064,
"grad_norm": 0.15501734614372253,
"kl": 0.00012418627738952637,
"learning_rate": 9.996052735444862e-07,
"loss": 0.0,
"reward": 0.25998372526373714,
"reward_std": 0.6867683958262205,
"rewards/cosine_scaled_reward": -0.047091471031308174,
"rewards/format_reward": 0.3541666716337204,
"step": 56
},
{
"completion_length": 3392.9791870117188,
"epoch": 0.06514285714285714,
"grad_norm": 0.1358395218849182,
"kl": 5.584210157394409e-05,
"learning_rate": 9.994627618036452e-07,
"loss": 0.0,
"reward": -0.13618062436580658,
"reward_std": 0.6030133794993162,
"rewards/cosine_scaled_reward": -0.18267363915219903,
"rewards/format_reward": 0.2291666679084301,
"step": 57
},
{
"completion_length": 2349.08341217041,
"epoch": 0.06628571428571428,
"grad_norm": 0.22305431962013245,
"kl": 0.0013600699603557587,
"learning_rate": 9.992983438818915e-07,
"loss": 0.0001,
"reward": 0.5223081162257586,
"reward_std": 0.9040666744112968,
"rewards/cosine_scaled_reward": -0.030512610450387,
"rewards/format_reward": 0.5833333395421505,
"step": 58
},
{
"completion_length": 2854.125030517578,
"epoch": 0.06742857142857143,
"grad_norm": 0.1728586107492447,
"kl": 7.890164852142334e-05,
"learning_rate": 9.991120277927223e-07,
"loss": 0.0,
"reward": 0.1912415586411953,
"reward_std": 0.6860835738480091,
"rewards/cosine_scaled_reward": -0.06062923185527325,
"rewards/format_reward": 0.31250000558793545,
"step": 59
},
{
"completion_length": 3058.0208587646484,
"epoch": 0.06857142857142857,
"grad_norm": 0.15029877424240112,
"kl": 0.0001646280288696289,
"learning_rate": 9.989038226169207e-07,
"loss": 0.0,
"reward": -0.1796177178621292,
"reward_std": 0.5032580140978098,
"rewards/cosine_scaled_reward": -0.2356421989388764,
"rewards/format_reward": 0.29166667349636555,
"step": 60
},
{
"completion_length": 3021.9375915527344,
"epoch": 0.06971428571428571,
"grad_norm": 0.18782874941825867,
"kl": 0.00031859055161476135,
"learning_rate": 9.98673738502114e-07,
"loss": 0.0,
"reward": 0.2405167557299137,
"reward_std": 0.8873464465141296,
"rewards/cosine_scaled_reward": -0.11932496633380651,
"rewards/format_reward": 0.4791666679084301,
"step": 61
},
{
"completion_length": 2797.437515258789,
"epoch": 0.07085714285714285,
"grad_norm": 0.19510804116725922,
"kl": 0.0028433650732040405,
"learning_rate": 9.98421786662277e-07,
"loss": 0.0001,
"reward": 0.6466732956469059,
"reward_std": 1.1665390692651272,
"rewards/cosine_scaled_reward": 0.06291996827349067,
"rewards/format_reward": 0.520833345130086,
"step": 62
},
{
"completion_length": 2213.6458740234375,
"epoch": 0.072,
"grad_norm": 0.21749359369277954,
"kl": 0.0012581497430801392,
"learning_rate": 9.981479793771866e-07,
"loss": 0.0001,
"reward": 0.9055937305092812,
"reward_std": 0.8001900352537632,
"rewards/cosine_scaled_reward": 0.08821351453661919,
"rewards/format_reward": 0.7291666716337204,
"step": 63
},
{
"completion_length": 2778.3333435058594,
"epoch": 0.07314285714285715,
"grad_norm": 0.18261142075061798,
"kl": 0.0003489851951599121,
"learning_rate": 9.97852329991824e-07,
"loss": 0.0,
"reward": 0.38868435472249985,
"reward_std": 0.6305454671382904,
"rewards/cosine_scaled_reward": -0.02440785290673375,
"rewards/format_reward": 0.4375000074505806,
"step": 64
},
{
"completion_length": 2673.687530517578,
"epoch": 0.07428571428571429,
"grad_norm": 0.1992972046136856,
"kl": 0.00033433735370635986,
"learning_rate": 9.975348529157229e-07,
"loss": 0.0,
"reward": 0.3928167396225035,
"reward_std": 0.7381406147032976,
"rewards/cosine_scaled_reward": -0.07442497462034225,
"rewards/format_reward": 0.5416666734963655,
"step": 65
},
{
"completion_length": 2147.3750038146973,
"epoch": 0.07542857142857143,
"grad_norm": 0.22494584321975708,
"kl": 0.0011640042066574097,
"learning_rate": 9.971955636222684e-07,
"loss": 0.0,
"reward": 0.42546659521758556,
"reward_std": 0.5198768675327301,
"rewards/cosine_scaled_reward": -0.03726671263575554,
"rewards/format_reward": 0.5,
"step": 66
},
{
"completion_length": 3342.7708740234375,
"epoch": 0.07657142857142857,
"grad_norm": 0.14088153839111328,
"kl": 0.00046347081661224365,
"learning_rate": 9.968344786479415e-07,
"loss": 0.0,
"reward": -0.42950472608208656,
"reward_std": 0.378866882994771,
"rewards/cosine_scaled_reward": -0.31891903653740883,
"rewards/format_reward": 0.2083333395421505,
"step": 67
},
{
"completion_length": 2170.7083854675293,
"epoch": 0.07771428571428571,
"grad_norm": 0.29666775465011597,
"kl": 0.0014674067497253418,
"learning_rate": 9.964516155915151e-07,
"loss": 0.0001,
"reward": 0.4676838363520801,
"reward_std": 0.7767233625054359,
"rewards/cosine_scaled_reward": -0.03699141927063465,
"rewards/format_reward": 0.5416666679084301,
"step": 68
},
{
"completion_length": 2446.1458740234375,
"epoch": 0.07885714285714286,
"grad_norm": 0.22960010170936584,
"kl": 0.0015052556991577148,
"learning_rate": 9.960469931131936e-07,
"loss": 0.0001,
"reward": 0.21741511672735214,
"reward_std": 0.8107622135430574,
"rewards/cosine_scaled_reward": -0.1412924542091787,
"rewards/format_reward": 0.500000013038516,
"step": 69
},
{
"completion_length": 3091.6041717529297,
"epoch": 0.08,
"grad_norm": 0.19971789419651031,
"kl": 0.0011597275733947754,
"learning_rate": 9.956206309337066e-07,
"loss": 0.0,
"reward": 0.12162317335605621,
"reward_std": 0.5483939237892628,
"rewards/cosine_scaled_reward": -0.11627176497131586,
"rewards/format_reward": 0.35416666977107525,
"step": 70
},
{
"completion_length": 2609.0208587646484,
"epoch": 0.08114285714285714,
"grad_norm": 0.23139537870883942,
"kl": 0.0016418173909187317,
"learning_rate": 9.951725498333448e-07,
"loss": 0.0001,
"reward": 0.2501606810837984,
"reward_std": 0.5798324253410101,
"rewards/cosine_scaled_reward": -0.0832529878243804,
"rewards/format_reward": 0.4166666716337204,
"step": 71
},
{
"completion_length": 3072.9583740234375,
"epoch": 0.08228571428571428,
"grad_norm": 0.22307687997817993,
"kl": 0.0010285675525665283,
"learning_rate": 9.947027716509488e-07,
"loss": 0.0,
"reward": -0.06439175084233284,
"reward_std": 0.7740810364484787,
"rewards/cosine_scaled_reward": -0.16761254286393523,
"rewards/format_reward": 0.2708333395421505,
"step": 72
},
{
"completion_length": 3455.875030517578,
"epoch": 0.08342857142857144,
"grad_norm": 0.150093212723732,
"kl": 0.00021230429410934448,
"learning_rate": 9.942113192828444e-07,
"loss": 0.0,
"reward": -0.021114151924848557,
"reward_std": 0.7219380252063274,
"rewards/cosine_scaled_reward": -0.10430707037448883,
"rewards/format_reward": 0.18750000558793545,
"step": 73
},
{
"completion_length": 3296.250045776367,
"epoch": 0.08457142857142858,
"grad_norm": 0.17310041189193726,
"kl": 0.0007840991020202637,
"learning_rate": 9.93698216681727e-07,
"loss": 0.0,
"reward": -0.0612453930079937,
"reward_std": 0.5310056991875172,
"rewards/cosine_scaled_reward": -0.14520604407880455,
"rewards/format_reward": 0.22916666977107525,
"step": 74
},
{
"completion_length": 3076.0833740234375,
"epoch": 0.08571428571428572,
"grad_norm": 0.14123524725437164,
"kl": 0.0008448450826108456,
"learning_rate": 9.931634888554935e-07,
"loss": 0.0,
"reward": 0.21019641309976578,
"reward_std": 0.4503815546631813,
"rewards/cosine_scaled_reward": -0.05115179833956063,
"rewards/format_reward": 0.31250000186264515,
"step": 75
},
{
"completion_length": 2814.2500228881836,
"epoch": 0.08685714285714285,
"grad_norm": 0.20728114247322083,
"kl": 0.0001634061336517334,
"learning_rate": 9.926071618660237e-07,
"loss": 0.0,
"reward": 0.06560860201716423,
"reward_std": 0.5740169659256935,
"rewards/cosine_scaled_reward": -0.17552903201431036,
"rewards/format_reward": 0.41666666977107525,
"step": 76
},
{
"completion_length": 3305.1041870117188,
"epoch": 0.088,
"grad_norm": 0.14447131752967834,
"kl": 0.0003515356220304966,
"learning_rate": 9.9202926282791e-07,
"loss": 0.0,
"reward": -0.06417501904070377,
"reward_std": 0.5662869866937399,
"rewards/cosine_scaled_reward": -0.15708751417696476,
"rewards/format_reward": 0.25000000186264515,
"step": 77
},
{
"completion_length": 3341.875,
"epoch": 0.08914285714285715,
"grad_norm": 0.14708709716796875,
"kl": 0.00022222846746444702,
"learning_rate": 9.91429819907136e-07,
"loss": 0.0,
"reward": 0.07691416330635548,
"reward_std": 0.7588405385613441,
"rewards/cosine_scaled_reward": -0.09695957787334919,
"rewards/format_reward": 0.27083334140479565,
"step": 78
},
{
"completion_length": 2390.979179382324,
"epoch": 0.09028571428571429,
"grad_norm": 0.2777462303638458,
"kl": 0.0015588998794555664,
"learning_rate": 9.908088623197048e-07,
"loss": 0.0001,
"reward": 0.19228350277990103,
"reward_std": 0.7328854473307729,
"rewards/cosine_scaled_reward": -0.1746915839612484,
"rewards/format_reward": 0.5416666734963655,
"step": 79
},
{
"completion_length": 3182.437530517578,
"epoch": 0.09142857142857143,
"grad_norm": 0.18599167466163635,
"kl": 0.0007250010967254639,
"learning_rate": 9.901664203302124e-07,
"loss": 0.0,
"reward": 0.44475090876221657,
"reward_std": 0.6929970532655716,
"rewards/cosine_scaled_reward": 0.024458803236484528,
"rewards/format_reward": 0.3958333358168602,
"step": 80
},
{
"completion_length": 2998.7500534057617,
"epoch": 0.09257142857142857,
"grad_norm": 0.23842793703079224,
"kl": 0.0027131736278533936,
"learning_rate": 9.895025252503755e-07,
"loss": 0.0001,
"reward": -0.0324825718998909,
"reward_std": 0.6861623711884022,
"rewards/cosine_scaled_reward": -0.16207461850717664,
"rewards/format_reward": 0.29166667349636555,
"step": 81
},
{
"completion_length": 2781.791717529297,
"epoch": 0.09371428571428571,
"grad_norm": 0.19417299330234528,
"kl": 0.0011616647243499756,
"learning_rate": 9.888172094375033e-07,
"loss": 0.0,
"reward": 0.3426995016634464,
"reward_std": 0.756939671933651,
"rewards/cosine_scaled_reward": -0.03698359243571758,
"rewards/format_reward": 0.41666666977107525,
"step": 82
},
{
"completion_length": 2735.4791946411133,
"epoch": 0.09485714285714286,
"grad_norm": 0.23736146092414856,
"kl": 0.001489013433456421,
"learning_rate": 9.881105062929221e-07,
"loss": 0.0001,
"reward": 0.022416603518649936,
"reward_std": 0.33409483451396227,
"rewards/cosine_scaled_reward": -0.14504170510917902,
"rewards/format_reward": 0.3125,
"step": 83
},
{
"completion_length": 3006.7708587646484,
"epoch": 0.096,
"grad_norm": 0.16109317541122437,
"kl": 0.0031517744064331055,
"learning_rate": 9.873824502603459e-07,
"loss": 0.0001,
"reward": 0.2841636799275875,
"reward_std": 0.9676609244197607,
"rewards/cosine_scaled_reward": -0.055834827944636345,
"rewards/format_reward": 0.39583334140479565,
"step": 84
},
{
"completion_length": 2945.0416870117188,
"epoch": 0.09714285714285714,
"grad_norm": 0.14022095501422882,
"kl": 0.00030300021171569824,
"learning_rate": 9.866330768241983e-07,
"loss": 0.0,
"reward": 0.3169367350637913,
"reward_std": 0.8734622374176979,
"rewards/cosine_scaled_reward": -0.08111499063670635,
"rewards/format_reward": 0.47916667722165585,
"step": 85
},
{
"completion_length": 2773.3333740234375,
"epoch": 0.09828571428571428,
"grad_norm": 0.20840014517307281,
"kl": 0.0014702677726745605,
"learning_rate": 9.85862422507884e-07,
"loss": 0.0001,
"reward": 0.3505563363432884,
"reward_std": 0.50963762588799,
"rewards/cosine_scaled_reward": -0.06430518021807075,
"rewards/format_reward": 0.47916666977107525,
"step": 86
},
{
"completion_length": 2771.8333740234375,
"epoch": 0.09942857142857142,
"grad_norm": 0.2091607004404068,
"kl": 0.0014937222003936768,
"learning_rate": 9.850705248720068e-07,
"loss": 0.0001,
"reward": 0.21895797760225832,
"reward_std": 0.8198871687054634,
"rewards/cosine_scaled_reward": -0.11968768760561943,
"rewards/format_reward": 0.45833334513008595,
"step": 87
},
{
"completion_length": 2588.979248046875,
"epoch": 0.10057142857142858,
"grad_norm": 0.20578493177890778,
"kl": 0.003347158432006836,
"learning_rate": 9.8425742251254e-07,
"loss": 0.0001,
"reward": 0.8585769310593605,
"reward_std": 1.0248939394950867,
"rewards/cosine_scaled_reward": 0.11678845560527407,
"rewards/format_reward": 0.6250000093132257,
"step": 88
},
{
"completion_length": 3318.6459045410156,
"epoch": 0.10171428571428572,
"grad_norm": 0.198954775929451,
"kl": 0.0015476346015930176,
"learning_rate": 9.83423155058946e-07,
"loss": 0.0001,
"reward": 0.03354086447507143,
"reward_std": 0.9196067973971367,
"rewards/cosine_scaled_reward": -0.10822956170886755,
"rewards/format_reward": 0.2500000074505806,
"step": 89
},
{
"completion_length": 2533.125045776367,
"epoch": 0.10285714285714286,
"grad_norm": 0.30491241812705994,
"kl": 0.0027846097946166992,
"learning_rate": 9.825677631722435e-07,
"loss": 0.0001,
"reward": 0.04410627228207886,
"reward_std": 0.5515282340347767,
"rewards/cosine_scaled_reward": -0.21753019373863935,
"rewards/format_reward": 0.47916667722165585,
"step": 90
},
{
"completion_length": 3007.5000610351562,
"epoch": 0.104,
"grad_norm": 0.1628594547510147,
"kl": 0.0011910051107406616,
"learning_rate": 9.816912885430258e-07,
"loss": 0.0,
"reward": 0.4382549002766609,
"reward_std": 0.8723863288760185,
"rewards/cosine_scaled_reward": 0.010794118046760559,
"rewards/format_reward": 0.416666679084301,
"step": 91
},
{
"completion_length": 2512.5833587646484,
"epoch": 0.10514285714285715,
"grad_norm": 0.19360074400901794,
"kl": 0.0010070949792861938,
"learning_rate": 9.807937738894303e-07,
"loss": 0.0,
"reward": 0.32041475689038634,
"reward_std": 0.6855251621454954,
"rewards/cosine_scaled_reward": -0.11062596645206213,
"rewards/format_reward": 0.5416666697710752,
"step": 92
},
{
"completion_length": 3344.4166870117188,
"epoch": 0.10628571428571429,
"grad_norm": 0.1997271329164505,
"kl": 0.0018374919891357422,
"learning_rate": 9.798752629550546e-07,
"loss": 0.0001,
"reward": -0.34033770859241486,
"reward_std": 0.5492488071322441,
"rewards/cosine_scaled_reward": -0.2535021901130676,
"rewards/format_reward": 0.1666666716337204,
"step": 93
},
{
"completion_length": 3007.375015258789,
"epoch": 0.10742857142857143,
"grad_norm": 0.18923112750053406,
"kl": 0.0023823007941246033,
"learning_rate": 9.78935800506826e-07,
"loss": 0.0001,
"reward": 0.13416577805764973,
"reward_std": 0.4046405693516135,
"rewards/cosine_scaled_reward": -0.06833377946168184,
"rewards/format_reward": 0.27083333395421505,
"step": 94
},
{
"completion_length": 3444.0416870117188,
"epoch": 0.10857142857142857,
"grad_norm": 0.14268328249454498,
"kl": 0.0009976625442504883,
"learning_rate": 9.779754323328192e-07,
"loss": 0.0,
"reward": -0.09058984462171793,
"reward_std": 0.7453176453709602,
"rewards/cosine_scaled_reward": -0.15987825905904174,
"rewards/format_reward": 0.2291666716337204,
"step": 95
},
{
"completion_length": 2678.1875228881836,
"epoch": 0.10971428571428571,
"grad_norm": 0.19071771204471588,
"kl": 0.0018395986407995224,
"learning_rate": 9.769942052400235e-07,
"loss": 0.0001,
"reward": 0.348697304725647,
"reward_std": 0.6102097555994987,
"rewards/cosine_scaled_reward": -0.04440134949982166,
"rewards/format_reward": 0.43750000186264515,
"step": 96
},
{
"completion_length": 2990.000030517578,
"epoch": 0.11085714285714286,
"grad_norm": 0.20323379337787628,
"kl": 0.001025363802909851,
"learning_rate": 9.759921670520634e-07,
"loss": 0.0,
"reward": 0.41594838351011276,
"reward_std": 0.7984455898404121,
"rewards/cosine_scaled_reward": 0.02047417126595974,
"rewards/format_reward": 0.3750000111758709,
"step": 97
},
{
"completion_length": 2679.2083587646484,
"epoch": 0.112,
"grad_norm": 0.19446168839931488,
"kl": 0.0004923343658447266,
"learning_rate": 9.749693666068663e-07,
"loss": 0.0,
"reward": 0.38869317155331373,
"reward_std": 0.564236119389534,
"rewards/cosine_scaled_reward": -0.07648676075041294,
"rewards/format_reward": 0.541666679084301,
"step": 98
},
{
"completion_length": 2801.7708435058594,
"epoch": 0.11314285714285714,
"grad_norm": 0.20274780690670013,
"kl": 0.0009057521820068359,
"learning_rate": 9.739258537542835e-07,
"loss": 0.0,
"reward": 0.04034796729683876,
"reward_std": 0.43001002445816994,
"rewards/cosine_scaled_reward": -0.1152426817570813,
"rewards/format_reward": 0.27083333395421505,
"step": 99
},
{
"completion_length": 2591.125030517578,
"epoch": 0.11428571428571428,
"grad_norm": 0.19084717333316803,
"kl": 0.001154184341430664,
"learning_rate": 9.728616793536587e-07,
"loss": 0.0,
"reward": 0.6981350090354681,
"reward_std": 0.9793934002518654,
"rewards/cosine_scaled_reward": 0.06781747564673424,
"rewards/format_reward": 0.5625000111758709,
"step": 100
},
{
"completion_length": 2723.312530517578,
"epoch": 0.11542857142857142,
"grad_norm": 0.22346965968608856,
"kl": 0.001171112060546875,
"learning_rate": 9.717768952713511e-07,
"loss": 0.0,
"reward": 0.2876786937122233,
"reward_std": 0.4630406089127064,
"rewards/cosine_scaled_reward": -0.05407731421291828,
"rewards/format_reward": 0.39583333395421505,
"step": 101
},
{
"completion_length": 2054.187545776367,
"epoch": 0.11657142857142858,
"grad_norm": 0.23816390335559845,
"kl": 0.002776503562927246,
"learning_rate": 9.706715543782064e-07,
"loss": 0.0001,
"reward": 0.6102155670523643,
"reward_std": 0.7314882390201092,
"rewards/cosine_scaled_reward": -0.059475560672581196,
"rewards/format_reward": 0.7291666865348816,
"step": 102
},
{
"completion_length": 2666.833396911621,
"epoch": 0.11771428571428572,
"grad_norm": 0.2530282139778137,
"kl": 0.0017848014831542969,
"learning_rate": 9.695457105469804e-07,
"loss": 0.0001,
"reward": 0.2429911457002163,
"reward_std": 0.6969506703317165,
"rewards/cosine_scaled_reward": -0.11808777041733265,
"rewards/format_reward": 0.479166679084301,
"step": 103
},
{
"completion_length": 2740.9791870117188,
"epoch": 0.11885714285714286,
"grad_norm": 0.20160914957523346,
"kl": 0.0023827552795410156,
"learning_rate": 9.683994186497132e-07,
"loss": 0.0001,
"reward": 0.20196924358606339,
"reward_std": 0.5999510791152716,
"rewards/cosine_scaled_reward": -0.0865153931081295,
"rewards/format_reward": 0.3750000037252903,
"step": 104
},
{
"completion_length": 2462.1041870117188,
"epoch": 0.12,
"grad_norm": 0.24578723311424255,
"kl": 0.0011974573135375977,
"learning_rate": 9.672327345550543e-07,
"loss": 0.0,
"reward": 0.29713789327070117,
"reward_std": 0.8025108277797699,
"rewards/cosine_scaled_reward": -0.080597716383636,
"rewards/format_reward": 0.45833334140479565,
"step": 105
},
{
"completion_length": 2199.854179382324,
"epoch": 0.12114285714285715,
"grad_norm": 0.19964368641376495,
"kl": 0.002254962921142578,
"learning_rate": 9.66045715125541e-07,
"loss": 0.0001,
"reward": 0.8605472417548299,
"reward_std": 0.7241987711749971,
"rewards/cosine_scaled_reward": 0.11777362413704395,
"rewards/format_reward": 0.6250000037252903,
"step": 106
},
{
"completion_length": 2900.125030517578,
"epoch": 0.12228571428571429,
"grad_norm": 0.1984090954065323,
"kl": 0.0013356208801269531,
"learning_rate": 9.648384182148252e-07,
"loss": 0.0001,
"reward": 0.3114801459014416,
"reward_std": 0.5947921872138977,
"rewards/cosine_scaled_reward": -0.08384328025022114,
"rewards/format_reward": 0.47916666977107525,
"step": 107
},
{
"completion_length": 2570.6041870117188,
"epoch": 0.12342857142857143,
"grad_norm": 0.2010282427072525,
"kl": 0.0011126399040222168,
"learning_rate": 9.636109026648554e-07,
"loss": 0.0,
"reward": 0.6098845191299915,
"reward_std": 0.8683119043707848,
"rewards/cosine_scaled_reward": 0.02369226049631834,
"rewards/format_reward": 0.5625000074505806,
"step": 108
},
{
"completion_length": 2960.645896911621,
"epoch": 0.12457142857142857,
"grad_norm": 0.19048207998275757,
"kl": 0.0006784200668334961,
"learning_rate": 9.623632283030077e-07,
"loss": 0.0,
"reward": 0.38045269437134266,
"reward_std": 0.6336373277008533,
"rewards/cosine_scaled_reward": 0.002726326696574688,
"rewards/format_reward": 0.3750000037252903,
"step": 109
},
{
"completion_length": 2726.1042098999023,
"epoch": 0.12571428571428572,
"grad_norm": 0.20842185616493225,
"kl": 0.0014406442642211914,
"learning_rate": 9.610954559391704e-07,
"loss": 0.0001,
"reward": 0.2915392220020294,
"reward_std": 0.891454428434372,
"rewards/cosine_scaled_reward": -0.10423038713634014,
"rewards/format_reward": 0.5000000093132257,
"step": 110
},
{
"completion_length": 2772.9375534057617,
"epoch": 0.12685714285714286,
"grad_norm": 0.20696447789669037,
"kl": 0.0015645027160644531,
"learning_rate": 9.598076473627796e-07,
"loss": 0.0001,
"reward": 0.20833251508884132,
"reward_std": 0.7267865724861622,
"rewards/cosine_scaled_reward": -0.12500040791928768,
"rewards/format_reward": 0.4583333395421505,
"step": 111
},
{
"completion_length": 2978.0208892822266,
"epoch": 0.128,
"grad_norm": 0.16811113059520721,
"kl": 0.000897526741027832,
"learning_rate": 9.58499865339809e-07,
"loss": 0.0,
"reward": 0.3418322764337063,
"reward_std": 0.65217125415802,
"rewards/cosine_scaled_reward": -0.05825053807348013,
"rewards/format_reward": 0.4583333358168602,
"step": 112
},
{
"completion_length": 2559.7083740234375,
"epoch": 0.12914285714285714,
"grad_norm": 0.20631085336208344,
"kl": 0.0014824867248535156,
"learning_rate": 9.571721736097088e-07,
"loss": 0.0001,
"reward": 0.43655770644545555,
"reward_std": 0.7786016501486301,
"rewards/cosine_scaled_reward": -0.03172116680070758,
"rewards/format_reward": 0.5000000093132257,
"step": 113
},
{
"completion_length": 2495.7917137145996,
"epoch": 0.13028571428571428,
"grad_norm": 0.22188131511211395,
"kl": 0.0031752586364746094,
"learning_rate": 9.55824636882301e-07,
"loss": 0.0001,
"reward": 0.20582257118076086,
"reward_std": 0.7198268510401249,
"rewards/cosine_scaled_reward": -0.18875539442524314,
"rewards/format_reward": 0.5833333469927311,
"step": 114
},
{
"completion_length": 2754.6667098999023,
"epoch": 0.13142857142857142,
"grad_norm": 0.19496048986911774,
"kl": 0.0033437013626098633,
"learning_rate": 9.54457320834625e-07,
"loss": 0.0001,
"reward": 0.02730092965066433,
"reward_std": 0.5944252610206604,
"rewards/cosine_scaled_reward": -0.2050995440222323,
"rewards/format_reward": 0.4375000074505806,
"step": 115
},
{
"completion_length": 3369.375030517578,
"epoch": 0.13257142857142856,
"grad_norm": 0.1768704503774643,
"kl": 0.001748800277709961,
"learning_rate": 9.530702921077358e-07,
"loss": 0.0001,
"reward": -0.17860433738678694,
"reward_std": 0.7088249325752258,
"rewards/cosine_scaled_reward": -0.15180217241868377,
"rewards/format_reward": 0.1250000037252903,
"step": 116
},
{
"completion_length": 3029.687530517578,
"epoch": 0.1337142857142857,
"grad_norm": 0.22546739876270294,
"kl": 0.0027513504028320312,
"learning_rate": 9.516636183034564e-07,
"loss": 0.0001,
"reward": -0.09248590935021639,
"reward_std": 0.6045599468052387,
"rewards/cosine_scaled_reward": -0.19207628909498453,
"rewards/format_reward": 0.2916666753590107,
"step": 117
},
{
"completion_length": 2870.8958740234375,
"epoch": 0.13485714285714287,
"grad_norm": 0.16772252321243286,
"kl": 0.0015900135040283203,
"learning_rate": 9.502373679810839e-07,
"loss": 0.0001,
"reward": 0.46789007633924484,
"reward_std": 0.8386504454538226,
"rewards/cosine_scaled_reward": 0.015195020474493504,
"rewards/format_reward": 0.4375000074505806,
"step": 118
},
{
"completion_length": 2562.3125610351562,
"epoch": 0.136,
"grad_norm": 0.21363668143749237,
"kl": 0.004290223121643066,
"learning_rate": 9.487916106540465e-07,
"loss": 0.0002,
"reward": 0.7369925267994404,
"reward_std": 0.7727241404354572,
"rewards/cosine_scaled_reward": 0.0768295917659998,
"rewards/format_reward": 0.5833333395421505,
"step": 119
},
{
"completion_length": 2289.1667289733887,
"epoch": 0.13714285714285715,
"grad_norm": 0.2960655391216278,
"kl": 0.0022792816162109375,
"learning_rate": 9.473264167865171e-07,
"loss": 0.0001,
"reward": 0.513806963339448,
"reward_std": 0.7356006652116776,
"rewards/cosine_scaled_reward": -0.045179841690696776,
"rewards/format_reward": 0.6041666772216558,
"step": 120
},
{
"completion_length": 1657.1041870117188,
"epoch": 0.1382857142857143,
"grad_norm": 0.27528315782546997,
"kl": 0.00405430793762207,
"learning_rate": 9.458418577899774e-07,
"loss": 0.0002,
"reward": 0.7713621072471142,
"reward_std": 0.6179003790020943,
"rewards/cosine_scaled_reward": -0.010152293369174004,
"rewards/format_reward": 0.7916666716337204,
"step": 121
},
{
"completion_length": 2898.375030517578,
"epoch": 0.13942857142857143,
"grad_norm": 0.20366472005844116,
"kl": 0.0014312267303466797,
"learning_rate": 9.443380060197385e-07,
"loss": 0.0001,
"reward": 0.47324386797845364,
"reward_std": 0.8266710359603167,
"rewards/cosine_scaled_reward": 0.02828860469162464,
"rewards/format_reward": 0.4166666753590107,
"step": 122
},
{
"completion_length": 2754.791717529297,
"epoch": 0.14057142857142857,
"grad_norm": 0.17461104691028595,
"kl": 0.0015136003494262695,
"learning_rate": 9.428149347714143e-07,
"loss": 0.0001,
"reward": 0.28382661822251976,
"reward_std": 0.5956688185688108,
"rewards/cosine_scaled_reward": -0.12892001681029797,
"rewards/format_reward": 0.541666679084301,
"step": 123
},
{
"completion_length": 2220.0833587646484,
"epoch": 0.1417142857142857,
"grad_norm": 0.2452717274427414,
"kl": 0.0047130584716796875,
"learning_rate": 9.412727182773486e-07,
"loss": 0.0002,
"reward": 0.5766889279475436,
"reward_std": 0.7685924731194973,
"rewards/cosine_scaled_reward": -0.03457220923155546,
"rewards/format_reward": 0.6458333414047956,
"step": 124
},
{
"completion_length": 2875.895866394043,
"epoch": 0.14285714285714285,
"grad_norm": 0.17229042947292328,
"kl": 0.0020313262939453125,
"learning_rate": 9.397114317029974e-07,
"loss": 0.0001,
"reward": 0.23444998264312744,
"reward_std": 0.5397426411509514,
"rewards/cosine_scaled_reward": -0.028608346357941628,
"rewards/format_reward": 0.2916666679084301,
"step": 125
},
{
"completion_length": 2846.5833587646484,
"epoch": 0.144,
"grad_norm": 0.1771528422832489,
"kl": 0.0011348724365234375,
"learning_rate": 9.381311511432658e-07,
"loss": 0.0,
"reward": 0.2827608957886696,
"reward_std": 0.6769377700984478,
"rewards/cosine_scaled_reward": -0.11903620883822441,
"rewards/format_reward": 0.5208333414047956,
"step": 126
},
{
"completion_length": 3026.437545776367,
"epoch": 0.14514285714285713,
"grad_norm": 0.23673483729362488,
"kl": 0.0022172927856445312,
"learning_rate": 9.36531953618799e-07,
"loss": 0.0001,
"reward": -0.08590636402368546,
"reward_std": 0.6378746330738068,
"rewards/cosine_scaled_reward": -0.18878651410341263,
"rewards/format_reward": 0.2916666716337204,
"step": 127
},
{
"completion_length": 2956.1458892822266,
"epoch": 0.1462857142857143,
"grad_norm": 0.1844361275434494,
"kl": 0.002949953079223633,
"learning_rate": 9.34913917072228e-07,
"loss": 0.0001,
"reward": 0.39643347449600697,
"reward_std": 0.7595263011753559,
"rewards/cosine_scaled_reward": 0.010716758668422699,
"rewards/format_reward": 0.37500000186264515,
"step": 128
},
{
"completion_length": 3379.1666870117188,
"epoch": 0.14742857142857144,
"grad_norm": 0.40960681438446045,
"kl": 0.02291560173034668,
"learning_rate": 9.332771203643714e-07,
"loss": 0.0009,
"reward": -0.011747716460376978,
"reward_std": 0.9172870293259621,
"rewards/cosine_scaled_reward": -0.09962387196719646,
"rewards/format_reward": 0.1875000037252903,
"step": 129
},
{
"completion_length": 2683.5416870117188,
"epoch": 0.14857142857142858,
"grad_norm": 0.18997545540332794,
"kl": 0.0017414093017578125,
"learning_rate": 9.316216432703916e-07,
"loss": 0.0001,
"reward": 0.2956298217177391,
"reward_std": 0.7938947193324566,
"rewards/cosine_scaled_reward": -0.05010176869109273,
"rewards/format_reward": 0.39583334140479565,
"step": 130
},
{
"completion_length": 2938.958396911621,
"epoch": 0.14971428571428572,
"grad_norm": 0.1829785704612732,
"kl": 0.004005908966064453,
"learning_rate": 9.299475664759068e-07,
"loss": 0.0002,
"reward": 0.6047782748937607,
"reward_std": 0.8474699303042144,
"rewards/cosine_scaled_reward": 0.10447247140109539,
"rewards/format_reward": 0.39583334140479565,
"step": 131
},
{
"completion_length": 2620.854179382324,
"epoch": 0.15085714285714286,
"grad_norm": 0.17881910502910614,
"kl": 0.0018241405487060547,
"learning_rate": 9.282549715730579e-07,
"loss": 0.0001,
"reward": 0.3893936946988106,
"reward_std": 0.687925798818469,
"rewards/cosine_scaled_reward": -0.013636493356898427,
"rewards/format_reward": 0.41666667349636555,
"step": 132
},
{
"completion_length": 3095.3958740234375,
"epoch": 0.152,
"grad_norm": 0.20941434800624847,
"kl": 0.0031735897064208984,
"learning_rate": 9.265439410565328e-07,
"loss": 0.0001,
"reward": 0.05856896564364433,
"reward_std": 0.7605378944426775,
"rewards/cosine_scaled_reward": -0.12696552043780684,
"rewards/format_reward": 0.31250000558793545,
"step": 133
},
{
"completion_length": 2441.6458740234375,
"epoch": 0.15314285714285714,
"grad_norm": 0.18976929783821106,
"kl": 0.0040435791015625,
"learning_rate": 9.248145583195447e-07,
"loss": 0.0002,
"reward": 0.5501680783927441,
"reward_std": 0.618685107678175,
"rewards/cosine_scaled_reward": -0.026999298483133316,
"rewards/format_reward": 0.6041666697710752,
"step": 134
},
{
"completion_length": 2050.6041870117188,
"epoch": 0.15428571428571428,
"grad_norm": 0.2555489242076874,
"kl": 0.004626750946044922,
"learning_rate": 9.230669076497687e-07,
"loss": 0.0002,
"reward": 1.1215206906199455,
"reward_std": 0.8544092262163758,
"rewards/cosine_scaled_reward": 0.22742698714137077,
"rewards/format_reward": 0.6666666772216558,
"step": 135
},
{
"completion_length": 2620.791717529297,
"epoch": 0.15542857142857142,
"grad_norm": 0.18313182890415192,
"kl": 0.003920078277587891,
"learning_rate": 9.213010742252327e-07,
"loss": 0.0002,
"reward": 0.5208159852772951,
"reward_std": 0.9554536268115044,
"rewards/cosine_scaled_reward": 0.010407987982034683,
"rewards/format_reward": 0.5000000111758709,
"step": 136
},
{
"completion_length": 2811.666702270508,
"epoch": 0.15657142857142858,
"grad_norm": 0.17304006218910217,
"kl": 0.0026001930236816406,
"learning_rate": 9.195171441101668e-07,
"loss": 0.0001,
"reward": 0.03694445453584194,
"reward_std": 0.6093163676559925,
"rewards/cosine_scaled_reward": -0.15861110761761665,
"rewards/format_reward": 0.35416666977107525,
"step": 137
},
{
"completion_length": 2228.2500762939453,
"epoch": 0.15771428571428572,
"grad_norm": 0.19729673862457275,
"kl": 0.0018553733825683594,
"learning_rate": 9.177152042508077e-07,
"loss": 0.0001,
"reward": 0.4917922643944621,
"reward_std": 0.5503780655562878,
"rewards/cosine_scaled_reward": -0.08743719174526632,
"rewards/format_reward": 0.6666666734963655,
"step": 138
},
{
"completion_length": 3121.8750915527344,
"epoch": 0.15885714285714286,
"grad_norm": 0.17099516093730927,
"kl": 0.0035991668701171875,
"learning_rate": 9.158953424711624e-07,
"loss": 0.0001,
"reward": 0.20771604776382446,
"reward_std": 0.8068908657878637,
"rewards/cosine_scaled_reward": -0.08364198234630749,
"rewards/format_reward": 0.37500000931322575,
"step": 139
},
{
"completion_length": 2868.979217529297,
"epoch": 0.16,
"grad_norm": 0.22809311747550964,
"kl": 0.0057888031005859375,
"learning_rate": 9.140576474687263e-07,
"loss": 0.0002,
"reward": 0.10171525273472071,
"reward_std": 0.42831680551171303,
"rewards/cosine_scaled_reward": -0.11580904200673103,
"rewards/format_reward": 0.33333333767950535,
"step": 140
},
{
"completion_length": 2534.354263305664,
"epoch": 0.16114285714285714,
"grad_norm": 0.1993127316236496,
"kl": 0.0041599273681640625,
"learning_rate": 9.122022088101613e-07,
"loss": 0.0002,
"reward": 0.5355201922357082,
"reward_std": 0.7652903571724892,
"rewards/cosine_scaled_reward": -0.04473992623388767,
"rewards/format_reward": 0.6250000149011612,
"step": 141
},
{
"completion_length": 2745.250015258789,
"epoch": 0.16228571428571428,
"grad_norm": 0.20162560045719147,
"kl": 0.0030961036682128906,
"learning_rate": 9.103291169269299e-07,
"loss": 0.0001,
"reward": 0.3946248684078455,
"reward_std": 0.7829636707901955,
"rewards/cosine_scaled_reward": -0.09435425186529756,
"rewards/format_reward": 0.583333345130086,
"step": 142
},
{
"completion_length": 2417.6250610351562,
"epoch": 0.16342857142857142,
"grad_norm": 0.4018979072570801,
"kl": 0.004683494567871094,
"learning_rate": 9.084384631108882e-07,
"loss": 0.0002,
"reward": 0.19592272117733955,
"reward_std": 0.5982137080281973,
"rewards/cosine_scaled_reward": -0.1832886370830238,
"rewards/format_reward": 0.5625000111758709,
"step": 143
},
{
"completion_length": 2747.145881652832,
"epoch": 0.16457142857142856,
"grad_norm": 0.2238093614578247,
"kl": 0.0034952163696289062,
"learning_rate": 9.065303395098358e-07,
"loss": 0.0001,
"reward": 0.3479926884174347,
"reward_std": 0.8791730478405952,
"rewards/cosine_scaled_reward": -0.04475365893449634,
"rewards/format_reward": 0.43750001303851604,
"step": 144
},
{
"completion_length": 2055.895866394043,
"epoch": 0.1657142857142857,
"grad_norm": 0.2830770015716553,
"kl": 0.0055332183837890625,
"learning_rate": 9.046048391230247e-07,
"loss": 0.0002,
"reward": 0.5167977176606655,
"reward_std": 0.5320884976536036,
"rewards/cosine_scaled_reward": -0.043684473261237144,
"rewards/format_reward": 0.6041666716337204,
"step": 145
},
{
"completion_length": 1917.2917251586914,
"epoch": 0.16685714285714287,
"grad_norm": 0.18457695841789246,
"kl": 0.0024001598358154297,
"learning_rate": 9.026620557966279e-07,
"loss": 0.0001,
"reward": 0.5474055055528879,
"reward_std": 0.7067083790898323,
"rewards/cosine_scaled_reward": -0.11171392910182476,
"rewards/format_reward": 0.7708333395421505,
"step": 146
},
{
"completion_length": 2394.2500534057617,
"epoch": 0.168,
"grad_norm": 0.26179954409599304,
"kl": 0.004405975341796875,
"learning_rate": 9.007020842191634e-07,
"loss": 0.0002,
"reward": 0.39791956916451454,
"reward_std": 1.0572543032467365,
"rewards/cosine_scaled_reward": -0.07187354937195778,
"rewards/format_reward": 0.5416666697710752,
"step": 147
},
{
"completion_length": 1997.3125534057617,
"epoch": 0.16914285714285715,
"grad_norm": 0.22162283957004547,
"kl": 0.0032930374145507812,
"learning_rate": 8.987250199168808e-07,
"loss": 0.0001,
"reward": 0.6760393833974376,
"reward_std": 0.7666322588920593,
"rewards/cosine_scaled_reward": -0.01614698488265276,
"rewards/format_reward": 0.7083333358168602,
"step": 148
},
{
"completion_length": 2594.416717529297,
"epoch": 0.1702857142857143,
"grad_norm": 0.17498500645160675,
"kl": 0.0034232139587402344,
"learning_rate": 8.967309592491052e-07,
"loss": 0.0001,
"reward": 0.5365147553384304,
"reward_std": 0.6859856657683849,
"rewards/cosine_scaled_reward": -0.012992626056075096,
"rewards/format_reward": 0.5625000186264515,
"step": 149
},
{
"completion_length": 2656.104202270508,
"epoch": 0.17142857142857143,
"grad_norm": 0.22585879266262054,
"kl": 0.0047016143798828125,
"learning_rate": 8.9471999940354e-07,
"loss": 0.0002,
"reward": 0.2979818880558014,
"reward_std": 0.7468250878155231,
"rewards/cosine_scaled_reward": -0.08017573575489223,
"rewards/format_reward": 0.45833333767950535,
"step": 150
},
{
"completion_length": 2433.208396911621,
"epoch": 0.17257142857142857,
"grad_norm": 0.2490771859884262,
"kl": 0.0044155120849609375,
"learning_rate": 8.926922383915315e-07,
"loss": 0.0002,
"reward": 0.5584549466148019,
"reward_std": 0.8048359379172325,
"rewards/cosine_scaled_reward": -0.022855868563055992,
"rewards/format_reward": 0.6041666809469461,
"step": 151
},
{
"completion_length": 2724.979217529297,
"epoch": 0.1737142857142857,
"grad_norm": 0.2223489135503769,
"kl": 0.003587007522583008,
"learning_rate": 8.906477750432903e-07,
"loss": 0.0001,
"reward": 0.020049065351486206,
"reward_std": 0.6065222397446632,
"rewards/cosine_scaled_reward": -0.1774754635989666,
"rewards/format_reward": 0.37500000558793545,
"step": 152
},
{
"completion_length": 2770.2708740234375,
"epoch": 0.17485714285714285,
"grad_norm": 0.19104580581188202,
"kl": 0.006191253662109375,
"learning_rate": 8.88586709003076e-07,
"loss": 0.0002,
"reward": 0.004709841683506966,
"reward_std": 0.47944022715091705,
"rewards/cosine_scaled_reward": -0.19556174334138632,
"rewards/format_reward": 0.3958333358168602,
"step": 153
},
{
"completion_length": 3214.062530517578,
"epoch": 0.176,
"grad_norm": 0.186995267868042,
"kl": 0.0031766891479492188,
"learning_rate": 8.865091407243394e-07,
"loss": 0.0001,
"reward": 0.3578900098800659,
"reward_std": 1.006507821381092,
"rewards/cosine_scaled_reward": 0.03311167098581791,
"rewards/format_reward": 0.29166666977107525,
"step": 154
},
{
"completion_length": 2554.666732788086,
"epoch": 0.17714285714285713,
"grad_norm": 0.20163290202617645,
"kl": 0.004076957702636719,
"learning_rate": 8.844151714648274e-07,
"loss": 0.0002,
"reward": 0.5557569041848183,
"reward_std": 0.6795310117304325,
"rewards/cosine_scaled_reward": 0.02787843905389309,
"rewards/format_reward": 0.5000000037252903,
"step": 155
},
{
"completion_length": 2689.2292098999023,
"epoch": 0.1782857142857143,
"grad_norm": 0.18030290305614471,
"kl": 0.003159046173095703,
"learning_rate": 8.823049032816478e-07,
"loss": 0.0001,
"reward": 0.4783413279801607,
"reward_std": 0.8905918002128601,
"rewards/cosine_scaled_reward": 0.02042064256966114,
"rewards/format_reward": 0.4375,
"step": 156
},
{
"completion_length": 2461.875015258789,
"epoch": 0.17942857142857144,
"grad_norm": 0.22876004874706268,
"kl": 0.003749370574951172,
"learning_rate": 8.801784390262943e-07,
"loss": 0.0001,
"reward": 0.15547512285411358,
"reward_std": 0.5757812671363354,
"rewards/cosine_scaled_reward": -0.19309578835964203,
"rewards/format_reward": 0.541666679084301,
"step": 157
},
{
"completion_length": 2797.8333740234375,
"epoch": 0.18057142857142858,
"grad_norm": 0.2126486450433731,
"kl": 0.0034036636352539062,
"learning_rate": 8.780358823396352e-07,
"loss": 0.0001,
"reward": 0.5735968193039298,
"reward_std": 0.8958301469683647,
"rewards/cosine_scaled_reward": 0.02638173568993807,
"rewards/format_reward": 0.5208333469927311,
"step": 158
},
{
"completion_length": 2499.937515258789,
"epoch": 0.18171428571428572,
"grad_norm": 0.17875580489635468,
"kl": 0.003962516784667969,
"learning_rate": 8.758773376468604e-07,
"loss": 0.0002,
"reward": 0.11497697234153748,
"reward_std": 0.49865414947271347,
"rewards/cosine_scaled_reward": -0.1820948626846075,
"rewards/format_reward": 0.4791666753590107,
"step": 159
},
{
"completion_length": 2463.708381652832,
"epoch": 0.18285714285714286,
"grad_norm": 0.23183724284172058,
"kl": 0.006978034973144531,
"learning_rate": 8.737029101523929e-07,
"loss": 0.0003,
"reward": 0.7383107729256153,
"reward_std": 0.9244415387511253,
"rewards/cosine_scaled_reward": 0.05665536457672715,
"rewards/format_reward": 0.6250000055879354,
"step": 160
},
{
"completion_length": 2109.3333740234375,
"epoch": 0.184,
"grad_norm": 0.2291431725025177,
"kl": 0.0051670074462890625,
"learning_rate": 8.715127058347614e-07,
"loss": 0.0002,
"reward": 0.43655015155673027,
"reward_std": 0.5879729185253382,
"rewards/cosine_scaled_reward": -0.11505825724452734,
"rewards/format_reward": 0.6666666772216558,
"step": 161
},
{
"completion_length": 2962.1458740234375,
"epoch": 0.18514285714285714,
"grad_norm": 0.18190893530845642,
"kl": 0.008568763732910156,
"learning_rate": 8.693068314414344e-07,
"loss": 0.0003,
"reward": -0.018576149828732014,
"reward_std": 0.6142776571214199,
"rewards/cosine_scaled_reward": -0.1759547544643283,
"rewards/format_reward": 0.3333333432674408,
"step": 162
},
{
"completion_length": 2259.770851135254,
"epoch": 0.18628571428571428,
"grad_norm": 0.2146298736333847,
"kl": 0.004612922668457031,
"learning_rate": 8.670853944836176e-07,
"loss": 0.0002,
"reward": 0.9779267348349094,
"reward_std": 0.5618617758154869,
"rewards/cosine_scaled_reward": 0.18687998969107866,
"rewards/format_reward": 0.6041666716337204,
"step": 163
},
{
"completion_length": 2226.6042098999023,
"epoch": 0.18742857142857142,
"grad_norm": 0.2856714725494385,
"kl": 0.0065479278564453125,
"learning_rate": 8.648485032310144e-07,
"loss": 0.0003,
"reward": 0.5088155679404736,
"reward_std": 0.6196571066975594,
"rewards/cosine_scaled_reward": -0.05809224210679531,
"rewards/format_reward": 0.6250000167638063,
"step": 164
},
{
"completion_length": 2562.2083892822266,
"epoch": 0.18857142857142858,
"grad_norm": 0.2433558702468872,
"kl": 0.0056896209716796875,
"learning_rate": 8.625962667065487e-07,
"loss": 0.0002,
"reward": 0.17038288246840239,
"reward_std": 0.8414329029619694,
"rewards/cosine_scaled_reward": -0.14397523301886395,
"rewards/format_reward": 0.4583333432674408,
"step": 165
},
{
"completion_length": 2418.8958892822266,
"epoch": 0.18971428571428572,
"grad_norm": 0.1904870867729187,
"kl": 0.0036516189575195312,
"learning_rate": 8.603287946810513e-07,
"loss": 0.0001,
"reward": 0.37238264083862305,
"reward_std": 0.7455781847238541,
"rewards/cosine_scaled_reward": -0.08464201167225838,
"rewards/format_reward": 0.5416666772216558,
"step": 166
},
{
"completion_length": 1871.6042175292969,
"epoch": 0.19085714285714286,
"grad_norm": 0.21718546748161316,
"kl": 0.003131866455078125,
"learning_rate": 8.580461976679099e-07,
"loss": 0.0001,
"reward": 0.5296564288437366,
"reward_std": 0.6462862379848957,
"rewards/cosine_scaled_reward": -0.1414217846468091,
"rewards/format_reward": 0.8125000204890966,
"step": 167
},
{
"completion_length": 2663.8750534057617,
"epoch": 0.192,
"grad_norm": 0.22513794898986816,
"kl": 0.0043582916259765625,
"learning_rate": 8.557485869176825e-07,
"loss": 0.0002,
"reward": 0.5514265485107899,
"reward_std": 0.9005202539265156,
"rewards/cosine_scaled_reward": -0.01595340482890606,
"rewards/format_reward": 0.5833333432674408,
"step": 168
},
{
"completion_length": 1646.375015258789,
"epoch": 0.19314285714285714,
"grad_norm": 0.26874470710754395,
"kl": 0.0048503875732421875,
"learning_rate": 8.534360744126753e-07,
"loss": 0.0002,
"reward": 1.3640152756124735,
"reward_std": 0.753107562661171,
"rewards/cosine_scaled_reward": 0.2549243066459894,
"rewards/format_reward": 0.8541666753590107,
"step": 169
},
{
"completion_length": 2181.6667289733887,
"epoch": 0.19428571428571428,
"grad_norm": 0.2248026579618454,
"kl": 0.0046710968017578125,
"learning_rate": 8.511087728614862e-07,
"loss": 0.0002,
"reward": 0.5494927279651165,
"reward_std": 0.5512229539453983,
"rewards/cosine_scaled_reward": -0.006503628566861153,
"rewards/format_reward": 0.5625000111758709,
"step": 170
},
{
"completion_length": 2032.6458587646484,
"epoch": 0.19542857142857142,
"grad_norm": 0.2114606499671936,
"kl": 0.0031061172485351562,
"learning_rate": 8.487667956935087e-07,
"loss": 0.0001,
"reward": 0.5898782406002283,
"reward_std": 0.7169837113469839,
"rewards/cosine_scaled_reward": -0.0175608959980309,
"rewards/format_reward": 0.6250000037252903,
"step": 171
},
{
"completion_length": 2407.6875610351562,
"epoch": 0.19657142857142856,
"grad_norm": 0.295926034450531,
"kl": 0.0061702728271484375,
"learning_rate": 8.464102570534061e-07,
"loss": 0.0002,
"reward": 0.8869556821882725,
"reward_std": 0.7517024762928486,
"rewards/cosine_scaled_reward": 0.1518111675977707,
"rewards/format_reward": 0.583333345130086,
"step": 172
},
{
"completion_length": 1608.0833702087402,
"epoch": 0.1977142857142857,
"grad_norm": 0.28166893124580383,
"kl": 0.00473785400390625,
"learning_rate": 8.440392717955475e-07,
"loss": 0.0002,
"reward": 0.49864733405411243,
"reward_std": 0.6368578001856804,
"rewards/cosine_scaled_reward": -0.12567635625600815,
"rewards/format_reward": 0.7500000037252903,
"step": 173
},
{
"completion_length": 1789.645866394043,
"epoch": 0.19885714285714284,
"grad_norm": 0.2832661271095276,
"kl": 0.006890296936035156,
"learning_rate": 8.416539554784089e-07,
"loss": 0.0003,
"reward": 0.717628687620163,
"reward_std": 0.6769092865288258,
"rewards/cosine_scaled_reward": -0.026602333411574364,
"rewards/format_reward": 0.7708333414047956,
"step": 174
},
{
"completion_length": 2314.1042098999023,
"epoch": 0.2,
"grad_norm": 0.2392614483833313,
"kl": 0.004840850830078125,
"learning_rate": 8.392544243589427e-07,
"loss": 0.0002,
"reward": 0.7497927155345678,
"reward_std": 0.7398151978850365,
"rewards/cosine_scaled_reward": 0.0728130005300045,
"rewards/format_reward": 0.6041666753590107,
"step": 175
},
{
"completion_length": 1971.0417213439941,
"epoch": 0.20114285714285715,
"grad_norm": 0.23342494666576385,
"kl": 0.006039619445800781,
"learning_rate": 8.368407953869103e-07,
"loss": 0.0002,
"reward": 0.5884247794747353,
"reward_std": 0.859587823972106,
"rewards/cosine_scaled_reward": -0.10162095539271832,
"rewards/format_reward": 0.7916666716337204,
"step": 176
},
{
"completion_length": 2159.5417098999023,
"epoch": 0.2022857142857143,
"grad_norm": 0.24055302143096924,
"kl": 0.004790306091308594,
"learning_rate": 8.344131861991828e-07,
"loss": 0.0002,
"reward": 0.6115904222242534,
"reward_std": 0.7329309619963169,
"rewards/cosine_scaled_reward": -0.04837145563215017,
"rewards/format_reward": 0.7083333432674408,
"step": 177
},
{
"completion_length": 1962.1875457763672,
"epoch": 0.20342857142857143,
"grad_norm": 0.24433988332748413,
"kl": 0.007831573486328125,
"learning_rate": 8.319717151140072e-07,
"loss": 0.0003,
"reward": 0.5400892496109009,
"reward_std": 0.5809228383004665,
"rewards/cosine_scaled_reward": -0.07370538869872689,
"rewards/format_reward": 0.6875000111758709,
"step": 178
},
{
"completion_length": 2294.9375228881836,
"epoch": 0.20457142857142857,
"grad_norm": 0.21765747666358948,
"kl": 0.00481414794921875,
"learning_rate": 8.295165011252396e-07,
"loss": 0.0002,
"reward": 0.3706574998795986,
"reward_std": 0.711703471839428,
"rewards/cosine_scaled_reward": -0.08550458867102861,
"rewards/format_reward": 0.5416666679084301,
"step": 179
},
{
"completion_length": 1782.8333587646484,
"epoch": 0.2057142857142857,
"grad_norm": 0.2759692668914795,
"kl": 0.009118080139160156,
"learning_rate": 8.270476638965461e-07,
"loss": 0.0004,
"reward": 0.8141425922513008,
"reward_std": 0.8202670086175203,
"rewards/cosine_scaled_reward": 0.06332127377390862,
"rewards/format_reward": 0.6875000018626451,
"step": 180
},
{
"completion_length": 2508.3750534057617,
"epoch": 0.20685714285714285,
"grad_norm": 0.2136908620595932,
"kl": 0.005702972412109375,
"learning_rate": 8.245653237555705e-07,
"loss": 0.0002,
"reward": 0.53153170022415,
"reward_std": 0.6961524914950132,
"rewards/cosine_scaled_reward": -0.02590082644019276,
"rewards/format_reward": 0.5833333469927311,
"step": 181
},
{
"completion_length": 1820.1875228881836,
"epoch": 0.208,
"grad_norm": 0.1944979727268219,
"kl": 0.0024585723876953125,
"learning_rate": 8.220696016880687e-07,
"loss": 0.0001,
"reward": 0.5819809623062611,
"reward_std": 0.6004299521446228,
"rewards/cosine_scaled_reward": -0.07359286700375378,
"rewards/format_reward": 0.7291666716337204,
"step": 182
},
{
"completion_length": 1418.458351135254,
"epoch": 0.20914285714285713,
"grad_norm": 0.2479531317949295,
"kl": 0.00787353515625,
"learning_rate": 8.195606193320136e-07,
"loss": 0.0003,
"reward": 1.0037191323935986,
"reward_std": 0.7833016626536846,
"rewards/cosine_scaled_reward": 0.04352622898295522,
"rewards/format_reward": 0.916666679084301,
"step": 183
},
{
"completion_length": 2061.06254196167,
"epoch": 0.2102857142857143,
"grad_norm": 0.2752915620803833,
"kl": 0.0069942474365234375,
"learning_rate": 8.170384989716657e-07,
"loss": 0.0003,
"reward": 0.39811624586582184,
"reward_std": 0.612327728420496,
"rewards/cosine_scaled_reward": -0.11344187799841166,
"rewards/format_reward": 0.6250000093132257,
"step": 184
},
{
"completion_length": 1844.6667098999023,
"epoch": 0.21142857142857144,
"grad_norm": 0.2239396721124649,
"kl": 0.0045299530029296875,
"learning_rate": 8.145033635316128e-07,
"loss": 0.0002,
"reward": 0.32189831510186195,
"reward_std": 0.5676768328994513,
"rewards/cosine_scaled_reward": -0.18280084058642387,
"rewards/format_reward": 0.6875000149011612,
"step": 185
},
{
"completion_length": 2218.791702270508,
"epoch": 0.21257142857142858,
"grad_norm": 0.20360872149467468,
"kl": 0.0063877105712890625,
"learning_rate": 8.119553365707802e-07,
"loss": 0.0003,
"reward": 0.48006572760641575,
"reward_std": 0.5926500409841537,
"rewards/cosine_scaled_reward": -0.030800477601587772,
"rewards/format_reward": 0.5416666734963655,
"step": 186
},
{
"completion_length": 1770.9791870117188,
"epoch": 0.21371428571428572,
"grad_norm": 0.23867206275463104,
"kl": 0.0055904388427734375,
"learning_rate": 8.093945422764069e-07,
"loss": 0.0002,
"reward": 0.6042033806443214,
"reward_std": 0.4485644996166229,
"rewards/cosine_scaled_reward": -0.08331498829647899,
"rewards/format_reward": 0.7708333395421505,
"step": 187
},
{
"completion_length": 2462.854202270508,
"epoch": 0.21485714285714286,
"grad_norm": 0.24813704192638397,
"kl": 0.0071868896484375,
"learning_rate": 8.068211054579943e-07,
"loss": 0.0003,
"reward": 0.014021937269717455,
"reward_std": 0.5901096761226654,
"rewards/cosine_scaled_reward": -0.22215570323169231,
"rewards/format_reward": 0.4583333469927311,
"step": 188
},
{
"completion_length": 1708.5000534057617,
"epoch": 0.216,
"grad_norm": 0.2578124403953552,
"kl": 0.006272315979003906,
"learning_rate": 8.04235151541222e-07,
"loss": 0.0003,
"reward": 0.6445069871842861,
"reward_std": 0.7020072378218174,
"rewards/cosine_scaled_reward": -0.052746512461453676,
"rewards/format_reward": 0.7500000111758709,
"step": 189
},
{
"completion_length": 1291.5833892822266,
"epoch": 0.21714285714285714,
"grad_norm": 0.2704342007637024,
"kl": 0.0067882537841796875,
"learning_rate": 8.01636806561836e-07,
"loss": 0.0003,
"reward": 0.9361954592168331,
"reward_std": 0.6467648409307003,
"rewards/cosine_scaled_reward": -0.0006522866897284985,
"rewards/format_reward": 0.9375000149011612,
"step": 190
},
{
"completion_length": 1299.5833892822266,
"epoch": 0.21828571428571428,
"grad_norm": 0.24868877232074738,
"kl": 0.0057735443115234375,
"learning_rate": 7.990261971595048e-07,
"loss": 0.0002,
"reward": 1.0458338633179665,
"reward_std": 0.7703647427260876,
"rewards/cosine_scaled_reward": 0.08541689871344715,
"rewards/format_reward": 0.8750000149011612,
"step": 191
},
{
"completion_length": 1983.1042404174805,
"epoch": 0.21942857142857142,
"grad_norm": 0.24434901773929596,
"kl": 0.006023406982421875,
"learning_rate": 7.964034505716476e-07,
"loss": 0.0002,
"reward": 0.5690425429493189,
"reward_std": 0.7882693633437157,
"rewards/cosine_scaled_reward": -0.09047873830422759,
"rewards/format_reward": 0.750000013038516,
"step": 192
},
{
"completion_length": 2579.333366394043,
"epoch": 0.22057142857142858,
"grad_norm": 0.26767823100090027,
"kl": 0.008405685424804688,
"learning_rate": 7.93768694627233e-07,
"loss": 0.0003,
"reward": 0.08483336865901947,
"reward_std": 0.5648258291184902,
"rewards/cosine_scaled_reward": -0.19716665521264076,
"rewards/format_reward": 0.4791666753590107,
"step": 193
},
{
"completion_length": 2240.562568664551,
"epoch": 0.22171428571428572,
"grad_norm": 0.19335110485553741,
"kl": 0.00666046142578125,
"learning_rate": 7.911220577405484e-07,
"loss": 0.0003,
"reward": 1.0503641180694103,
"reward_std": 0.5451640971004963,
"rewards/cosine_scaled_reward": 0.16059872414916754,
"rewards/format_reward": 0.7291666772216558,
"step": 194
},
{
"completion_length": 1826.2708892822266,
"epoch": 0.22285714285714286,
"grad_norm": 0.24450555443763733,
"kl": 0.006824493408203125,
"learning_rate": 7.884636689049422e-07,
"loss": 0.0003,
"reward": 0.7163921128958464,
"reward_std": 1.0118936747312546,
"rewards/cosine_scaled_reward": -0.03763728140620515,
"rewards/format_reward": 0.7916666828095913,
"step": 195
},
{
"completion_length": 2620.041778564453,
"epoch": 0.224,
"grad_norm": 0.2696687877178192,
"kl": 0.00728607177734375,
"learning_rate": 7.857936576865356e-07,
"loss": 0.0003,
"reward": 0.21067072823643684,
"reward_std": 0.859990905970335,
"rewards/cosine_scaled_reward": -0.13424798846244812,
"rewards/format_reward": 0.4791666716337204,
"step": 196
},
{
"completion_length": 1049.0208587646484,
"epoch": 0.22514285714285714,
"grad_norm": 0.27546316385269165,
"kl": 0.00786590576171875,
"learning_rate": 7.831121542179086e-07,
"loss": 0.0003,
"reward": 0.9739863537251949,
"reward_std": 0.8307300806045532,
"rewards/cosine_scaled_reward": 0.01824316382408142,
"rewards/format_reward": 0.9375,
"step": 197
},
{
"completion_length": 1475.4583892822266,
"epoch": 0.22628571428571428,
"grad_norm": 0.2635370194911957,
"kl": 0.008056640625,
"learning_rate": 7.804192891917571e-07,
"loss": 0.0003,
"reward": 0.8781713657081127,
"reward_std": 0.8930405229330063,
"rewards/cosine_scaled_reward": 0.012002333998680115,
"rewards/format_reward": 0.8541666679084301,
"step": 198
},
{
"completion_length": 1526.4583587646484,
"epoch": 0.22742857142857142,
"grad_norm": 0.29261499643325806,
"kl": 0.007879257202148438,
"learning_rate": 7.777151938545235e-07,
"loss": 0.0003,
"reward": 0.42517440766096115,
"reward_std": 0.6142569780349731,
"rewards/cosine_scaled_reward": -0.22491280548274517,
"rewards/format_reward": 0.8750000149011612,
"step": 199
},
{
"completion_length": 1371.458366394043,
"epoch": 0.22857142857142856,
"grad_norm": 0.23450589179992676,
"kl": 0.0063762664794921875,
"learning_rate": 7.75e-07,
"loss": 0.0003,
"reward": 1.134487384930253,
"reward_std": 0.7592474110424519,
"rewards/cosine_scaled_reward": 0.10891036130487919,
"rewards/format_reward": 0.9166666679084301,
"step": 200
},
{
"completion_length": 1873.7500686645508,
"epoch": 0.2297142857142857,
"grad_norm": 0.20057973265647888,
"kl": 0.0047245025634765625,
"learning_rate": 7.72273839962904e-07,
"loss": 0.0002,
"reward": 1.3440011143684387,
"reward_std": 0.6584898792207241,
"rewards/cosine_scaled_reward": 0.2657505361130461,
"rewards/format_reward": 0.8125000074505806,
"step": 201
},
{
"completion_length": 1296.6041870117188,
"epoch": 0.23085714285714284,
"grad_norm": 0.2126869261264801,
"kl": 0.006069183349609375,
"learning_rate": 7.695368466124296e-07,
"loss": 0.0002,
"reward": 1.2371138632297516,
"reward_std": 0.45892489701509476,
"rewards/cosine_scaled_reward": 0.18105687946081161,
"rewards/format_reward": 0.8750000055879354,
"step": 202
},
{
"completion_length": 1661.5000457763672,
"epoch": 0.232,
"grad_norm": 0.25018230080604553,
"kl": 0.008678436279296875,
"learning_rate": 7.667891533457718e-07,
"loss": 0.0003,
"reward": 0.8724356088787317,
"reward_std": 0.8623361866921186,
"rewards/cosine_scaled_reward": 0.040384437423199415,
"rewards/format_reward": 0.7916666939854622,
"step": 203
},
{
"completion_length": 1537.708381652832,
"epoch": 0.23314285714285715,
"grad_norm": 0.34546995162963867,
"kl": 0.007953643798828125,
"learning_rate": 7.640308940816239e-07,
"loss": 0.0003,
"reward": 0.7893814034759998,
"reward_std": 0.7430329732596874,
"rewards/cosine_scaled_reward": -0.03239264711737633,
"rewards/format_reward": 0.8541666697710752,
"step": 204
},
{
"completion_length": 1645.1250305175781,
"epoch": 0.2342857142857143,
"grad_norm": 0.24868625402450562,
"kl": 0.0064334869384765625,
"learning_rate": 7.612622032536507e-07,
"loss": 0.0003,
"reward": 1.1130757508799434,
"reward_std": 0.9551872611045837,
"rewards/cosine_scaled_reward": 0.15028784982860088,
"rewards/format_reward": 0.8125000055879354,
"step": 205
},
{
"completion_length": 2075.187545776367,
"epoch": 0.23542857142857143,
"grad_norm": 0.25428035855293274,
"kl": 0.0065708160400390625,
"learning_rate": 7.584832158039378e-07,
"loss": 0.0003,
"reward": 0.233387497253716,
"reward_std": 0.5259231552481651,
"rewards/cosine_scaled_reward": -0.22705625742673874,
"rewards/format_reward": 0.6875000260770321,
"step": 206
},
{
"completion_length": 1656.3750686645508,
"epoch": 0.23657142857142857,
"grad_norm": 0.26431310176849365,
"kl": 0.0085601806640625,
"learning_rate": 7.556940671764124e-07,
"loss": 0.0003,
"reward": 0.6532822176814079,
"reward_std": 0.6114402338862419,
"rewards/cosine_scaled_reward": -0.13169224187731743,
"rewards/format_reward": 0.9166666716337204,
"step": 207
},
{
"completion_length": 1265.3542022705078,
"epoch": 0.2377142857142857,
"grad_norm": 0.241045743227005,
"kl": 0.00737762451171875,
"learning_rate": 7.528948933102438e-07,
"loss": 0.0003,
"reward": 0.9077612403780222,
"reward_std": 0.5354608930647373,
"rewards/cosine_scaled_reward": -0.01486940123140812,
"rewards/format_reward": 0.9375000074505806,
"step": 208
},
{
"completion_length": 1344.3750305175781,
"epoch": 0.23885714285714285,
"grad_norm": 0.3023705780506134,
"kl": 0.010385513305664062,
"learning_rate": 7.500858306332172e-07,
"loss": 0.0004,
"reward": 1.0339510310441256,
"reward_std": 0.7570701129734516,
"rewards/cosine_scaled_reward": 0.07947551319375634,
"rewards/format_reward": 0.8750000149011612,
"step": 209
},
{
"completion_length": 1667.3541793823242,
"epoch": 0.24,
"grad_norm": 0.18748997151851654,
"kl": 0.006000518798828125,
"learning_rate": 7.472670160550848e-07,
"loss": 0.0002,
"reward": 0.666348340921104,
"reward_std": 0.5869894400238991,
"rewards/cosine_scaled_reward": -0.09390917886048555,
"rewards/format_reward": 0.8541666716337204,
"step": 210
},
{
"completion_length": 1571.8958740234375,
"epoch": 0.24114285714285713,
"grad_norm": 0.25766894221305847,
"kl": 0.0088043212890625,
"learning_rate": 7.444385869608921e-07,
"loss": 0.0004,
"reward": 0.9365372620522976,
"reward_std": 0.5268308343365788,
"rewards/cosine_scaled_reward": 0.07243526913225651,
"rewards/format_reward": 0.7916666734963655,
"step": 211
},
{
"completion_length": 967.6458549499512,
"epoch": 0.2422857142857143,
"grad_norm": 0.2648831009864807,
"kl": 0.00879669189453125,
"learning_rate": 7.416006812042827e-07,
"loss": 0.0004,
"reward": 1.0848271660506725,
"reward_std": 0.44052685238420963,
"rewards/cosine_scaled_reward": 0.08408024348318577,
"rewards/format_reward": 0.9166666679084301,
"step": 212
},
{
"completion_length": 1431.3959121704102,
"epoch": 0.24342857142857144,
"grad_norm": 0.33580857515335083,
"kl": 0.0103302001953125,
"learning_rate": 7.387534371007797e-07,
"loss": 0.0004,
"reward": 1.239082669839263,
"reward_std": 0.9362837933003902,
"rewards/cosine_scaled_reward": 0.1820413067471236,
"rewards/format_reward": 0.8750000074505806,
"step": 213
},
{
"completion_length": 1762.3125457763672,
"epoch": 0.24457142857142858,
"grad_norm": 0.24269923567771912,
"kl": 0.00745391845703125,
"learning_rate": 7.358969934210438e-07,
"loss": 0.0003,
"reward": 1.0624176412820816,
"reward_std": 0.7310810908675194,
"rewards/cosine_scaled_reward": 0.11454213625984266,
"rewards/format_reward": 0.8333333432674408,
"step": 214
},
{
"completion_length": 1427.3542022705078,
"epoch": 0.24571428571428572,
"grad_norm": 0.25518307089805603,
"kl": 0.0060749053955078125,
"learning_rate": 7.330314893841101e-07,
"loss": 0.0002,
"reward": 0.5486622415482998,
"reward_std": 0.42411663569509983,
"rewards/cosine_scaled_reward": -0.14233557134866714,
"rewards/format_reward": 0.8333333432674408,
"step": 215
},
{
"completion_length": 1129.395851135254,
"epoch": 0.24685714285714286,
"grad_norm": 0.28439247608184814,
"kl": 0.007411956787109375,
"learning_rate": 7.301570646506027e-07,
"loss": 0.0003,
"reward": 1.0450292900204659,
"reward_std": 0.6436006706207991,
"rewards/cosine_scaled_reward": 0.08501462638378143,
"rewards/format_reward": 0.8750000055879354,
"step": 216
},
{
"completion_length": 1375.4791946411133,
"epoch": 0.248,
"grad_norm": 0.22866062819957733,
"kl": 0.0072784423828125,
"learning_rate": 7.27273859315928e-07,
"loss": 0.0003,
"reward": 1.0294950436800718,
"reward_std": 0.7613518834114075,
"rewards/cosine_scaled_reward": 0.08766416925936937,
"rewards/format_reward": 0.8541666679084301,
"step": 217
},
{
"completion_length": 1546.6041946411133,
"epoch": 0.24914285714285714,
"grad_norm": 0.2400980442762375,
"kl": 0.00803375244140625,
"learning_rate": 7.243820139034464e-07,
"loss": 0.0003,
"reward": 0.6402788404375315,
"reward_std": 0.7851009257137775,
"rewards/cosine_scaled_reward": -0.10694391108700074,
"rewards/format_reward": 0.8541666716337204,
"step": 218
},
{
"completion_length": 1293.1042098999023,
"epoch": 0.2502857142857143,
"grad_norm": 0.3005291521549225,
"kl": 0.0071582794189453125,
"learning_rate": 7.214816693576234e-07,
"loss": 0.0003,
"reward": 1.2072506174445152,
"reward_std": 0.8155215214937925,
"rewards/cosine_scaled_reward": 0.1452919525327161,
"rewards/format_reward": 0.9166666679084301,
"step": 219
},
{
"completion_length": 1585.4583740234375,
"epoch": 0.25142857142857145,
"grad_norm": 0.29073673486709595,
"kl": 0.009313583374023438,
"learning_rate": 7.185729670371604e-07,
"loss": 0.0004,
"reward": 0.25497015565633774,
"reward_std": 0.43767623975872993,
"rewards/cosine_scaled_reward": -0.2787649389356375,
"rewards/format_reward": 0.8125000149011612,
"step": 220
},
{
"completion_length": 1091.9583625793457,
"epoch": 0.25257142857142856,
"grad_norm": 0.2504833936691284,
"kl": 0.006580352783203125,
"learning_rate": 7.156560487081051e-07,
"loss": 0.0003,
"reward": 1.1241591908037663,
"reward_std": 0.3971919184550643,
"rewards/cosine_scaled_reward": 0.09332958236336708,
"rewards/format_reward": 0.9375,
"step": 221
},
{
"completion_length": 1431.5000305175781,
"epoch": 0.2537142857142857,
"grad_norm": 0.2770059108734131,
"kl": 0.008672714233398438,
"learning_rate": 7.127310565369415e-07,
"loss": 0.0003,
"reward": 1.0303391478955746,
"reward_std": 0.711064089089632,
"rewards/cosine_scaled_reward": 0.09850288555026054,
"rewards/format_reward": 0.8333333395421505,
"step": 222
},
{
"completion_length": 1270.583351135254,
"epoch": 0.25485714285714284,
"grad_norm": 0.22273501753807068,
"kl": 0.00675201416015625,
"learning_rate": 7.097981330836616e-07,
"loss": 0.0003,
"reward": 0.8901128210127354,
"reward_std": 0.560223001986742,
"rewards/cosine_scaled_reward": 0.0075564137659966946,
"rewards/format_reward": 0.875,
"step": 223
},
{
"completion_length": 1732.1458587646484,
"epoch": 0.256,
"grad_norm": 0.2018723040819168,
"kl": 0.008203506469726562,
"learning_rate": 7.068574212948169e-07,
"loss": 0.0003,
"reward": 0.7687601521611214,
"reward_std": 0.7839281968772411,
"rewards/cosine_scaled_reward": -0.03228660812601447,
"rewards/format_reward": 0.8333333432674408,
"step": 224
},
{
"completion_length": 2117.18758392334,
"epoch": 0.2571428571428571,
"grad_norm": 0.2617863416671753,
"kl": 0.01264190673828125,
"learning_rate": 7.039090644965509e-07,
"loss": 0.0005,
"reward": 0.8388488199561834,
"reward_std": 0.7986405938863754,
"rewards/cosine_scaled_reward": 0.044424411840736866,
"rewards/format_reward": 0.750000013038516,
"step": 225
},
{
"completion_length": 1414.020881652832,
"epoch": 0.2582857142857143,
"grad_norm": 0.19753257930278778,
"kl": 0.0067138671875,
"learning_rate": 7.009532063876148e-07,
"loss": 0.0003,
"reward": 1.1793978083878756,
"reward_std": 0.5940953148528934,
"rewards/cosine_scaled_reward": 0.11053222604095936,
"rewards/format_reward": 0.9583333432674408,
"step": 226
},
{
"completion_length": 1173.1666870117188,
"epoch": 0.25942857142857145,
"grad_norm": 0.3516033887863159,
"kl": 0.01033782958984375,
"learning_rate": 6.979899910323624e-07,
"loss": 0.0004,
"reward": 0.7449045367538929,
"reward_std": 0.5567011050879955,
"rewards/cosine_scaled_reward": -0.10671441350132227,
"rewards/format_reward": 0.9583333432674408,
"step": 227
},
{
"completion_length": 1190.1250305175781,
"epoch": 0.26057142857142856,
"grad_norm": 0.31133556365966797,
"kl": 0.00760650634765625,
"learning_rate": 6.950195628537299e-07,
"loss": 0.0003,
"reward": 1.191257143393159,
"reward_std": 0.6371707431972027,
"rewards/cosine_scaled_reward": 0.1581285521388054,
"rewards/format_reward": 0.875,
"step": 228
},
{
"completion_length": 1375.6250305175781,
"epoch": 0.26171428571428573,
"grad_norm": 0.2500562369823456,
"kl": 0.0087432861328125,
"learning_rate": 6.920420666261961e-07,
"loss": 0.0003,
"reward": 0.7764038797467947,
"reward_std": 0.5216951882466674,
"rewards/cosine_scaled_reward": -0.04929806664586067,
"rewards/format_reward": 0.8750000055879354,
"step": 229
},
{
"completion_length": 1970.1458587646484,
"epoch": 0.26285714285714284,
"grad_norm": 0.23503738641738892,
"kl": 0.009944915771484375,
"learning_rate": 6.890576474687263e-07,
"loss": 0.0004,
"reward": 0.531811726745218,
"reward_std": 0.7926243953406811,
"rewards/cosine_scaled_reward": -0.10909414570778608,
"rewards/format_reward": 0.7500000204890966,
"step": 230
},
{
"completion_length": 1569.3750457763672,
"epoch": 0.264,
"grad_norm": 0.2320818156003952,
"kl": 0.008174896240234375,
"learning_rate": 6.860664508377001e-07,
"loss": 0.0003,
"reward": 1.0276812557131052,
"reward_std": 0.789770107716322,
"rewards/cosine_scaled_reward": 0.05550729110836983,
"rewards/format_reward": 0.9166666716337204,
"step": 231
},
{
"completion_length": 1607.0625305175781,
"epoch": 0.2651428571428571,
"grad_norm": 0.23815037310123444,
"kl": 0.008876800537109375,
"learning_rate": 6.83068622519821e-07,
"loss": 0.0004,
"reward": 0.352162616327405,
"reward_std": 0.5658496394753456,
"rewards/cosine_scaled_reward": -0.2405853734817356,
"rewards/format_reward": 0.8333333414047956,
"step": 232
},
{
"completion_length": 1084.5833740234375,
"epoch": 0.2662857142857143,
"grad_norm": 0.2646292746067047,
"kl": 0.007411956787109375,
"learning_rate": 6.800643086250121e-07,
"loss": 0.0003,
"reward": 0.6558086425065994,
"reward_std": 0.7230100817978382,
"rewards/cosine_scaled_reward": -0.1408456964418292,
"rewards/format_reward": 0.9375000149011612,
"step": 233
},
{
"completion_length": 1680.520881652832,
"epoch": 0.2674285714285714,
"grad_norm": 0.2598845660686493,
"kl": 0.009510040283203125,
"learning_rate": 6.770536555792944e-07,
"loss": 0.0004,
"reward": 0.6849129311740398,
"reward_std": 0.5895892381668091,
"rewards/cosine_scaled_reward": -0.042960209771990776,
"rewards/format_reward": 0.7708333432674408,
"step": 234
},
{
"completion_length": 1212.5208549499512,
"epoch": 0.26857142857142857,
"grad_norm": 0.281758576631546,
"kl": 0.007808685302734375,
"learning_rate": 6.740368101176495e-07,
"loss": 0.0003,
"reward": 1.2133875098079443,
"reward_std": 0.634877560660243,
"rewards/cosine_scaled_reward": 0.14836042001843452,
"rewards/format_reward": 0.9166666716337204,
"step": 235
},
{
"completion_length": 2002.2500534057617,
"epoch": 0.26971428571428574,
"grad_norm": 0.21561594307422638,
"kl": 0.008035659790039062,
"learning_rate": 6.710139192768694e-07,
"loss": 0.0003,
"reward": 0.5754090845584869,
"reward_std": 0.681129951030016,
"rewards/cosine_scaled_reward": -0.07687881146557629,
"rewards/format_reward": 0.7291666716337204,
"step": 236
},
{
"completion_length": 1571.7291946411133,
"epoch": 0.27085714285714285,
"grad_norm": 0.23634596168994904,
"kl": 0.006927490234375,
"learning_rate": 6.679851303883891e-07,
"loss": 0.0003,
"reward": 0.8946835342794657,
"reward_std": 0.5508107021450996,
"rewards/cosine_scaled_reward": 0.03067508526146412,
"rewards/format_reward": 0.8333333432674408,
"step": 237
},
{
"completion_length": 1162.2292098999023,
"epoch": 0.272,
"grad_norm": 0.23457442224025726,
"kl": 0.0085906982421875,
"learning_rate": 6.649505910711058e-07,
"loss": 0.0003,
"reward": 1.158431712538004,
"reward_std": 0.7775964550673962,
"rewards/cosine_scaled_reward": 0.08963251765817404,
"rewards/format_reward": 0.9791666716337204,
"step": 238
},
{
"completion_length": 1398.8750457763672,
"epoch": 0.27314285714285713,
"grad_norm": 0.22179792821407318,
"kl": 0.005764007568359375,
"learning_rate": 6.619104492241847e-07,
"loss": 0.0002,
"reward": 1.289793978095986,
"reward_std": 0.5327765932306647,
"rewards/cosine_scaled_reward": 0.23864697851240635,
"rewards/format_reward": 0.8125,
"step": 239
},
{
"completion_length": 1420.9375381469727,
"epoch": 0.2742857142857143,
"grad_norm": 0.33549872040748596,
"kl": 0.01104736328125,
"learning_rate": 6.588648530198504e-07,
"loss": 0.0004,
"reward": 0.3914220330771059,
"reward_std": 0.6205890811979771,
"rewards/cosine_scaled_reward": -0.2209556633606553,
"rewards/format_reward": 0.8333333395421505,
"step": 240
},
{
"completion_length": 1621.7292098999023,
"epoch": 0.2754285714285714,
"grad_norm": 0.28596657514572144,
"kl": 0.010402679443359375,
"learning_rate": 6.558139508961654e-07,
"loss": 0.0004,
"reward": 0.44276779890060425,
"reward_std": 0.5154628418385983,
"rewards/cosine_scaled_reward": -0.1848661107942462,
"rewards/format_reward": 0.8125000111758709,
"step": 241
},
{
"completion_length": 1138.5000457763672,
"epoch": 0.2765714285714286,
"grad_norm": 0.4309399724006653,
"kl": 0.0121612548828125,
"learning_rate": 6.527578915497951e-07,
"loss": 0.0005,
"reward": 0.7130568971624598,
"reward_std": 0.4758261194219813,
"rewards/cosine_scaled_reward": -0.12263822788372636,
"rewards/format_reward": 0.9583333432674408,
"step": 242
},
{
"completion_length": 1494.687515258789,
"epoch": 0.2777142857142857,
"grad_norm": 0.21594373881816864,
"kl": 0.00745391845703125,
"learning_rate": 6.496968239287603e-07,
"loss": 0.0003,
"reward": 0.7156112641096115,
"reward_std": 0.6859831623733044,
"rewards/cosine_scaled_reward": -0.06927772145718336,
"rewards/format_reward": 0.8541666772216558,
"step": 243
},
{
"completion_length": 1628.1667137145996,
"epoch": 0.27885714285714286,
"grad_norm": 0.2714637219905853,
"kl": 0.007848739624023438,
"learning_rate": 6.466308972251785e-07,
"loss": 0.0003,
"reward": 0.8751203082501888,
"reward_std": 0.9190315119922161,
"rewards/cosine_scaled_reward": 0.010476819472387433,
"rewards/format_reward": 0.8541666753590107,
"step": 244
},
{
"completion_length": 1648.875015258789,
"epoch": 0.28,
"grad_norm": 0.2101169377565384,
"kl": 0.0067119598388671875,
"learning_rate": 6.435602608679916e-07,
"loss": 0.0003,
"reward": 1.1979175508022308,
"reward_std": 0.7641258873045444,
"rewards/cosine_scaled_reward": 0.1718754144385457,
"rewards/format_reward": 0.8541666697710752,
"step": 245
},
{
"completion_length": 1389.895881652832,
"epoch": 0.28114285714285714,
"grad_norm": 0.21795134246349335,
"kl": 0.009159088134765625,
"learning_rate": 6.404850645156841e-07,
"loss": 0.0004,
"reward": 0.7879314236342907,
"reward_std": 0.6879578605294228,
"rewards/cosine_scaled_reward": -0.07478431053459644,
"rewards/format_reward": 0.9375,
"step": 246
},
{
"completion_length": 2069.1458892822266,
"epoch": 0.2822857142857143,
"grad_norm": 0.2720467150211334,
"kl": 0.012157440185546875,
"learning_rate": 6.374054580489873e-07,
"loss": 0.0005,
"reward": 0.3278281930834055,
"reward_std": 0.6504724733531475,
"rewards/cosine_scaled_reward": -0.16941926488652825,
"rewards/format_reward": 0.6666666753590107,
"step": 247
},
{
"completion_length": 1404.020866394043,
"epoch": 0.2834285714285714,
"grad_norm": 0.29227957129478455,
"kl": 0.009120941162109375,
"learning_rate": 6.343215915635761e-07,
"loss": 0.0004,
"reward": 1.3239782322198153,
"reward_std": 0.6399974799714983,
"rewards/cosine_scaled_reward": 0.24532245565205812,
"rewards/format_reward": 0.8333333395421505,
"step": 248
},
{
"completion_length": 1421.7500610351562,
"epoch": 0.2845714285714286,
"grad_norm": 0.25874021649360657,
"kl": 0.010471343994140625,
"learning_rate": 6.31233615362752e-07,
"loss": 0.0004,
"reward": 1.373143790755421,
"reward_std": 0.5737661384046078,
"rewards/cosine_scaled_reward": 0.23865519277751446,
"rewards/format_reward": 0.8958333432674408,
"step": 249
},
{
"completion_length": 1404.6667022705078,
"epoch": 0.2857142857142857,
"grad_norm": 0.6512049436569214,
"kl": 0.011989593505859375,
"learning_rate": 6.281416799501187e-07,
"loss": 0.0005,
"reward": 0.8663649614900351,
"reward_std": 0.6013601124286652,
"rewards/cosine_scaled_reward": -0.035567532293498516,
"rewards/format_reward": 0.9375000074505806,
"step": 250
},
{
"completion_length": 1101.1250305175781,
"epoch": 0.28685714285714287,
"grad_norm": 0.40293389558792114,
"kl": 0.013065338134765625,
"learning_rate": 6.25045936022246e-07,
"loss": 0.0005,
"reward": 0.7222131416201591,
"reward_std": 0.6657759360969067,
"rewards/cosine_scaled_reward": -0.07639344967901707,
"rewards/format_reward": 0.8750000074505806,
"step": 251
},
{
"completion_length": 1508.4583702087402,
"epoch": 0.288,
"grad_norm": 0.2395544797182083,
"kl": 0.010028839111328125,
"learning_rate": 6.219465344613258e-07,
"loss": 0.0004,
"reward": 0.7061197310686111,
"reward_std": 0.5144665259867907,
"rewards/cosine_scaled_reward": -0.07402347587049007,
"rewards/format_reward": 0.8541666697710752,
"step": 252
},
{
"completion_length": 1397.2291984558105,
"epoch": 0.28914285714285715,
"grad_norm": 0.32397717237472534,
"kl": 0.013702392578125,
"learning_rate": 6.188436263278172e-07,
"loss": 0.0005,
"reward": 0.6706481170840561,
"reward_std": 0.8627937883138657,
"rewards/cosine_scaled_reward": -0.09175930079072714,
"rewards/format_reward": 0.8541666716337204,
"step": 253
},
{
"completion_length": 1515.0000839233398,
"epoch": 0.29028571428571426,
"grad_norm": 0.3457793891429901,
"kl": 0.010593414306640625,
"learning_rate": 6.157373628530852e-07,
"loss": 0.0004,
"reward": 0.5583736216649413,
"reward_std": 0.7457092814147472,
"rewards/cosine_scaled_reward": -0.12706319894641638,
"rewards/format_reward": 0.8125000074505806,
"step": 254
},
{
"completion_length": 2169.166702270508,
"epoch": 0.2914285714285714,
"grad_norm": 0.2815548777580261,
"kl": 0.0111541748046875,
"learning_rate": 6.126278954320294e-07,
"loss": 0.0004,
"reward": 0.2357357144355774,
"reward_std": 0.6888550817966461,
"rewards/cosine_scaled_reward": -0.20504882326349616,
"rewards/format_reward": 0.6458333358168602,
"step": 255
},
{
"completion_length": 1219.937515258789,
"epoch": 0.2925714285714286,
"grad_norm": 0.2393861711025238,
"kl": 0.00894927978515625,
"learning_rate": 6.095153756157051e-07,
"loss": 0.0004,
"reward": 1.0311391949653625,
"reward_std": 0.6041121408343315,
"rewards/cosine_scaled_reward": 0.036402929574251175,
"rewards/format_reward": 0.9583333432674408,
"step": 256
},
{
"completion_length": 1972.0000228881836,
"epoch": 0.2937142857142857,
"grad_norm": 0.18130330741405487,
"kl": 0.009616851806640625,
"learning_rate": 6.06399955103937e-07,
"loss": 0.0004,
"reward": 0.9688311172649264,
"reward_std": 0.8995218388736248,
"rewards/cosine_scaled_reward": 0.10941554605960846,
"rewards/format_reward": 0.7500000055879354,
"step": 257
},
{
"completion_length": 1788.7917098999023,
"epoch": 0.2948571428571429,
"grad_norm": 0.24320876598358154,
"kl": 0.00939178466796875,
"learning_rate": 6.032817857379256e-07,
"loss": 0.0004,
"reward": 0.7169995531439781,
"reward_std": 0.7408483605831861,
"rewards/cosine_scaled_reward": -0.05816690996289253,
"rewards/format_reward": 0.8333333432674408,
"step": 258
},
{
"completion_length": 1297.375015258789,
"epoch": 0.296,
"grad_norm": 0.30073729157447815,
"kl": 0.011287689208984375,
"learning_rate": 6.001610194928464e-07,
"loss": 0.0005,
"reward": 0.8080965355038643,
"reward_std": 0.5971200875937939,
"rewards/cosine_scaled_reward": -0.0230350773781538,
"rewards/format_reward": 0.8541666772216558,
"step": 259
},
{
"completion_length": 1015.0000190734863,
"epoch": 0.29714285714285715,
"grad_norm": 0.29709815979003906,
"kl": 0.00806427001953125,
"learning_rate": 5.97037808470444e-07,
"loss": 0.0003,
"reward": 1.2091553770005703,
"reward_std": 0.7044631829485297,
"rewards/cosine_scaled_reward": 0.12541100312955678,
"rewards/format_reward": 0.9583333432674408,
"step": 260
},
{
"completion_length": 2179.1666870117188,
"epoch": 0.29828571428571427,
"grad_norm": 0.20508523285388947,
"kl": 0.01059722900390625,
"learning_rate": 5.939123048916173e-07,
"loss": 0.0004,
"reward": 0.47496978752315044,
"reward_std": 0.5501383896917105,
"rewards/cosine_scaled_reward": -0.07501510810106993,
"rewards/format_reward": 0.625,
"step": 261
},
{
"completion_length": 1553.3542175292969,
"epoch": 0.29942857142857143,
"grad_norm": 0.41438964009284973,
"kl": 0.01168060302734375,
"learning_rate": 5.907846610890011e-07,
"loss": 0.0005,
"reward": 0.3764358746702783,
"reward_std": 0.6005188822746277,
"rewards/cosine_scaled_reward": -0.20761540438979864,
"rewards/format_reward": 0.7916666753590107,
"step": 262
},
{
"completion_length": 1263.2917098999023,
"epoch": 0.30057142857142854,
"grad_norm": 0.26419222354888916,
"kl": 0.007190704345703125,
"learning_rate": 5.87655029499542e-07,
"loss": 0.0003,
"reward": 0.5402636826038361,
"reward_std": 0.5199831649661064,
"rewards/cosine_scaled_reward": -0.1986181689426303,
"rewards/format_reward": 0.9375000074505806,
"step": 263
},
{
"completion_length": 1310.7292404174805,
"epoch": 0.3017142857142857,
"grad_norm": 0.22596046328544617,
"kl": 0.008066177368164062,
"learning_rate": 5.845235626570683e-07,
"loss": 0.0003,
"reward": 0.8366705775260925,
"reward_std": 0.7488792147487402,
"rewards/cosine_scaled_reward": -0.0504147283063503,
"rewards/format_reward": 0.9375000074505806,
"step": 264
},
{
"completion_length": 1436.1458587646484,
"epoch": 0.3028571428571429,
"grad_norm": 0.24842660129070282,
"kl": 0.009662628173828125,
"learning_rate": 5.813904131848564e-07,
"loss": 0.0004,
"reward": 1.0776810441166162,
"reward_std": 0.6058933921158314,
"rewards/cosine_scaled_reward": 0.07009050995111465,
"rewards/format_reward": 0.9375000074505806,
"step": 265
},
{
"completion_length": 1513.9167022705078,
"epoch": 0.304,
"grad_norm": 0.22984397411346436,
"kl": 0.0084075927734375,
"learning_rate": 5.78255733788191e-07,
"loss": 0.0003,
"reward": 0.9025900475680828,
"reward_std": 0.5533245950937271,
"rewards/cosine_scaled_reward": -0.007038334384560585,
"rewards/format_reward": 0.9166666679084301,
"step": 266
},
{
"completion_length": 1806.5000228881836,
"epoch": 0.30514285714285716,
"grad_norm": 0.34077852964401245,
"kl": 0.013782501220703125,
"learning_rate": 5.751196772469237e-07,
"loss": 0.0006,
"reward": 0.22909173252992332,
"reward_std": 0.6235861741006374,
"rewards/cosine_scaled_reward": -0.2292041452601552,
"rewards/format_reward": 0.6875000074505806,
"step": 267
},
{
"completion_length": 1182.5625228881836,
"epoch": 0.3062857142857143,
"grad_norm": 0.29332637786865234,
"kl": 0.01444244384765625,
"learning_rate": 5.71982396408026e-07,
"loss": 0.0006,
"reward": 0.6916986927390099,
"reward_std": 0.5937565844506025,
"rewards/cosine_scaled_reward": -0.11248400900512934,
"rewards/format_reward": 0.9166666716337204,
"step": 268
},
{
"completion_length": 1484.3750228881836,
"epoch": 0.30742857142857144,
"grad_norm": 0.28696972131729126,
"kl": 0.010364532470703125,
"learning_rate": 5.688440441781398e-07,
"loss": 0.0004,
"reward": 0.6794960014522076,
"reward_std": 0.5427012406289577,
"rewards/cosine_scaled_reward": -0.08733535185456276,
"rewards/format_reward": 0.8541666772216558,
"step": 269
},
{
"completion_length": 1590.645851135254,
"epoch": 0.30857142857142855,
"grad_norm": 0.22989951074123383,
"kl": 0.01123046875,
"learning_rate": 5.657047735161255e-07,
"loss": 0.0004,
"reward": 0.8947062492370605,
"reward_std": 1.0069415792822838,
"rewards/cosine_scaled_reward": 0.0202697841450572,
"rewards/format_reward": 0.8541666716337204,
"step": 270
},
{
"completion_length": 1183.0000457763672,
"epoch": 0.3097142857142857,
"grad_norm": 0.29653027653694153,
"kl": 0.0086669921875,
"learning_rate": 5.625647374256061e-07,
"loss": 0.0003,
"reward": 1.1815486252307892,
"reward_std": 0.7092056274414062,
"rewards/cosine_scaled_reward": 0.11160763050429523,
"rewards/format_reward": 0.9583333358168602,
"step": 271
},
{
"completion_length": 1719.6875457763672,
"epoch": 0.31085714285714283,
"grad_norm": 0.2264571636915207,
"kl": 0.010009765625,
"learning_rate": 5.594240889475106e-07,
"loss": 0.0004,
"reward": 0.6460251696407795,
"reward_std": 0.6881718635559082,
"rewards/cosine_scaled_reward": -0.0832374356687069,
"rewards/format_reward": 0.8125000149011612,
"step": 272
},
{
"completion_length": 1260.9166870117188,
"epoch": 0.312,
"grad_norm": 0.2590281367301941,
"kl": 0.01006317138671875,
"learning_rate": 5.562829811526154e-07,
"loss": 0.0004,
"reward": 1.0608440730720758,
"reward_std": 0.49590713158249855,
"rewards/cosine_scaled_reward": 0.06167200347408652,
"rewards/format_reward": 0.9375000074505806,
"step": 273
},
{
"completion_length": 1077.4583587646484,
"epoch": 0.31314285714285717,
"grad_norm": 0.29721972346305847,
"kl": 0.012844085693359375,
"learning_rate": 5.531415671340826e-07,
"loss": 0.0005,
"reward": 1.1142795570194721,
"reward_std": 0.61124661937356,
"rewards/cosine_scaled_reward": 0.057139765471220016,
"rewards/format_reward": 1.0,
"step": 274
},
{
"completion_length": 1554.9167175292969,
"epoch": 0.3142857142857143,
"grad_norm": 0.23796315491199493,
"kl": 0.0102996826171875,
"learning_rate": 5.5e-07,
"loss": 0.0004,
"reward": 0.974131946451962,
"reward_std": 0.6788717601448298,
"rewards/cosine_scaled_reward": 0.08081597136333585,
"rewards/format_reward": 0.8125,
"step": 275
},
{
"completion_length": 1287.2916870117188,
"epoch": 0.31542857142857145,
"grad_norm": 0.35233473777770996,
"kl": 0.012027740478515625,
"learning_rate": 5.468584328659172e-07,
"loss": 0.0005,
"reward": 1.0395971853286028,
"reward_std": 0.770504854619503,
"rewards/cosine_scaled_reward": 0.07188189588487148,
"rewards/format_reward": 0.8958333432674408,
"step": 276
},
{
"completion_length": 1217.3958549499512,
"epoch": 0.31657142857142856,
"grad_norm": 0.423069030046463,
"kl": 0.01209259033203125,
"learning_rate": 5.437170188473847e-07,
"loss": 0.0005,
"reward": 1.1905713304877281,
"reward_std": 0.7023467533290386,
"rewards/cosine_scaled_reward": 0.1369522949680686,
"rewards/format_reward": 0.9166666865348816,
"step": 277
},
{
"completion_length": 1309.5417098999023,
"epoch": 0.3177142857142857,
"grad_norm": 0.28700801730155945,
"kl": 0.007678985595703125,
"learning_rate": 5.405759110524894e-07,
"loss": 0.0003,
"reward": 1.08343615103513,
"reward_std": 0.47201682440936565,
"rewards/cosine_scaled_reward": 0.07296805875375867,
"rewards/format_reward": 0.9375000074505806,
"step": 278
},
{
"completion_length": 1422.8958740234375,
"epoch": 0.31885714285714284,
"grad_norm": 0.48463860154151917,
"kl": 0.01280975341796875,
"learning_rate": 5.37435262574394e-07,
"loss": 0.0005,
"reward": 0.8561514802277088,
"reward_std": 0.6483948938548565,
"rewards/cosine_scaled_reward": -0.04067427571862936,
"rewards/format_reward": 0.9375000074505806,
"step": 279
},
{
"completion_length": 1659.3333778381348,
"epoch": 0.32,
"grad_norm": 0.289205402135849,
"kl": 0.016315460205078125,
"learning_rate": 5.342952264838747e-07,
"loss": 0.0007,
"reward": 1.3687419444322586,
"reward_std": 0.9038915932178497,
"rewards/cosine_scaled_reward": 0.27812093193642795,
"rewards/format_reward": 0.8125000186264515,
"step": 280
},
{
"completion_length": 2221.645851135254,
"epoch": 0.3211428571428571,
"grad_norm": 0.2718718945980072,
"kl": 0.015529632568359375,
"learning_rate": 5.311559558218603e-07,
"loss": 0.0006,
"reward": 0.3475176487118006,
"reward_std": 0.7998633496463299,
"rewards/cosine_scaled_reward": -0.11790786002529785,
"rewards/format_reward": 0.5833333414047956,
"step": 281
},
{
"completion_length": 1343.0625381469727,
"epoch": 0.3222857142857143,
"grad_norm": 0.28914305567741394,
"kl": 0.010540008544921875,
"learning_rate": 5.28017603591974e-07,
"loss": 0.0004,
"reward": 1.117010936141014,
"reward_std": 0.6344310864806175,
"rewards/cosine_scaled_reward": 0.0897554587572813,
"rewards/format_reward": 0.9375000074505806,
"step": 282
},
{
"completion_length": 1945.6250686645508,
"epoch": 0.32342857142857145,
"grad_norm": 0.1946076899766922,
"kl": 0.0126190185546875,
"learning_rate": 5.248803227530763e-07,
"loss": 0.0005,
"reward": 1.111644510179758,
"reward_std": 0.7122819889336824,
"rewards/cosine_scaled_reward": 0.15998891461640596,
"rewards/format_reward": 0.7916666716337204,
"step": 283
},
{
"completion_length": 1269.0000381469727,
"epoch": 0.32457142857142857,
"grad_norm": 0.26954689621925354,
"kl": 0.006649017333984375,
"learning_rate": 5.21744266211809e-07,
"loss": 0.0003,
"reward": 0.9363188669085503,
"reward_std": 0.4825965305790305,
"rewards/cosine_scaled_reward": -0.011007236316800117,
"rewards/format_reward": 0.9583333432674408,
"step": 284
},
{
"completion_length": 881.7916946411133,
"epoch": 0.32571428571428573,
"grad_norm": 0.3140123784542084,
"kl": 0.01016998291015625,
"learning_rate": 5.186095868151436e-07,
"loss": 0.0004,
"reward": 1.102295933291316,
"reward_std": 0.8087750803679228,
"rewards/cosine_scaled_reward": 0.061564626172184944,
"rewards/format_reward": 0.9791666716337204,
"step": 285
},
{
"completion_length": 1551.270881652832,
"epoch": 0.32685714285714285,
"grad_norm": 0.4618990421295166,
"kl": 0.01885223388671875,
"learning_rate": 5.154764373429315e-07,
"loss": 0.0008,
"reward": 0.8153204774716869,
"reward_std": 0.8457888886332512,
"rewards/cosine_scaled_reward": 0.0014102212153375149,
"rewards/format_reward": 0.8125000111758709,
"step": 286
},
{
"completion_length": 1342.687515258789,
"epoch": 0.328,
"grad_norm": 0.3015505373477936,
"kl": 0.01461029052734375,
"learning_rate": 5.123449705004581e-07,
"loss": 0.0006,
"reward": 0.8659841865301132,
"reward_std": 0.7081486638635397,
"rewards/cosine_scaled_reward": 0.047575398966728244,
"rewards/format_reward": 0.7708333358168602,
"step": 287
},
{
"completion_length": 1528.8542251586914,
"epoch": 0.3291428571428571,
"grad_norm": 0.33641308546066284,
"kl": 0.014873504638671875,
"learning_rate": 5.09215338910999e-07,
"loss": 0.0006,
"reward": 0.765910281566903,
"reward_std": 0.7056640759110451,
"rewards/cosine_scaled_reward": -0.07537820562720299,
"rewards/format_reward": 0.916666679084301,
"step": 288
},
{
"completion_length": 1399.9583587646484,
"epoch": 0.3302857142857143,
"grad_norm": 0.4806149899959564,
"kl": 0.017198562622070312,
"learning_rate": 5.060876951083828e-07,
"loss": 0.0007,
"reward": 0.7978585977107286,
"reward_std": 0.40477965772151947,
"rewards/cosine_scaled_reward": -0.038570704869925976,
"rewards/format_reward": 0.8750000074505806,
"step": 289
},
{
"completion_length": 1012.0625228881836,
"epoch": 0.3314285714285714,
"grad_norm": 0.335376113653183,
"kl": 0.009782791137695312,
"learning_rate": 5.02962191529556e-07,
"loss": 0.0004,
"reward": 1.1996897123754025,
"reward_std": 0.8074947744607925,
"rewards/cosine_scaled_reward": 0.09984485851600766,
"rewards/format_reward": 1.0,
"step": 290
},
{
"completion_length": 1318.7500457763672,
"epoch": 0.3325714285714286,
"grad_norm": 0.24284976720809937,
"kl": 0.014574050903320312,
"learning_rate": 4.998389805071536e-07,
"loss": 0.0006,
"reward": 0.9642973355948925,
"reward_std": 0.7317453175783157,
"rewards/cosine_scaled_reward": 0.034231980331242085,
"rewards/format_reward": 0.895833333954215,
"step": 291
},
{
"completion_length": 1687.9166946411133,
"epoch": 0.33371428571428574,
"grad_norm": 0.28374290466308594,
"kl": 0.017087936401367188,
"learning_rate": 4.967182142620745e-07,
"loss": 0.0007,
"reward": 0.6350435484200716,
"reward_std": 0.6143127456307411,
"rewards/cosine_scaled_reward": -0.1199782375479117,
"rewards/format_reward": 0.8750000111758709,
"step": 292
},
{
"completion_length": 1271.3750381469727,
"epoch": 0.33485714285714285,
"grad_norm": 0.38170552253723145,
"kl": 0.023435592651367188,
"learning_rate": 4.93600044896063e-07,
"loss": 0.0009,
"reward": 0.8408510126173496,
"reward_std": 0.6498479042202234,
"rewards/cosine_scaled_reward": -0.03790782764554024,
"rewards/format_reward": 0.916666679084301,
"step": 293
},
{
"completion_length": 1540.2708740234375,
"epoch": 0.336,
"grad_norm": 0.29528045654296875,
"kl": 0.012912750244140625,
"learning_rate": 4.904846243842949e-07,
"loss": 0.0005,
"reward": 0.8604166656732559,
"reward_std": 0.5733331702649593,
"rewards/cosine_scaled_reward": 0.003124975599348545,
"rewards/format_reward": 0.8541666716337204,
"step": 294
},
{
"completion_length": 1394.4792098999023,
"epoch": 0.33714285714285713,
"grad_norm": 0.3902963399887085,
"kl": 0.014739990234375,
"learning_rate": 4.873721045679706e-07,
"loss": 0.0006,
"reward": 1.0252277310937643,
"reward_std": 0.5337934233248234,
"rewards/cosine_scaled_reward": 0.06469716504216194,
"rewards/format_reward": 0.8958333432674408,
"step": 295
},
{
"completion_length": 1718.8958892822266,
"epoch": 0.3382857142857143,
"grad_norm": 0.609682559967041,
"kl": 0.02099609375,
"learning_rate": 4.842626371469149e-07,
"loss": 0.0008,
"reward": 0.6263405880890787,
"reward_std": 0.6606453433632851,
"rewards/cosine_scaled_reward": -0.10349638154730201,
"rewards/format_reward": 0.833333358168602,
"step": 296
},
{
"completion_length": 1873.7083892822266,
"epoch": 0.3394285714285714,
"grad_norm": 0.29421982169151306,
"kl": 0.01636505126953125,
"learning_rate": 4.811563736721829e-07,
"loss": 0.0007,
"reward": 0.8774769939482212,
"reward_std": 0.8447716347873211,
"rewards/cosine_scaled_reward": 0.04290514811873436,
"rewards/format_reward": 0.791666679084301,
"step": 297
},
{
"completion_length": 1471.958381652832,
"epoch": 0.3405714285714286,
"grad_norm": 0.37182632088661194,
"kl": 0.017574310302734375,
"learning_rate": 4.780534655386743e-07,
"loss": 0.0007,
"reward": 0.7408417947590351,
"reward_std": 0.7095592878758907,
"rewards/cosine_scaled_reward": -0.0462457868270576,
"rewards/format_reward": 0.8333333488553762,
"step": 298
},
{
"completion_length": 1818.8958892822266,
"epoch": 0.3417142857142857,
"grad_norm": 0.4747914671897888,
"kl": 0.03339385986328125,
"learning_rate": 4.749540639777539e-07,
"loss": 0.0013,
"reward": 0.6715311715379357,
"reward_std": 0.638892836868763,
"rewards/cosine_scaled_reward": -0.02881775365676731,
"rewards/format_reward": 0.7291666846722364,
"step": 299
},
{
"completion_length": 1775.7083740234375,
"epoch": 0.34285714285714286,
"grad_norm": 0.8008535504341125,
"kl": 0.03324127197265625,
"learning_rate": 4.7185832004988133e-07,
"loss": 0.0013,
"reward": 0.503076022490859,
"reward_std": 0.6447076573967934,
"rewards/cosine_scaled_reward": -0.08179534040391445,
"rewards/format_reward": 0.6666666846722364,
"step": 300
},
{
"completion_length": 1681.7500457763672,
"epoch": 0.344,
"grad_norm": 0.6473574042320251,
"kl": 0.0345306396484375,
"learning_rate": 4.68766384637248e-07,
"loss": 0.0014,
"reward": 0.47819951456040144,
"reward_std": 0.7737650983035564,
"rewards/cosine_scaled_reward": -0.16715026053134352,
"rewards/format_reward": 0.8125000074505806,
"step": 301
},
{
"completion_length": 1704.6250228881836,
"epoch": 0.34514285714285714,
"grad_norm": 0.32545921206474304,
"kl": 0.03167724609375,
"learning_rate": 4.656784084364238e-07,
"loss": 0.0013,
"reward": 0.9550731834024191,
"reward_std": 0.7380726649425924,
"rewards/cosine_scaled_reward": 0.11295327357947826,
"rewards/format_reward": 0.7291666716337204,
"step": 302
},
{
"completion_length": 1360.1458740234375,
"epoch": 0.3462857142857143,
"grad_norm": 0.35140302777290344,
"kl": 0.02618408203125,
"learning_rate": 4.6259454195101267e-07,
"loss": 0.001,
"reward": 0.7957173548638821,
"reward_std": 0.7143728174269199,
"rewards/cosine_scaled_reward": -0.018808012828230858,
"rewards/format_reward": 0.8333333358168602,
"step": 303
},
{
"completion_length": 1244.3750381469727,
"epoch": 0.3474285714285714,
"grad_norm": 0.38197869062423706,
"kl": 0.0144195556640625,
"learning_rate": 4.59514935484316e-07,
"loss": 0.0006,
"reward": 0.6606247052550316,
"reward_std": 0.5714793428778648,
"rewards/cosine_scaled_reward": -0.11760433949530125,
"rewards/format_reward": 0.8958333395421505,
"step": 304
},
{
"completion_length": 1325.6667022705078,
"epoch": 0.3485714285714286,
"grad_norm": 0.3902123272418976,
"kl": 0.016117095947265625,
"learning_rate": 4.5643973913200837e-07,
"loss": 0.0006,
"reward": 0.6409200690686703,
"reward_std": 0.6203774958848953,
"rewards/cosine_scaled_reward": -0.1587066389620304,
"rewards/format_reward": 0.9583333432674408,
"step": 305
},
{
"completion_length": 1282.3542175292969,
"epoch": 0.3497142857142857,
"grad_norm": 0.6988428235054016,
"kl": 0.026388168334960938,
"learning_rate": 4.5336910277482155e-07,
"loss": 0.0011,
"reward": 1.0727143473923206,
"reward_std": 0.6053726552054286,
"rewards/cosine_scaled_reward": 0.11969051510095596,
"rewards/format_reward": 0.833333333954215,
"step": 306
},
{
"completion_length": 1716.9583702087402,
"epoch": 0.35085714285714287,
"grad_norm": 0.36837834119796753,
"kl": 0.040142059326171875,
"learning_rate": 4.503031760712397e-07,
"loss": 0.0016,
"reward": 0.8561188094317913,
"reward_std": 0.9516715090721846,
"rewards/cosine_scaled_reward": 0.011392734944820404,
"rewards/format_reward": 0.8333333395421505,
"step": 307
},
{
"completion_length": 2273.4584045410156,
"epoch": 0.352,
"grad_norm": 0.5795905590057373,
"kl": 0.03869056701660156,
"learning_rate": 4.4724210845020494e-07,
"loss": 0.0015,
"reward": 0.7666318230330944,
"reward_std": 0.9787959046661854,
"rewards/cosine_scaled_reward": -0.002100769430398941,
"rewards/format_reward": 0.7708333432674408,
"step": 308
},
{
"completion_length": 2063.3542556762695,
"epoch": 0.35314285714285715,
"grad_norm": 0.3654479384422302,
"kl": 0.05999183654785156,
"learning_rate": 4.441860491038345e-07,
"loss": 0.0024,
"reward": 0.6858750740066171,
"reward_std": 0.760811198502779,
"rewards/cosine_scaled_reward": -0.0528958085924387,
"rewards/format_reward": 0.7916666734963655,
"step": 309
},
{
"completion_length": 1340.1250305175781,
"epoch": 0.35428571428571426,
"grad_norm": 0.49283814430236816,
"kl": 0.033687591552734375,
"learning_rate": 4.4113514698014953e-07,
"loss": 0.0013,
"reward": 0.7110094074159861,
"reward_std": 0.7351307831704617,
"rewards/cosine_scaled_reward": -0.06116198655217886,
"rewards/format_reward": 0.8333333395421505,
"step": 310
},
{
"completion_length": 1418.187515258789,
"epoch": 0.3554285714285714,
"grad_norm": 0.44990670680999756,
"kl": 0.04097747802734375,
"learning_rate": 4.3808955077581546e-07,
"loss": 0.0016,
"reward": 1.2232839856296778,
"reward_std": 0.9500982649624348,
"rewards/cosine_scaled_reward": 0.15330865047872066,
"rewards/format_reward": 0.9166666716337204,
"step": 311
},
{
"completion_length": 1471.708381652832,
"epoch": 0.3565714285714286,
"grad_norm": 0.6391323804855347,
"kl": 0.04137420654296875,
"learning_rate": 4.350494089288943e-07,
"loss": 0.0017,
"reward": 1.1680905930697918,
"reward_std": 0.37351681664586067,
"rewards/cosine_scaled_reward": 0.16737862676382065,
"rewards/format_reward": 0.8333333395421505,
"step": 312
},
{
"completion_length": 2079.6667137145996,
"epoch": 0.3577142857142857,
"grad_norm": 0.9562482833862305,
"kl": 0.07365036010742188,
"learning_rate": 4.3201486961161093e-07,
"loss": 0.0029,
"reward": 0.6807579882442951,
"reward_std": 0.8096479428932071,
"rewards/cosine_scaled_reward": 0.017462321557104588,
"rewards/format_reward": 0.6458333488553762,
"step": 313
},
{
"completion_length": 1646.2500305175781,
"epoch": 0.3588571428571429,
"grad_norm": 0.582167387008667,
"kl": 0.06851959228515625,
"learning_rate": 4.2898608072313045e-07,
"loss": 0.0027,
"reward": 0.823919128626585,
"reward_std": 0.5355783794075251,
"rewards/cosine_scaled_reward": 0.0369595680385828,
"rewards/format_reward": 0.7500000111758709,
"step": 314
},
{
"completion_length": 2116.229217529297,
"epoch": 0.36,
"grad_norm": 0.6685879230499268,
"kl": 0.1165008544921875,
"learning_rate": 4.2596318988235037e-07,
"loss": 0.0047,
"reward": 0.8504741322249174,
"reward_std": 0.5491553768515587,
"rewards/cosine_scaled_reward": 0.06065371725708246,
"rewards/format_reward": 0.7291666734963655,
"step": 315
},
{
"completion_length": 2242.4375610351562,
"epoch": 0.36114285714285715,
"grad_norm": 0.7561622262001038,
"kl": 0.11125946044921875,
"learning_rate": 4.2294634442070553e-07,
"loss": 0.0045,
"reward": -0.009940480813384056,
"reward_std": 0.5052176639437675,
"rewards/cosine_scaled_reward": -0.22372024692595005,
"rewards/format_reward": 0.43750000931322575,
"step": 316
},
{
"completion_length": 1841.1875457763672,
"epoch": 0.36228571428571427,
"grad_norm": 1.0075013637542725,
"kl": 0.0679473876953125,
"learning_rate": 4.1993569137498776e-07,
"loss": 0.0027,
"reward": 0.5096511642332189,
"reward_std": 0.8513825722038746,
"rewards/cosine_scaled_reward": -0.057674430310726166,
"rewards/format_reward": 0.625000013038516,
"step": 317
},
{
"completion_length": 1029.7916870117188,
"epoch": 0.36342857142857143,
"grad_norm": 0.38250845670700073,
"kl": 0.039661407470703125,
"learning_rate": 4.1693137748017915e-07,
"loss": 0.0016,
"reward": 0.8797074742615223,
"reward_std": 0.4655795283615589,
"rewards/cosine_scaled_reward": -0.0601462684571743,
"rewards/format_reward": 1.0,
"step": 318
},
{
"completion_length": 1612.9167098999023,
"epoch": 0.36457142857142855,
"grad_norm": 0.7825446724891663,
"kl": 0.05585479736328125,
"learning_rate": 4.1393354916230005e-07,
"loss": 0.0022,
"reward": 0.5894158203154802,
"reward_std": 0.7856029607355595,
"rewards/cosine_scaled_reward": -0.13237543310970068,
"rewards/format_reward": 0.854166679084301,
"step": 319
},
{
"completion_length": 1328.7916831970215,
"epoch": 0.3657142857142857,
"grad_norm": 0.7866749167442322,
"kl": 0.07510757446289062,
"learning_rate": 4.1094235253127374e-07,
"loss": 0.003,
"reward": 1.0613620709627867,
"reward_std": 0.6303130388259888,
"rewards/cosine_scaled_reward": 0.07234767638146877,
"rewards/format_reward": 0.916666679084301,
"step": 320
},
{
"completion_length": 1117.3125457763672,
"epoch": 0.3668571428571429,
"grad_norm": 0.5064549446105957,
"kl": 0.04396820068359375,
"learning_rate": 4.079579333738039e-07,
"loss": 0.0018,
"reward": 1.1430763825774193,
"reward_std": 0.6700709462165833,
"rewards/cosine_scaled_reward": 0.1340381633490324,
"rewards/format_reward": 0.8750000111758709,
"step": 321
},
{
"completion_length": 1828.0417175292969,
"epoch": 0.368,
"grad_norm": 0.7172619104385376,
"kl": 0.133880615234375,
"learning_rate": 4.0498043714627006e-07,
"loss": 0.0054,
"reward": 0.645214811898768,
"reward_std": 1.0024632290005684,
"rewards/cosine_scaled_reward": -0.04197592940181494,
"rewards/format_reward": 0.7291666753590107,
"step": 322
},
{
"completion_length": 1734.2916946411133,
"epoch": 0.36914285714285716,
"grad_norm": 0.7203247547149658,
"kl": 0.1142730712890625,
"learning_rate": 4.020100089676376e-07,
"loss": 0.0046,
"reward": 0.6648443900048733,
"reward_std": 0.6985526494681835,
"rewards/cosine_scaled_reward": 0.009505534544587135,
"rewards/format_reward": 0.6458333414047956,
"step": 323
},
{
"completion_length": 1589.1667022705078,
"epoch": 0.3702857142857143,
"grad_norm": 0.8060963153839111,
"kl": 0.10352706909179688,
"learning_rate": 3.9904679361238526e-07,
"loss": 0.0041,
"reward": 0.27571332873776555,
"reward_std": 0.6113120466470718,
"rewards/cosine_scaled_reward": -0.20589334331452847,
"rewards/format_reward": 0.6875000167638063,
"step": 324
},
{
"completion_length": 1642.0625686645508,
"epoch": 0.37142857142857144,
"grad_norm": 0.603640079498291,
"kl": 0.059417724609375,
"learning_rate": 3.9609093550344907e-07,
"loss": 0.0024,
"reward": 0.9450785778462887,
"reward_std": 0.8715181350708008,
"rewards/cosine_scaled_reward": 0.045455962885171175,
"rewards/format_reward": 0.8541666865348816,
"step": 325
},
{
"completion_length": 1383.208351135254,
"epoch": 0.37257142857142855,
"grad_norm": 0.8735449314117432,
"kl": 0.08353424072265625,
"learning_rate": 3.931425787051832e-07,
"loss": 0.0033,
"reward": 1.0190029181540012,
"reward_std": 0.7654371298849583,
"rewards/cosine_scaled_reward": 0.07200142601504922,
"rewards/format_reward": 0.8750000111758709,
"step": 326
},
{
"completion_length": 1609.208366394043,
"epoch": 0.3737142857142857,
"grad_norm": 0.40844622254371643,
"kl": 0.056499481201171875,
"learning_rate": 3.902018669163384e-07,
"loss": 0.0023,
"reward": 1.1053630914539099,
"reward_std": 0.6595817804336548,
"rewards/cosine_scaled_reward": 0.1360148610547185,
"rewards/format_reward": 0.8333333358168602,
"step": 327
},
{
"completion_length": 1788.4375534057617,
"epoch": 0.37485714285714283,
"grad_norm": 0.6934894919395447,
"kl": 0.11262893676757812,
"learning_rate": 3.872689434630585e-07,
"loss": 0.0045,
"reward": 0.47454674541950226,
"reward_std": 0.7443269528448582,
"rewards/cosine_scaled_reward": -0.12730997893959284,
"rewards/format_reward": 0.7291666753590107,
"step": 328
},
{
"completion_length": 1103.7083587646484,
"epoch": 0.376,
"grad_norm": 0.9421954154968262,
"kl": 0.05080413818359375,
"learning_rate": 3.843439512918949e-07,
"loss": 0.002,
"reward": 1.2518079336732626,
"reward_std": 0.5773205179721117,
"rewards/cosine_scaled_reward": 0.15715394588187337,
"rewards/format_reward": 0.9375000149011612,
"step": 329
},
{
"completion_length": 1171.9791946411133,
"epoch": 0.37714285714285717,
"grad_norm": 0.7696932554244995,
"kl": 0.10460662841796875,
"learning_rate": 3.8142703296283953e-07,
"loss": 0.0042,
"reward": 0.5345208197832108,
"reward_std": 0.7594005465507507,
"rewards/cosine_scaled_reward": -0.15982293151319027,
"rewards/format_reward": 0.8541666716337204,
"step": 330
},
{
"completion_length": 1748.4167098999023,
"epoch": 0.3782857142857143,
"grad_norm": 1.497854232788086,
"kl": 0.12451934814453125,
"learning_rate": 3.785183306423767e-07,
"loss": 0.005,
"reward": 0.5829995409585536,
"reward_std": 0.8618629835546017,
"rewards/cosine_scaled_reward": -0.0626668983604759,
"rewards/format_reward": 0.7083333432674408,
"step": 331
},
{
"completion_length": 1661.1667022705078,
"epoch": 0.37942857142857145,
"grad_norm": 0.7761502861976624,
"kl": 0.08023452758789062,
"learning_rate": 3.7561798609655373e-07,
"loss": 0.0032,
"reward": 0.5321935811080039,
"reward_std": 0.5660249888896942,
"rewards/cosine_scaled_reward": -0.11931989248842001,
"rewards/format_reward": 0.7708333432674408,
"step": 332
},
{
"completion_length": 1305.7500228881836,
"epoch": 0.38057142857142856,
"grad_norm": 0.7130544185638428,
"kl": 0.0748291015625,
"learning_rate": 3.72726140684072e-07,
"loss": 0.003,
"reward": 0.7820851001888514,
"reward_std": 0.671183954924345,
"rewards/cosine_scaled_reward": -0.06729080062359571,
"rewards/format_reward": 0.9166666865348816,
"step": 333
},
{
"completion_length": 2046.4375610351562,
"epoch": 0.38171428571428573,
"grad_norm": 0.9568099975585938,
"kl": 0.18133544921875,
"learning_rate": 3.6984293534939737e-07,
"loss": 0.0072,
"reward": 0.355112224817276,
"reward_std": 0.6977374590933323,
"rewards/cosine_scaled_reward": -0.1870272308588028,
"rewards/format_reward": 0.7291666828095913,
"step": 334
},
{
"completion_length": 1462.2292022705078,
"epoch": 0.38285714285714284,
"grad_norm": 1.2614269256591797,
"kl": 0.07938385009765625,
"learning_rate": 3.6696851061588994e-07,
"loss": 0.0032,
"reward": 0.9349308745004237,
"reward_std": 0.6556266993284225,
"rewards/cosine_scaled_reward": 0.04038208909332752,
"rewards/format_reward": 0.854166679084301,
"step": 335
},
{
"completion_length": 1641.4375534057617,
"epoch": 0.384,
"grad_norm": 0.8289753198623657,
"kl": 0.11359786987304688,
"learning_rate": 3.641030065789562e-07,
"loss": 0.0045,
"reward": 0.8081842958927155,
"reward_std": 0.931253258138895,
"rewards/cosine_scaled_reward": 0.08117546886205673,
"rewards/format_reward": 0.6458333469927311,
"step": 336
},
{
"completion_length": 1747.2500534057617,
"epoch": 0.3851428571428571,
"grad_norm": 1.1650760173797607,
"kl": 0.12990570068359375,
"learning_rate": 3.612465628992203e-07,
"loss": 0.0052,
"reward": 0.7834707293659449,
"reward_std": 0.8634283617138863,
"rewards/cosine_scaled_reward": -0.04576464742422104,
"rewards/format_reward": 0.8750000074505806,
"step": 337
},
{
"completion_length": 1280.7708740234375,
"epoch": 0.3862857142857143,
"grad_norm": 0.5302877426147461,
"kl": 0.062366485595703125,
"learning_rate": 3.5839931879571725e-07,
"loss": 0.0025,
"reward": 1.1318869441747665,
"reward_std": 0.7834238409996033,
"rewards/cosine_scaled_reward": 0.09719344391487539,
"rewards/format_reward": 0.9375,
"step": 338
},
{
"completion_length": 1575.9375381469727,
"epoch": 0.38742857142857146,
"grad_norm": 3.0021309852600098,
"kl": 0.14781951904296875,
"learning_rate": 3.555614130391079e-07,
"loss": 0.0059,
"reward": 0.435087047284469,
"reward_std": 0.43532489985227585,
"rewards/cosine_scaled_reward": -0.13662315905094147,
"rewards/format_reward": 0.7083333563059568,
"step": 339
},
{
"completion_length": 1496.6250534057617,
"epoch": 0.38857142857142857,
"grad_norm": 1.7084987163543701,
"kl": 0.12076950073242188,
"learning_rate": 3.5273298394491515e-07,
"loss": 0.0048,
"reward": 0.7966768400510773,
"reward_std": 0.5643632663413882,
"rewards/cosine_scaled_reward": -0.007911591790616512,
"rewards/format_reward": 0.8125000111758709,
"step": 340
},
{
"completion_length": 1304.9792098999023,
"epoch": 0.38971428571428574,
"grad_norm": 5.948554039001465,
"kl": 0.22769927978515625,
"learning_rate": 3.4991416936678276e-07,
"loss": 0.0091,
"reward": 1.1839299397543073,
"reward_std": 0.643942728638649,
"rewards/cosine_scaled_reward": 0.175298273563385,
"rewards/format_reward": 0.8333333507180214,
"step": 341
},
{
"completion_length": 1663.6875610351562,
"epoch": 0.39085714285714285,
"grad_norm": 1.0961518287658691,
"kl": 0.18028640747070312,
"learning_rate": 3.471051066897562e-07,
"loss": 0.0072,
"reward": 0.8878756612539291,
"reward_std": 0.9449864365160465,
"rewards/cosine_scaled_reward": 0.06893783865962178,
"rewards/format_reward": 0.750000013038516,
"step": 342
},
{
"completion_length": 1583.0000686645508,
"epoch": 0.392,
"grad_norm": 1.2760441303253174,
"kl": 0.15594482421875,
"learning_rate": 3.4430593282358777e-07,
"loss": 0.0062,
"reward": 0.8547459337860346,
"reward_std": 0.6034443583339453,
"rewards/cosine_scaled_reward": 0.0002896404330385849,
"rewards/format_reward": 0.8541666939854622,
"step": 343
},
{
"completion_length": 1706.4375457763672,
"epoch": 0.3931428571428571,
"grad_norm": 1.06587553024292,
"kl": 0.281158447265625,
"learning_rate": 3.4151678419606233e-07,
"loss": 0.0113,
"reward": 1.1242186180315912,
"reward_std": 0.6546860057860613,
"rewards/cosine_scaled_reward": 0.17669263062998652,
"rewards/format_reward": 0.7708333432674408,
"step": 344
},
{
"completion_length": 1465.4583587646484,
"epoch": 0.3942857142857143,
"grad_norm": 1.2281720638275146,
"kl": 0.1438140869140625,
"learning_rate": 3.387377967463493e-07,
"loss": 0.0058,
"reward": 0.9402041547000408,
"reward_std": 0.6865711808204651,
"rewards/cosine_scaled_reward": 0.011768726049922407,
"rewards/format_reward": 0.916666679084301,
"step": 345
},
{
"completion_length": 1591.0833740234375,
"epoch": 0.3954285714285714,
"grad_norm": 0.8186521530151367,
"kl": 0.10266876220703125,
"learning_rate": 3.359691059183761e-07,
"loss": 0.0041,
"reward": 0.6260932851582766,
"reward_std": 0.5609011054039001,
"rewards/cosine_scaled_reward": -0.15570336702512577,
"rewards/format_reward": 0.9375000074505806,
"step": 346
},
{
"completion_length": 1574.2083892822266,
"epoch": 0.3965714285714286,
"grad_norm": 0.7894213795661926,
"kl": 0.1241302490234375,
"learning_rate": 3.3321084665422803e-07,
"loss": 0.005,
"reward": 0.5836004763841629,
"reward_std": 0.6906316690146923,
"rewards/cosine_scaled_reward": -0.15611644479213282,
"rewards/format_reward": 0.8958333432674408,
"step": 347
},
{
"completion_length": 1624.1667022705078,
"epoch": 0.3977142857142857,
"grad_norm": 1.4086353778839111,
"kl": 0.22664642333984375,
"learning_rate": 3.3046315338757026e-07,
"loss": 0.0091,
"reward": 0.7276029635686427,
"reward_std": 0.6486309953033924,
"rewards/cosine_scaled_reward": -0.04244852438569069,
"rewards/format_reward": 0.8125000074505806,
"step": 348
},
{
"completion_length": 1256.1875228881836,
"epoch": 0.39885714285714285,
"grad_norm": 1.4020756483078003,
"kl": 0.1046600341796875,
"learning_rate": 3.2772616003709616e-07,
"loss": 0.0042,
"reward": 0.5811667609959841,
"reward_std": 0.7433434501290321,
"rewards/cosine_scaled_reward": -0.09483329905197024,
"rewards/format_reward": 0.7708333469927311,
"step": 349
},
{
"completion_length": 1057.7291946411133,
"epoch": 0.4,
"grad_norm": 0.9083346128463745,
"kl": 0.0474395751953125,
"learning_rate": 3.250000000000001e-07,
"loss": 0.0019,
"reward": 0.8259809445589781,
"reward_std": 0.7229878939688206,
"rewards/cosine_scaled_reward": -0.06617620773613453,
"rewards/format_reward": 0.9583333358168602,
"step": 350
},
{
"completion_length": 1219.6458740234375,
"epoch": 0.40114285714285713,
"grad_norm": 1.6471617221832275,
"kl": 0.12386322021484375,
"learning_rate": 3.222848061454764e-07,
"loss": 0.0049,
"reward": 0.7785749807953835,
"reward_std": 0.8159589394927025,
"rewards/cosine_scaled_reward": -0.0377958663739264,
"rewards/format_reward": 0.854166679084301,
"step": 351
},
{
"completion_length": 1422.1875534057617,
"epoch": 0.4022857142857143,
"grad_norm": 1.3077303171157837,
"kl": 0.2323760986328125,
"learning_rate": 3.195807108082429e-07,
"loss": 0.0093,
"reward": 0.9625241123139858,
"reward_std": 0.8021371066570282,
"rewards/cosine_scaled_reward": 0.06459538266062737,
"rewards/format_reward": 0.8333333469927311,
"step": 352
},
{
"completion_length": 1283.333351135254,
"epoch": 0.4034285714285714,
"grad_norm": 1.1020612716674805,
"kl": 0.125701904296875,
"learning_rate": 3.168878457820915e-07,
"loss": 0.005,
"reward": 1.0344783924520016,
"reward_std": 0.8279353678226471,
"rewards/cosine_scaled_reward": 0.05890584830194712,
"rewards/format_reward": 0.916666679084301,
"step": 353
},
{
"completion_length": 1127.6667022705078,
"epoch": 0.4045714285714286,
"grad_norm": 1.216342568397522,
"kl": 0.12079620361328125,
"learning_rate": 3.142063423134644e-07,
"loss": 0.0048,
"reward": 1.099512368440628,
"reward_std": 0.5979024097323418,
"rewards/cosine_scaled_reward": 0.0914228311739862,
"rewards/format_reward": 0.916666679084301,
"step": 354
},
{
"completion_length": 1001.6458549499512,
"epoch": 0.4057142857142857,
"grad_norm": 0.9325054287910461,
"kl": 0.059314727783203125,
"learning_rate": 3.115363310950578e-07,
"loss": 0.0024,
"reward": 1.2166800498962402,
"reward_std": 0.675847515463829,
"rewards/cosine_scaled_reward": 0.12917334213852882,
"rewards/format_reward": 0.9583333432674408,
"step": 355
},
{
"completion_length": 1459.2292251586914,
"epoch": 0.40685714285714286,
"grad_norm": 2.4190444946289062,
"kl": 0.2304840087890625,
"learning_rate": 3.0887794225945143e-07,
"loss": 0.0092,
"reward": 0.4463031552731991,
"reward_std": 0.6662746425718069,
"rewards/cosine_scaled_reward": -0.17268177028745413,
"rewards/format_reward": 0.7916666865348816,
"step": 356
},
{
"completion_length": 1725.6875457763672,
"epoch": 0.408,
"grad_norm": 2.1936678886413574,
"kl": 0.295989990234375,
"learning_rate": 3.062313053727671e-07,
"loss": 0.0118,
"reward": 0.3668034356087446,
"reward_std": 0.5282239988446236,
"rewards/cosine_scaled_reward": -0.26451496221125126,
"rewards/format_reward": 0.8958333507180214,
"step": 357
},
{
"completion_length": 1472.312515258789,
"epoch": 0.40914285714285714,
"grad_norm": 1.637854814529419,
"kl": 0.2021331787109375,
"learning_rate": 3.0359654942835247e-07,
"loss": 0.0081,
"reward": 1.2237652689218521,
"reward_std": 0.6800592541694641,
"rewards/cosine_scaled_reward": 0.1535492818802595,
"rewards/format_reward": 0.9166666716337204,
"step": 358
},
{
"completion_length": 929.5416793823242,
"epoch": 0.4102857142857143,
"grad_norm": 1.4736838340759277,
"kl": 0.1471099853515625,
"learning_rate": 3.0097380284049523e-07,
"loss": 0.0059,
"reward": 0.8410445712506771,
"reward_std": 0.5730547439306974,
"rewards/cosine_scaled_reward": -0.06906107859686017,
"rewards/format_reward": 0.9791666716337204,
"step": 359
},
{
"completion_length": 1209.8125457763672,
"epoch": 0.4114285714285714,
"grad_norm": 2.1532845497131348,
"kl": 0.35944366455078125,
"learning_rate": 2.9836319343816397e-07,
"loss": 0.0144,
"reward": 0.769681841135025,
"reward_std": 0.8142721727490425,
"rewards/cosine_scaled_reward": -0.06307575106620789,
"rewards/format_reward": 0.895833358168602,
"step": 360
},
{
"completion_length": 1173.6041946411133,
"epoch": 0.4125714285714286,
"grad_norm": 2.002321481704712,
"kl": 0.2322998046875,
"learning_rate": 2.9576484845877793e-07,
"loss": 0.0093,
"reward": 0.8964769653975964,
"reward_std": 0.7383791692554951,
"rewards/cosine_scaled_reward": -0.041344886645674706,
"rewards/format_reward": 0.9791666716337204,
"step": 361
},
{
"completion_length": 1014.0625305175781,
"epoch": 0.4137142857142857,
"grad_norm": 1.2043986320495605,
"kl": 0.2470703125,
"learning_rate": 2.931788945420058e-07,
"loss": 0.0099,
"reward": 1.00144612044096,
"reward_std": 0.47917743772268295,
"rewards/cosine_scaled_reward": 0.03197303228080273,
"rewards/format_reward": 0.9375,
"step": 362
},
{
"completion_length": 896.8541870117188,
"epoch": 0.41485714285714287,
"grad_norm": 1.703730583190918,
"kl": 0.1836700439453125,
"learning_rate": 2.9060545772359305e-07,
"loss": 0.0074,
"reward": 1.373911328613758,
"reward_std": 0.7581704575568438,
"rewards/cosine_scaled_reward": 0.21820560842752457,
"rewards/format_reward": 0.9375000074505806,
"step": 363
},
{
"completion_length": 1474.4167251586914,
"epoch": 0.416,
"grad_norm": 2.327479839324951,
"kl": 0.5921630859375,
"learning_rate": 2.8804466342921987e-07,
"loss": 0.0237,
"reward": 0.48526396974921227,
"reward_std": 0.5558421425521374,
"rewards/cosine_scaled_reward": -0.20528469607234,
"rewards/format_reward": 0.895833358168602,
"step": 364
},
{
"completion_length": 1866.6875610351562,
"epoch": 0.41714285714285715,
"grad_norm": 2.2852516174316406,
"kl": 0.80853271484375,
"learning_rate": 2.854966364683872e-07,
"loss": 0.0324,
"reward": 0.7523469850420952,
"reward_std": 0.6609731055796146,
"rewards/cosine_scaled_reward": 0.03242346830666065,
"rewards/format_reward": 0.6875000111758709,
"step": 365
},
{
"completion_length": 1085.2083740234375,
"epoch": 0.41828571428571426,
"grad_norm": 2.795609951019287,
"kl": 0.29659271240234375,
"learning_rate": 2.829615010283344e-07,
"loss": 0.0119,
"reward": 0.9424293376505375,
"reward_std": 0.763162437826395,
"rewards/cosine_scaled_reward": 0.023297980427742004,
"rewards/format_reward": 0.8958333507180214,
"step": 366
},
{
"completion_length": 1567.9167175292969,
"epoch": 0.41942857142857143,
"grad_norm": 1.7122759819030762,
"kl": 0.5317840576171875,
"learning_rate": 2.8043938066798645e-07,
"loss": 0.0213,
"reward": 0.8573304824531078,
"reward_std": 0.7073214678093791,
"rewards/cosine_scaled_reward": 0.01199856773018837,
"rewards/format_reward": 0.8333333507180214,
"step": 367
},
{
"completion_length": 1578.2083549499512,
"epoch": 0.4205714285714286,
"grad_norm": 4.118416786193848,
"kl": 0.380859375,
"learning_rate": 2.7793039831193133e-07,
"loss": 0.0152,
"reward": 0.5520367622375488,
"reward_std": 0.651011124253273,
"rewards/cosine_scaled_reward": -0.13023164262995124,
"rewards/format_reward": 0.8125000149011612,
"step": 368
},
{
"completion_length": 1570.8333702087402,
"epoch": 0.4217142857142857,
"grad_norm": 3.169609785079956,
"kl": 0.53338623046875,
"learning_rate": 2.7543467624442956e-07,
"loss": 0.0213,
"reward": 0.7706317435950041,
"reward_std": 0.9069001339375973,
"rewards/cosine_scaled_reward": -0.020934134838171303,
"rewards/format_reward": 0.8125000223517418,
"step": 369
},
{
"completion_length": 1200.7500228881836,
"epoch": 0.4228571428571429,
"grad_norm": 1.968066930770874,
"kl": 0.415557861328125,
"learning_rate": 2.729523361034538e-07,
"loss": 0.0166,
"reward": 0.761927604675293,
"reward_std": 0.5050818659365177,
"rewards/cosine_scaled_reward": -0.05653620883822441,
"rewards/format_reward": 0.8750000223517418,
"step": 370
},
{
"completion_length": 638.2708549499512,
"epoch": 0.424,
"grad_norm": 1.2254022359848022,
"kl": 0.0458984375,
"learning_rate": 2.7048349887476037e-07,
"loss": 0.0018,
"reward": 1.2654454093426466,
"reward_std": 0.4487613644450903,
"rewards/cosine_scaled_reward": 0.13272269815206528,
"rewards/format_reward": 1.0,
"step": 371
},
{
"completion_length": 1610.770896911621,
"epoch": 0.42514285714285716,
"grad_norm": 1.422675371170044,
"kl": 0.4040679931640625,
"learning_rate": 2.6802828488599294e-07,
"loss": 0.0162,
"reward": 0.9225452449172735,
"reward_std": 0.8723856098949909,
"rewards/cosine_scaled_reward": 0.03418927453458309,
"rewards/format_reward": 0.8541666865348816,
"step": 372
},
{
"completion_length": 916.0416870117188,
"epoch": 0.42628571428571427,
"grad_norm": 4.1732659339904785,
"kl": 0.23648834228515625,
"learning_rate": 2.655868138008171e-07,
"loss": 0.0095,
"reward": 0.753505942877382,
"reward_std": 0.5106633007526398,
"rewards/cosine_scaled_reward": -0.1024136976338923,
"rewards/format_reward": 0.9583333432674408,
"step": 373
},
{
"completion_length": 1235.333396911621,
"epoch": 0.42742857142857144,
"grad_norm": 0.9994511008262634,
"kl": 0.1927032470703125,
"learning_rate": 2.631592046130896e-07,
"loss": 0.0077,
"reward": 1.0162720493972301,
"reward_std": 0.4486389271914959,
"rewards/cosine_scaled_reward": 0.03938601026311517,
"rewards/format_reward": 0.9375000149011612,
"step": 374
},
{
"completion_length": 1341.0625305175781,
"epoch": 0.42857142857142855,
"grad_norm": 2.8108980655670166,
"kl": 0.382843017578125,
"learning_rate": 2.6074557564105724e-07,
"loss": 0.0153,
"reward": 0.9311719611287117,
"reward_std": 0.6638787761330605,
"rewards/cosine_scaled_reward": 0.038502639159560204,
"rewards/format_reward": 0.8541666865348816,
"step": 375
},
{
"completion_length": 1282.5000534057617,
"epoch": 0.4297142857142857,
"grad_norm": 1.4323093891143799,
"kl": 0.21431732177734375,
"learning_rate": 2.583460445215911e-07,
"loss": 0.0086,
"reward": 0.8344197571277618,
"reward_std": 0.7551028467714787,
"rewards/cosine_scaled_reward": -0.04112347261980176,
"rewards/format_reward": 0.916666679084301,
"step": 376
},
{
"completion_length": 1487.5625495910645,
"epoch": 0.4308571428571429,
"grad_norm": 2.224385976791382,
"kl": 0.4652557373046875,
"learning_rate": 2.5596072820445254e-07,
"loss": 0.0186,
"reward": 0.49483706802129745,
"reward_std": 0.7973055504262447,
"rewards/cosine_scaled_reward": -0.13799815066158772,
"rewards/format_reward": 0.7708333432674408,
"step": 377
},
{
"completion_length": 1158.9791851043701,
"epoch": 0.432,
"grad_norm": 1.5131416320800781,
"kl": 0.283233642578125,
"learning_rate": 2.5358974294659373e-07,
"loss": 0.0113,
"reward": 1.1355108618736267,
"reward_std": 0.7891764938831329,
"rewards/cosine_scaled_reward": 0.1198387467302382,
"rewards/format_reward": 0.8958333432674408,
"step": 378
},
{
"completion_length": 1687.2292213439941,
"epoch": 0.43314285714285716,
"grad_norm": 2.515700340270996,
"kl": 0.5447998046875,
"learning_rate": 2.512332043064913e-07,
"loss": 0.0218,
"reward": 0.488259082660079,
"reward_std": 0.6213226802647114,
"rewards/cosine_scaled_reward": -0.16212046705186367,
"rewards/format_reward": 0.8125000186264515,
"step": 379
},
{
"completion_length": 1145.1250343322754,
"epoch": 0.4342857142857143,
"grad_norm": 2.491877555847168,
"kl": 0.34210205078125,
"learning_rate": 2.488912271385139e-07,
"loss": 0.0137,
"reward": 0.6510128956288099,
"reward_std": 0.764140423387289,
"rewards/cosine_scaled_reward": -0.09116021171212196,
"rewards/format_reward": 0.8333333432674408,
"step": 380
},
{
"completion_length": 1576.9791946411133,
"epoch": 0.43542857142857144,
"grad_norm": 2.5349020957946777,
"kl": 0.4387664794921875,
"learning_rate": 2.465639255873246e-07,
"loss": 0.0176,
"reward": 0.45536880288273096,
"reward_std": 0.6865225993096828,
"rewards/cosine_scaled_reward": -0.17856561671942472,
"rewards/format_reward": 0.8125000223517418,
"step": 381
},
{
"completion_length": 1223.1667022705078,
"epoch": 0.43657142857142855,
"grad_norm": 3.171522378921509,
"kl": 0.3052978515625,
"learning_rate": 2.4425141308231765e-07,
"loss": 0.0122,
"reward": 0.47218240704387426,
"reward_std": 0.6197453737258911,
"rewards/cosine_scaled_reward": -0.20140881277620792,
"rewards/format_reward": 0.8750000223517418,
"step": 382
},
{
"completion_length": 1093.8750343322754,
"epoch": 0.4377142857142857,
"grad_norm": 1.554950475692749,
"kl": 0.307342529296875,
"learning_rate": 2.4195380233209006e-07,
"loss": 0.0123,
"reward": 1.3121628165245056,
"reward_std": 0.7447218038141727,
"rewards/cosine_scaled_reward": 0.1873313980177045,
"rewards/format_reward": 0.9375000074505806,
"step": 383
},
{
"completion_length": 1102.0833778381348,
"epoch": 0.43885714285714283,
"grad_norm": 3.9803340435028076,
"kl": 0.32332611083984375,
"learning_rate": 2.3967120531894857e-07,
"loss": 0.0129,
"reward": 1.2209933521226048,
"reward_std": 0.9811634235084057,
"rewards/cosine_scaled_reward": 0.1834133416414261,
"rewards/format_reward": 0.854166679084301,
"step": 384
},
{
"completion_length": 1301.0625381469727,
"epoch": 0.44,
"grad_norm": 1.6203845739364624,
"kl": 0.3782196044921875,
"learning_rate": 2.374037332934512e-07,
"loss": 0.0151,
"reward": 0.6873492915183306,
"reward_std": 0.7231754045933485,
"rewards/cosine_scaled_reward": -0.09382536727935076,
"rewards/format_reward": 0.8750000111758709,
"step": 385
},
{
"completion_length": 1025.8125305175781,
"epoch": 0.44114285714285717,
"grad_norm": 2.561079978942871,
"kl": 0.351043701171875,
"learning_rate": 2.3515149676898552e-07,
"loss": 0.014,
"reward": 1.1378429010510445,
"reward_std": 0.8463675826787949,
"rewards/cosine_scaled_reward": 0.14183809887617826,
"rewards/format_reward": 0.8541666865348816,
"step": 386
},
{
"completion_length": 1460.0417175292969,
"epoch": 0.4422857142857143,
"grad_norm": 3.437443494796753,
"kl": 0.76416015625,
"learning_rate": 2.3291460551638237e-07,
"loss": 0.0306,
"reward": 0.6168055031448603,
"reward_std": 0.6576487310230732,
"rewards/cosine_scaled_reward": -0.1290972474962473,
"rewards/format_reward": 0.8750000223517418,
"step": 387
},
{
"completion_length": 1255.8958740234375,
"epoch": 0.44342857142857145,
"grad_norm": 3.211789846420288,
"kl": 0.52313232421875,
"learning_rate": 2.306931685585657e-07,
"loss": 0.0209,
"reward": 0.8223461015149951,
"reward_std": 0.6507496982812881,
"rewards/cosine_scaled_reward": -0.015910295769572258,
"rewards/format_reward": 0.8541666939854622,
"step": 388
},
{
"completion_length": 1253.7917022705078,
"epoch": 0.44457142857142856,
"grad_norm": 5.535946846008301,
"kl": 0.503692626953125,
"learning_rate": 2.2848729416523859e-07,
"loss": 0.0201,
"reward": 0.7248962745070457,
"reward_std": 0.5912930071353912,
"rewards/cosine_scaled_reward": -0.07505187718197703,
"rewards/format_reward": 0.8750000149011612,
"step": 389
},
{
"completion_length": 1278.0417022705078,
"epoch": 0.44571428571428573,
"grad_norm": 3.257326364517212,
"kl": 0.6408615112304688,
"learning_rate": 2.2629708984760706e-07,
"loss": 0.0256,
"reward": 0.6384771540760994,
"reward_std": 0.7342811785638332,
"rewards/cosine_scaled_reward": -0.11826143972575665,
"rewards/format_reward": 0.8750000074505806,
"step": 390
},
{
"completion_length": 1312.6042022705078,
"epoch": 0.44685714285714284,
"grad_norm": 5.830494403839111,
"kl": 1.293975830078125,
"learning_rate": 2.2412266235313973e-07,
"loss": 0.0519,
"reward": 0.8241331037133932,
"reward_std": 0.9150289222598076,
"rewards/cosine_scaled_reward": 0.016233190894126892,
"rewards/format_reward": 0.791666679084301,
"step": 391
},
{
"completion_length": 1468.937557220459,
"epoch": 0.448,
"grad_norm": 1.9504032135009766,
"kl": 0.7166290283203125,
"learning_rate": 2.2196411766036487e-07,
"loss": 0.0287,
"reward": 0.5725310668349266,
"reward_std": 0.7125820200890303,
"rewards/cosine_scaled_reward": -0.1512344698421657,
"rewards/format_reward": 0.8750000149011612,
"step": 392
},
{
"completion_length": 1273.1250267028809,
"epoch": 0.4491428571428571,
"grad_norm": 6.959710597991943,
"kl": 1.0224609375,
"learning_rate": 2.1982156097370557e-07,
"loss": 0.0409,
"reward": 0.8387455176562071,
"reward_std": 0.8648201785981655,
"rewards/cosine_scaled_reward": 0.013122743383519264,
"rewards/format_reward": 0.8125000204890966,
"step": 393
},
{
"completion_length": 1102.4583854675293,
"epoch": 0.4502857142857143,
"grad_norm": 1.945556402206421,
"kl": 0.54296875,
"learning_rate": 2.1769509671835223e-07,
"loss": 0.0217,
"reward": 0.32192777935415506,
"reward_std": 0.5174819845706224,
"rewards/cosine_scaled_reward": -0.24528613314032555,
"rewards/format_reward": 0.8125000149011612,
"step": 394
},
{
"completion_length": 937.8958549499512,
"epoch": 0.4514285714285714,
"grad_norm": 5.496992588043213,
"kl": 0.67437744140625,
"learning_rate": 2.1558482853517253e-07,
"loss": 0.027,
"reward": 0.9939035659190267,
"reward_std": 0.7032170053571463,
"rewards/cosine_scaled_reward": 0.049035104806534946,
"rewards/format_reward": 0.8958333507180214,
"step": 395
},
{
"completion_length": 987.7292022705078,
"epoch": 0.45257142857142857,
"grad_norm": 4.864074230194092,
"kl": 0.49527740478515625,
"learning_rate": 2.134908592756607e-07,
"loss": 0.0198,
"reward": 0.7741489135660231,
"reward_std": 0.5997458174824715,
"rewards/cosine_scaled_reward": -0.06084221974015236,
"rewards/format_reward": 0.8958333432674408,
"step": 396
},
{
"completion_length": 1146.437515258789,
"epoch": 0.45371428571428574,
"grad_norm": 4.235805511474609,
"kl": 0.8509521484375,
"learning_rate": 2.1141329099692406e-07,
"loss": 0.034,
"reward": 0.8896873220801353,
"reward_std": 0.6099906638264656,
"rewards/cosine_scaled_reward": -0.04473969340324402,
"rewards/format_reward": 0.9791666716337204,
"step": 397
},
{
"completion_length": 1264.4167022705078,
"epoch": 0.45485714285714285,
"grad_norm": 2.6962742805480957,
"kl": 0.738128662109375,
"learning_rate": 2.0935222495670968e-07,
"loss": 0.0296,
"reward": 0.5152244120836258,
"reward_std": 0.8842574842274189,
"rewards/cosine_scaled_reward": -0.14863780653104186,
"rewards/format_reward": 0.8125000186264515,
"step": 398
},
{
"completion_length": 1035.7292022705078,
"epoch": 0.456,
"grad_norm": 2.1586830615997314,
"kl": 0.4624786376953125,
"learning_rate": 2.0730776160846853e-07,
"loss": 0.0185,
"reward": 0.9472854379564524,
"reward_std": 0.6391190886497498,
"rewards/cosine_scaled_reward": 0.025726054795086384,
"rewards/format_reward": 0.8958333358168602,
"step": 399
},
{
"completion_length": 913.2500343322754,
"epoch": 0.45714285714285713,
"grad_norm": 3.6302056312561035,
"kl": 0.26483154296875,
"learning_rate": 2.0528000059645995e-07,
"loss": 0.0106,
"reward": 1.4475815817713737,
"reward_std": 0.7946172095835209,
"rewards/cosine_scaled_reward": 0.26545744470786303,
"rewards/format_reward": 0.916666679084301,
"step": 400
},
{
"completion_length": 1466.1667175292969,
"epoch": 0.4582857142857143,
"grad_norm": 3.406836986541748,
"kl": 0.783599853515625,
"learning_rate": 2.032690407508949e-07,
"loss": 0.0314,
"reward": 0.6450461961794645,
"reward_std": 0.541100338101387,
"rewards/cosine_scaled_reward": -0.1149769127368927,
"rewards/format_reward": 0.8750000149011612,
"step": 401
},
{
"completion_length": 1107.4375228881836,
"epoch": 0.4594285714285714,
"grad_norm": 5.733443737030029,
"kl": 0.68023681640625,
"learning_rate": 2.0127498008311922e-07,
"loss": 0.0272,
"reward": 0.7256474066525698,
"reward_std": 0.6739897206425667,
"rewards/cosine_scaled_reward": -0.07467629760503769,
"rewards/format_reward": 0.8750000223517418,
"step": 402
},
{
"completion_length": 961.5000381469727,
"epoch": 0.4605714285714286,
"grad_norm": 2.457207441329956,
"kl": 0.1911468505859375,
"learning_rate": 1.9929791578083655e-07,
"loss": 0.0076,
"reward": 1.1732404585927725,
"reward_std": 0.589294470846653,
"rewards/cosine_scaled_reward": 0.08662020694464445,
"rewards/format_reward": 1.0,
"step": 403
},
{
"completion_length": 1151.1667175292969,
"epoch": 0.4617142857142857,
"grad_norm": 4.529026508331299,
"kl": 0.5990447998046875,
"learning_rate": 1.9733794420337213e-07,
"loss": 0.024,
"reward": 0.7381823370233178,
"reward_std": 0.6230232007801533,
"rewards/cosine_scaled_reward": -0.04757549986243248,
"rewards/format_reward": 0.8333333507180214,
"step": 404
},
{
"completion_length": 1068.1667251586914,
"epoch": 0.46285714285714286,
"grad_norm": 7.2086262702941895,
"kl": 0.4109954833984375,
"learning_rate": 1.9539516087697517e-07,
"loss": 0.0164,
"reward": 1.3128372617065907,
"reward_std": 0.8704881519079208,
"rewards/cosine_scaled_reward": 0.20850196853280067,
"rewards/format_reward": 0.895833358168602,
"step": 405
},
{
"completion_length": 1452.145851135254,
"epoch": 0.464,
"grad_norm": 6.928232192993164,
"kl": 1.5478515625,
"learning_rate": 1.934696604901642e-07,
"loss": 0.062,
"reward": 0.8689113333821297,
"reward_std": 1.0427627116441727,
"rewards/cosine_scaled_reward": 0.017788991099223495,
"rewards/format_reward": 0.8333333432674408,
"step": 406
},
{
"completion_length": 1271.6041984558105,
"epoch": 0.46514285714285714,
"grad_norm": 2.267540216445923,
"kl": 0.8037109375,
"learning_rate": 1.915615368891117e-07,
"loss": 0.0321,
"reward": 0.8099214821122587,
"reward_std": 0.6622566767036915,
"rewards/cosine_scaled_reward": -0.0012892577797174454,
"rewards/format_reward": 0.8125000111758709,
"step": 407
},
{
"completion_length": 1401.8333625793457,
"epoch": 0.4662857142857143,
"grad_norm": 2.2123162746429443,
"kl": 0.6725006103515625,
"learning_rate": 1.8967088307307e-07,
"loss": 0.0269,
"reward": 1.0615033656358719,
"reward_std": 0.8494626618921757,
"rewards/cosine_scaled_reward": 0.10366834327578545,
"rewards/format_reward": 0.8541666865348816,
"step": 408
},
{
"completion_length": 1703.6875610351562,
"epoch": 0.4674285714285714,
"grad_norm": 3.3070085048675537,
"kl": 1.30078125,
"learning_rate": 1.8779779118983867e-07,
"loss": 0.052,
"reward": 0.682604375295341,
"reward_std": 0.7020121356472373,
"rewards/cosine_scaled_reward": -0.0649478193372488,
"rewards/format_reward": 0.8125000149011612,
"step": 409
},
{
"completion_length": 1503.2500343322754,
"epoch": 0.4685714285714286,
"grad_norm": 7.976802349090576,
"kl": 1.97076416015625,
"learning_rate": 1.8594235253127372e-07,
"loss": 0.0788,
"reward": 0.46726590394973755,
"reward_std": 0.8286273218691349,
"rewards/cosine_scaled_reward": -0.07886704918928444,
"rewards/format_reward": 0.6250000260770321,
"step": 410
},
{
"completion_length": 1640.1458778381348,
"epoch": 0.4697142857142857,
"grad_norm": 3.6351277828216553,
"kl": 0.9295196533203125,
"learning_rate": 1.8410465752883758e-07,
"loss": 0.0372,
"reward": 0.6482336856424809,
"reward_std": 0.7643021754920483,
"rewards/cosine_scaled_reward": -0.0821331706829369,
"rewards/format_reward": 0.8125000186264515,
"step": 411
},
{
"completion_length": 1004.1458587646484,
"epoch": 0.47085714285714286,
"grad_norm": 2.904054641723633,
"kl": 0.413330078125,
"learning_rate": 1.822847957491922e-07,
"loss": 0.0165,
"reward": 0.9712292104959488,
"reward_std": 0.9014318101108074,
"rewards/cosine_scaled_reward": 0.048114595003426075,
"rewards/format_reward": 0.8750000149011612,
"step": 412
},
{
"completion_length": 1190.9583740234375,
"epoch": 0.472,
"grad_norm": 2.611903667449951,
"kl": 0.674652099609375,
"learning_rate": 1.804828558898332e-07,
"loss": 0.027,
"reward": 0.8645992483943701,
"reward_std": 0.8248982280492783,
"rewards/cosine_scaled_reward": 0.005216277204453945,
"rewards/format_reward": 0.854166679084301,
"step": 413
},
{
"completion_length": 1804.0000228881836,
"epoch": 0.47314285714285714,
"grad_norm": 3.9129860401153564,
"kl": 1.2784423828125,
"learning_rate": 1.7869892577476722e-07,
"loss": 0.0512,
"reward": 0.5207854583859444,
"reward_std": 0.8007166758179665,
"rewards/cosine_scaled_reward": -0.10419062164146453,
"rewards/format_reward": 0.7291666753590107,
"step": 414
},
{
"completion_length": 1269.1667022705078,
"epoch": 0.4742857142857143,
"grad_norm": 6.273374080657959,
"kl": 0.8948516845703125,
"learning_rate": 1.7693309235023127e-07,
"loss": 0.0358,
"reward": 0.6741074454039335,
"reward_std": 0.7499744538217783,
"rewards/cosine_scaled_reward": -0.05877961404621601,
"rewards/format_reward": 0.7916666865348816,
"step": 415
},
{
"completion_length": 1129.3125534057617,
"epoch": 0.4754285714285714,
"grad_norm": 1.7560322284698486,
"kl": 0.2455902099609375,
"learning_rate": 1.7518544168045524e-07,
"loss": 0.0098,
"reward": 1.1211753264069557,
"reward_std": 0.7526138452813029,
"rewards/cosine_scaled_reward": 0.07100430876016617,
"rewards/format_reward": 0.9791666716337204,
"step": 416
},
{
"completion_length": 1839.1875762939453,
"epoch": 0.4765714285714286,
"grad_norm": 4.114261627197266,
"kl": 1.20947265625,
"learning_rate": 1.7345605894346726e-07,
"loss": 0.0484,
"reward": 0.6709015071392059,
"reward_std": 0.874784380197525,
"rewards/cosine_scaled_reward": -0.049965920858085155,
"rewards/format_reward": 0.7708333544433117,
"step": 417
},
{
"completion_length": 896.5625305175781,
"epoch": 0.4777142857142857,
"grad_norm": 4.675693511962891,
"kl": 0.6626663208007812,
"learning_rate": 1.7174502842694212e-07,
"loss": 0.0265,
"reward": 1.091725105419755,
"reward_std": 0.7902912050485611,
"rewards/cosine_scaled_reward": 0.13961253920570016,
"rewards/format_reward": 0.8125000186264515,
"step": 418
},
{
"completion_length": 1493.6250457763672,
"epoch": 0.47885714285714287,
"grad_norm": 2.2568399906158447,
"kl": 0.5430755615234375,
"learning_rate": 1.7005243352409333e-07,
"loss": 0.0217,
"reward": 0.8138027191162109,
"reward_std": 0.8418096788227558,
"rewards/cosine_scaled_reward": -0.04101531347259879,
"rewards/format_reward": 0.8958333507180214,
"step": 419
},
{
"completion_length": 1029.1875228881836,
"epoch": 0.48,
"grad_norm": 3.587470293045044,
"kl": 0.508331298828125,
"learning_rate": 1.6837835672960831e-07,
"loss": 0.0204,
"reward": 0.7593546295538545,
"reward_std": 0.41525958105921745,
"rewards/cosine_scaled_reward": -0.057822706177830696,
"rewards/format_reward": 0.8750000149011612,
"step": 420
},
{
"completion_length": 1540.0833625793457,
"epoch": 0.48114285714285715,
"grad_norm": 2.360013484954834,
"kl": 0.88397216796875,
"learning_rate": 1.6672287963562852e-07,
"loss": 0.0354,
"reward": 0.4944930747151375,
"reward_std": 0.6833166517317295,
"rewards/cosine_scaled_reward": -0.19025347288697958,
"rewards/format_reward": 0.8750000223517418,
"step": 421
},
{
"completion_length": 1501.7292556762695,
"epoch": 0.48228571428571426,
"grad_norm": 2.635981798171997,
"kl": 0.740234375,
"learning_rate": 1.6508608292777203e-07,
"loss": 0.0296,
"reward": 0.9262944171205163,
"reward_std": 0.754150040447712,
"rewards/cosine_scaled_reward": 0.025647209025919437,
"rewards/format_reward": 0.8750000149011612,
"step": 422
},
{
"completion_length": 1964.8750762939453,
"epoch": 0.48342857142857143,
"grad_norm": 4.063174247741699,
"kl": 1.255462646484375,
"learning_rate": 1.6346804638120098e-07,
"loss": 0.0503,
"reward": 0.3909978587180376,
"reward_std": 0.8100734055042267,
"rewards/cosine_scaled_reward": -0.13783442322164774,
"rewards/format_reward": 0.6666666865348816,
"step": 423
},
{
"completion_length": 1720.937515258789,
"epoch": 0.4845714285714286,
"grad_norm": 3.836009979248047,
"kl": 0.89697265625,
"learning_rate": 1.6186884885673413e-07,
"loss": 0.0358,
"reward": 0.29376337490975857,
"reward_std": 0.5368039496243,
"rewards/cosine_scaled_reward": -0.2489516567438841,
"rewards/format_reward": 0.7916666865348816,
"step": 424
},
{
"completion_length": 1298.8125495910645,
"epoch": 0.4857142857142857,
"grad_norm": 1.9228633642196655,
"kl": 0.457244873046875,
"learning_rate": 1.6028856829700258e-07,
"loss": 0.0183,
"reward": 1.3913816176354885,
"reward_std": 0.6065534967929125,
"rewards/cosine_scaled_reward": 0.2477741353213787,
"rewards/format_reward": 0.8958333432674408,
"step": 425
},
{
"completion_length": 973.8541870117188,
"epoch": 0.4868571428571429,
"grad_norm": 2.929983615875244,
"kl": 0.32680511474609375,
"learning_rate": 1.5872728172265146e-07,
"loss": 0.0131,
"reward": 0.8921321593225002,
"reward_std": 0.4427370298653841,
"rewards/cosine_scaled_reward": -0.04351727291941643,
"rewards/format_reward": 0.9791666716337204,
"step": 426
},
{
"completion_length": 1596.7708740234375,
"epoch": 0.488,
"grad_norm": 3.161161422729492,
"kl": 0.484588623046875,
"learning_rate": 1.5718506522858572e-07,
"loss": 0.0194,
"reward": 0.973305675201118,
"reward_std": 0.8661460429430008,
"rewards/cosine_scaled_reward": 0.06998615153133869,
"rewards/format_reward": 0.8333333507180214,
"step": 427
},
{
"completion_length": 1573.3125610351562,
"epoch": 0.48914285714285716,
"grad_norm": 2.7298057079315186,
"kl": 0.6797714233398438,
"learning_rate": 1.5566199398026147e-07,
"loss": 0.0272,
"reward": 0.833369608346402,
"reward_std": 0.8424164094030857,
"rewards/cosine_scaled_reward": -0.020815202966332436,
"rewards/format_reward": 0.8750000074505806,
"step": 428
},
{
"completion_length": 972.3958435058594,
"epoch": 0.49028571428571427,
"grad_norm": 3.271073579788208,
"kl": 0.5417633056640625,
"learning_rate": 1.5415814221002265e-07,
"loss": 0.0216,
"reward": 0.6053271126002073,
"reward_std": 0.7208438105881214,
"rewards/cosine_scaled_reward": -0.15566978510469198,
"rewards/format_reward": 0.9166666865348816,
"step": 429
},
{
"completion_length": 1340.9167098999023,
"epoch": 0.49142857142857144,
"grad_norm": 4.3969879150390625,
"kl": 0.57080078125,
"learning_rate": 1.5267358321348285e-07,
"loss": 0.0228,
"reward": 0.9481127467006445,
"reward_std": 0.8026427961885929,
"rewards/cosine_scaled_reward": 0.0365563714876771,
"rewards/format_reward": 0.8750000223517418,
"step": 430
},
{
"completion_length": 1008.6250267028809,
"epoch": 0.49257142857142855,
"grad_norm": 2.9247236251831055,
"kl": 0.3984832763671875,
"learning_rate": 1.5120838934595337e-07,
"loss": 0.016,
"reward": 0.7810343587771058,
"reward_std": 0.6039218865334988,
"rewards/cosine_scaled_reward": -0.06781616434454918,
"rewards/format_reward": 0.9166666865348816,
"step": 431
},
{
"completion_length": 1682.0417022705078,
"epoch": 0.4937142857142857,
"grad_norm": 3.6658926010131836,
"kl": 0.94122314453125,
"learning_rate": 1.4976263201891613e-07,
"loss": 0.0376,
"reward": 0.37208714336156845,
"reward_std": 0.7206946834921837,
"rewards/cosine_scaled_reward": -0.17853978439234197,
"rewards/format_reward": 0.7291666846722364,
"step": 432
},
{
"completion_length": 1573.8125610351562,
"epoch": 0.4948571428571429,
"grad_norm": 6.710811138153076,
"kl": 0.98748779296875,
"learning_rate": 1.483363816965435e-07,
"loss": 0.0395,
"reward": 0.84003546833992,
"reward_std": 0.6314461715519428,
"rewards/cosine_scaled_reward": 0.0033510662615299225,
"rewards/format_reward": 0.833333358168602,
"step": 433
},
{
"completion_length": 1738.7292175292969,
"epoch": 0.496,
"grad_norm": 4.559843063354492,
"kl": 1.0400390625,
"learning_rate": 1.469297078922642e-07,
"loss": 0.0415,
"reward": 0.4310444425791502,
"reward_std": 0.6072977893054485,
"rewards/cosine_scaled_reward": -0.15947778831468895,
"rewards/format_reward": 0.7500000223517418,
"step": 434
},
{
"completion_length": 935.7708511352539,
"epoch": 0.49714285714285716,
"grad_norm": 1.8925296068191528,
"kl": 0.5841522216796875,
"learning_rate": 1.4554267916537495e-07,
"loss": 0.0234,
"reward": 0.6217253655195236,
"reward_std": 0.5926484689116478,
"rewards/cosine_scaled_reward": -0.1578873231774196,
"rewards/format_reward": 0.9375000149011612,
"step": 435
},
{
"completion_length": 1107.6875495910645,
"epoch": 0.4982857142857143,
"grad_norm": 2.4335594177246094,
"kl": 0.6590576171875,
"learning_rate": 1.4417536311769885e-07,
"loss": 0.0264,
"reward": 1.1079479418694973,
"reward_std": 0.7582566514611244,
"rewards/cosine_scaled_reward": 0.11647394431201974,
"rewards/format_reward": 0.8750000111758709,
"step": 436
},
{
"completion_length": 1200.9375381469727,
"epoch": 0.49942857142857144,
"grad_norm": 3.8739376068115234,
"kl": 0.56396484375,
"learning_rate": 1.4282782639029128e-07,
"loss": 0.0225,
"reward": 0.7988617792725563,
"reward_std": 0.6751260980963707,
"rewards/cosine_scaled_reward": -0.04848578106611967,
"rewards/format_reward": 0.895833358168602,
"step": 437
},
{
"completion_length": 1821.7708892822266,
"epoch": 0.5005714285714286,
"grad_norm": 3.8501853942871094,
"kl": 1.15753173828125,
"learning_rate": 1.4150013466019114e-07,
"loss": 0.0462,
"reward": 0.45935247000306845,
"reward_std": 0.6324943359941244,
"rewards/cosine_scaled_reward": -0.16615711338818073,
"rewards/format_reward": 0.7916666846722364,
"step": 438
},
{
"completion_length": 1447.7916793823242,
"epoch": 0.5017142857142857,
"grad_norm": 13.093454360961914,
"kl": 1.1785888671875,
"learning_rate": 1.4019235263722034e-07,
"loss": 0.0471,
"reward": 0.6338995918631554,
"reward_std": 0.6636701337993145,
"rewards/cosine_scaled_reward": -0.12055021477863193,
"rewards/format_reward": 0.8750000223517418,
"step": 439
},
{
"completion_length": 1387.4792022705078,
"epoch": 0.5028571428571429,
"grad_norm": 4.15952205657959,
"kl": 1.02880859375,
"learning_rate": 1.3890454406082956e-07,
"loss": 0.0411,
"reward": 0.5482727251946926,
"reward_std": 0.6047806814312935,
"rewards/cosine_scaled_reward": -0.15294698532670736,
"rewards/format_reward": 0.854166679084301,
"step": 440
},
{
"completion_length": 1395.708381652832,
"epoch": 0.504,
"grad_norm": 3.0667598247528076,
"kl": 0.73974609375,
"learning_rate": 1.3763677169699217e-07,
"loss": 0.0296,
"reward": 0.9216288132593036,
"reward_std": 0.7844664789736271,
"rewards/cosine_scaled_reward": 0.04414770007133484,
"rewards/format_reward": 0.8333333507180214,
"step": 441
},
{
"completion_length": 1071.7917022705078,
"epoch": 0.5051428571428571,
"grad_norm": 2.833597421646118,
"kl": 0.763397216796875,
"learning_rate": 1.3638909733514452e-07,
"loss": 0.0305,
"reward": 0.886376628652215,
"reward_std": 0.8642673939466476,
"rewards/cosine_scaled_reward": 0.02652162907179445,
"rewards/format_reward": 0.8333333469927311,
"step": 442
},
{
"completion_length": 1647.8542404174805,
"epoch": 0.5062857142857143,
"grad_norm": 2.424894332885742,
"kl": 1.24749755859375,
"learning_rate": 1.351615817851748e-07,
"loss": 0.0499,
"reward": 0.5579078826121986,
"reward_std": 0.6206741183996201,
"rewards/cosine_scaled_reward": -0.12729605846107006,
"rewards/format_reward": 0.8125000111758709,
"step": 443
},
{
"completion_length": 1236.9166946411133,
"epoch": 0.5074285714285715,
"grad_norm": 4.010989665985107,
"kl": 0.55133056640625,
"learning_rate": 1.3395428487445914e-07,
"loss": 0.0221,
"reward": 0.581955180503428,
"reward_std": 0.7338661774992943,
"rewards/cosine_scaled_reward": -0.1569390781223774,
"rewards/format_reward": 0.8958333507180214,
"step": 444
},
{
"completion_length": 1259.9166793823242,
"epoch": 0.5085714285714286,
"grad_norm": 2.583313465118408,
"kl": 0.74591064453125,
"learning_rate": 1.3276726544494571e-07,
"loss": 0.0298,
"reward": 0.5723795741796494,
"reward_std": 0.788989819586277,
"rewards/cosine_scaled_reward": -0.13047689152881503,
"rewards/format_reward": 0.8333333432674408,
"step": 445
},
{
"completion_length": 1332.9792098999023,
"epoch": 0.5097142857142857,
"grad_norm": 16.281204223632812,
"kl": 1.068878173828125,
"learning_rate": 1.316005813502869e-07,
"loss": 0.0428,
"reward": 0.7542771156877279,
"reward_std": 0.7587186098098755,
"rewards/cosine_scaled_reward": -0.008278121706098318,
"rewards/format_reward": 0.770833358168602,
"step": 446
},
{
"completion_length": 1513.3958930969238,
"epoch": 0.5108571428571429,
"grad_norm": 3.205157518386841,
"kl": 1.26171875,
"learning_rate": 1.3045428945301953e-07,
"loss": 0.0505,
"reward": 0.7465504482388496,
"reward_std": 0.57644097879529,
"rewards/cosine_scaled_reward": -0.04339144751429558,
"rewards/format_reward": 0.8333333432674408,
"step": 447
},
{
"completion_length": 1335.083366394043,
"epoch": 0.512,
"grad_norm": 3.9793076515197754,
"kl": 1.251495361328125,
"learning_rate": 1.2932844562179352e-07,
"loss": 0.0501,
"reward": 0.6771020290179877,
"reward_std": 0.7358664702624083,
"rewards/cosine_scaled_reward": -0.026032326743006706,
"rewards/format_reward": 0.729166679084301,
"step": 448
},
{
"completion_length": 1115.187515258789,
"epoch": 0.5131428571428571,
"grad_norm": 3.228578805923462,
"kl": 0.47408294677734375,
"learning_rate": 1.2822310472864885e-07,
"loss": 0.0189,
"reward": 0.422957434784621,
"reward_std": 0.5728885792195797,
"rewards/cosine_scaled_reward": -0.22602130100131035,
"rewards/format_reward": 0.8750000223517418,
"step": 449
},
{
"completion_length": 1225.1458740234375,
"epoch": 0.5142857142857142,
"grad_norm": 2.933077335357666,
"kl": 0.64776611328125,
"learning_rate": 1.2713832064634125e-07,
"loss": 0.0259,
"reward": 0.790303866029717,
"reward_std": 0.622036661952734,
"rewards/cosine_scaled_reward": -0.04234808124601841,
"rewards/format_reward": 0.8750000149011612,
"step": 450
},
{
"completion_length": 1283.5208892822266,
"epoch": 0.5154285714285715,
"grad_norm": 4.34015417098999,
"kl": 0.707855224609375,
"learning_rate": 1.260741462457165e-07,
"loss": 0.0283,
"reward": 0.7061118334531784,
"reward_std": 0.6641687378287315,
"rewards/cosine_scaled_reward": -0.06361076328903437,
"rewards/format_reward": 0.833333358168602,
"step": 451
},
{
"completion_length": 1390.333366394043,
"epoch": 0.5165714285714286,
"grad_norm": 3.791003942489624,
"kl": 0.733154296875,
"learning_rate": 1.2503063339313356e-07,
"loss": 0.0294,
"reward": 0.8051364235579967,
"reward_std": 0.9408066868782043,
"rewards/cosine_scaled_reward": -0.014098492218181491,
"rewards/format_reward": 0.8333333507180214,
"step": 452
},
{
"completion_length": 1164.7916984558105,
"epoch": 0.5177142857142857,
"grad_norm": 2.2949931621551514,
"kl": 0.513427734375,
"learning_rate": 1.2400783294793668e-07,
"loss": 0.0205,
"reward": 0.8233426138758659,
"reward_std": 0.5944336298853159,
"rewards/cosine_scaled_reward": -0.06749536748975515,
"rewards/format_reward": 0.9583333432674408,
"step": 453
},
{
"completion_length": 1308.4167022705078,
"epoch": 0.5188571428571429,
"grad_norm": 3.0252280235290527,
"kl": 0.7433929443359375,
"learning_rate": 1.2300579475997657e-07,
"loss": 0.0298,
"reward": 0.5274054184556007,
"reward_std": 0.6061475053429604,
"rewards/cosine_scaled_reward": -0.14254729636013508,
"rewards/format_reward": 0.8125000149011612,
"step": 454
},
{
"completion_length": 1739.1458702087402,
"epoch": 0.52,
"grad_norm": 4.987695217132568,
"kl": 1.2000732421875,
"learning_rate": 1.220245676671809e-07,
"loss": 0.048,
"reward": 0.2408284079283476,
"reward_std": 0.6372124627232552,
"rewards/cosine_scaled_reward": -0.27541913744062185,
"rewards/format_reward": 0.7916666865348816,
"step": 455
},
{
"completion_length": 1663.187557220459,
"epoch": 0.5211428571428571,
"grad_norm": 3.197329044342041,
"kl": 0.941253662109375,
"learning_rate": 1.2106419949317388e-07,
"loss": 0.0376,
"reward": 0.47842175513505936,
"reward_std": 0.7478823028504848,
"rewards/cosine_scaled_reward": -0.15662246476858854,
"rewards/format_reward": 0.791666679084301,
"step": 456
},
{
"completion_length": 1417.3542289733887,
"epoch": 0.5222857142857142,
"grad_norm": 2.964496612548828,
"kl": 0.938079833984375,
"learning_rate": 1.2012473704494537e-07,
"loss": 0.0375,
"reward": 0.6622494223265676,
"reward_std": 0.5692349448800087,
"rewards/cosine_scaled_reward": -0.03345862403512001,
"rewards/format_reward": 0.7291666828095913,
"step": 457
},
{
"completion_length": 1527.2916946411133,
"epoch": 0.5234285714285715,
"grad_norm": 4.469851493835449,
"kl": 1.12396240234375,
"learning_rate": 1.1920622611056974e-07,
"loss": 0.0449,
"reward": 0.5578488986939192,
"reward_std": 0.693300411105156,
"rewards/cosine_scaled_reward": -0.10649222880601883,
"rewards/format_reward": 0.7708333469927311,
"step": 458
},
{
"completion_length": 1051.7708740234375,
"epoch": 0.5245714285714286,
"grad_norm": 3.3350539207458496,
"kl": 0.466766357421875,
"learning_rate": 1.1830871145697412e-07,
"loss": 0.0187,
"reward": 0.9517181403934956,
"reward_std": 0.8380660861730576,
"rewards/cosine_scaled_reward": 0.0071090515702962875,
"rewards/format_reward": 0.9375000074505806,
"step": 459
},
{
"completion_length": 1639.1250610351562,
"epoch": 0.5257142857142857,
"grad_norm": 7.242498874664307,
"kl": 1.00091552734375,
"learning_rate": 1.1743223682775649e-07,
"loss": 0.04,
"reward": 0.3730992656201124,
"reward_std": 0.726275160908699,
"rewards/cosine_scaled_reward": -0.16761704441159964,
"rewards/format_reward": 0.7083333432674408,
"step": 460
},
{
"completion_length": 1452.6250305175781,
"epoch": 0.5268571428571428,
"grad_norm": 3.2622721195220947,
"kl": 1.1546630859375,
"learning_rate": 1.1657684494105386e-07,
"loss": 0.0461,
"reward": 0.8234957940876484,
"reward_std": 0.6311179846525192,
"rewards/cosine_scaled_reward": -0.0049187901604454964,
"rewards/format_reward": 0.8333333488553762,
"step": 461
},
{
"completion_length": 1285.4166946411133,
"epoch": 0.528,
"grad_norm": 3.8556363582611084,
"kl": 0.8689117431640625,
"learning_rate": 1.1574257748745986e-07,
"loss": 0.0348,
"reward": 0.4587779585272074,
"reward_std": 0.6436084322631359,
"rewards/cosine_scaled_reward": -0.17686102783773094,
"rewards/format_reward": 0.8125000223517418,
"step": 462
},
{
"completion_length": 1802.8125381469727,
"epoch": 0.5291428571428571,
"grad_norm": 5.279976844787598,
"kl": 0.799560546875,
"learning_rate": 1.1492947512799328e-07,
"loss": 0.032,
"reward": 0.6291724583134055,
"reward_std": 0.8651260025799274,
"rewards/cosine_scaled_reward": -0.04999712225981057,
"rewards/format_reward": 0.7291666939854622,
"step": 463
},
{
"completion_length": 1047.145851135254,
"epoch": 0.5302857142857142,
"grad_norm": 2.4638097286224365,
"kl": 0.605743408203125,
"learning_rate": 1.1413757749211602e-07,
"loss": 0.0242,
"reward": 1.302383467555046,
"reward_std": 0.4087425358593464,
"rewards/cosine_scaled_reward": 0.1616083886474371,
"rewards/format_reward": 0.9791666716337204,
"step": 464
},
{
"completion_length": 1426.8542022705078,
"epoch": 0.5314285714285715,
"grad_norm": 2.563239336013794,
"kl": 0.8856201171875,
"learning_rate": 1.1336692317580158e-07,
"loss": 0.0354,
"reward": 0.514048907905817,
"reward_std": 0.7893399521708488,
"rewards/cosine_scaled_reward": -0.14922555815428495,
"rewards/format_reward": 0.8125000223517418,
"step": 465
},
{
"completion_length": 1289.5625381469727,
"epoch": 0.5325714285714286,
"grad_norm": 2.4319827556610107,
"kl": 0.561614990234375,
"learning_rate": 1.1261754973965422e-07,
"loss": 0.0225,
"reward": 1.006679143756628,
"reward_std": 0.8896586894989014,
"rewards/cosine_scaled_reward": 0.045006227446720004,
"rewards/format_reward": 0.9166666865348816,
"step": 466
},
{
"completion_length": 1679.6666870117188,
"epoch": 0.5337142857142857,
"grad_norm": 3.1560564041137695,
"kl": 1.129669189453125,
"learning_rate": 1.1188949370707787e-07,
"loss": 0.0453,
"reward": 0.34113773331046104,
"reward_std": 0.6508898884057999,
"rewards/cosine_scaled_reward": -0.19401447381824255,
"rewards/format_reward": 0.7291666846722364,
"step": 467
},
{
"completion_length": 1565.2083778381348,
"epoch": 0.5348571428571428,
"grad_norm": 181.5196990966797,
"kl": 7.007781982421875,
"learning_rate": 1.1118279056249653e-07,
"loss": 0.2808,
"reward": 0.6900721359997988,
"reward_std": 0.8350103311240673,
"rewards/cosine_scaled_reward": -0.07163060246966779,
"rewards/format_reward": 0.8333333432674408,
"step": 468
},
{
"completion_length": 1385.6458892822266,
"epoch": 0.536,
"grad_norm": 3.4314520359039307,
"kl": 0.941375732421875,
"learning_rate": 1.1049747474962444e-07,
"loss": 0.0377,
"reward": 0.5675666080787778,
"reward_std": 0.6128499489277601,
"rewards/cosine_scaled_reward": -0.12246670690365136,
"rewards/format_reward": 0.8125000149011612,
"step": 469
},
{
"completion_length": 1621.4375305175781,
"epoch": 0.5371428571428571,
"grad_norm": 4.546476364135742,
"kl": 1.376953125,
"learning_rate": 1.0983357966978745e-07,
"loss": 0.0551,
"reward": 0.30739592137979344,
"reward_std": 0.725900623947382,
"rewards/cosine_scaled_reward": -0.15880204178392887,
"rewards/format_reward": 0.625000013038516,
"step": 470
},
{
"completion_length": 1291.0208587646484,
"epoch": 0.5382857142857143,
"grad_norm": 2.4413740634918213,
"kl": 0.525848388671875,
"learning_rate": 1.0919113768029517e-07,
"loss": 0.021,
"reward": 0.7892501968890429,
"reward_std": 0.7734898887574673,
"rewards/cosine_scaled_reward": -0.03245823457837105,
"rewards/format_reward": 0.8541666939854622,
"step": 471
},
{
"completion_length": 1400.1667022705078,
"epoch": 0.5394285714285715,
"grad_norm": 3.6039347648620605,
"kl": 0.58624267578125,
"learning_rate": 1.0857018009286381e-07,
"loss": 0.0234,
"reward": 0.3975699208676815,
"reward_std": 0.6745503656566143,
"rewards/cosine_scaled_reward": -0.22829839028418064,
"rewards/format_reward": 0.8541666865348816,
"step": 472
},
{
"completion_length": 1205.7500381469727,
"epoch": 0.5405714285714286,
"grad_norm": 2.652244806289673,
"kl": 0.7998046875,
"learning_rate": 1.0797073717209013e-07,
"loss": 0.032,
"reward": 0.5399867701344192,
"reward_std": 0.6538946256041527,
"rewards/cosine_scaled_reward": -0.17792330123484135,
"rewards/format_reward": 0.8958333507180214,
"step": 473
},
{
"completion_length": 1451.083351135254,
"epoch": 0.5417142857142857,
"grad_norm": 4.507306098937988,
"kl": 0.9193878173828125,
"learning_rate": 1.0739283813397639e-07,
"loss": 0.0368,
"reward": 1.300966864451766,
"reward_std": 0.745520330965519,
"rewards/cosine_scaled_reward": 0.2546501159667969,
"rewards/format_reward": 0.7916666828095913,
"step": 474
},
{
"completion_length": 1491.0000686645508,
"epoch": 0.5428571428571428,
"grad_norm": 2.09909987449646,
"kl": 0.9030914306640625,
"learning_rate": 1.068365111445064e-07,
"loss": 0.0361,
"reward": 0.6428233720362186,
"reward_std": 0.8779257349669933,
"rewards/cosine_scaled_reward": -0.06400499166920781,
"rewards/format_reward": 0.7708333469927311,
"step": 475
},
{
"completion_length": 1502.0208740234375,
"epoch": 0.544,
"grad_norm": 3.001065969467163,
"kl": 0.85467529296875,
"learning_rate": 1.063017833182728e-07,
"loss": 0.0342,
"reward": 0.9168616086244583,
"reward_std": 0.9465513862669468,
"rewards/cosine_scaled_reward": 0.020930795930325985,
"rewards/format_reward": 0.8750000074505806,
"step": 476
},
{
"completion_length": 1313.4791946411133,
"epoch": 0.5451428571428572,
"grad_norm": 4.386512756347656,
"kl": 0.9460906982421875,
"learning_rate": 1.0578868071715544e-07,
"loss": 0.0379,
"reward": 0.8154587242752314,
"reward_std": 0.8511052504181862,
"rewards/cosine_scaled_reward": 0.022312658838927746,
"rewards/format_reward": 0.7708333544433117,
"step": 477
},
{
"completion_length": 1496.3125305175781,
"epoch": 0.5462857142857143,
"grad_norm": 3.565664768218994,
"kl": 0.8566741943359375,
"learning_rate": 1.0529722834905125e-07,
"loss": 0.0342,
"reward": 0.8052469007670879,
"reward_std": 0.7007171474397182,
"rewards/cosine_scaled_reward": -0.003626542165875435,
"rewards/format_reward": 0.8125000223517418,
"step": 478
},
{
"completion_length": 1708.4167098999023,
"epoch": 0.5474285714285714,
"grad_norm": 2.6371641159057617,
"kl": 1.12823486328125,
"learning_rate": 1.0482745016665526e-07,
"loss": 0.0451,
"reward": 0.5828341413289309,
"reward_std": 0.6526912562549114,
"rewards/cosine_scaled_reward": -0.1148329358547926,
"rewards/format_reward": 0.8125000074505806,
"step": 479
},
{
"completion_length": 1512.4166946411133,
"epoch": 0.5485714285714286,
"grad_norm": 4.293049335479736,
"kl": 0.984771728515625,
"learning_rate": 1.0437936906629334e-07,
"loss": 0.0394,
"reward": 0.8314212337136269,
"reward_std": 0.5310982428491116,
"rewards/cosine_scaled_reward": -0.03220607154071331,
"rewards/format_reward": 0.895833358168602,
"step": 480
},
{
"completion_length": 1671.5417022705078,
"epoch": 0.5497142857142857,
"grad_norm": 5.241688251495361,
"kl": 0.995025634765625,
"learning_rate": 1.0395300688680625e-07,
"loss": 0.0397,
"reward": 0.5629774704575539,
"reward_std": 0.6768053583800793,
"rewards/cosine_scaled_reward": -0.1351779391989112,
"rewards/format_reward": 0.8333333432674408,
"step": 481
},
{
"completion_length": 1726.7500267028809,
"epoch": 0.5508571428571428,
"grad_norm": 4.3753886222839355,
"kl": 1.4754638671875,
"learning_rate": 1.0354838440848501e-07,
"loss": 0.059,
"reward": 0.5926886834204197,
"reward_std": 0.6844369061291218,
"rewards/cosine_scaled_reward": -0.01615567714907229,
"rewards/format_reward": 0.6250000279396772,
"step": 482
},
{
"completion_length": 1887.2500610351562,
"epoch": 0.552,
"grad_norm": 3.9196157455444336,
"kl": 1.394287109375,
"learning_rate": 1.0316552135205837e-07,
"loss": 0.0558,
"reward": 0.5968103185296059,
"reward_std": 0.8982166573405266,
"rewards/cosine_scaled_reward": -0.08701153006404638,
"rewards/format_reward": 0.770833358168602,
"step": 483
},
{
"completion_length": 1416.1458930969238,
"epoch": 0.5531428571428572,
"grad_norm": 4.831650257110596,
"kl": 0.6915283203125,
"learning_rate": 1.0280443637773163e-07,
"loss": 0.0276,
"reward": 0.6882726345211267,
"reward_std": 0.781242698431015,
"rewards/cosine_scaled_reward": -0.06211370480014011,
"rewards/format_reward": 0.8125000223517418,
"step": 484
},
{
"completion_length": 1175.083351135254,
"epoch": 0.5542857142857143,
"grad_norm": 2.7525644302368164,
"kl": 0.708404541015625,
"learning_rate": 1.0246514708427701e-07,
"loss": 0.0283,
"reward": 0.5069214683026075,
"reward_std": 0.4130475576967001,
"rewards/cosine_scaled_reward": -0.16320594353601336,
"rewards/format_reward": 0.8333333414047956,
"step": 485
},
{
"completion_length": 945.3750343322754,
"epoch": 0.5554285714285714,
"grad_norm": 1.641010046005249,
"kl": 0.423370361328125,
"learning_rate": 1.0214767000817596e-07,
"loss": 0.0169,
"reward": 0.7414621282368898,
"reward_std": 0.5828515980392694,
"rewards/cosine_scaled_reward": -0.0771856140345335,
"rewards/format_reward": 0.8958333432674408,
"step": 486
},
{
"completion_length": 1054.7708625793457,
"epoch": 0.5565714285714286,
"grad_norm": 2.9195919036865234,
"kl": 0.4309539794921875,
"learning_rate": 1.0185202062281336e-07,
"loss": 0.0173,
"reward": 1.2239743052050471,
"reward_std": 0.5242457445710897,
"rewards/cosine_scaled_reward": 0.14323717169463634,
"rewards/format_reward": 0.9375000149011612,
"step": 487
},
{
"completion_length": 1278.1667022705078,
"epoch": 0.5577142857142857,
"grad_norm": 2.495598316192627,
"kl": 0.9759674072265625,
"learning_rate": 1.0157821333772304e-07,
"loss": 0.039,
"reward": 0.6595983803272247,
"reward_std": 0.5137137211859226,
"rewards/cosine_scaled_reward": -0.08686749078333378,
"rewards/format_reward": 0.8333333544433117,
"step": 488
},
{
"completion_length": 1564.6250381469727,
"epoch": 0.5588571428571428,
"grad_norm": 3.082076072692871,
"kl": 1.235137939453125,
"learning_rate": 1.013262614978859e-07,
"loss": 0.0494,
"reward": 0.09155605779960752,
"reward_std": 0.5432869009673595,
"rewards/cosine_scaled_reward": -0.28755532018840313,
"rewards/format_reward": 0.6666666828095913,
"step": 489
},
{
"completion_length": 1278.1250381469727,
"epoch": 0.56,
"grad_norm": 1.8155015707015991,
"kl": 0.6814727783203125,
"learning_rate": 1.0109617738307911e-07,
"loss": 0.0273,
"reward": 0.6778741236776114,
"reward_std": 0.7777648419141769,
"rewards/cosine_scaled_reward": -0.08814628981053829,
"rewards/format_reward": 0.854166679084301,
"step": 490
},
{
"completion_length": 1448.9375381469727,
"epoch": 0.5611428571428572,
"grad_norm": 2.183570623397827,
"kl": 0.358306884765625,
"learning_rate": 1.0088797220727779e-07,
"loss": 0.0143,
"reward": 1.1492541544139385,
"reward_std": 1.02406694740057,
"rewards/cosine_scaled_reward": 0.12671040603891015,
"rewards/format_reward": 0.8958333432674408,
"step": 491
},
{
"completion_length": 1325.7708740234375,
"epoch": 0.5622857142857143,
"grad_norm": 3.939805030822754,
"kl": 0.6011505126953125,
"learning_rate": 1.0070165611810855e-07,
"loss": 0.024,
"reward": 0.8053325146902353,
"reward_std": 0.5690103769302368,
"rewards/cosine_scaled_reward": -0.045250434428453445,
"rewards/format_reward": 0.8958333507180214,
"step": 492
},
{
"completion_length": 1136.2291946411133,
"epoch": 0.5634285714285714,
"grad_norm": 3.2057812213897705,
"kl": 0.6187744140625,
"learning_rate": 1.005372381963547e-07,
"loss": 0.0248,
"reward": 0.8366872314363718,
"reward_std": 0.8825880065560341,
"rewards/cosine_scaled_reward": -0.019156392896547914,
"rewards/format_reward": 0.8750000074505806,
"step": 493
},
{
"completion_length": 1201.3125305175781,
"epoch": 0.5645714285714286,
"grad_norm": 3.475240707397461,
"kl": 0.5660400390625,
"learning_rate": 1.0039472645551372e-07,
"loss": 0.0226,
"reward": 0.8446337506175041,
"reward_std": 0.9212733060121536,
"rewards/cosine_scaled_reward": -0.025599811924621463,
"rewards/format_reward": 0.8958333432674408,
"step": 494
},
{
"completion_length": 1859.8333892822266,
"epoch": 0.5657142857142857,
"grad_norm": 5.380873680114746,
"kl": 1.025390625,
"learning_rate": 1.002741278414069e-07,
"loss": 0.0411,
"reward": 0.7559722196310759,
"reward_std": 1.0365862026810646,
"rewards/cosine_scaled_reward": -0.007430561818182468,
"rewards/format_reward": 0.7708333656191826,
"step": 495
},
{
"completion_length": 1412.4375610351562,
"epoch": 0.5668571428571428,
"grad_norm": 3.017749547958374,
"kl": 1.0056610107421875,
"learning_rate": 1.0017544823184055e-07,
"loss": 0.0403,
"reward": 0.8477246034890413,
"reward_std": 0.7844705618917942,
"rewards/cosine_scaled_reward": 0.038445642217993736,
"rewards/format_reward": 0.7708333488553762,
"step": 496
},
{
"completion_length": 1259.4583702087402,
"epoch": 0.568,
"grad_norm": 3.237330436706543,
"kl": 0.850830078125,
"learning_rate": 1.0009869243631952e-07,
"loss": 0.034,
"reward": 1.0958917308598757,
"reward_std": 0.5949588976800442,
"rewards/cosine_scaled_reward": 0.12086252495646477,
"rewards/format_reward": 0.854166679084301,
"step": 497
},
{
"completion_length": 1587.083351135254,
"epoch": 0.5691428571428572,
"grad_norm": 4.424108028411865,
"kl": 1.174072265625,
"learning_rate": 1.000438641958131e-07,
"loss": 0.047,
"reward": 0.5771038420498371,
"reward_std": 0.7162478044629097,
"rewards/cosine_scaled_reward": -0.08644808363169432,
"rewards/format_reward": 0.7500000149011612,
"step": 498
},
{
"completion_length": 1515.1667022705078,
"epoch": 0.5702857142857143,
"grad_norm": 1.4978034496307373,
"kl": 0.6841812133789062,
"learning_rate": 1.0001096618257236e-07,
"loss": 0.0274,
"reward": 0.8714520921930671,
"reward_std": 0.7796698864549398,
"rewards/cosine_scaled_reward": -0.012190633453428745,
"rewards/format_reward": 0.8958333507180214,
"step": 499
},
{
"completion_length": 1229.7083740234375,
"epoch": 0.5714285714285714,
"grad_norm": 2.3225278854370117,
"kl": 0.565399169921875,
"learning_rate": 1e-07,
"loss": 0.0227,
"reward": 0.6700912415981293,
"reward_std": 0.7157665528357029,
"rewards/cosine_scaled_reward": -0.1337043906096369,
"rewards/format_reward": 0.9375000149011612,
"step": 500
},
{
"epoch": 0.5714285714285714,
"step": 500,
"total_flos": 0.0,
"train_loss": 0.009290046402068698,
"train_runtime": 55313.9986,
"train_samples_per_second": 0.434,
"train_steps_per_second": 0.009
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}