OpenRS-DR_GRPO / trainer_state.json
xiwenc1's picture
Model save
1eef637 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5714285714285714,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 3001.9584350585938,
"epoch": 0.001142857142857143,
"grad_norm": 0.11473917961120605,
"kl": 0.0,
"learning_rate": 2e-08,
"loss": 0.034,
"reward": -0.010712452232837677,
"reward_std": 0.48354096710681915,
"rewards/cosine_scaled_reward": -0.1928562317043543,
"rewards/format_reward": 0.37500000558793545,
"step": 1
},
{
"completion_length": 2822.541717529297,
"epoch": 0.002285714285714286,
"grad_norm": 0.17855221033096313,
"kl": 0.0,
"learning_rate": 4e-08,
"loss": 0.1095,
"reward": 0.4385625521535985,
"reward_std": 0.8208381980657578,
"rewards/cosine_scaled_reward": -0.009885392151772976,
"rewards/format_reward": 0.4583333432674408,
"step": 2
},
{
"completion_length": 2903.604248046875,
"epoch": 0.0034285714285714284,
"grad_norm": 0.05400172621011734,
"kl": 3.629922866821289e-05,
"learning_rate": 6e-08,
"loss": 0.0166,
"reward": -0.3212598990648985,
"reward_std": 0.36036985367536545,
"rewards/cosine_scaled_reward": -0.3168799467384815,
"rewards/format_reward": 0.31250000186264515,
"step": 3
},
{
"completion_length": 2924.8958740234375,
"epoch": 0.004571428571428572,
"grad_norm": 0.1298418492078781,
"kl": 3.390759229660034e-05,
"learning_rate": 8e-08,
"loss": 0.0193,
"reward": 0.11002232693135738,
"reward_std": 0.5668230727314949,
"rewards/cosine_scaled_reward": -0.12207217514514923,
"rewards/format_reward": 0.3541666865348816,
"step": 4
},
{
"completion_length": 2699.4793090820312,
"epoch": 0.005714285714285714,
"grad_norm": 0.11395805329084396,
"kl": 2.8192996978759766e-05,
"learning_rate": 1e-07,
"loss": 0.0509,
"reward": 0.5249291565269232,
"reward_std": 0.7597299069166183,
"rewards/cosine_scaled_reward": 0.033297897316515446,
"rewards/format_reward": 0.4583333544433117,
"step": 5
},
{
"completion_length": 2660.5001220703125,
"epoch": 0.006857142857142857,
"grad_norm": 0.15824902057647705,
"kl": 4.559755325317383e-05,
"learning_rate": 1.2e-07,
"loss": 0.04,
"reward": 0.42945386096835136,
"reward_std": 0.6760371923446655,
"rewards/cosine_scaled_reward": -0.05610641464591026,
"rewards/format_reward": 0.541666679084301,
"step": 6
},
{
"completion_length": 2458.479217529297,
"epoch": 0.008,
"grad_norm": 0.10866966843605042,
"kl": 2.4110078811645508e-05,
"learning_rate": 1.4e-07,
"loss": 0.0529,
"reward": 0.7580276802182198,
"reward_std": 0.6385035738348961,
"rewards/cosine_scaled_reward": 0.09776384383440018,
"rewards/format_reward": 0.5625000149011612,
"step": 7
},
{
"completion_length": 2977.8126220703125,
"epoch": 0.009142857142857144,
"grad_norm": 0.22230574488639832,
"kl": 3.574788570404053e-05,
"learning_rate": 1.6e-07,
"loss": 0.0993,
"reward": 0.06304685212671757,
"reward_std": 0.8850619196891785,
"rewards/cosine_scaled_reward": -0.16639323788695037,
"rewards/format_reward": 0.3958333395421505,
"step": 8
},
{
"completion_length": 3034.5416870117188,
"epoch": 0.010285714285714285,
"grad_norm": 0.17408320307731628,
"kl": 3.820657730102539e-05,
"learning_rate": 1.8e-07,
"loss": 0.0851,
"reward": 0.06854809075593948,
"reward_std": 0.8176102936267853,
"rewards/cosine_scaled_reward": -0.10114264115691185,
"rewards/format_reward": 0.2708333432674408,
"step": 9
},
{
"completion_length": 2121.2500610351562,
"epoch": 0.011428571428571429,
"grad_norm": 0.07089601457118988,
"kl": 2.7008354663848877e-05,
"learning_rate": 2e-07,
"loss": 0.026,
"reward": 0.6565612219274044,
"reward_std": 0.6731352433562279,
"rewards/cosine_scaled_reward": 0.026197269558906555,
"rewards/format_reward": 0.6041666716337204,
"step": 10
},
{
"completion_length": 2388.166748046875,
"epoch": 0.012571428571428572,
"grad_norm": 0.17368823289871216,
"kl": 2.911686897277832e-05,
"learning_rate": 2.1999999999999998e-07,
"loss": 0.09,
"reward": 0.7517527863383293,
"reward_std": 1.0614946484565735,
"rewards/cosine_scaled_reward": 0.07379304803907871,
"rewards/format_reward": 0.6041666865348816,
"step": 11
},
{
"completion_length": 2672.5834350585938,
"epoch": 0.013714285714285714,
"grad_norm": 0.09804865717887878,
"kl": 3.5643577575683594e-05,
"learning_rate": 2.4e-07,
"loss": 0.0423,
"reward": 0.46549332328140736,
"reward_std": 0.59340400993824,
"rewards/cosine_scaled_reward": -0.006836682558059692,
"rewards/format_reward": 0.479166679084301,
"step": 12
},
{
"completion_length": 2250.187530517578,
"epoch": 0.014857142857142857,
"grad_norm": 0.10080444812774658,
"kl": 3.0308961868286133e-05,
"learning_rate": 2.6e-07,
"loss": 0.0188,
"reward": 0.6889139215054456,
"reward_std": 0.8085261583328247,
"rewards/cosine_scaled_reward": 0.06320697697810829,
"rewards/format_reward": 0.5625000111758709,
"step": 13
},
{
"completion_length": 2936.9375610351562,
"epoch": 0.016,
"grad_norm": 0.1032668873667717,
"kl": 4.1931867599487305e-05,
"learning_rate": 2.8e-07,
"loss": 0.0156,
"reward": 0.10788557305932045,
"reward_std": 0.6920560002326965,
"rewards/cosine_scaled_reward": -0.11272389208897948,
"rewards/format_reward": 0.3333333432674408,
"step": 14
},
{
"completion_length": 3221.666748046875,
"epoch": 0.017142857142857144,
"grad_norm": 0.10653272271156311,
"kl": 3.7223100662231445e-05,
"learning_rate": 3e-07,
"loss": 0.0008,
"reward": -0.2332199066877365,
"reward_std": 0.63228340446949,
"rewards/cosine_scaled_reward": -0.21035997135186335,
"rewards/format_reward": 0.1875000111758709,
"step": 15
},
{
"completion_length": 2321.3750610351562,
"epoch": 0.018285714285714287,
"grad_norm": 0.14373674988746643,
"kl": 2.193450927734375e-05,
"learning_rate": 3.2e-07,
"loss": 0.0532,
"reward": 0.6621312350034714,
"reward_std": 0.9647989273071289,
"rewards/cosine_scaled_reward": 0.06023227237164974,
"rewards/format_reward": 0.5416666865348816,
"step": 16
},
{
"completion_length": 3174.8333740234375,
"epoch": 0.019428571428571427,
"grad_norm": 0.07878188043832779,
"kl": 3.62396240234375e-05,
"learning_rate": 3.4000000000000003e-07,
"loss": 0.0239,
"reward": -0.20133600383996964,
"reward_std": 0.5479727387428284,
"rewards/cosine_scaled_reward": -0.2152513451874256,
"rewards/format_reward": 0.2291666679084301,
"step": 17
},
{
"completion_length": 3214.229248046875,
"epoch": 0.02057142857142857,
"grad_norm": 0.1723223179578781,
"kl": 5.7220458984375e-05,
"learning_rate": 3.6e-07,
"loss": 0.0648,
"reward": -0.21091226488351822,
"reward_std": 0.5157570615410805,
"rewards/cosine_scaled_reward": -0.188789464533329,
"rewards/format_reward": 0.1666666679084301,
"step": 18
},
{
"completion_length": 3238.9584350585938,
"epoch": 0.021714285714285714,
"grad_norm": 0.161203071475029,
"kl": 2.1696090698242188e-05,
"learning_rate": 3.7999999999999996e-07,
"loss": 0.0562,
"reward": 0.049652623711153865,
"reward_std": 0.9271627813577652,
"rewards/cosine_scaled_reward": -0.1210070364177227,
"rewards/format_reward": 0.2916666679084301,
"step": 19
},
{
"completion_length": 2502.9584045410156,
"epoch": 0.022857142857142857,
"grad_norm": 0.19064471125602722,
"kl": 3.2901763916015625e-05,
"learning_rate": 4e-07,
"loss": 0.097,
"reward": 0.33966562896966934,
"reward_std": 0.6814321130514145,
"rewards/cosine_scaled_reward": -0.10100051760673523,
"rewards/format_reward": 0.5416666865348816,
"step": 20
},
{
"completion_length": 2544.5833740234375,
"epoch": 0.024,
"grad_norm": 0.08170344680547714,
"kl": 2.512335777282715e-05,
"learning_rate": 4.1999999999999995e-07,
"loss": 0.0099,
"reward": 0.26008715480566025,
"reward_std": 0.5456661060452461,
"rewards/cosine_scaled_reward": -0.06787310540676117,
"rewards/format_reward": 0.3958333432674408,
"step": 21
},
{
"completion_length": 3508.8126220703125,
"epoch": 0.025142857142857144,
"grad_norm": 0.14452184736728668,
"kl": 2.7313828468322754e-05,
"learning_rate": 4.3999999999999997e-07,
"loss": 0.0283,
"reward": -0.03812084347009659,
"reward_std": 0.7810813337564468,
"rewards/cosine_scaled_reward": -0.10239375196397305,
"rewards/format_reward": 0.16666666977107525,
"step": 22
},
{
"completion_length": 3135.5000610351562,
"epoch": 0.026285714285714287,
"grad_norm": 0.18309734761714935,
"kl": 4.690885543823242e-05,
"learning_rate": 4.6e-07,
"loss": 0.0864,
"reward": -0.03534786030650139,
"reward_std": 0.8103697001934052,
"rewards/cosine_scaled_reward": -0.17392393667250872,
"rewards/format_reward": 0.3125000111758709,
"step": 23
},
{
"completion_length": 2123.3750915527344,
"epoch": 0.027428571428571427,
"grad_norm": 0.07949961721897125,
"kl": 1.4767050743103027e-05,
"learning_rate": 4.8e-07,
"loss": 0.0269,
"reward": 0.6402075001969934,
"reward_std": 0.7203418090939522,
"rewards/cosine_scaled_reward": 0.018020419403910637,
"rewards/format_reward": 0.6041666679084301,
"step": 24
},
{
"completion_length": 2792.7709045410156,
"epoch": 0.02857142857142857,
"grad_norm": 0.09897608309984207,
"kl": 1.7628073692321777e-05,
"learning_rate": 5e-07,
"loss": 0.0263,
"reward": 0.3667532876133919,
"reward_std": 0.5270465165376663,
"rewards/cosine_scaled_reward": -0.03537335619330406,
"rewards/format_reward": 0.43750000558793545,
"step": 25
},
{
"completion_length": 3103.5416870117188,
"epoch": 0.029714285714285714,
"grad_norm": 0.15197034180164337,
"kl": 1.8015503883361816e-05,
"learning_rate": 5.2e-07,
"loss": 0.0517,
"reward": 0.23722141981124878,
"reward_std": 0.826317235827446,
"rewards/cosine_scaled_reward": -0.027222641743719578,
"rewards/format_reward": 0.29166667722165585,
"step": 26
},
{
"completion_length": 3099.729248046875,
"epoch": 0.030857142857142857,
"grad_norm": 0.11937292665243149,
"kl": 2.5153160095214844e-05,
"learning_rate": 5.4e-07,
"loss": 0.0279,
"reward": -0.05506348796188831,
"reward_std": 0.483004167675972,
"rewards/cosine_scaled_reward": -0.14211508259177208,
"rewards/format_reward": 0.2291666716337204,
"step": 27
},
{
"completion_length": 3221.7291870117188,
"epoch": 0.032,
"grad_norm": 0.1231866255402565,
"kl": 2.6211142539978027e-05,
"learning_rate": 5.6e-07,
"loss": -0.0031,
"reward": 0.19264543801546097,
"reward_std": 0.7934563755989075,
"rewards/cosine_scaled_reward": -0.07034394145011902,
"rewards/format_reward": 0.33333334885537624,
"step": 28
},
{
"completion_length": 3130.6459350585938,
"epoch": 0.03314285714285714,
"grad_norm": 0.14249049127101898,
"kl": 2.726912498474121e-06,
"learning_rate": 5.8e-07,
"loss": 0.0394,
"reward": 0.20082764513790607,
"reward_std": 1.0230832546949387,
"rewards/cosine_scaled_reward": -0.06625284859910607,
"rewards/format_reward": 0.33333334513008595,
"step": 29
},
{
"completion_length": 3211.125,
"epoch": 0.03428571428571429,
"grad_norm": 0.11244227737188339,
"kl": 2.047419548034668e-05,
"learning_rate": 6e-07,
"loss": 0.0135,
"reward": 0.11087529244832695,
"reward_std": 0.6219374239444733,
"rewards/cosine_scaled_reward": -0.09039569273591042,
"rewards/format_reward": 0.29166667722165585,
"step": 30
},
{
"completion_length": 2505.687530517578,
"epoch": 0.03542857142857143,
"grad_norm": 0.10730752348899841,
"kl": 2.9802322387695312e-05,
"learning_rate": 6.2e-07,
"loss": 0.0711,
"reward": 0.10028511472046375,
"reward_std": 0.7022345140576363,
"rewards/cosine_scaled_reward": -0.1686074547469616,
"rewards/format_reward": 0.4375000074505806,
"step": 31
},
{
"completion_length": 3546.5,
"epoch": 0.036571428571428574,
"grad_norm": 0.08949411660432816,
"kl": 2.053380012512207e-05,
"learning_rate": 6.4e-07,
"loss": 0.0081,
"reward": -0.4492787718772888,
"reward_std": 0.4731578528881073,
"rewards/cosine_scaled_reward": -0.2454727292060852,
"rewards/format_reward": 0.0416666679084301,
"step": 32
},
{
"completion_length": 3140.4584350585938,
"epoch": 0.037714285714285714,
"grad_norm": 0.15533116459846497,
"kl": 1.6998499631881714e-05,
"learning_rate": 6.6e-07,
"loss": 0.0936,
"reward": 0.14784683287143707,
"reward_std": 0.8761000260710716,
"rewards/cosine_scaled_reward": -0.10315992683172226,
"rewards/format_reward": 0.3541666716337204,
"step": 33
},
{
"completion_length": 3067.5208740234375,
"epoch": 0.038857142857142854,
"grad_norm": 0.05691331624984741,
"kl": 7.178634405136108e-06,
"learning_rate": 6.800000000000001e-07,
"loss": 0.0164,
"reward": -0.4488837197422981,
"reward_std": 0.4332050681114197,
"rewards/cosine_scaled_reward": -0.31819187104701996,
"rewards/format_reward": 0.1875,
"step": 34
},
{
"completion_length": 2977.979248046875,
"epoch": 0.04,
"grad_norm": 0.13275845348834991,
"kl": 2.034008502960205e-05,
"learning_rate": 7e-07,
"loss": 0.0431,
"reward": 0.19347557425498962,
"reward_std": 0.7837567403912544,
"rewards/cosine_scaled_reward": -0.11159555055201054,
"rewards/format_reward": 0.4166666828095913,
"step": 35
},
{
"completion_length": 2511.7500610351562,
"epoch": 0.04114285714285714,
"grad_norm": 0.08902338147163391,
"kl": 7.106363773345947e-05,
"learning_rate": 7.2e-07,
"loss": 0.0143,
"reward": 0.6313629895448685,
"reward_std": 0.4862937852740288,
"rewards/cosine_scaled_reward": 0.06568148266524076,
"rewards/format_reward": 0.5000000055879354,
"step": 36
},
{
"completion_length": 2623.6458435058594,
"epoch": 0.04228571428571429,
"grad_norm": 0.12060169875621796,
"kl": 6.20037317276001e-05,
"learning_rate": 7.4e-07,
"loss": 0.0323,
"reward": 0.4485716000199318,
"reward_std": 0.8753202259540558,
"rewards/cosine_scaled_reward": -0.0361308753490448,
"rewards/format_reward": 0.5208333414047956,
"step": 37
},
{
"completion_length": 3015.5625610351562,
"epoch": 0.04342857142857143,
"grad_norm": 0.10110022872686386,
"kl": 0.00016170740127563477,
"learning_rate": 7.599999999999999e-07,
"loss": 0.0344,
"reward": -0.068646389991045,
"reward_std": 0.6391054093837738,
"rewards/cosine_scaled_reward": -0.22182317543774843,
"rewards/format_reward": 0.37500002048909664,
"step": 38
},
{
"completion_length": 2867.5208740234375,
"epoch": 0.044571428571428574,
"grad_norm": 0.15215592086315155,
"kl": 0.00011932849884033203,
"learning_rate": 7.799999999999999e-07,
"loss": 0.1002,
"reward": 0.14817129005677998,
"reward_std": 0.7805476784706116,
"rewards/cosine_scaled_reward": -0.12383103743195534,
"rewards/format_reward": 0.39583333395421505,
"step": 39
},
{
"completion_length": 3186.5000610351562,
"epoch": 0.045714285714285714,
"grad_norm": 0.11930648982524872,
"kl": 0.00010547041893005371,
"learning_rate": 8e-07,
"loss": 0.0178,
"reward": -0.03248624689877033,
"reward_std": 0.63504558801651,
"rewards/cosine_scaled_reward": -0.16207645926624537,
"rewards/format_reward": 0.29166666977107525,
"step": 40
},
{
"completion_length": 3180.8959350585938,
"epoch": 0.046857142857142854,
"grad_norm": 0.18630492687225342,
"kl": 3.663450479507446e-05,
"learning_rate": 8.199999999999999e-07,
"loss": 0.0553,
"reward": 0.43380990624427795,
"reward_std": 0.8565632924437523,
"rewards/cosine_scaled_reward": 0.018988274037837982,
"rewards/format_reward": 0.3958333432674408,
"step": 41
},
{
"completion_length": 2079.1041870117188,
"epoch": 0.048,
"grad_norm": 0.11225883662700653,
"kl": 0.0004626065492630005,
"learning_rate": 8.399999999999999e-07,
"loss": 0.0429,
"reward": 0.8895847648382187,
"reward_std": 0.764504998922348,
"rewards/cosine_scaled_reward": 0.11145903076976538,
"rewards/format_reward": 0.6666666828095913,
"step": 42
},
{
"completion_length": 3000.166748046875,
"epoch": 0.04914285714285714,
"grad_norm": 0.24759933352470398,
"kl": 0.00012095272541046143,
"learning_rate": 8.599999999999999e-07,
"loss": 0.064,
"reward": 0.32910796254873276,
"reward_std": 1.0378518775105476,
"rewards/cosine_scaled_reward": -0.03336267964914441,
"rewards/format_reward": 0.39583333395421505,
"step": 43
},
{
"completion_length": 2956.9375610351562,
"epoch": 0.05028571428571429,
"grad_norm": 0.28840357065200806,
"kl": 0.0008223056793212891,
"learning_rate": 8.799999999999999e-07,
"loss": 0.0814,
"reward": 0.2166026197373867,
"reward_std": 0.745319314301014,
"rewards/cosine_scaled_reward": -0.10003203712403774,
"rewards/format_reward": 0.4166666828095913,
"step": 44
},
{
"completion_length": 2793.9583740234375,
"epoch": 0.05142857142857143,
"grad_norm": 0.1415959894657135,
"kl": 6.61015510559082e-05,
"learning_rate": 9e-07,
"loss": 0.0791,
"reward": 0.6622170452028513,
"reward_std": 0.8223324418067932,
"rewards/cosine_scaled_reward": 0.08110851421952248,
"rewards/format_reward": 0.5000000074505806,
"step": 45
},
{
"completion_length": 2979.9583740234375,
"epoch": 0.052571428571428575,
"grad_norm": 0.10514923185110092,
"kl": 0.00029647350311279297,
"learning_rate": 9.2e-07,
"loss": 0.0303,
"reward": 0.2565183639526367,
"reward_std": 0.5196356028318405,
"rewards/cosine_scaled_reward": -0.03840749338269234,
"rewards/format_reward": 0.3333333432674408,
"step": 46
},
{
"completion_length": 2650.1458587646484,
"epoch": 0.053714285714285714,
"grad_norm": 0.15202954411506653,
"kl": 0.0002989917993545532,
"learning_rate": 9.399999999999999e-07,
"loss": 0.0978,
"reward": 0.6011475473642349,
"reward_std": 0.908449612557888,
"rewards/cosine_scaled_reward": 0.04015708714723587,
"rewards/format_reward": 0.520833358168602,
"step": 47
},
{
"completion_length": 2887.9584350585938,
"epoch": 0.054857142857142854,
"grad_norm": 0.14365942776203156,
"kl": 0.0003235340118408203,
"learning_rate": 9.6e-07,
"loss": 0.1098,
"reward": 0.3464186545461416,
"reward_std": 0.8909324407577515,
"rewards/cosine_scaled_reward": -0.02470733504742384,
"rewards/format_reward": 0.3958333395421505,
"step": 48
},
{
"completion_length": 2294.9792098999023,
"epoch": 0.056,
"grad_norm": 0.12595273554325104,
"kl": 0.0003814399242401123,
"learning_rate": 9.8e-07,
"loss": 0.0294,
"reward": 0.3887506239116192,
"reward_std": 0.709479071199894,
"rewards/cosine_scaled_reward": -0.08687468431890011,
"rewards/format_reward": 0.5625000074505806,
"step": 49
},
{
"completion_length": 2483.5834350585938,
"epoch": 0.05714285714285714,
"grad_norm": 0.1347932517528534,
"kl": 0.0020999908447265625,
"learning_rate": 1e-06,
"loss": 0.0097,
"reward": 0.48975098691880703,
"reward_std": 0.7372790724039078,
"rewards/cosine_scaled_reward": 0.015708832070231438,
"rewards/format_reward": 0.4583333432674408,
"step": 50
},
{
"completion_length": 3298.0208740234375,
"epoch": 0.05828571428571429,
"grad_norm": 0.12307793647050858,
"kl": 0.0010235309600830078,
"learning_rate": 9.999890338174275e-07,
"loss": 0.0291,
"reward": -0.01613167393952608,
"reward_std": 0.7748741805553436,
"rewards/cosine_scaled_reward": -0.1538991741836071,
"rewards/format_reward": 0.2916666716337204,
"step": 51
},
{
"completion_length": 3464.9375610351562,
"epoch": 0.05942857142857143,
"grad_norm": 0.14545413851737976,
"kl": 0.0018963813781738281,
"learning_rate": 9.999561358041868e-07,
"loss": 0.0448,
"reward": -0.23706040158867836,
"reward_std": 0.7933510839939117,
"rewards/cosine_scaled_reward": -0.18103019893169403,
"rewards/format_reward": 0.12500000186264515,
"step": 52
},
{
"completion_length": 2938.2084350585938,
"epoch": 0.060571428571428575,
"grad_norm": 0.08684064447879791,
"kl": 0.0016429424285888672,
"learning_rate": 9.999013075636804e-07,
"loss": 0.0463,
"reward": -0.056304458528757095,
"reward_std": 0.5842409431934357,
"rewards/cosine_scaled_reward": -0.18440223019570112,
"rewards/format_reward": 0.3125000111758709,
"step": 53
},
{
"completion_length": 2789.2916870117188,
"epoch": 0.061714285714285715,
"grad_norm": 0.060190364718437195,
"kl": 0.0017528533935546875,
"learning_rate": 9.998245517681593e-07,
"loss": 0.0171,
"reward": -0.11311334511265159,
"reward_std": 0.42039141058921814,
"rewards/cosine_scaled_reward": -0.23364001512527466,
"rewards/format_reward": 0.3541666716337204,
"step": 54
},
{
"completion_length": 3271.5625,
"epoch": 0.06285714285714286,
"grad_norm": 0.07129888236522675,
"kl": 0.0009405612945556641,
"learning_rate": 9.997258721585931e-07,
"loss": 0.0159,
"reward": -0.34992948174476624,
"reward_std": 0.4250538572669029,
"rewards/cosine_scaled_reward": -0.24788140505552292,
"rewards/format_reward": 0.14583333395421505,
"step": 55
},
{
"completion_length": 3073.604248046875,
"epoch": 0.064,
"grad_norm": 0.16036204993724823,
"kl": 0.0025844573974609375,
"learning_rate": 9.996052735444862e-07,
"loss": 0.0509,
"reward": 0.015420392155647278,
"reward_std": 0.7796643078327179,
"rewards/cosine_scaled_reward": -0.11728980112820864,
"rewards/format_reward": 0.25000000558793545,
"step": 56
},
{
"completion_length": 3082.9584350585938,
"epoch": 0.06514285714285714,
"grad_norm": 0.14083248376846313,
"kl": 0.010837554931640625,
"learning_rate": 9.994627618036452e-07,
"loss": 0.0506,
"reward": 0.042304279981181026,
"reward_std": 0.7727529257535934,
"rewards/cosine_scaled_reward": -0.13509786408394575,
"rewards/format_reward": 0.3125000111758709,
"step": 57
},
{
"completion_length": 3073.3541870117188,
"epoch": 0.06628571428571428,
"grad_norm": 0.16678181290626526,
"kl": 0.003218412399291992,
"learning_rate": 9.992983438818915e-07,
"loss": 0.0068,
"reward": 0.20121465623378754,
"reward_std": 0.7175656408071518,
"rewards/cosine_scaled_reward": -0.055642676539719105,
"rewards/format_reward": 0.3125000074505806,
"step": 58
},
{
"completion_length": 3008.2709350585938,
"epoch": 0.06742857142857143,
"grad_norm": 0.1475798785686493,
"kl": 0.009433746337890625,
"learning_rate": 9.991120277927223e-07,
"loss": 0.0125,
"reward": 0.4367425888776779,
"reward_std": 0.647830456495285,
"rewards/cosine_scaled_reward": 0.06212127208709717,
"rewards/format_reward": 0.3125000111758709,
"step": 59
},
{
"completion_length": 2855.6666870117188,
"epoch": 0.06857142857142857,
"grad_norm": 0.09679862856864929,
"kl": 0.00621795654296875,
"learning_rate": 9.989038226169207e-07,
"loss": 0.0164,
"reward": 0.47872328013181686,
"reward_std": 0.5911416038870811,
"rewards/cosine_scaled_reward": 0.0622783238068223,
"rewards/format_reward": 0.35416667722165585,
"step": 60
},
{
"completion_length": 2144.3750610351562,
"epoch": 0.06971428571428571,
"grad_norm": 0.05888332054018974,
"kl": 0.0020198822021484375,
"learning_rate": 9.98673738502114e-07,
"loss": 0.0166,
"reward": 1.0081715881824493,
"reward_std": 0.5063923448324203,
"rewards/cosine_scaled_reward": 0.19158576428890228,
"rewards/format_reward": 0.625,
"step": 61
},
{
"completion_length": 3236.3125610351562,
"epoch": 0.07085714285714285,
"grad_norm": 0.16112229228019714,
"kl": 0.0008752346038818359,
"learning_rate": 9.98421786662277e-07,
"loss": 0.0491,
"reward": 0.47718358784914017,
"reward_std": 0.9821799397468567,
"rewards/cosine_scaled_reward": 0.01984177529811859,
"rewards/format_reward": 0.4375000149011612,
"step": 62
},
{
"completion_length": 2407.416748046875,
"epoch": 0.072,
"grad_norm": 0.09190040081739426,
"kl": 0.009485244750976562,
"learning_rate": 9.981479793771866e-07,
"loss": 0.0014,
"reward": 0.6355759827420115,
"reward_std": 0.5608287900686264,
"rewards/cosine_scaled_reward": 0.026121314615011215,
"rewards/format_reward": 0.5833333358168602,
"step": 63
},
{
"completion_length": 3042.2708740234375,
"epoch": 0.07314285714285715,
"grad_norm": 0.11311787366867065,
"kl": 0.0009531974792480469,
"learning_rate": 9.97852329991824e-07,
"loss": 0.0158,
"reward": 0.624295711517334,
"reward_std": 0.6829620823264122,
"rewards/cosine_scaled_reward": 0.1142311654984951,
"rewards/format_reward": 0.3958333358168602,
"step": 64
},
{
"completion_length": 2979.9376220703125,
"epoch": 0.07428571428571429,
"grad_norm": 0.17287743091583252,
"kl": 0.008108139038085938,
"learning_rate": 9.975348529157229e-07,
"loss": 0.0803,
"reward": 0.3465092070400715,
"reward_std": 0.8748672604560852,
"rewards/cosine_scaled_reward": -0.01424538716673851,
"rewards/format_reward": 0.37500000558793545,
"step": 65
},
{
"completion_length": 3180.5833740234375,
"epoch": 0.07542857142857143,
"grad_norm": 0.13114774227142334,
"kl": 0.0013761520385742188,
"learning_rate": 9.971955636222684e-07,
"loss": 0.011,
"reward": 0.197968615218997,
"reward_std": 0.808275930583477,
"rewards/cosine_scaled_reward": -0.07809901610016823,
"rewards/format_reward": 0.3541666679084301,
"step": 66
},
{
"completion_length": 3045.2709350585938,
"epoch": 0.07657142857142857,
"grad_norm": 0.16203086078166962,
"kl": 0.0018739700317382812,
"learning_rate": 9.968344786479415e-07,
"loss": 0.0847,
"reward": 0.6482307966798544,
"reward_std": 1.029038056731224,
"rewards/cosine_scaled_reward": 0.10536541882902384,
"rewards/format_reward": 0.43750001676380634,
"step": 67
},
{
"completion_length": 2498.9166870117188,
"epoch": 0.07771428571428571,
"grad_norm": 0.06138293072581291,
"kl": 0.00598907470703125,
"learning_rate": 9.964516155915151e-07,
"loss": -0.0034,
"reward": 0.12972787162289023,
"reward_std": 0.5004179775714874,
"rewards/cosine_scaled_reward": -0.12263606488704681,
"rewards/format_reward": 0.375,
"step": 68
},
{
"completion_length": 2794.0834350585938,
"epoch": 0.07885714285714286,
"grad_norm": 0.1431104838848114,
"kl": 0.005124092102050781,
"learning_rate": 9.960469931131936e-07,
"loss": 0.0283,
"reward": 0.5150027610361576,
"reward_std": 0.6274815611541271,
"rewards/cosine_scaled_reward": 0.01791803538799286,
"rewards/format_reward": 0.4791666716337204,
"step": 69
},
{
"completion_length": 3173.8125610351562,
"epoch": 0.08,
"grad_norm": 0.146661639213562,
"kl": 0.0033349990844726562,
"learning_rate": 9.956206309337066e-07,
"loss": 0.0731,
"reward": 0.23033593781292439,
"reward_std": 0.7032231390476227,
"rewards/cosine_scaled_reward": -0.04108203295618296,
"rewards/format_reward": 0.31250000186264515,
"step": 70
},
{
"completion_length": 3088.0834350585938,
"epoch": 0.08114285714285714,
"grad_norm": 0.1698896586894989,
"kl": 0.005756378173828125,
"learning_rate": 9.951725498333448e-07,
"loss": 0.0857,
"reward": 0.4810620807111263,
"reward_std": 0.7472349628806114,
"rewards/cosine_scaled_reward": 0.032197702676057816,
"rewards/format_reward": 0.4166666828095913,
"step": 71
},
{
"completion_length": 2835.9583740234375,
"epoch": 0.08228571428571428,
"grad_norm": 0.15748044848442078,
"kl": 0.005644321441650391,
"learning_rate": 9.947027716509488e-07,
"loss": 0.0414,
"reward": 0.39926697919145226,
"reward_std": 0.7735992036759853,
"rewards/cosine_scaled_reward": -0.029533179476857185,
"rewards/format_reward": 0.4583333395421505,
"step": 72
},
{
"completion_length": 2668.854248046875,
"epoch": 0.08342857142857144,
"grad_norm": 0.2273511439561844,
"kl": 0.0141448974609375,
"learning_rate": 9.942113192828444e-07,
"loss": 0.1186,
"reward": 0.6719660833477974,
"reward_std": 0.9455910921096802,
"rewards/cosine_scaled_reward": 0.0859830379486084,
"rewards/format_reward": 0.5000000298023224,
"step": 73
},
{
"completion_length": 2737.291748046875,
"epoch": 0.08457142857142858,
"grad_norm": 0.16039791703224182,
"kl": 0.007320404052734375,
"learning_rate": 9.93698216681727e-07,
"loss": 0.0567,
"reward": 0.31655584648251534,
"reward_std": 0.6061973124742508,
"rewards/cosine_scaled_reward": -0.03963874280452728,
"rewards/format_reward": 0.3958333395421505,
"step": 74
},
{
"completion_length": 2990.854248046875,
"epoch": 0.08571428571428572,
"grad_norm": 0.22528968751430511,
"kl": 0.007213592529296875,
"learning_rate": 9.931634888554935e-07,
"loss": 0.1029,
"reward": 0.07040337100625038,
"reward_std": 0.8260042667388916,
"rewards/cosine_scaled_reward": -0.10021498240530491,
"rewards/format_reward": 0.27083333767950535,
"step": 75
},
{
"completion_length": 2957.6459350585938,
"epoch": 0.08685714285714285,
"grad_norm": 0.12294893711805344,
"kl": 0.0023813247680664062,
"learning_rate": 9.926071618660237e-07,
"loss": 0.0543,
"reward": 0.28933531790971756,
"reward_std": 0.7524442374706268,
"rewards/cosine_scaled_reward": -0.04283232241868973,
"rewards/format_reward": 0.3750000074505806,
"step": 76
},
{
"completion_length": 3163.9583740234375,
"epoch": 0.088,
"grad_norm": 0.09998784214258194,
"kl": 0.003734588623046875,
"learning_rate": 9.9202926282791e-07,
"loss": 0.0151,
"reward": 0.43463192135095596,
"reward_std": 0.6034069135785103,
"rewards/cosine_scaled_reward": 0.0506493030115962,
"rewards/format_reward": 0.3333333432674408,
"step": 77
},
{
"completion_length": 2789.729217529297,
"epoch": 0.08914285714285715,
"grad_norm": 0.1028476133942604,
"kl": 0.0034427642822265625,
"learning_rate": 9.91429819907136e-07,
"loss": 0.009,
"reward": 0.49053217470645905,
"reward_std": 0.671901747584343,
"rewards/cosine_scaled_reward": 0.005682730115950108,
"rewards/format_reward": 0.4791666716337204,
"step": 78
},
{
"completion_length": 3021.9584350585938,
"epoch": 0.09028571428571429,
"grad_norm": 0.14524304866790771,
"kl": 0.002349853515625,
"learning_rate": 9.908088623197048e-07,
"loss": 0.0437,
"reward": 0.18685297295451164,
"reward_std": 0.82758379727602,
"rewards/cosine_scaled_reward": -0.10449018701910973,
"rewards/format_reward": 0.3958333469927311,
"step": 79
},
{
"completion_length": 3422.2916870117188,
"epoch": 0.09142857142857143,
"grad_norm": 0.35899317264556885,
"kl": 0.0026226043701171875,
"learning_rate": 9.901664203302124e-07,
"loss": 0.0522,
"reward": -0.14088810980319977,
"reward_std": 0.6001620069146156,
"rewards/cosine_scaled_reward": -0.14336072688456625,
"rewards/format_reward": 0.1458333358168602,
"step": 80
},
{
"completion_length": 3328.1458740234375,
"epoch": 0.09257142857142857,
"grad_norm": 0.1413203924894333,
"kl": 0.003086090087890625,
"learning_rate": 9.895025252503755e-07,
"loss": -0.0009,
"reward": 0.20990341156721115,
"reward_std": 0.7368708997964859,
"rewards/cosine_scaled_reward": -0.04088162397965789,
"rewards/format_reward": 0.29166668094694614,
"step": 81
},
{
"completion_length": 3138.6041870117188,
"epoch": 0.09371428571428571,
"grad_norm": 0.5630224943161011,
"kl": 0.0060558319091796875,
"learning_rate": 9.888172094375033e-07,
"loss": 0.0801,
"reward": 0.005654335021972656,
"reward_std": 0.7520733773708344,
"rewards/cosine_scaled_reward": -0.14300616830587387,
"rewards/format_reward": 0.2916666716337204,
"step": 82
},
{
"completion_length": 3398.0833740234375,
"epoch": 0.09485714285714286,
"grad_norm": 0.09970960766077042,
"kl": 0.0034198760986328125,
"learning_rate": 9.881105062929221e-07,
"loss": 0.0172,
"reward": -0.2690254710614681,
"reward_std": 0.6017113700509071,
"rewards/cosine_scaled_reward": -0.2490960769355297,
"rewards/format_reward": 0.22916667722165585,
"step": 83
},
{
"completion_length": 2666.2501220703125,
"epoch": 0.096,
"grad_norm": 0.2184879034757614,
"kl": 0.002471923828125,
"learning_rate": 9.873824502603459e-07,
"loss": 0.0858,
"reward": 1.2698333784937859,
"reward_std": 1.1699798554182053,
"rewards/cosine_scaled_reward": 0.3119999971240759,
"rewards/format_reward": 0.6458333656191826,
"step": 84
},
{
"completion_length": 3092.9791870117188,
"epoch": 0.09714285714285714,
"grad_norm": 0.11792045831680298,
"kl": 0.0024585723876953125,
"learning_rate": 9.866330768241983e-07,
"loss": 0.0172,
"reward": 0.3196272477507591,
"reward_std": 0.7417704239487648,
"rewards/cosine_scaled_reward": -0.017269723117351532,
"rewards/format_reward": 0.354166679084301,
"step": 85
},
{
"completion_length": 3099.604248046875,
"epoch": 0.09828571428571428,
"grad_norm": 0.12413817644119263,
"kl": 0.004852294921875,
"learning_rate": 9.85862422507884e-07,
"loss": 0.0517,
"reward": -0.07946242019534111,
"reward_std": 0.5531802475452423,
"rewards/cosine_scaled_reward": -0.1751478873193264,
"rewards/format_reward": 0.2708333432674408,
"step": 86
},
{
"completion_length": 3024.354278564453,
"epoch": 0.09942857142857142,
"grad_norm": 0.12308648228645325,
"kl": 0.006999969482421875,
"learning_rate": 9.850705248720068e-07,
"loss": 0.0426,
"reward": 0.1297205686569214,
"reward_std": 0.7171878144145012,
"rewards/cosine_scaled_reward": -0.12263973196968436,
"rewards/format_reward": 0.37500000186264515,
"step": 87
},
{
"completion_length": 2875.6875610351562,
"epoch": 0.10057142857142858,
"grad_norm": 0.1610432118177414,
"kl": 0.014064788818359375,
"learning_rate": 9.8425742251254e-07,
"loss": 0.0297,
"reward": 0.6831055271031801,
"reward_std": 0.7087237983942032,
"rewards/cosine_scaled_reward": 0.0811360776424408,
"rewards/format_reward": 0.5208333432674408,
"step": 88
},
{
"completion_length": 3291.3959350585938,
"epoch": 0.10171428571428572,
"grad_norm": 0.14732913672924042,
"kl": 0.004520416259765625,
"learning_rate": 9.83423155058946e-07,
"loss": 0.063,
"reward": 0.3873383179306984,
"reward_std": 0.9104212373495102,
"rewards/cosine_scaled_reward": 0.0374191589653492,
"rewards/format_reward": 0.3125000111758709,
"step": 89
},
{
"completion_length": 3100.7500610351562,
"epoch": 0.10285714285714286,
"grad_norm": 0.09902340173721313,
"kl": 0.005191802978515625,
"learning_rate": 9.825677631722435e-07,
"loss": 0.0212,
"reward": 0.2355214934796095,
"reward_std": 0.5521544776856899,
"rewards/cosine_scaled_reward": -0.03848925232887268,
"rewards/format_reward": 0.3125000074505806,
"step": 90
},
{
"completion_length": 3321.3958740234375,
"epoch": 0.104,
"grad_norm": 0.11201111227273941,
"kl": 0.0046215057373046875,
"learning_rate": 9.816912885430258e-07,
"loss": 0.0302,
"reward": 0.06314115412533283,
"reward_std": 0.6101053357124329,
"rewards/cosine_scaled_reward": -0.1246794331818819,
"rewards/format_reward": 0.31250000558793545,
"step": 91
},
{
"completion_length": 2866.9375610351562,
"epoch": 0.10514285714285715,
"grad_norm": 0.08195216953754425,
"kl": 0.00637054443359375,
"learning_rate": 9.807937738894303e-07,
"loss": 0.0374,
"reward": 0.2856922000646591,
"reward_std": 0.6180723085999489,
"rewards/cosine_scaled_reward": -0.09673722740262747,
"rewards/format_reward": 0.4791666716337204,
"step": 92
},
{
"completion_length": 2626.8333740234375,
"epoch": 0.10628571428571429,
"grad_norm": 0.0848076120018959,
"kl": 0.00502777099609375,
"learning_rate": 9.798752629550546e-07,
"loss": 0.0158,
"reward": 0.47025431878864765,
"reward_std": 0.5611053630709648,
"rewards/cosine_scaled_reward": 0.005960509181022644,
"rewards/format_reward": 0.4583333432674408,
"step": 93
},
{
"completion_length": 3384.666748046875,
"epoch": 0.10742857142857143,
"grad_norm": 0.11509731411933899,
"kl": 0.005451202392578125,
"learning_rate": 9.78935800506826e-07,
"loss": 0.0334,
"reward": 0.010346372611820698,
"reward_std": 0.6185438930988312,
"rewards/cosine_scaled_reward": -0.09899348951876163,
"rewards/format_reward": 0.2083333358168602,
"step": 94
},
{
"completion_length": 3308.729248046875,
"epoch": 0.10857142857142857,
"grad_norm": 0.13493004441261292,
"kl": 0.00511932373046875,
"learning_rate": 9.779754323328192e-07,
"loss": 0.0494,
"reward": -0.04175245389342308,
"reward_std": 0.819076806306839,
"rewards/cosine_scaled_reward": -0.14587622694671154,
"rewards/format_reward": 0.2500000111758709,
"step": 95
},
{
"completion_length": 2638.8333740234375,
"epoch": 0.10971428571428571,
"grad_norm": 0.1093597412109375,
"kl": 0.006412506103515625,
"learning_rate": 9.769942052400235e-07,
"loss": 0.0613,
"reward": 0.2132774479687214,
"reward_std": 0.6241517812013626,
"rewards/cosine_scaled_reward": -0.1121112871915102,
"rewards/format_reward": 0.4375000149011612,
"step": 96
},
{
"completion_length": 3025.687530517578,
"epoch": 0.11085714285714286,
"grad_norm": 0.14619475603103638,
"kl": 0.017696380615234375,
"learning_rate": 9.759921670520634e-07,
"loss": 0.0518,
"reward": 0.21731913276016712,
"reward_std": 0.8663276582956314,
"rewards/cosine_scaled_reward": -0.058007098734378815,
"rewards/format_reward": 0.33333334513008595,
"step": 97
},
{
"completion_length": 2998.0833740234375,
"epoch": 0.112,
"grad_norm": 0.08425849676132202,
"kl": 0.011322021484375,
"learning_rate": 9.749693666068663e-07,
"loss": 0.0316,
"reward": -0.0604003369808197,
"reward_std": 0.4831971898674965,
"rewards/cosine_scaled_reward": -0.17603351920843124,
"rewards/format_reward": 0.2916666716337204,
"step": 98
},
{
"completion_length": 2882.8958435058594,
"epoch": 0.11314285714285714,
"grad_norm": 0.10733813792467117,
"kl": 0.0042572021484375,
"learning_rate": 9.739258537542835e-07,
"loss": 0.0469,
"reward": 0.37905219942331314,
"reward_std": 0.6325190886855125,
"rewards/cosine_scaled_reward": 0.012442763894796371,
"rewards/format_reward": 0.35416667722165585,
"step": 99
},
{
"completion_length": 3077.979248046875,
"epoch": 0.11428571428571428,
"grad_norm": 0.22007572650909424,
"kl": 0.00611114501953125,
"learning_rate": 9.728616793536587e-07,
"loss": 0.0684,
"reward": 0.15971739403903484,
"reward_std": 0.8245379701256752,
"rewards/cosine_scaled_reward": -0.0972246453166008,
"rewards/format_reward": 0.3541666716337204,
"step": 100
},
{
"completion_length": 3149.5000610351562,
"epoch": 0.11542857142857142,
"grad_norm": 0.17998212575912476,
"kl": 0.0086212158203125,
"learning_rate": 9.717768952713511e-07,
"loss": 0.0924,
"reward": -0.043516192585229874,
"reward_std": 0.7394061759114265,
"rewards/cosine_scaled_reward": -0.1467580944299698,
"rewards/format_reward": 0.25000001303851604,
"step": 101
},
{
"completion_length": 2830.2500610351562,
"epoch": 0.11657142857142858,
"grad_norm": 0.10636850446462631,
"kl": 0.006778717041015625,
"learning_rate": 9.706715543782064e-07,
"loss": 0.0118,
"reward": 0.20193170942366123,
"reward_std": 0.5816469639539719,
"rewards/cosine_scaled_reward": -0.09695081505924463,
"rewards/format_reward": 0.3958333358168602,
"step": 102
},
{
"completion_length": 3253.354248046875,
"epoch": 0.11771428571428572,
"grad_norm": 0.10601601004600525,
"kl": 0.0059051513671875,
"learning_rate": 9.695457105469804e-07,
"loss": 0.0393,
"reward": 0.16332483664155006,
"reward_std": 0.7165435254573822,
"rewards/cosine_scaled_reward": -0.07458756864070892,
"rewards/format_reward": 0.3125000149011612,
"step": 103
},
{
"completion_length": 2784.0416870117188,
"epoch": 0.11885714285714286,
"grad_norm": 0.14525532722473145,
"kl": 0.00762176513671875,
"learning_rate": 9.683994186497132e-07,
"loss": 0.0257,
"reward": 0.6941813006997108,
"reward_std": 0.731097511947155,
"rewards/cosine_scaled_reward": 0.13875730894505978,
"rewards/format_reward": 0.41666667722165585,
"step": 104
},
{
"completion_length": 3037.291748046875,
"epoch": 0.12,
"grad_norm": 0.10406464338302612,
"kl": 0.0091552734375,
"learning_rate": 9.672327345550543e-07,
"loss": 0.0217,
"reward": -0.03945709019899368,
"reward_std": 0.5527790486812592,
"rewards/cosine_scaled_reward": -0.14472855255007744,
"rewards/format_reward": 0.25000000186264515,
"step": 105
},
{
"completion_length": 3007.6250610351562,
"epoch": 0.12114285714285715,
"grad_norm": 0.1392635703086853,
"kl": 0.00736236572265625,
"learning_rate": 9.66045715125541e-07,
"loss": 0.0638,
"reward": 0.2589884400367737,
"reward_std": 0.8927985578775406,
"rewards/cosine_scaled_reward": -0.05800577998161316,
"rewards/format_reward": 0.3750000111758709,
"step": 106
},
{
"completion_length": 2705.52099609375,
"epoch": 0.12228571428571429,
"grad_norm": 0.19877693057060242,
"kl": 0.00640869140625,
"learning_rate": 9.648384182148252e-07,
"loss": 0.0852,
"reward": 0.42868572287261486,
"reward_std": 0.7907231077551842,
"rewards/cosine_scaled_reward": -0.025240465998649597,
"rewards/format_reward": 0.4791666716337204,
"step": 107
},
{
"completion_length": 2601.9793090820312,
"epoch": 0.12342857142857143,
"grad_norm": 0.1907849907875061,
"kl": 0.010498046875,
"learning_rate": 9.636109026648554e-07,
"loss": 0.0862,
"reward": 1.0781057141721249,
"reward_std": 0.926390677690506,
"rewards/cosine_scaled_reward": 0.2578028216958046,
"rewards/format_reward": 0.5625,
"step": 108
},
{
"completion_length": 2873.500030517578,
"epoch": 0.12457142857142857,
"grad_norm": 0.12728413939476013,
"kl": 0.00748443603515625,
"learning_rate": 9.623632283030077e-07,
"loss": 0.0443,
"reward": 0.2420949712395668,
"reward_std": 0.6641058176755905,
"rewards/cosine_scaled_reward": -0.0872858352959156,
"rewards/format_reward": 0.416666679084301,
"step": 109
},
{
"completion_length": 2759.041748046875,
"epoch": 0.12571428571428572,
"grad_norm": 0.3926822543144226,
"kl": 0.0103759765625,
"learning_rate": 9.610954559391704e-07,
"loss": 0.051,
"reward": 0.7485219649970531,
"reward_std": 1.0151629000902176,
"rewards/cosine_scaled_reward": 0.07217762316577137,
"rewards/format_reward": 0.6041666865348816,
"step": 110
},
{
"completion_length": 1981.291748046875,
"epoch": 0.12685714285714286,
"grad_norm": 0.13100939989089966,
"kl": 0.0102081298828125,
"learning_rate": 9.598076473627796e-07,
"loss": -0.018,
"reward": 0.8073812872171402,
"reward_std": 0.8186813145875931,
"rewards/cosine_scaled_reward": 0.028690634877420962,
"rewards/format_reward": 0.7500000149011612,
"step": 111
},
{
"completion_length": 3106.9583740234375,
"epoch": 0.128,
"grad_norm": 0.18594208359718323,
"kl": 0.01175689697265625,
"learning_rate": 9.58499865339809e-07,
"loss": 0.0913,
"reward": 0.3422376364469528,
"reward_std": 0.8253115490078926,
"rewards/cosine_scaled_reward": -0.016381196677684784,
"rewards/format_reward": 0.3750000223517418,
"step": 112
},
{
"completion_length": 2985.0208740234375,
"epoch": 0.12914285714285714,
"grad_norm": 0.10086725652217865,
"kl": 0.0164794921875,
"learning_rate": 9.571721736097088e-07,
"loss": 0.026,
"reward": 0.6304376311600208,
"reward_std": 0.6578450873494148,
"rewards/cosine_scaled_reward": 0.10688545554876328,
"rewards/format_reward": 0.41666667722165585,
"step": 113
},
{
"completion_length": 2085.0416870117188,
"epoch": 0.13028571428571428,
"grad_norm": 0.1902545839548111,
"kl": 0.0139923095703125,
"learning_rate": 9.55824636882301e-07,
"loss": 0.0715,
"reward": 0.9513098001480103,
"reward_std": 0.9133107215166092,
"rewards/cosine_scaled_reward": 0.13190488796681166,
"rewards/format_reward": 0.6875000149011612,
"step": 114
},
{
"completion_length": 3096.6250610351562,
"epoch": 0.13142857142857142,
"grad_norm": 0.1532527357339859,
"kl": 0.01084136962890625,
"learning_rate": 9.54457320834625e-07,
"loss": 0.0594,
"reward": 0.5540619897656143,
"reward_std": 0.9744190573692322,
"rewards/cosine_scaled_reward": 0.05828099511563778,
"rewards/format_reward": 0.43750000558793545,
"step": 115
},
{
"completion_length": 3017.5833740234375,
"epoch": 0.13257142857142856,
"grad_norm": 0.1256159394979477,
"kl": 0.012542724609375,
"learning_rate": 9.530702921077358e-07,
"loss": 0.0515,
"reward": -0.0034197866916656494,
"reward_std": 0.6141533181071281,
"rewards/cosine_scaled_reward": -0.12670988403260708,
"rewards/format_reward": 0.25,
"step": 116
},
{
"completion_length": 3360.916748046875,
"epoch": 0.1337142857142857,
"grad_norm": 0.13030223548412323,
"kl": 0.010650634765625,
"learning_rate": 9.516636183034564e-07,
"loss": 0.0416,
"reward": -0.01844558771699667,
"reward_std": 0.7771024033427238,
"rewards/cosine_scaled_reward": -0.13422280363738537,
"rewards/format_reward": 0.2500000111758709,
"step": 117
},
{
"completion_length": 2493.2709350585938,
"epoch": 0.13485714285714287,
"grad_norm": 0.10445129871368408,
"kl": 0.01708984375,
"learning_rate": 9.502373679810839e-07,
"loss": 0.045,
"reward": 0.7345311008393764,
"reward_std": 0.6608476266264915,
"rewards/cosine_scaled_reward": 0.054765526205301285,
"rewards/format_reward": 0.6250000055879354,
"step": 118
},
{
"completion_length": 2583.8334045410156,
"epoch": 0.136,
"grad_norm": 0.1751917004585266,
"kl": 0.01385498046875,
"learning_rate": 9.487916106540465e-07,
"loss": 0.1151,
"reward": 0.16756585985422134,
"reward_std": 0.6609668508172035,
"rewards/cosine_scaled_reward": -0.1558004072867334,
"rewards/format_reward": 0.4791666939854622,
"step": 119
},
{
"completion_length": 3435.6458740234375,
"epoch": 0.13714285714285715,
"grad_norm": 0.20698896050453186,
"kl": 0.01154327392578125,
"learning_rate": 9.473264167865171e-07,
"loss": 0.0604,
"reward": -0.05263599753379822,
"reward_std": 1.0508478283882141,
"rewards/cosine_scaled_reward": -0.17215134110301733,
"rewards/format_reward": 0.2916666716337204,
"step": 120
},
{
"completion_length": 3101.875,
"epoch": 0.1382857142857143,
"grad_norm": 0.10516638308763504,
"kl": 0.012359619140625,
"learning_rate": 9.458418577899774e-07,
"loss": 0.0121,
"reward": 0.045499179512262344,
"reward_std": 0.5043403655290604,
"rewards/cosine_scaled_reward": -0.10225043445825577,
"rewards/format_reward": 0.2500000111758709,
"step": 121
},
{
"completion_length": 3044.3541870117188,
"epoch": 0.13942857142857143,
"grad_norm": 0.10074342042207718,
"kl": 0.019744873046875,
"learning_rate": 9.443380060197385e-07,
"loss": 0.0442,
"reward": 0.021381250582635403,
"reward_std": 0.5577950775623322,
"rewards/cosine_scaled_reward": -0.13514270819723606,
"rewards/format_reward": 0.2916666716337204,
"step": 122
},
{
"completion_length": 2913.7083740234375,
"epoch": 0.14057142857142857,
"grad_norm": 0.14308768510818481,
"kl": 0.0152587890625,
"learning_rate": 9.428149347714143e-07,
"loss": 0.0878,
"reward": 0.12965750694274902,
"reward_std": 0.736047625541687,
"rewards/cosine_scaled_reward": -0.09142125025391579,
"rewards/format_reward": 0.31250000558793545,
"step": 123
},
{
"completion_length": 2562.5000610351562,
"epoch": 0.1417142857142857,
"grad_norm": 0.19142040610313416,
"kl": 0.01031494140625,
"learning_rate": 9.412727182773486e-07,
"loss": 0.065,
"reward": 0.8353077471256256,
"reward_std": 1.026055485010147,
"rewards/cosine_scaled_reward": 0.12598720658570528,
"rewards/format_reward": 0.5833333507180214,
"step": 124
},
{
"completion_length": 3017.6251220703125,
"epoch": 0.14285714285714285,
"grad_norm": 0.117274209856987,
"kl": 0.009552001953125,
"learning_rate": 9.397114317029974e-07,
"loss": 0.0076,
"reward": 0.1632972015067935,
"reward_std": 0.5557524636387825,
"rewards/cosine_scaled_reward": -0.10585140064358711,
"rewards/format_reward": 0.37500000558793545,
"step": 125
},
{
"completion_length": 2858.8334350585938,
"epoch": 0.144,
"grad_norm": 0.2655041217803955,
"kl": 0.01821136474609375,
"learning_rate": 9.381311511432658e-07,
"loss": 0.0873,
"reward": 0.30082017183303833,
"reward_std": 0.9569597989320755,
"rewards/cosine_scaled_reward": -0.06833992386236787,
"rewards/format_reward": 0.4375000074505806,
"step": 126
},
{
"completion_length": 2871.2083435058594,
"epoch": 0.14514285714285713,
"grad_norm": 0.0872960090637207,
"kl": 0.0139007568359375,
"learning_rate": 9.36531953618799e-07,
"loss": -0.0046,
"reward": 0.1562192291021347,
"reward_std": 0.58997593075037,
"rewards/cosine_scaled_reward": -0.10939039289951324,
"rewards/format_reward": 0.37500000558793545,
"step": 127
},
{
"completion_length": 3199.2291870117188,
"epoch": 0.1462857142857143,
"grad_norm": 0.21217796206474304,
"kl": 0.020172119140625,
"learning_rate": 9.34913917072228e-07,
"loss": 0.0995,
"reward": 0.07891843095421791,
"reward_std": 0.858635775744915,
"rewards/cosine_scaled_reward": -0.10637411894276738,
"rewards/format_reward": 0.2916666716337204,
"step": 128
},
{
"completion_length": 2658.1458435058594,
"epoch": 0.14742857142857144,
"grad_norm": 0.13081493973731995,
"kl": 0.0191650390625,
"learning_rate": 9.332771203643714e-07,
"loss": 0.0418,
"reward": 0.2748406231403351,
"reward_std": 0.6719504073262215,
"rewards/cosine_scaled_reward": -0.10216302564367652,
"rewards/format_reward": 0.4791666753590107,
"step": 129
},
{
"completion_length": 3460.2291870117188,
"epoch": 0.14857142857142858,
"grad_norm": 0.12681667506694794,
"kl": 0.01409912109375,
"learning_rate": 9.316216432703916e-07,
"loss": 0.0399,
"reward": -0.11496437340974808,
"reward_std": 0.6864899545907974,
"rewards/cosine_scaled_reward": -0.15123217983637005,
"rewards/format_reward": 0.18750000558793545,
"step": 130
},
{
"completion_length": 3362.3750610351562,
"epoch": 0.14971428571428572,
"grad_norm": 0.12439722567796707,
"kl": 0.01568603515625,
"learning_rate": 9.299475664759068e-07,
"loss": 0.0316,
"reward": 0.17270515114068985,
"reward_std": 0.636282742023468,
"rewards/cosine_scaled_reward": -0.01781410351395607,
"rewards/format_reward": 0.20833334513008595,
"step": 131
},
{
"completion_length": 3433.3333740234375,
"epoch": 0.15085714285714286,
"grad_norm": 0.13320712745189667,
"kl": 0.020172119140625,
"learning_rate": 9.282549715730579e-07,
"loss": 0.0099,
"reward": -0.2513204962015152,
"reward_std": 0.6501054912805557,
"rewards/cosine_scaled_reward": -0.2298269160091877,
"rewards/format_reward": 0.2083333432674408,
"step": 132
},
{
"completion_length": 3220.1459350585938,
"epoch": 0.152,
"grad_norm": 0.17302778363227844,
"kl": 0.01995849609375,
"learning_rate": 9.265439410565328e-07,
"loss": 0.0398,
"reward": -0.11768799647688866,
"reward_std": 0.6951716169714928,
"rewards/cosine_scaled_reward": -0.22551067918539047,
"rewards/format_reward": 0.3333333544433117,
"step": 133
},
{
"completion_length": 2314.5416870117188,
"epoch": 0.15314285714285714,
"grad_norm": 0.0858488380908966,
"kl": 0.025665283203125,
"learning_rate": 9.248145583195447e-07,
"loss": 0.0092,
"reward": 0.602238692343235,
"reward_std": 0.563841238617897,
"rewards/cosine_scaled_reward": -0.011380670592188835,
"rewards/format_reward": 0.625,
"step": 134
},
{
"completion_length": 2965.8750610351562,
"epoch": 0.15428571428571428,
"grad_norm": 0.17062057554721832,
"kl": 0.019134521484375,
"learning_rate": 9.230669076497687e-07,
"loss": 0.045,
"reward": 0.17675711959600449,
"reward_std": 0.5801602862775326,
"rewards/cosine_scaled_reward": -0.05745477043092251,
"rewards/format_reward": 0.2916666679084301,
"step": 135
},
{
"completion_length": 2485.8334045410156,
"epoch": 0.15542857142857142,
"grad_norm": 0.13649305701255798,
"kl": 0.021697998046875,
"learning_rate": 9.213010742252327e-07,
"loss": 0.0181,
"reward": 0.6858363393694162,
"reward_std": 0.8353622853755951,
"rewards/cosine_scaled_reward": 0.009584830142557621,
"rewards/format_reward": 0.6666666865348816,
"step": 136
},
{
"completion_length": 3372.6043090820312,
"epoch": 0.15657142857142858,
"grad_norm": 0.12744168937206268,
"kl": 0.0316314697265625,
"learning_rate": 9.195171441101668e-07,
"loss": 0.0593,
"reward": -0.08887681737542152,
"reward_std": 0.6366704031825066,
"rewards/cosine_scaled_reward": -0.1486050896346569,
"rewards/format_reward": 0.2083333358168602,
"step": 137
},
{
"completion_length": 2583.1250915527344,
"epoch": 0.15771428571428572,
"grad_norm": 0.1180926188826561,
"kl": 0.0181732177734375,
"learning_rate": 9.177152042508077e-07,
"loss": 0.035,
"reward": 0.8022582903504372,
"reward_std": 0.7210212647914886,
"rewards/cosine_scaled_reward": 0.11987911909818649,
"rewards/format_reward": 0.5625000111758709,
"step": 138
},
{
"completion_length": 2719.5208435058594,
"epoch": 0.15885714285714286,
"grad_norm": 0.13920994102954865,
"kl": 0.0205841064453125,
"learning_rate": 9.158953424711624e-07,
"loss": 0.0124,
"reward": 0.43635744601488113,
"reward_std": 0.7499766424298286,
"rewards/cosine_scaled_reward": -0.042237947694957256,
"rewards/format_reward": 0.520833345130086,
"step": 139
},
{
"completion_length": 2655.6251220703125,
"epoch": 0.16,
"grad_norm": 0.12660294771194458,
"kl": 0.01995849609375,
"learning_rate": 9.140576474687263e-07,
"loss": 0.0166,
"reward": 0.6822620648890734,
"reward_std": 0.6412546709179878,
"rewards/cosine_scaled_reward": 0.049464356154203415,
"rewards/format_reward": 0.5833333432674408,
"step": 140
},
{
"completion_length": 2931.5208740234375,
"epoch": 0.16114285714285714,
"grad_norm": 0.21838468313217163,
"kl": 0.023284912109375,
"learning_rate": 9.122022088101613e-07,
"loss": 0.0897,
"reward": 0.5501389801502228,
"reward_std": 0.931708961725235,
"rewards/cosine_scaled_reward": 0.05631948262453079,
"rewards/format_reward": 0.4375000074505806,
"step": 141
},
{
"completion_length": 2731.5209350585938,
"epoch": 0.16228571428571428,
"grad_norm": 0.1206783875823021,
"kl": 0.0212860107421875,
"learning_rate": 9.103291169269299e-07,
"loss": 0.0285,
"reward": 0.44770222902297974,
"reward_std": 0.6320216841995716,
"rewards/cosine_scaled_reward": 0.01551777683198452,
"rewards/format_reward": 0.41666667722165585,
"step": 142
},
{
"completion_length": 3003.0833740234375,
"epoch": 0.16342857142857142,
"grad_norm": 0.1385820508003235,
"kl": 0.0207366943359375,
"learning_rate": 9.084384631108882e-07,
"loss": 0.0375,
"reward": 0.36010952293872833,
"reward_std": 0.6810671910643578,
"rewards/cosine_scaled_reward": -0.007445234805345535,
"rewards/format_reward": 0.37500001676380634,
"step": 143
},
{
"completion_length": 3069.2084350585938,
"epoch": 0.16457142857142856,
"grad_norm": 0.24886491894721985,
"kl": 0.023193359375,
"learning_rate": 9.065303395098358e-07,
"loss": 0.0784,
"reward": 0.5813055820763111,
"reward_std": 1.03695610165596,
"rewards/cosine_scaled_reward": 0.030236128717660904,
"rewards/format_reward": 0.520833358168602,
"step": 144
},
{
"completion_length": 3248.541748046875,
"epoch": 0.1657142857142857,
"grad_norm": 0.27944961190223694,
"kl": 0.026947021484375,
"learning_rate": 9.046048391230247e-07,
"loss": 0.0821,
"reward": 0.31095648277550936,
"reward_std": 1.044460952281952,
"rewards/cosine_scaled_reward": -0.0007717590779066086,
"rewards/format_reward": 0.3125000037252903,
"step": 145
},
{
"completion_length": 3391.2918090820312,
"epoch": 0.16685714285714287,
"grad_norm": 0.1663837432861328,
"kl": 0.033966064453125,
"learning_rate": 9.026620557966279e-07,
"loss": 0.0146,
"reward": 0.22623740322887897,
"reward_std": 0.7760383784770966,
"rewards/cosine_scaled_reward": -0.10563132539391518,
"rewards/format_reward": 0.4375000111758709,
"step": 146
},
{
"completion_length": 2764.3750610351562,
"epoch": 0.168,
"grad_norm": 0.15888190269470215,
"kl": 0.0340576171875,
"learning_rate": 9.007020842191634e-07,
"loss": 0.02,
"reward": 0.43453994020819664,
"reward_std": 0.6980537474155426,
"rewards/cosine_scaled_reward": -0.011896707117557526,
"rewards/format_reward": 0.4583333395421505,
"step": 147
},
{
"completion_length": 3387.8125610351562,
"epoch": 0.16914285714285715,
"grad_norm": 0.1351691633462906,
"kl": 0.0419921875,
"learning_rate": 8.987250199168808e-07,
"loss": 0.0369,
"reward": -0.3159765365999192,
"reward_std": 0.5913000628352165,
"rewards/cosine_scaled_reward": -0.2517382688820362,
"rewards/format_reward": 0.18750000558793545,
"step": 148
},
{
"completion_length": 3070.1458740234375,
"epoch": 0.1702857142857143,
"grad_norm": 0.13587744534015656,
"kl": 0.0269927978515625,
"learning_rate": 8.967309592491052e-07,
"loss": 0.0154,
"reward": 0.8058477342128754,
"reward_std": 0.6222796887159348,
"rewards/cosine_scaled_reward": 0.17375719547271729,
"rewards/format_reward": 0.4583333432674408,
"step": 149
},
{
"completion_length": 2798.2084350585938,
"epoch": 0.17142857142857143,
"grad_norm": 0.15157835185527802,
"kl": 0.03900146484375,
"learning_rate": 8.9471999940354e-07,
"loss": 0.038,
"reward": 0.05908125883433968,
"reward_std": 0.7354179471731186,
"rewards/cosine_scaled_reward": -0.21004271879792213,
"rewards/format_reward": 0.4791666679084301,
"step": 150
},
{
"completion_length": 2979.8541870117188,
"epoch": 0.17257142857142857,
"grad_norm": 0.08188746124505997,
"kl": 0.028045654296875,
"learning_rate": 8.926922383915315e-07,
"loss": -0.004,
"reward": -0.011732706800103188,
"reward_std": 0.44251058250665665,
"rewards/cosine_scaled_reward": -0.1621163571253419,
"rewards/format_reward": 0.3125,
"step": 151
},
{
"completion_length": 3252.1458740234375,
"epoch": 0.1737142857142857,
"grad_norm": 0.17107248306274414,
"kl": 0.039520263671875,
"learning_rate": 8.906477750432903e-07,
"loss": 0.0199,
"reward": 0.2528679259121418,
"reward_std": 0.782855249941349,
"rewards/cosine_scaled_reward": 0.0014339573681354523,
"rewards/format_reward": 0.2500000111758709,
"step": 152
},
{
"completion_length": 2536.1458740234375,
"epoch": 0.17485714285714285,
"grad_norm": 0.13167858123779297,
"kl": 0.02642822265625,
"learning_rate": 8.88586709003076e-07,
"loss": 0.0437,
"reward": 0.8370774015784264,
"reward_std": 0.7839193791151047,
"rewards/cosine_scaled_reward": 0.08520536310970783,
"rewards/format_reward": 0.6666666865348816,
"step": 153
},
{
"completion_length": 2881.3125610351562,
"epoch": 0.176,
"grad_norm": 0.20085100829601288,
"kl": 0.0284423828125,
"learning_rate": 8.865091407243394e-07,
"loss": 0.0533,
"reward": 0.6227563321590424,
"reward_std": 0.8027107864618301,
"rewards/cosine_scaled_reward": 0.0717947967350483,
"rewards/format_reward": 0.4791666828095913,
"step": 154
},
{
"completion_length": 3419.541748046875,
"epoch": 0.17714285714285713,
"grad_norm": 0.1839601695537567,
"kl": 0.03076171875,
"learning_rate": 8.844151714648274e-07,
"loss": -0.0021,
"reward": 0.12460730504244566,
"reward_std": 0.942700669169426,
"rewards/cosine_scaled_reward": -0.07311302423477173,
"rewards/format_reward": 0.2708333432674408,
"step": 155
},
{
"completion_length": 3294.9791870117188,
"epoch": 0.1782857142857143,
"grad_norm": 0.15247705578804016,
"kl": 0.03961181640625,
"learning_rate": 8.823049032816478e-07,
"loss": 0.0569,
"reward": -0.32880749367177486,
"reward_std": 0.5324635952711105,
"rewards/cosine_scaled_reward": -0.2685704119503498,
"rewards/format_reward": 0.20833334140479565,
"step": 156
},
{
"completion_length": 2784.7916870117188,
"epoch": 0.17942857142857144,
"grad_norm": 0.29496413469314575,
"kl": 0.0323944091796875,
"learning_rate": 8.801784390262943e-07,
"loss": 0.0628,
"reward": 0.37049394473433495,
"reward_std": 1.1466023474931717,
"rewards/cosine_scaled_reward": -0.07516971230506897,
"rewards/format_reward": 0.5208333432674408,
"step": 157
},
{
"completion_length": 3182.6875610351562,
"epoch": 0.18057142857142858,
"grad_norm": 0.1029396653175354,
"kl": 0.04150390625,
"learning_rate": 8.780358823396352e-07,
"loss": 0.024,
"reward": -0.27919139340519905,
"reward_std": 0.5330808311700821,
"rewards/cosine_scaled_reward": -0.2541790306568146,
"rewards/format_reward": 0.22916666977107525,
"step": 158
},
{
"completion_length": 2943.6250610351562,
"epoch": 0.18171428571428572,
"grad_norm": 0.4807628393173218,
"kl": 0.05224609375,
"learning_rate": 8.758773376468604e-07,
"loss": 0.136,
"reward": 0.4126173257827759,
"reward_std": 0.9520216137170792,
"rewards/cosine_scaled_reward": -0.012441340368241072,
"rewards/format_reward": 0.4375000149011612,
"step": 159
},
{
"completion_length": 2988.7291870117188,
"epoch": 0.18285714285714286,
"grad_norm": 0.1530563086271286,
"kl": 0.041748046875,
"learning_rate": 8.737029101523929e-07,
"loss": 0.0217,
"reward": 0.5735020600259304,
"reward_std": 0.8126933425664902,
"rewards/cosine_scaled_reward": 0.03675099462270737,
"rewards/format_reward": 0.5000000149011612,
"step": 160
},
{
"completion_length": 3521.916748046875,
"epoch": 0.184,
"grad_norm": 0.18781894445419312,
"kl": 0.04571533203125,
"learning_rate": 8.715127058347614e-07,
"loss": 0.0335,
"reward": -0.04885682836174965,
"reward_std": 0.8325313180685043,
"rewards/cosine_scaled_reward": -0.12859507277607918,
"rewards/format_reward": 0.2083333395421505,
"step": 161
},
{
"completion_length": 3144.6875,
"epoch": 0.18514285714285714,
"grad_norm": 0.15695880353450775,
"kl": 0.0509033203125,
"learning_rate": 8.693068314414344e-07,
"loss": 0.033,
"reward": 0.5102378875017166,
"reward_std": 0.7466369420289993,
"rewards/cosine_scaled_reward": -0.02613106439821422,
"rewards/format_reward": 0.5625000111758709,
"step": 162
},
{
"completion_length": 2595.2083435058594,
"epoch": 0.18628571428571428,
"grad_norm": 0.25721773505210876,
"kl": 0.056854248046875,
"learning_rate": 8.670853944836176e-07,
"loss": -0.002,
"reward": 0.3704167567193508,
"reward_std": 0.6248408891260624,
"rewards/cosine_scaled_reward": -0.06479163467884064,
"rewards/format_reward": 0.5000000055879354,
"step": 163
},
{
"completion_length": 3132.6458740234375,
"epoch": 0.18742857142857142,
"grad_norm": 0.3121108412742615,
"kl": 0.066162109375,
"learning_rate": 8.648485032310144e-07,
"loss": 0.0436,
"reward": -0.15848805382847786,
"reward_std": 0.6006623804569244,
"rewards/cosine_scaled_reward": -0.21466069296002388,
"rewards/format_reward": 0.27083334140479565,
"step": 164
},
{
"completion_length": 3106.7918090820312,
"epoch": 0.18857142857142858,
"grad_norm": 0.36176592111587524,
"kl": 0.059814453125,
"learning_rate": 8.625962667065487e-07,
"loss": 0.0527,
"reward": 0.523316752165556,
"reward_std": 1.1928484439849854,
"rewards/cosine_scaled_reward": 0.022075051441788673,
"rewards/format_reward": 0.4791666865348816,
"step": 165
},
{
"completion_length": 2909.6458435058594,
"epoch": 0.18971428571428572,
"grad_norm": 0.09995611011981964,
"kl": 0.0648193359375,
"learning_rate": 8.603287946810513e-07,
"loss": 0.0269,
"reward": 0.30894866585731506,
"reward_std": 0.543118342757225,
"rewards/cosine_scaled_reward": -0.022608992643654346,
"rewards/format_reward": 0.35416666977107525,
"step": 166
},
{
"completion_length": 3123.3750610351562,
"epoch": 0.19085714285714286,
"grad_norm": 0.19064339995384216,
"kl": 0.065673828125,
"learning_rate": 8.580461976679099e-07,
"loss": 0.0316,
"reward": -0.04249940067529678,
"reward_std": 0.6464731246232986,
"rewards/cosine_scaled_reward": -0.2087497040629387,
"rewards/format_reward": 0.3750000111758709,
"step": 167
},
{
"completion_length": 2451.8958740234375,
"epoch": 0.192,
"grad_norm": 0.2923497259616852,
"kl": 0.06597900390625,
"learning_rate": 8.557485869176825e-07,
"loss": 0.0505,
"reward": 0.5589314834214747,
"reward_std": 0.7117247879505157,
"rewards/cosine_scaled_reward": -0.03303426876664162,
"rewards/format_reward": 0.6250000149011612,
"step": 168
},
{
"completion_length": 2313.9375610351562,
"epoch": 0.19314285714285714,
"grad_norm": 0.22615736722946167,
"kl": 0.0550537109375,
"learning_rate": 8.534360744126753e-07,
"loss": 0.0106,
"reward": 0.9692112673074007,
"reward_std": 0.9811852872371674,
"rewards/cosine_scaled_reward": 0.1616889564320445,
"rewards/format_reward": 0.6458333507180214,
"step": 169
},
{
"completion_length": 2736.4375610351562,
"epoch": 0.19428571428571428,
"grad_norm": 0.17036058008670807,
"kl": 0.08740234375,
"learning_rate": 8.511087728614862e-07,
"loss": 0.0282,
"reward": 0.17224126309156418,
"reward_std": 0.5562086030840874,
"rewards/cosine_scaled_reward": -0.09096270857844502,
"rewards/format_reward": 0.35416666977107525,
"step": 170
},
{
"completion_length": 2596.1458740234375,
"epoch": 0.19542857142857142,
"grad_norm": 0.2933753728866577,
"kl": 0.1033935546875,
"learning_rate": 8.487667956935087e-07,
"loss": -0.0277,
"reward": 0.3066958854906261,
"reward_std": 1.1108788549900055,
"rewards/cosine_scaled_reward": 0.0283479536883533,
"rewards/format_reward": 0.2500000074505806,
"step": 171
},
{
"completion_length": 2705.1251220703125,
"epoch": 0.19657142857142856,
"grad_norm": 0.18700729310512543,
"kl": 0.1024169921875,
"learning_rate": 8.464102570534061e-07,
"loss": 0.0173,
"reward": 0.4773051217198372,
"reward_std": 0.8035851642489433,
"rewards/cosine_scaled_reward": -0.032180776819586754,
"rewards/format_reward": 0.5416666828095913,
"step": 172
},
{
"completion_length": 2499.479248046875,
"epoch": 0.1977142857142857,
"grad_norm": 0.5721752643585205,
"kl": 0.1807861328125,
"learning_rate": 8.440392717955475e-07,
"loss": 0.0732,
"reward": 0.582635186612606,
"reward_std": 0.9862835854291916,
"rewards/cosine_scaled_reward": -0.010765749961137772,
"rewards/format_reward": 0.6041666865348816,
"step": 173
},
{
"completion_length": 3085.5000610351562,
"epoch": 0.19885714285714284,
"grad_norm": 0.20046721398830414,
"kl": 0.103759765625,
"learning_rate": 8.416539554784089e-07,
"loss": 0.0112,
"reward": 0.35188272781670094,
"reward_std": 0.5055751278996468,
"rewards/cosine_scaled_reward": -0.032391976565122604,
"rewards/format_reward": 0.41666667722165585,
"step": 174
},
{
"completion_length": 2600.9584350585938,
"epoch": 0.2,
"grad_norm": 0.27946504950523376,
"kl": 0.1097412109375,
"learning_rate": 8.392544243589427e-07,
"loss": 0.0436,
"reward": 0.622465105727315,
"reward_std": 0.4762147720903158,
"rewards/cosine_scaled_reward": 0.0716492049396038,
"rewards/format_reward": 0.4791666679084301,
"step": 175
},
{
"completion_length": 2800.7084350585938,
"epoch": 0.20114285714285715,
"grad_norm": 0.27955377101898193,
"kl": 0.1202392578125,
"learning_rate": 8.368407953869103e-07,
"loss": 0.0389,
"reward": 0.4877171404659748,
"reward_std": 0.9056157171726227,
"rewards/cosine_scaled_reward": -0.006141431163996458,
"rewards/format_reward": 0.5000000149011612,
"step": 176
},
{
"completion_length": 2680.6459350585938,
"epoch": 0.2022857142857143,
"grad_norm": 0.2823414206504822,
"kl": 0.1099853515625,
"learning_rate": 8.344131861991828e-07,
"loss": 0.0552,
"reward": 0.033542659133672714,
"reward_std": 0.5713647753000259,
"rewards/cosine_scaled_reward": -0.1603120118379593,
"rewards/format_reward": 0.3541666679084301,
"step": 177
},
{
"completion_length": 2046.5625305175781,
"epoch": 0.20342857142857143,
"grad_norm": 0.20538190007209778,
"kl": 0.1141357421875,
"learning_rate": 8.319717151140072e-07,
"loss": 0.0464,
"reward": 0.728565389290452,
"reward_std": 0.6446417346596718,
"rewards/cosine_scaled_reward": 0.07261601462960243,
"rewards/format_reward": 0.583333358168602,
"step": 178
},
{
"completion_length": 2761.5000610351562,
"epoch": 0.20457142857142857,
"grad_norm": 0.40644508600234985,
"kl": 0.1458740234375,
"learning_rate": 8.295165011252396e-07,
"loss": 0.0513,
"reward": 0.404015829320997,
"reward_std": 0.853428527712822,
"rewards/cosine_scaled_reward": -0.03757544606924057,
"rewards/format_reward": 0.4791666828095913,
"step": 179
},
{
"completion_length": 2943.8958740234375,
"epoch": 0.2057142857142857,
"grad_norm": 0.5542572736740112,
"kl": 0.16357421875,
"learning_rate": 8.270476638965461e-07,
"loss": 0.0718,
"reward": 0.4506250247359276,
"reward_std": 0.7890695706009865,
"rewards/cosine_scaled_reward": -0.045520816929638386,
"rewards/format_reward": 0.5416666716337204,
"step": 180
},
{
"completion_length": 3004.7709045410156,
"epoch": 0.20685714285714285,
"grad_norm": 0.2843971252441406,
"kl": 0.154541015625,
"learning_rate": 8.245653237555705e-07,
"loss": 0.0082,
"reward": 0.4879231466911733,
"reward_std": 0.9720990136265755,
"rewards/cosine_scaled_reward": 0.014794901013374329,
"rewards/format_reward": 0.4583333358168602,
"step": 181
},
{
"completion_length": 2924.9375610351562,
"epoch": 0.208,
"grad_norm": 0.5410143136978149,
"kl": 0.20361328125,
"learning_rate": 8.220696016880687e-07,
"loss": 0.0912,
"reward": 0.005998063832521439,
"reward_std": 0.7128682732582092,
"rewards/cosine_scaled_reward": -0.18450098019093275,
"rewards/format_reward": 0.3750000111758709,
"step": 182
},
{
"completion_length": 2788.166748046875,
"epoch": 0.20914285714285713,
"grad_norm": 0.33847859501838684,
"kl": 0.1676025390625,
"learning_rate": 8.195606193320136e-07,
"loss": 0.0478,
"reward": -0.0011163651943206787,
"reward_std": 0.5493139624595642,
"rewards/cosine_scaled_reward": -0.24014152213931084,
"rewards/format_reward": 0.479166679084301,
"step": 183
},
{
"completion_length": 2438.3541870117188,
"epoch": 0.2102857142857143,
"grad_norm": 0.5205087065696716,
"kl": 0.181884765625,
"learning_rate": 8.170384989716657e-07,
"loss": -0.002,
"reward": 0.896189346909523,
"reward_std": 1.161486804485321,
"rewards/cosine_scaled_reward": 0.1460113013163209,
"rewards/format_reward": 0.6041666865348816,
"step": 184
},
{
"completion_length": 2972.6250610351562,
"epoch": 0.21142857142857144,
"grad_norm": 0.5775122046470642,
"kl": 0.25244140625,
"learning_rate": 8.145033635316128e-07,
"loss": 0.0701,
"reward": 0.1199110560119152,
"reward_std": 0.8271754533052444,
"rewards/cosine_scaled_reward": -0.11712781526148319,
"rewards/format_reward": 0.3541666679084301,
"step": 185
},
{
"completion_length": 2840.1875610351562,
"epoch": 0.21257142857142858,
"grad_norm": 0.3676423728466034,
"kl": 0.2158203125,
"learning_rate": 8.119553365707802e-07,
"loss": 0.059,
"reward": 0.5942272543907166,
"reward_std": 0.7698107957839966,
"rewards/cosine_scaled_reward": 0.057530272752046585,
"rewards/format_reward": 0.4791666716337204,
"step": 186
},
{
"completion_length": 2704.8333740234375,
"epoch": 0.21371428571428572,
"grad_norm": 0.3322462737560272,
"kl": 0.2138671875,
"learning_rate": 8.093945422764069e-07,
"loss": 0.0419,
"reward": 0.4956296235322952,
"reward_std": 0.7072524651885033,
"rewards/cosine_scaled_reward": 0.10198147594928741,
"rewards/format_reward": 0.29166668094694614,
"step": 187
},
{
"completion_length": 2780.729248046875,
"epoch": 0.21485714285714286,
"grad_norm": 0.3984168469905853,
"kl": 0.288330078125,
"learning_rate": 8.068211054579943e-07,
"loss": 0.0474,
"reward": 0.5863161403685808,
"reward_std": 0.9082886129617691,
"rewards/cosine_scaled_reward": -0.008925255388021469,
"rewards/format_reward": 0.6041666865348816,
"step": 188
},
{
"completion_length": 2758.1459350585938,
"epoch": 0.216,
"grad_norm": 0.3293847143650055,
"kl": 0.314453125,
"learning_rate": 8.04235151541222e-07,
"loss": 0.0399,
"reward": 0.07589801587164402,
"reward_std": 0.6460907310247421,
"rewards/cosine_scaled_reward": -0.15996766556054354,
"rewards/format_reward": 0.3958333358168602,
"step": 189
},
{
"completion_length": 3006.666748046875,
"epoch": 0.21714285714285714,
"grad_norm": 0.3320949971675873,
"kl": 0.3447265625,
"learning_rate": 8.01636806561836e-07,
"loss": 0.0357,
"reward": -0.11713236942887306,
"reward_std": 0.6270528212189674,
"rewards/cosine_scaled_reward": -0.20439952798187733,
"rewards/format_reward": 0.29166667349636555,
"step": 190
},
{
"completion_length": 2770.479248046875,
"epoch": 0.21828571428571428,
"grad_norm": 0.7219541668891907,
"kl": 0.30859375,
"learning_rate": 7.990261971595048e-07,
"loss": 0.084,
"reward": 0.37447334453463554,
"reward_std": 0.9116730242967606,
"rewards/cosine_scaled_reward": -0.0002633389085531235,
"rewards/format_reward": 0.3750000149011612,
"step": 191
},
{
"completion_length": 2974.6875610351562,
"epoch": 0.21942857142857142,
"grad_norm": 0.44086411595344543,
"kl": 0.38525390625,
"learning_rate": 7.964034505716476e-07,
"loss": 0.0333,
"reward": 0.2543896287679672,
"reward_std": 0.9647316783666611,
"rewards/cosine_scaled_reward": -0.06030518375337124,
"rewards/format_reward": 0.37500001303851604,
"step": 192
},
{
"completion_length": 2625.5209350585938,
"epoch": 0.22057142857142858,
"grad_norm": 0.439861536026001,
"kl": 0.33935546875,
"learning_rate": 7.93768694627233e-07,
"loss": 0.0154,
"reward": -0.05579917132854462,
"reward_std": 0.552303358912468,
"rewards/cosine_scaled_reward": -0.19456627347972244,
"rewards/format_reward": 0.33333334140479565,
"step": 193
},
{
"completion_length": 1735.7917175292969,
"epoch": 0.22171428571428572,
"grad_norm": 0.3492659032344818,
"kl": 0.223876953125,
"learning_rate": 7.911220577405484e-07,
"loss": 0.0307,
"reward": 1.0081698819994926,
"reward_std": 1.0613654553890228,
"rewards/cosine_scaled_reward": 0.10825158283114433,
"rewards/format_reward": 0.7916667014360428,
"step": 194
},
{
"completion_length": 2374.9375610351562,
"epoch": 0.22285714285714286,
"grad_norm": 0.39783236384391785,
"kl": 0.357666015625,
"learning_rate": 7.884636689049422e-07,
"loss": 0.0483,
"reward": 0.519692053552717,
"reward_std": 0.8805719166994095,
"rewards/cosine_scaled_reward": -0.04223730321973562,
"rewards/format_reward": 0.6041666716337204,
"step": 195
},
{
"completion_length": 2985.0208740234375,
"epoch": 0.224,
"grad_norm": 0.5228659510612488,
"kl": 0.39453125,
"learning_rate": 7.857936576865356e-07,
"loss": 0.0626,
"reward": 0.31096921616699547,
"reward_std": 0.9736936837434769,
"rewards/cosine_scaled_reward": -0.07368208467960358,
"rewards/format_reward": 0.45833334140479565,
"step": 196
},
{
"completion_length": 3063.1459045410156,
"epoch": 0.22514285714285714,
"grad_norm": 0.4522063732147217,
"kl": 0.39892578125,
"learning_rate": 7.831121542179086e-07,
"loss": 0.047,
"reward": -0.027099967002868652,
"reward_std": 0.7299272418022156,
"rewards/cosine_scaled_reward": -0.14896666258573532,
"rewards/format_reward": 0.27083334140479565,
"step": 197
},
{
"completion_length": 3130.291748046875,
"epoch": 0.22628571428571428,
"grad_norm": 0.7724531888961792,
"kl": 0.40771484375,
"learning_rate": 7.804192891917571e-07,
"loss": 0.0823,
"reward": 0.07385630160570145,
"reward_std": 0.7986228317022324,
"rewards/cosine_scaled_reward": -0.18182185851037502,
"rewards/format_reward": 0.4375000074505806,
"step": 198
},
{
"completion_length": 2927.9375610351562,
"epoch": 0.22742857142857142,
"grad_norm": 0.4081217050552368,
"kl": 0.40234375,
"learning_rate": 7.777151938545235e-07,
"loss": 0.0405,
"reward": 0.9421972185373306,
"reward_std": 0.8113018572330475,
"rewards/cosine_scaled_reward": 0.18984858132898808,
"rewards/format_reward": 0.5625000149011612,
"step": 199
},
{
"completion_length": 2594.6875610351562,
"epoch": 0.22857142857142856,
"grad_norm": 1.1233628988265991,
"kl": 0.4052734375,
"learning_rate": 7.75e-07,
"loss": 0.1087,
"reward": 0.4042139081284404,
"reward_std": 0.9797720313072205,
"rewards/cosine_scaled_reward": -0.06872639432549477,
"rewards/format_reward": 0.5416666772216558,
"step": 200
},
{
"completion_length": 2749.729217529297,
"epoch": 0.2297142857142857,
"grad_norm": 0.4544771611690521,
"kl": 0.463134765625,
"learning_rate": 7.72273839962904e-07,
"loss": 0.0332,
"reward": 0.049620624631643295,
"reward_std": 0.6019374430179596,
"rewards/cosine_scaled_reward": -0.18352303700521588,
"rewards/format_reward": 0.41666667722165585,
"step": 201
},
{
"completion_length": 2291.3334350585938,
"epoch": 0.23085714285714284,
"grad_norm": 0.4469071328639984,
"kl": 0.4375,
"learning_rate": 7.695368466124296e-07,
"loss": 0.0543,
"reward": 0.33724231645464897,
"reward_std": 0.6383469551801682,
"rewards/cosine_scaled_reward": -0.10221217246726155,
"rewards/format_reward": 0.5416666716337204,
"step": 202
},
{
"completion_length": 2692.416717529297,
"epoch": 0.232,
"grad_norm": 0.9959556460380554,
"kl": 0.60302734375,
"learning_rate": 7.667891533457718e-07,
"loss": 0.0229,
"reward": 0.5023867785930634,
"reward_std": 0.8520723432302475,
"rewards/cosine_scaled_reward": -0.009223278611898422,
"rewards/format_reward": 0.520833358168602,
"step": 203
},
{
"completion_length": 2610.291748046875,
"epoch": 0.23314285714285715,
"grad_norm": 0.5574892163276672,
"kl": 0.54150390625,
"learning_rate": 7.640308940816239e-07,
"loss": 0.0779,
"reward": 0.668186828494072,
"reward_std": 0.7796131670475006,
"rewards/cosine_scaled_reward": 0.04242673283442855,
"rewards/format_reward": 0.5833333488553762,
"step": 204
},
{
"completion_length": 3158.0625,
"epoch": 0.2342857142857143,
"grad_norm": 0.9340919256210327,
"kl": 0.658203125,
"learning_rate": 7.612622032536507e-07,
"loss": 0.091,
"reward": 0.36712072789669037,
"reward_std": 1.0377983078360558,
"rewards/cosine_scaled_reward": -0.014356307685375214,
"rewards/format_reward": 0.3958333507180214,
"step": 205
},
{
"completion_length": 3241.7500610351562,
"epoch": 0.23542857142857143,
"grad_norm": 0.6677758097648621,
"kl": 0.7197265625,
"learning_rate": 7.584832158039378e-07,
"loss": 0.0745,
"reward": -0.032032303512096405,
"reward_std": 0.7234849855303764,
"rewards/cosine_scaled_reward": -0.15143282152712345,
"rewards/format_reward": 0.27083334885537624,
"step": 206
},
{
"completion_length": 2452.1876220703125,
"epoch": 0.23657142857142857,
"grad_norm": 0.9905790090560913,
"kl": 0.4033203125,
"learning_rate": 7.556940671764124e-07,
"loss": 0.0568,
"reward": 0.8914177902042866,
"reward_std": 0.8338152915239334,
"rewards/cosine_scaled_reward": 0.0811255220323801,
"rewards/format_reward": 0.7291666865348816,
"step": 207
},
{
"completion_length": 2867.4583740234375,
"epoch": 0.2377142857142857,
"grad_norm": 1.0818088054656982,
"kl": 0.6904296875,
"learning_rate": 7.528948933102438e-07,
"loss": 0.0301,
"reward": 0.22067961934953928,
"reward_std": 0.46099015325307846,
"rewards/cosine_scaled_reward": -0.17091020289808512,
"rewards/format_reward": 0.5625000149011612,
"step": 208
},
{
"completion_length": 2549.354217529297,
"epoch": 0.23885714285714285,
"grad_norm": 0.5277766585350037,
"kl": 0.5927734375,
"learning_rate": 7.500858306332172e-07,
"loss": 0.0733,
"reward": 0.142703301506117,
"reward_std": 0.7169675379991531,
"rewards/cosine_scaled_reward": -0.26198170334100723,
"rewards/format_reward": 0.6666666716337204,
"step": 209
},
{
"completion_length": 2013.7500610351562,
"epoch": 0.24,
"grad_norm": 0.610791027545929,
"kl": 0.40966796875,
"learning_rate": 7.472670160550848e-07,
"loss": 0.0513,
"reward": 0.6129203364253044,
"reward_std": 0.8901711851358414,
"rewards/cosine_scaled_reward": -0.026873177848756313,
"rewards/format_reward": 0.6666666772216558,
"step": 210
},
{
"completion_length": 3397.9376220703125,
"epoch": 0.24114285714285713,
"grad_norm": 0.8708758354187012,
"kl": 0.751953125,
"learning_rate": 7.444385869608921e-07,
"loss": 0.0628,
"reward": -0.10053645074367523,
"reward_std": 0.5338989198207855,
"rewards/cosine_scaled_reward": -0.14401823794469237,
"rewards/format_reward": 0.1875000074505806,
"step": 211
},
{
"completion_length": 2320.8334045410156,
"epoch": 0.2422857142857143,
"grad_norm": 0.8576116561889648,
"kl": 0.481201171875,
"learning_rate": 7.416006812042827e-07,
"loss": 0.0179,
"reward": 0.7511888779699802,
"reward_std": 0.8285558968782425,
"rewards/cosine_scaled_reward": 0.021427758038043976,
"rewards/format_reward": 0.7083333432674408,
"step": 212
},
{
"completion_length": 3072.2084350585938,
"epoch": 0.24342857142857144,
"grad_norm": 0.7516844272613525,
"kl": 0.6279296875,
"learning_rate": 7.387534371007797e-07,
"loss": 0.0663,
"reward": 0.14471609145402908,
"reward_std": 0.5673011243343353,
"rewards/cosine_scaled_reward": -0.09430863708257675,
"rewards/format_reward": 0.3333333395421505,
"step": 213
},
{
"completion_length": 3004.666748046875,
"epoch": 0.24457142857142858,
"grad_norm": 0.650104820728302,
"kl": 0.49853515625,
"learning_rate": 7.358969934210438e-07,
"loss": 0.048,
"reward": 0.38014761358499527,
"reward_std": 0.6449386551976204,
"rewards/cosine_scaled_reward": -0.05992620065808296,
"rewards/format_reward": 0.5000000055879354,
"step": 214
},
{
"completion_length": 2997.1251220703125,
"epoch": 0.24571428571428572,
"grad_norm": 0.8768295049667358,
"kl": 0.55859375,
"learning_rate": 7.330314893841101e-07,
"loss": 0.0617,
"reward": 0.14181075803935528,
"reward_std": 0.7453153133392334,
"rewards/cosine_scaled_reward": -0.21034463122487068,
"rewards/format_reward": 0.5625000149011612,
"step": 215
},
{
"completion_length": 3182.6250610351562,
"epoch": 0.24685714285714286,
"grad_norm": 0.5447856187820435,
"kl": 0.52685546875,
"learning_rate": 7.301570646506027e-07,
"loss": 0.0435,
"reward": -0.2610638588666916,
"reward_std": 0.5414926931262016,
"rewards/cosine_scaled_reward": -0.2451152689754963,
"rewards/format_reward": 0.2291666679084301,
"step": 216
},
{
"completion_length": 2864.8334350585938,
"epoch": 0.248,
"grad_norm": 0.5242255330085754,
"kl": 0.46875,
"learning_rate": 7.27273859315928e-07,
"loss": 0.0353,
"reward": 0.28853584825992584,
"reward_std": 0.5657162964344025,
"rewards/cosine_scaled_reward": -0.11614875216037035,
"rewards/format_reward": 0.520833358168602,
"step": 217
},
{
"completion_length": 2654.9583740234375,
"epoch": 0.24914285714285714,
"grad_norm": 0.9366975426673889,
"kl": 0.392578125,
"learning_rate": 7.243820139034464e-07,
"loss": 0.0515,
"reward": 0.3301328122615814,
"reward_std": 0.7091851830482483,
"rewards/cosine_scaled_reward": -0.04326693775783497,
"rewards/format_reward": 0.4166666828095913,
"step": 218
},
{
"completion_length": 2303.854217529297,
"epoch": 0.2502857142857143,
"grad_norm": 1.7971564531326294,
"kl": 0.3369140625,
"learning_rate": 7.214816693576234e-07,
"loss": 0.0794,
"reward": 0.6591267697513103,
"reward_std": 0.9642367362976074,
"rewards/cosine_scaled_reward": 0.03789670951664448,
"rewards/format_reward": 0.583333358168602,
"step": 219
},
{
"completion_length": 2634.2501220703125,
"epoch": 0.25142857142857145,
"grad_norm": 1.3504126071929932,
"kl": 0.4423828125,
"learning_rate": 7.185729670371604e-07,
"loss": -0.0076,
"reward": 0.41383227705955505,
"reward_std": 0.64960727840662,
"rewards/cosine_scaled_reward": -0.043083855882287025,
"rewards/format_reward": 0.5000000204890966,
"step": 220
},
{
"completion_length": 2984.7709350585938,
"epoch": 0.25257142857142856,
"grad_norm": 0.9762473106384277,
"kl": 0.4384765625,
"learning_rate": 7.156560487081051e-07,
"loss": 0.0725,
"reward": 0.25423768046312034,
"reward_std": 0.8094103336334229,
"rewards/cosine_scaled_reward": -0.11246450617909431,
"rewards/format_reward": 0.4791666716337204,
"step": 221
},
{
"completion_length": 3275.7500610351562,
"epoch": 0.2537142857142857,
"grad_norm": 0.37796396017074585,
"kl": 0.533203125,
"learning_rate": 7.127310565369415e-07,
"loss": 0.0546,
"reward": 0.08308765979018062,
"reward_std": 0.6242133527994156,
"rewards/cosine_scaled_reward": -0.18762284144759178,
"rewards/format_reward": 0.4583333395421505,
"step": 222
},
{
"completion_length": 2415.2916870117188,
"epoch": 0.25485714285714284,
"grad_norm": 66.52708435058594,
"kl": 19.72021484375,
"learning_rate": 7.097981330836616e-07,
"loss": 0.1598,
"reward": 0.32358624786138535,
"reward_std": 0.8794360756874084,
"rewards/cosine_scaled_reward": -0.12987355142831802,
"rewards/format_reward": 0.5833333507180214,
"step": 223
},
{
"completion_length": 2914.2084350585938,
"epoch": 0.256,
"grad_norm": 0.39709535241127014,
"kl": 0.42919921875,
"learning_rate": 7.068574212948169e-07,
"loss": 0.026,
"reward": 0.4726352207362652,
"reward_std": 0.5715819150209427,
"rewards/cosine_scaled_reward": -0.02409905381500721,
"rewards/format_reward": 0.5208333432674408,
"step": 224
},
{
"completion_length": 2191.4584045410156,
"epoch": 0.2571428571428571,
"grad_norm": 1.4947963953018188,
"kl": 0.361572265625,
"learning_rate": 7.039090644965509e-07,
"loss": 0.0904,
"reward": 0.8724448978900909,
"reward_std": 0.8835494965314865,
"rewards/cosine_scaled_reward": 0.16538911685347557,
"rewards/format_reward": 0.541666679084301,
"step": 225
},
{
"completion_length": 2944.8959350585938,
"epoch": 0.2582857142857143,
"grad_norm": 0.8030902147293091,
"kl": 0.5966796875,
"learning_rate": 7.009532063876148e-07,
"loss": 0.0354,
"reward": 0.16449306067079306,
"reward_std": 0.7553341090679169,
"rewards/cosine_scaled_reward": -0.14692013710737228,
"rewards/format_reward": 0.4583333432674408,
"step": 226
},
{
"completion_length": 2399.1251220703125,
"epoch": 0.25942857142857145,
"grad_norm": 0.6294677257537842,
"kl": 0.40478515625,
"learning_rate": 6.979899910323624e-07,
"loss": 0.0385,
"reward": 0.6515897959470749,
"reward_std": 0.7883607298135757,
"rewards/cosine_scaled_reward": -0.01795511320233345,
"rewards/format_reward": 0.6875000149011612,
"step": 227
},
{
"completion_length": 2944.2916870117188,
"epoch": 0.26057142857142856,
"grad_norm": 0.7098054885864258,
"kl": 0.5126953125,
"learning_rate": 6.950195628537299e-07,
"loss": 0.0537,
"reward": 0.2890019528567791,
"reward_std": 0.8232990577816963,
"rewards/cosine_scaled_reward": -0.1367490328848362,
"rewards/format_reward": 0.5625,
"step": 228
},
{
"completion_length": 2747.541748046875,
"epoch": 0.26171428571428573,
"grad_norm": 0.3639421761035919,
"kl": 0.53759765625,
"learning_rate": 6.920420666261961e-07,
"loss": 0.0462,
"reward": 0.1284541985951364,
"reward_std": 0.6105376034975052,
"rewards/cosine_scaled_reward": -0.21702291443943977,
"rewards/format_reward": 0.5625000149011612,
"step": 229
},
{
"completion_length": 2547.916748046875,
"epoch": 0.26285714285714284,
"grad_norm": 0.7889376878738403,
"kl": 0.4453125,
"learning_rate": 6.890576474687263e-07,
"loss": 0.0666,
"reward": 0.46958625549450517,
"reward_std": 0.8848246484994888,
"rewards/cosine_scaled_reward": 0.03687644610181451,
"rewards/format_reward": 0.3958333395421505,
"step": 230
},
{
"completion_length": 2979.3125610351562,
"epoch": 0.264,
"grad_norm": 0.49910208582878113,
"kl": 0.56689453125,
"learning_rate": 6.860664508377001e-07,
"loss": 0.0658,
"reward": 0.34871126525104046,
"reward_std": 0.7629459947347641,
"rewards/cosine_scaled_reward": -0.0756443589925766,
"rewards/format_reward": 0.5000000149011612,
"step": 231
},
{
"completion_length": 2503.5625610351562,
"epoch": 0.2651428571428571,
"grad_norm": 0.8284872174263,
"kl": 0.412109375,
"learning_rate": 6.83068622519821e-07,
"loss": 0.0204,
"reward": 0.6350362692028284,
"reward_std": 1.1135509312152863,
"rewards/cosine_scaled_reward": -0.02623187005519867,
"rewards/format_reward": 0.6875000149011612,
"step": 232
},
{
"completion_length": 2727.8751220703125,
"epoch": 0.2662857142857143,
"grad_norm": 0.5221201181411743,
"kl": 0.4931640625,
"learning_rate": 6.800643086250121e-07,
"loss": 0.0615,
"reward": 0.4846220053732395,
"reward_std": 0.7716068103909492,
"rewards/cosine_scaled_reward": -0.049355676397681236,
"rewards/format_reward": 0.5833333432674408,
"step": 233
},
{
"completion_length": 2544.7084350585938,
"epoch": 0.2674285714285714,
"grad_norm": 1.3812953233718872,
"kl": 0.498046875,
"learning_rate": 6.770536555792944e-07,
"loss": -0.0119,
"reward": 0.4157133437693119,
"reward_std": 0.7185128927230835,
"rewards/cosine_scaled_reward": -0.13589332532137632,
"rewards/format_reward": 0.6875000149011612,
"step": 234
},
{
"completion_length": 2495.375045776367,
"epoch": 0.26857142857142857,
"grad_norm": 0.6437314748764038,
"kl": 0.59716796875,
"learning_rate": 6.740368101176495e-07,
"loss": 0.0412,
"reward": 0.5019040778279305,
"reward_std": 0.6978631764650345,
"rewards/cosine_scaled_reward": -0.019881299696862698,
"rewards/format_reward": 0.5416666753590107,
"step": 235
},
{
"completion_length": 2483.3959350585938,
"epoch": 0.26971428571428574,
"grad_norm": 0.3919011950492859,
"kl": 0.4892578125,
"learning_rate": 6.710139192768694e-07,
"loss": 0.0482,
"reward": 0.2438975148834288,
"reward_std": 0.648132249712944,
"rewards/cosine_scaled_reward": -0.2009679153561592,
"rewards/format_reward": 0.6458333656191826,
"step": 236
},
{
"completion_length": 2204.4584350585938,
"epoch": 0.27085714285714285,
"grad_norm": 0.8478395342826843,
"kl": 0.39111328125,
"learning_rate": 6.679851303883891e-07,
"loss": 0.0545,
"reward": 0.42290161666460335,
"reward_std": 0.648314818739891,
"rewards/cosine_scaled_reward": -0.06979918852448463,
"rewards/format_reward": 0.5625000149011612,
"step": 237
},
{
"completion_length": 2635.062530517578,
"epoch": 0.272,
"grad_norm": 1.0054919719696045,
"kl": 0.572265625,
"learning_rate": 6.649505910711058e-07,
"loss": 0.0721,
"reward": 0.5835281796753407,
"reward_std": 0.7386454343795776,
"rewards/cosine_scaled_reward": -0.09365258179605007,
"rewards/format_reward": 0.770833358168602,
"step": 238
},
{
"completion_length": 3080.6875610351562,
"epoch": 0.27314285714285713,
"grad_norm": 0.8045799136161804,
"kl": 0.7119140625,
"learning_rate": 6.619104492241847e-07,
"loss": 0.0514,
"reward": 0.16217913012951612,
"reward_std": 0.8966347873210907,
"rewards/cosine_scaled_reward": -0.1376604586839676,
"rewards/format_reward": 0.43750000558793545,
"step": 239
},
{
"completion_length": 2219.5000915527344,
"epoch": 0.2742857142857143,
"grad_norm": 1.3121085166931152,
"kl": 0.403564453125,
"learning_rate": 6.588648530198504e-07,
"loss": 0.0747,
"reward": 1.15125173330307,
"reward_std": 0.957096055150032,
"rewards/cosine_scaled_reward": 0.200625860132277,
"rewards/format_reward": 0.7500000149011612,
"step": 240
},
{
"completion_length": 2043.0625610351562,
"epoch": 0.2754285714285714,
"grad_norm": 0.6292615532875061,
"kl": 0.319580078125,
"learning_rate": 6.558139508961654e-07,
"loss": 0.0002,
"reward": 0.9169554859399796,
"reward_std": 0.5727524533867836,
"rewards/cosine_scaled_reward": 0.07306107506155968,
"rewards/format_reward": 0.770833358168602,
"step": 241
},
{
"completion_length": 2313.4583740234375,
"epoch": 0.2765714285714286,
"grad_norm": 0.6727687120437622,
"kl": 0.4599609375,
"learning_rate": 6.527578915497951e-07,
"loss": 0.0547,
"reward": 0.634780153632164,
"reward_std": 0.7665407210588455,
"rewards/cosine_scaled_reward": -0.0992765948176384,
"rewards/format_reward": 0.8333333432674408,
"step": 242
},
{
"completion_length": 2128.354248046875,
"epoch": 0.2777142857142857,
"grad_norm": 2.353132963180542,
"kl": 0.46875,
"learning_rate": 6.496968239287603e-07,
"loss": -0.0288,
"reward": 0.7288870755583048,
"reward_std": 0.7078111618757248,
"rewards/cosine_scaled_reward": -0.020973138511180878,
"rewards/format_reward": 0.770833358168602,
"step": 243
},
{
"completion_length": 2385.729217529297,
"epoch": 0.27885714285714286,
"grad_norm": 0.797772228717804,
"kl": 0.435546875,
"learning_rate": 6.466308972251785e-07,
"loss": 0.0694,
"reward": 0.9379732981324196,
"reward_std": 0.76512710750103,
"rewards/cosine_scaled_reward": 0.08356995694339275,
"rewards/format_reward": 0.770833358168602,
"step": 244
},
{
"completion_length": 2175.229217529297,
"epoch": 0.28,
"grad_norm": 0.4513607621192932,
"kl": 0.4609375,
"learning_rate": 6.435602608679916e-07,
"loss": 0.0361,
"reward": 0.7639665333554149,
"reward_std": 0.5898980349302292,
"rewards/cosine_scaled_reward": -0.0034334324300289154,
"rewards/format_reward": 0.770833358168602,
"step": 245
},
{
"completion_length": 2385.3959045410156,
"epoch": 0.28114285714285714,
"grad_norm": 1.354136347770691,
"kl": 0.4619140625,
"learning_rate": 6.404850645156841e-07,
"loss": -0.0114,
"reward": 0.5757800415158272,
"reward_std": 0.4861333817243576,
"rewards/cosine_scaled_reward": -0.09752664715051651,
"rewards/format_reward": 0.7708333432674408,
"step": 246
},
{
"completion_length": 1984.0834045410156,
"epoch": 0.2822857142857143,
"grad_norm": 0.7202406525611877,
"kl": 0.39306640625,
"learning_rate": 6.374054580489873e-07,
"loss": -0.0064,
"reward": 0.7016956266015768,
"reward_std": 0.6964651569724083,
"rewards/cosine_scaled_reward": -0.03456886112689972,
"rewards/format_reward": 0.770833358168602,
"step": 247
},
{
"completion_length": 2431.6250610351562,
"epoch": 0.2834285714285714,
"grad_norm": 1.12034273147583,
"kl": 0.375,
"learning_rate": 6.343215915635761e-07,
"loss": 0.0399,
"reward": 0.47921356186270714,
"reward_std": 0.7437918186187744,
"rewards/cosine_scaled_reward": -0.10414323909208179,
"rewards/format_reward": 0.6875000149011612,
"step": 248
},
{
"completion_length": 2641.5000610351562,
"epoch": 0.2845714285714286,
"grad_norm": 1.147722601890564,
"kl": 0.466796875,
"learning_rate": 6.31233615362752e-07,
"loss": 0.0084,
"reward": 0.4995560571551323,
"reward_std": 0.7342625856399536,
"rewards/cosine_scaled_reward": -0.04188864305615425,
"rewards/format_reward": 0.583333358168602,
"step": 249
},
{
"completion_length": 2112.0209045410156,
"epoch": 0.2857142857142857,
"grad_norm": 0.6532469987869263,
"kl": 0.302490234375,
"learning_rate": 6.281416799501187e-07,
"loss": 0.0292,
"reward": 0.6722276238724589,
"reward_std": 1.072887122631073,
"rewards/cosine_scaled_reward": 0.03403047751635313,
"rewards/format_reward": 0.6041666865348816,
"step": 250
},
{
"completion_length": 2693.416748046875,
"epoch": 0.28685714285714287,
"grad_norm": 0.9663844108581543,
"kl": 0.419921875,
"learning_rate": 6.25045936022246e-07,
"loss": 0.0569,
"reward": 0.9957753866910934,
"reward_std": 0.9329462796449661,
"rewards/cosine_scaled_reward": 0.16455435939133167,
"rewards/format_reward": 0.6666666865348816,
"step": 251
},
{
"completion_length": 2677.7709350585938,
"epoch": 0.288,
"grad_norm": 0.720365583896637,
"kl": 0.42138671875,
"learning_rate": 6.219465344613258e-07,
"loss": 0.0584,
"reward": 0.791405975818634,
"reward_std": 0.8207461088895798,
"rewards/cosine_scaled_reward": -0.010547026991844177,
"rewards/format_reward": 0.8125000298023224,
"step": 252
},
{
"completion_length": 2163.2709350585938,
"epoch": 0.28914285714285715,
"grad_norm": 0.9754706025123596,
"kl": 0.333984375,
"learning_rate": 6.188436263278172e-07,
"loss": -0.0284,
"reward": 0.4755242392420769,
"reward_std": 0.9357906579971313,
"rewards/cosine_scaled_reward": -0.05390455946326256,
"rewards/format_reward": 0.583333358168602,
"step": 253
},
{
"completion_length": 2022.0834045410156,
"epoch": 0.29028571428571426,
"grad_norm": 0.7189564108848572,
"kl": 0.29931640625,
"learning_rate": 6.157373628530852e-07,
"loss": -0.0134,
"reward": 1.0547878816723824,
"reward_std": 0.6990637332201004,
"rewards/cosine_scaled_reward": 0.162810567766428,
"rewards/format_reward": 0.7291666865348816,
"step": 254
},
{
"completion_length": 2299.6041870117188,
"epoch": 0.2914285714285714,
"grad_norm": 0.6565377712249756,
"kl": 0.3150634765625,
"learning_rate": 6.126278954320294e-07,
"loss": 0.0082,
"reward": 0.9156973995268345,
"reward_std": 0.7535882145166397,
"rewards/cosine_scaled_reward": 0.08284871588693932,
"rewards/format_reward": 0.7500000149011612,
"step": 255
},
{
"completion_length": 1579.4583892822266,
"epoch": 0.2925714285714286,
"grad_norm": 0.25218111276626587,
"kl": 0.13134765625,
"learning_rate": 6.095153756157051e-07,
"loss": -0.0037,
"reward": 0.6594964060932398,
"reward_std": 0.7463338524103165,
"rewards/cosine_scaled_reward": -0.04525182023644447,
"rewards/format_reward": 0.7500000149011612,
"step": 256
},
{
"completion_length": 2658.6876220703125,
"epoch": 0.2937142857142857,
"grad_norm": 0.475395530462265,
"kl": 0.332275390625,
"learning_rate": 6.06399955103937e-07,
"loss": 0.0439,
"reward": 0.4807323142886162,
"reward_std": 0.7335182875394821,
"rewards/cosine_scaled_reward": -0.1450505219399929,
"rewards/format_reward": 0.7708333432674408,
"step": 257
},
{
"completion_length": 2290.5000610351562,
"epoch": 0.2948571428571429,
"grad_norm": 0.5613760948181152,
"kl": 0.2305908203125,
"learning_rate": 6.032817857379256e-07,
"loss": 0.0305,
"reward": 0.5192163055762649,
"reward_std": 0.7799556702375412,
"rewards/cosine_scaled_reward": -0.021641843486577272,
"rewards/format_reward": 0.5625000149011612,
"step": 258
},
{
"completion_length": 2217.8959350585938,
"epoch": 0.296,
"grad_norm": 1.199144959449768,
"kl": 0.24127197265625,
"learning_rate": 6.001610194928464e-07,
"loss": 0.049,
"reward": 0.5793692320585251,
"reward_std": 0.7019505053758621,
"rewards/cosine_scaled_reward": -0.09573205607011914,
"rewards/format_reward": 0.7708333432674408,
"step": 259
},
{
"completion_length": 2786.041748046875,
"epoch": 0.29714285714285715,
"grad_norm": 0.7002319693565369,
"kl": 0.292236328125,
"learning_rate": 5.97037808470444e-07,
"loss": 0.0086,
"reward": 0.5236682705581188,
"reward_std": 0.5017373934388161,
"rewards/cosine_scaled_reward": -0.10274921730160713,
"rewards/format_reward": 0.7291666865348816,
"step": 260
},
{
"completion_length": 2164.291717529297,
"epoch": 0.29828571428571427,
"grad_norm": 0.2812724709510803,
"kl": 0.186279296875,
"learning_rate": 5.939123048916173e-07,
"loss": 0.0171,
"reward": 0.6918911039829254,
"reward_std": 0.4820164740085602,
"rewards/cosine_scaled_reward": -0.07072112709283829,
"rewards/format_reward": 0.8333333432674408,
"step": 261
},
{
"completion_length": 2519.5625610351562,
"epoch": 0.29942857142857143,
"grad_norm": 0.4466201663017273,
"kl": 0.25927734375,
"learning_rate": 5.907846610890011e-07,
"loss": 0.0037,
"reward": 0.45665838569402695,
"reward_std": 0.7808536291122437,
"rewards/cosine_scaled_reward": -0.14667082950472832,
"rewards/format_reward": 0.7500000149011612,
"step": 262
},
{
"completion_length": 2283.0208435058594,
"epoch": 0.30057142857142854,
"grad_norm": 0.9734614491462708,
"kl": 0.24072265625,
"learning_rate": 5.87655029499542e-07,
"loss": -0.0445,
"reward": 0.6200529932975769,
"reward_std": 0.9734015464782715,
"rewards/cosine_scaled_reward": -0.05455685779452324,
"rewards/format_reward": 0.7291666716337204,
"step": 263
},
{
"completion_length": 2269.729248046875,
"epoch": 0.3017142857142857,
"grad_norm": 0.93758225440979,
"kl": 0.242919921875,
"learning_rate": 5.845235626570683e-07,
"loss": 0.0552,
"reward": 0.5712921991944313,
"reward_std": 0.6152775660157204,
"rewards/cosine_scaled_reward": -0.0789372380822897,
"rewards/format_reward": 0.7291666865348816,
"step": 264
},
{
"completion_length": 2714.729248046875,
"epoch": 0.3028571428571429,
"grad_norm": 0.4690639078617096,
"kl": 0.28564453125,
"learning_rate": 5.813904131848564e-07,
"loss": 0.0054,
"reward": 0.33216356858611107,
"reward_std": 0.5296753197908401,
"rewards/cosine_scaled_reward": -0.11516822502017021,
"rewards/format_reward": 0.5625000223517418,
"step": 265
},
{
"completion_length": 2834.8750610351562,
"epoch": 0.304,
"grad_norm": 0.6644603610038757,
"kl": 0.278076171875,
"learning_rate": 5.78255733788191e-07,
"loss": 0.0086,
"reward": 0.7553704380989075,
"reward_std": 0.6663154512643814,
"rewards/cosine_scaled_reward": -0.059814791195094585,
"rewards/format_reward": 0.8750000149011612,
"step": 266
},
{
"completion_length": 2623.7291870117188,
"epoch": 0.30514285714285716,
"grad_norm": 0.4014008343219757,
"kl": 0.30078125,
"learning_rate": 5.751196772469237e-07,
"loss": 0.0276,
"reward": 0.574170459061861,
"reward_std": 0.6768613308668137,
"rewards/cosine_scaled_reward": -0.046248115599155426,
"rewards/format_reward": 0.6666666865348816,
"step": 267
},
{
"completion_length": 2934.2916870117188,
"epoch": 0.3062857142857143,
"grad_norm": 0.32006382942199707,
"kl": 0.24169921875,
"learning_rate": 5.71982396408026e-07,
"loss": 0.0186,
"reward": 0.5890230983495712,
"reward_std": 0.6336611211299896,
"rewards/cosine_scaled_reward": -0.0388217861764133,
"rewards/format_reward": 0.666666679084301,
"step": 268
},
{
"completion_length": 2591.6459350585938,
"epoch": 0.30742857142857144,
"grad_norm": 0.2750188410282135,
"kl": 0.2086181640625,
"learning_rate": 5.688440441781398e-07,
"loss": 0.0096,
"reward": 0.4631531648337841,
"reward_std": 0.5730658769607544,
"rewards/cosine_scaled_reward": -0.15384008269757032,
"rewards/format_reward": 0.7708333432674408,
"step": 269
},
{
"completion_length": 1949.6250305175781,
"epoch": 0.30857142857142855,
"grad_norm": 0.3348838686943054,
"kl": 0.14434814453125,
"learning_rate": 5.657047735161255e-07,
"loss": 0.0019,
"reward": 1.0058863386511803,
"reward_std": 0.6113419234752655,
"rewards/cosine_scaled_reward": 0.05502649489790201,
"rewards/format_reward": 0.895833358168602,
"step": 270
},
{
"completion_length": 2595.8333740234375,
"epoch": 0.3097142857142857,
"grad_norm": 0.3792303502559662,
"kl": 0.18743896484375,
"learning_rate": 5.625647374256061e-07,
"loss": 0.0156,
"reward": 1.184450313448906,
"reward_std": 0.6347895562648773,
"rewards/cosine_scaled_reward": 0.18597513809800148,
"rewards/format_reward": 0.8125000149011612,
"step": 271
},
{
"completion_length": 3300.4583740234375,
"epoch": 0.31085714285714283,
"grad_norm": 0.4754711091518402,
"kl": 0.2998046875,
"learning_rate": 5.594240889475106e-07,
"loss": 0.0323,
"reward": 0.33772575482726097,
"reward_std": 0.7981042563915253,
"rewards/cosine_scaled_reward": -0.12280379980802536,
"rewards/format_reward": 0.583333358168602,
"step": 272
},
{
"completion_length": 2806.5208740234375,
"epoch": 0.312,
"grad_norm": 0.2589206397533417,
"kl": 0.203857421875,
"learning_rate": 5.562829811526154e-07,
"loss": 0.0018,
"reward": 0.4326868951320648,
"reward_std": 0.6429417282342911,
"rewards/cosine_scaled_reward": -0.1378232277929783,
"rewards/format_reward": 0.7083333432674408,
"step": 273
},
{
"completion_length": 2775.6043090820312,
"epoch": 0.31314285714285717,
"grad_norm": 0.392734557390213,
"kl": 0.182861328125,
"learning_rate": 5.531415671340826e-07,
"loss": 0.0352,
"reward": 0.39707405120134354,
"reward_std": 0.748130202293396,
"rewards/cosine_scaled_reward": -0.11396298557519913,
"rewards/format_reward": 0.6250000223517418,
"step": 274
},
{
"completion_length": 2929.979278564453,
"epoch": 0.3142857142857143,
"grad_norm": 0.700515627861023,
"kl": 0.240478515625,
"learning_rate": 5.5e-07,
"loss": 0.0581,
"reward": 0.3950451835989952,
"reward_std": 0.9513901323080063,
"rewards/cosine_scaled_reward": -0.06289407718577422,
"rewards/format_reward": 0.5208333432674408,
"step": 275
},
{
"completion_length": 2392.2291870117188,
"epoch": 0.31542857142857145,
"grad_norm": 0.6831299066543579,
"kl": 0.146484375,
"learning_rate": 5.468584328659172e-07,
"loss": 0.0305,
"reward": 0.8106965273618698,
"reward_std": 0.8061726838350296,
"rewards/cosine_scaled_reward": -0.011318429373204708,
"rewards/format_reward": 0.8333333432674408,
"step": 276
},
{
"completion_length": 2669.7709350585938,
"epoch": 0.31657142857142856,
"grad_norm": 1.2274115085601807,
"kl": 0.221435546875,
"learning_rate": 5.437170188473847e-07,
"loss": 0.0847,
"reward": 0.44736091792583466,
"reward_std": 0.8726006895303726,
"rewards/cosine_scaled_reward": -0.09923620894551277,
"rewards/format_reward": 0.6458333432674408,
"step": 277
},
{
"completion_length": 2377.7500915527344,
"epoch": 0.3177142857142857,
"grad_norm": 0.6143187284469604,
"kl": 0.225341796875,
"learning_rate": 5.405759110524894e-07,
"loss": 0.0193,
"reward": 0.5976903513073921,
"reward_std": 0.974912166595459,
"rewards/cosine_scaled_reward": -0.0032381737837567925,
"rewards/format_reward": 0.6041666865348816,
"step": 278
},
{
"completion_length": 2511.2083740234375,
"epoch": 0.31885714285714284,
"grad_norm": 0.7699910998344421,
"kl": 0.2982177734375,
"learning_rate": 5.37435262574394e-07,
"loss": 0.0299,
"reward": 0.3957599774003029,
"reward_std": 0.8634193539619446,
"rewards/cosine_scaled_reward": -0.10420336201786995,
"rewards/format_reward": 0.604166679084301,
"step": 279
},
{
"completion_length": 2782.8750610351562,
"epoch": 0.32,
"grad_norm": 0.9926307201385498,
"kl": 0.310791015625,
"learning_rate": 5.342952264838747e-07,
"loss": 0.0676,
"reward": 0.6104128423612565,
"reward_std": 0.8384141325950623,
"rewards/cosine_scaled_reward": -0.028126917779445648,
"rewards/format_reward": 0.6666666865348816,
"step": 280
},
{
"completion_length": 2380.4584350585938,
"epoch": 0.3211428571428571,
"grad_norm": 0.883975088596344,
"kl": 0.2364501953125,
"learning_rate": 5.311559558218603e-07,
"loss": -0.0298,
"reward": 0.6390588581562042,
"reward_std": 0.7505539357662201,
"rewards/cosine_scaled_reward": -0.055470582097768784,
"rewards/format_reward": 0.7500000074505806,
"step": 281
},
{
"completion_length": 2751.1666870117188,
"epoch": 0.3222857142857143,
"grad_norm": 0.6628551483154297,
"kl": 0.340087890625,
"learning_rate": 5.28017603591974e-07,
"loss": 0.0545,
"reward": 0.8024181574583054,
"reward_std": 0.9694567918777466,
"rewards/cosine_scaled_reward": 0.026209060102701187,
"rewards/format_reward": 0.7500000298023224,
"step": 282
},
{
"completion_length": 2520.1458740234375,
"epoch": 0.32342857142857145,
"grad_norm": 0.5402534604072571,
"kl": 0.352783203125,
"learning_rate": 5.248803227530763e-07,
"loss": 0.0129,
"reward": 0.4531768709421158,
"reward_std": 0.6381779089570045,
"rewards/cosine_scaled_reward": -0.1588282436132431,
"rewards/format_reward": 0.770833358168602,
"step": 283
},
{
"completion_length": 2361.5625,
"epoch": 0.32457142857142857,
"grad_norm": 0.7840125560760498,
"kl": 0.43798828125,
"learning_rate": 5.21744266211809e-07,
"loss": 0.0189,
"reward": 0.3853081315755844,
"reward_std": 0.7855608388781548,
"rewards/cosine_scaled_reward": -0.07817927654832602,
"rewards/format_reward": 0.5416666716337204,
"step": 284
},
{
"completion_length": 2909.3751220703125,
"epoch": 0.32571428571428573,
"grad_norm": 0.543645441532135,
"kl": 0.51806640625,
"learning_rate": 5.186095868151436e-07,
"loss": 0.059,
"reward": 0.0715614715591073,
"reward_std": 0.6991735994815826,
"rewards/cosine_scaled_reward": -0.22463593445718288,
"rewards/format_reward": 0.5208333358168602,
"step": 285
},
{
"completion_length": 2623.5000610351562,
"epoch": 0.32685714285714285,
"grad_norm": 1.0876595973968506,
"kl": 0.3642578125,
"learning_rate": 5.154764373429315e-07,
"loss": 0.0895,
"reward": 0.7619921118021011,
"reward_std": 1.0285737365484238,
"rewards/cosine_scaled_reward": 0.047662717290222645,
"rewards/format_reward": 0.6666666865348816,
"step": 286
},
{
"completion_length": 2762.666748046875,
"epoch": 0.328,
"grad_norm": 0.7187138795852661,
"kl": 0.50048828125,
"learning_rate": 5.123449705004581e-07,
"loss": 0.043,
"reward": 0.5433498155325651,
"reward_std": 0.6913661956787109,
"rewards/cosine_scaled_reward": -0.061658430844545364,
"rewards/format_reward": 0.6666666865348816,
"step": 287
},
{
"completion_length": 2270.8333740234375,
"epoch": 0.3291428571428571,
"grad_norm": 0.34955894947052,
"kl": 0.260986328125,
"learning_rate": 5.09215338910999e-07,
"loss": 0.019,
"reward": 0.9035947173833847,
"reward_std": 0.8012775778770447,
"rewards/cosine_scaled_reward": -0.006535984575748444,
"rewards/format_reward": 0.9166666865348816,
"step": 288
},
{
"completion_length": 2480.8541870117188,
"epoch": 0.3302857142857143,
"grad_norm": 1.0728695392608643,
"kl": 0.474609375,
"learning_rate": 5.060876951083828e-07,
"loss": 0.0877,
"reward": 0.5563938245177269,
"reward_std": 0.8119515627622604,
"rewards/cosine_scaled_reward": -0.06555308337556198,
"rewards/format_reward": 0.6875000223517418,
"step": 289
},
{
"completion_length": 2005.3542175292969,
"epoch": 0.3314285714285714,
"grad_norm": 2.5518229007720947,
"kl": 0.4202880859375,
"learning_rate": 5.02962191529556e-07,
"loss": 0.1377,
"reward": 1.0121518671512604,
"reward_std": 1.0199929028749466,
"rewards/cosine_scaled_reward": 0.14149258099496365,
"rewards/format_reward": 0.7291666865348816,
"step": 290
},
{
"completion_length": 1837.25,
"epoch": 0.3325714285714286,
"grad_norm": 0.5082411766052246,
"kl": 0.318115234375,
"learning_rate": 4.998389805071536e-07,
"loss": -0.0025,
"reward": 0.5244562700390816,
"reward_std": 0.8083207458257675,
"rewards/cosine_scaled_reward": -0.09193855058401823,
"rewards/format_reward": 0.7083333432674408,
"step": 291
},
{
"completion_length": 2516.000030517578,
"epoch": 0.33371428571428574,
"grad_norm": 0.6963807344436646,
"kl": 0.496826171875,
"learning_rate": 4.967182142620745e-07,
"loss": 0.0554,
"reward": 0.6148294545710087,
"reward_std": 0.7742474526166916,
"rewards/cosine_scaled_reward": -0.025918614119291306,
"rewards/format_reward": 0.666666679084301,
"step": 292
},
{
"completion_length": 2563.354248046875,
"epoch": 0.33485714285714285,
"grad_norm": 0.4553970992565155,
"kl": 0.64111328125,
"learning_rate": 4.93600044896063e-07,
"loss": 0.08,
"reward": 0.4226888967677951,
"reward_std": 0.8445644974708557,
"rewards/cosine_scaled_reward": -0.12198889185674489,
"rewards/format_reward": 0.666666679084301,
"step": 293
},
{
"completion_length": 2474.6459350585938,
"epoch": 0.336,
"grad_norm": 0.5785382390022278,
"kl": 0.543212890625,
"learning_rate": 4.904846243842949e-07,
"loss": 0.0498,
"reward": 0.7478385232388973,
"reward_std": 0.7380570024251938,
"rewards/cosine_scaled_reward": 0.08225257322192192,
"rewards/format_reward": 0.5833333432674408,
"step": 294
},
{
"completion_length": 2818.1043090820312,
"epoch": 0.33714285714285713,
"grad_norm": 1.9920473098754883,
"kl": 1.005859375,
"learning_rate": 4.873721045679706e-07,
"loss": 0.0599,
"reward": 0.38695642724633217,
"reward_std": 0.8360127806663513,
"rewards/cosine_scaled_reward": -0.0461051338352263,
"rewards/format_reward": 0.4791666716337204,
"step": 295
},
{
"completion_length": 2180.6875610351562,
"epoch": 0.3382857142857143,
"grad_norm": 1.0185471773147583,
"kl": 0.60888671875,
"learning_rate": 4.842626371469149e-07,
"loss": 0.0929,
"reward": 0.9686335474252701,
"reward_std": 0.9049602597951889,
"rewards/cosine_scaled_reward": 0.1405667569488287,
"rewards/format_reward": 0.6875000223517418,
"step": 296
},
{
"completion_length": 2705.2709350585938,
"epoch": 0.3394285714285714,
"grad_norm": 1.4574670791625977,
"kl": 0.7529296875,
"learning_rate": 4.811563736721829e-07,
"loss": 0.0525,
"reward": 0.3473209235817194,
"reward_std": 0.7314907014369965,
"rewards/cosine_scaled_reward": -0.12842286378145218,
"rewards/format_reward": 0.6041666865348816,
"step": 297
},
{
"completion_length": 2661.5416870117188,
"epoch": 0.3405714285714286,
"grad_norm": 1.0324411392211914,
"kl": 0.779296875,
"learning_rate": 4.780534655386743e-07,
"loss": 0.0626,
"reward": 0.44023372419178486,
"reward_std": 0.7127360999584198,
"rewards/cosine_scaled_reward": -0.04029981233179569,
"rewards/format_reward": 0.5208333507180214,
"step": 298
},
{
"completion_length": 2836.0626220703125,
"epoch": 0.3417142857142857,
"grad_norm": 1.2534230947494507,
"kl": 0.66015625,
"learning_rate": 4.749540639777539e-07,
"loss": 0.0559,
"reward": 0.4187684841454029,
"reward_std": 0.7654632180929184,
"rewards/cosine_scaled_reward": -0.1031157523393631,
"rewards/format_reward": 0.6250000260770321,
"step": 299
},
{
"completion_length": 2193.541717529297,
"epoch": 0.34285714285714286,
"grad_norm": 1.023747444152832,
"kl": 0.4393310546875,
"learning_rate": 4.7185832004988133e-07,
"loss": 0.0057,
"reward": 0.7049860581755638,
"reward_std": 0.8015492558479309,
"rewards/cosine_scaled_reward": 0.07124301791191101,
"rewards/format_reward": 0.5625000223517418,
"step": 300
},
{
"completion_length": 2034.166748046875,
"epoch": 0.344,
"grad_norm": 1.0728156566619873,
"kl": 0.6123046875,
"learning_rate": 4.68766384637248e-07,
"loss": 0.0087,
"reward": 0.5370926359901205,
"reward_std": 0.8870838582515717,
"rewards/cosine_scaled_reward": -0.05437035672366619,
"rewards/format_reward": 0.645833358168602,
"step": 301
},
{
"completion_length": 1496.7708587646484,
"epoch": 0.34514285714285714,
"grad_norm": 0.36257851123809814,
"kl": 0.46044921875,
"learning_rate": 4.656784084364238e-07,
"loss": -0.0228,
"reward": 0.484084477648139,
"reward_std": 0.7823295146226883,
"rewards/cosine_scaled_reward": -0.01837443746626377,
"rewards/format_reward": 0.5208333358168602,
"step": 302
},
{
"completion_length": 1376.9167175292969,
"epoch": 0.3462857142857143,
"grad_norm": 0.30551737546920776,
"kl": 0.42236328125,
"learning_rate": 4.6259454195101267e-07,
"loss": -0.0461,
"reward": 0.9217020869255066,
"reward_std": 0.7940811067819595,
"rewards/cosine_scaled_reward": 0.07543436251580715,
"rewards/format_reward": 0.7708333432674408,
"step": 303
},
{
"completion_length": 1413.708396911621,
"epoch": 0.3474285714285714,
"grad_norm": 0.9130037426948547,
"kl": 0.74609375,
"learning_rate": 4.59514935484316e-07,
"loss": -0.0368,
"reward": 0.7251628190279007,
"reward_std": 1.0211279392242432,
"rewards/cosine_scaled_reward": 0.05008140648715198,
"rewards/format_reward": 0.6250000074505806,
"step": 304
},
{
"completion_length": 1933.5208740234375,
"epoch": 0.3485714285714286,
"grad_norm": 0.6181937456130981,
"kl": 0.59716796875,
"learning_rate": 4.5643973913200837e-07,
"loss": -0.0665,
"reward": 0.6453933482989669,
"reward_std": 0.8129071295261383,
"rewards/cosine_scaled_reward": 0.03103000298142433,
"rewards/format_reward": 0.5833333432674408,
"step": 305
},
{
"completion_length": 1331.7917098999023,
"epoch": 0.3497142857142857,
"grad_norm": 0.2622654139995575,
"kl": 0.6375732421875,
"learning_rate": 4.5336910277482155e-07,
"loss": -0.0564,
"reward": 0.4545041471719742,
"reward_std": 0.6556018441915512,
"rewards/cosine_scaled_reward": -0.08524793572723866,
"rewards/format_reward": 0.6250000149011612,
"step": 306
},
{
"completion_length": 1522.2916870117188,
"epoch": 0.35085714285714287,
"grad_norm": 0.3843940198421478,
"kl": 0.647705078125,
"learning_rate": 4.503031760712397e-07,
"loss": -0.0408,
"reward": 0.9578620158135891,
"reward_std": 0.9549144953489304,
"rewards/cosine_scaled_reward": 0.15601433627307415,
"rewards/format_reward": 0.645833358168602,
"step": 307
},
{
"completion_length": 2036.0834045410156,
"epoch": 0.352,
"grad_norm": 0.8481309413909912,
"kl": 0.606689453125,
"learning_rate": 4.4724210845020494e-07,
"loss": -0.0199,
"reward": 0.631169930100441,
"reward_std": 0.7533179372549057,
"rewards/cosine_scaled_reward": -0.028165025636553764,
"rewards/format_reward": 0.6875000149011612,
"step": 308
},
{
"completion_length": 1487.1666870117188,
"epoch": 0.35314285714285715,
"grad_norm": 1.9852585792541504,
"kl": 0.5830078125,
"learning_rate": 4.441860491038345e-07,
"loss": 0.0105,
"reward": 0.7891280353069305,
"reward_std": 0.8583121746778488,
"rewards/cosine_scaled_reward": 0.07164734601974487,
"rewards/format_reward": 0.645833358168602,
"step": 309
},
{
"completion_length": 1955.791748046875,
"epoch": 0.35428571428571426,
"grad_norm": 0.31575194001197815,
"kl": 0.184326171875,
"learning_rate": 4.4113514698014953e-07,
"loss": -0.0014,
"reward": 0.8256345121189952,
"reward_std": 0.7062153369188309,
"rewards/cosine_scaled_reward": 0.048233918845653534,
"rewards/format_reward": 0.7291666716337204,
"step": 310
},
{
"completion_length": 1666.0833740234375,
"epoch": 0.3554285714285714,
"grad_norm": 2.016129970550537,
"kl": 0.47119140625,
"learning_rate": 4.3808955077581546e-07,
"loss": 0.072,
"reward": 0.8503673672676086,
"reward_std": 0.8861262649297714,
"rewards/cosine_scaled_reward": 0.08143368689343333,
"rewards/format_reward": 0.6875000298023224,
"step": 311
},
{
"completion_length": 1778.1041870117188,
"epoch": 0.3565714285714286,
"grad_norm": 2.5336270332336426,
"kl": 0.513916015625,
"learning_rate": 4.350494089288943e-07,
"loss": 0.0693,
"reward": 0.5695639494806528,
"reward_std": 0.7498121336102486,
"rewards/cosine_scaled_reward": -0.038134701550006866,
"rewards/format_reward": 0.645833358168602,
"step": 312
},
{
"completion_length": 2122.4791870117188,
"epoch": 0.3577142857142857,
"grad_norm": 0.3355765640735626,
"kl": 0.609619140625,
"learning_rate": 4.3201486961161093e-07,
"loss": -0.0237,
"reward": 0.7382938861846924,
"reward_std": 0.8554851859807968,
"rewards/cosine_scaled_reward": -0.005853069946169853,
"rewards/format_reward": 0.7500000298023224,
"step": 313
},
{
"completion_length": 2387.2083435058594,
"epoch": 0.3588571428571429,
"grad_norm": 3.036442756652832,
"kl": 0.231201171875,
"learning_rate": 4.2898608072313045e-07,
"loss": 0.1037,
"reward": 0.8101449112291448,
"reward_std": 0.963694229722023,
"rewards/cosine_scaled_reward": 0.01965576596558094,
"rewards/format_reward": 0.770833358168602,
"step": 314
},
{
"completion_length": 2170.729217529297,
"epoch": 0.36,
"grad_norm": 1.4392133951187134,
"kl": 0.21209716796875,
"learning_rate": 4.2596318988235037e-07,
"loss": 0.045,
"reward": 0.6554913818836212,
"reward_std": 1.1266003251075745,
"rewards/cosine_scaled_reward": -0.01600432489067316,
"rewards/format_reward": 0.6875000149011612,
"step": 315
},
{
"completion_length": 2317.1459350585938,
"epoch": 0.36114285714285715,
"grad_norm": 0.4884386658668518,
"kl": 0.36376953125,
"learning_rate": 4.2294634442070553e-07,
"loss": 0.0282,
"reward": 0.29845087230205536,
"reward_std": 0.6840033531188965,
"rewards/cosine_scaled_reward": -0.15285790944471955,
"rewards/format_reward": 0.6041666865348816,
"step": 316
},
{
"completion_length": 3088.2709350585938,
"epoch": 0.36228571428571427,
"grad_norm": 0.8027182817459106,
"kl": 0.3505859375,
"learning_rate": 4.1993569137498776e-07,
"loss": 0.0242,
"reward": 0.9088336080312729,
"reward_std": 1.000715285539627,
"rewards/cosine_scaled_reward": 0.1002501342445612,
"rewards/format_reward": 0.7083333432674408,
"step": 317
},
{
"completion_length": 2317.3750610351562,
"epoch": 0.36342857142857143,
"grad_norm": 0.327318012714386,
"kl": 0.3134765625,
"learning_rate": 4.1693137748017915e-07,
"loss": 0.0385,
"reward": 0.6265020594000816,
"reward_std": 0.7293453440070152,
"rewards/cosine_scaled_reward": -0.040915639605373144,
"rewards/format_reward": 0.7083333432674408,
"step": 318
},
{
"completion_length": 2849.3333740234375,
"epoch": 0.36457142857142855,
"grad_norm": 1.7290736436843872,
"kl": 0.443359375,
"learning_rate": 4.1393354916230005e-07,
"loss": 0.098,
"reward": 0.46177836135029793,
"reward_std": 0.9352491050958633,
"rewards/cosine_scaled_reward": -0.07119414396584034,
"rewards/format_reward": 0.604166679084301,
"step": 319
},
{
"completion_length": 2402.8750610351562,
"epoch": 0.3657142857142857,
"grad_norm": 1.1702836751937866,
"kl": 0.34814453125,
"learning_rate": 4.1094235253127374e-07,
"loss": 0.0587,
"reward": 0.5764410048723221,
"reward_std": 0.7314303368330002,
"rewards/cosine_scaled_reward": -0.055529496632516384,
"rewards/format_reward": 0.6875000149011612,
"step": 320
},
{
"completion_length": 2828.791748046875,
"epoch": 0.3668571428571429,
"grad_norm": 0.797664999961853,
"kl": 0.52001953125,
"learning_rate": 4.079579333738039e-07,
"loss": 0.0412,
"reward": 0.4816475547850132,
"reward_std": 0.8193319886922836,
"rewards/cosine_scaled_reward": -0.050842900411225855,
"rewards/format_reward": 0.5833333432674408,
"step": 321
},
{
"completion_length": 2521.479248046875,
"epoch": 0.368,
"grad_norm": 1.1600196361541748,
"kl": 0.3974609375,
"learning_rate": 4.0498043714627006e-07,
"loss": 0.024,
"reward": 0.8539287596940994,
"reward_std": 0.9238015562295914,
"rewards/cosine_scaled_reward": 0.020714368554763496,
"rewards/format_reward": 0.8125000149011612,
"step": 322
},
{
"completion_length": 2526.354278564453,
"epoch": 0.36914285714285716,
"grad_norm": 0.7439947128295898,
"kl": 0.40966796875,
"learning_rate": 4.020100089676376e-07,
"loss": 0.0387,
"reward": 0.9395965822041035,
"reward_std": 0.7121690958738327,
"rewards/cosine_scaled_reward": 0.0947982706129551,
"rewards/format_reward": 0.7500000298023224,
"step": 323
},
{
"completion_length": 2963.6666870117188,
"epoch": 0.3702857142857143,
"grad_norm": 0.7919374108314514,
"kl": 0.53271484375,
"learning_rate": 3.9904679361238526e-07,
"loss": 0.0574,
"reward": 0.3954196572303772,
"reward_std": 0.7907533347606659,
"rewards/cosine_scaled_reward": -0.0939568355679512,
"rewards/format_reward": 0.583333358168602,
"step": 324
},
{
"completion_length": 2059.416748046875,
"epoch": 0.37142857142857144,
"grad_norm": 0.7337906956672668,
"kl": 0.30908203125,
"learning_rate": 3.9609093550344907e-07,
"loss": 0.0437,
"reward": 0.6482492443174124,
"reward_std": 0.976516529917717,
"rewards/cosine_scaled_reward": -0.050875378074124455,
"rewards/format_reward": 0.7500000149011612,
"step": 325
},
{
"completion_length": 2717.5001220703125,
"epoch": 0.37257142857142855,
"grad_norm": 0.7754512429237366,
"kl": 0.4609375,
"learning_rate": 3.931425787051832e-07,
"loss": 0.0804,
"reward": 0.5230683460831642,
"reward_std": 0.7168317809700966,
"rewards/cosine_scaled_reward": -0.09263250115327537,
"rewards/format_reward": 0.708333358168602,
"step": 326
},
{
"completion_length": 2365.666732788086,
"epoch": 0.3737142857142857,
"grad_norm": 0.9611565470695496,
"kl": 0.370513916015625,
"learning_rate": 3.902018669163384e-07,
"loss": 0.013,
"reward": 0.8529483936727047,
"reward_std": 0.787610650062561,
"rewards/cosine_scaled_reward": 0.05147417262196541,
"rewards/format_reward": 0.7500000149011612,
"step": 327
},
{
"completion_length": 2920.5000610351562,
"epoch": 0.37485714285714283,
"grad_norm": 1.1496500968933105,
"kl": 0.568359375,
"learning_rate": 3.872689434630585e-07,
"loss": 0.1313,
"reward": 0.5756548047065735,
"reward_std": 1.1168714761734009,
"rewards/cosine_scaled_reward": -0.04550594184547663,
"rewards/format_reward": 0.6666666865348816,
"step": 328
},
{
"completion_length": 2694.229248046875,
"epoch": 0.376,
"grad_norm": 1.6449869871139526,
"kl": 0.4189453125,
"learning_rate": 3.843439512918949e-07,
"loss": 0.0905,
"reward": 0.607914388179779,
"reward_std": 0.9643268138170242,
"rewards/cosine_scaled_reward": -0.0918761616339907,
"rewards/format_reward": 0.7916666865348816,
"step": 329
},
{
"completion_length": 2766.041748046875,
"epoch": 0.37714285714285717,
"grad_norm": 0.8693978190422058,
"kl": 0.56396484375,
"learning_rate": 3.8142703296283953e-07,
"loss": 0.0817,
"reward": 0.42995208874344826,
"reward_std": 0.9052233844995499,
"rewards/cosine_scaled_reward": -0.07669062539935112,
"rewards/format_reward": 0.5833333432674408,
"step": 330
},
{
"completion_length": 2704.2084350585938,
"epoch": 0.3782857142857143,
"grad_norm": 0.6593329906463623,
"kl": 0.43994140625,
"learning_rate": 3.785183306423767e-07,
"loss": 0.0481,
"reward": 0.5416111797094345,
"reward_std": 0.7576990574598312,
"rewards/cosine_scaled_reward": -0.07294442504644394,
"rewards/format_reward": 0.6875000149011612,
"step": 331
},
{
"completion_length": 2430.1458740234375,
"epoch": 0.37942857142857145,
"grad_norm": 1.1451934576034546,
"kl": 0.4638671875,
"learning_rate": 3.7561798609655373e-07,
"loss": 0.0949,
"reward": 0.9672386646270752,
"reward_std": 0.9684969633817673,
"rewards/cosine_scaled_reward": 0.13986931554973125,
"rewards/format_reward": 0.6875000298023224,
"step": 332
},
{
"completion_length": 2586.3959045410156,
"epoch": 0.38057142857142856,
"grad_norm": 1.2027528285980225,
"kl": 0.5546875,
"learning_rate": 3.72726140684072e-07,
"loss": 0.0376,
"reward": 0.24384124111384153,
"reward_std": 0.6339670419692993,
"rewards/cosine_scaled_reward": -0.2218293957412243,
"rewards/format_reward": 0.6875000149011612,
"step": 333
},
{
"completion_length": 2716.5208740234375,
"epoch": 0.38171428571428573,
"grad_norm": 0.5679751634597778,
"kl": 0.46875,
"learning_rate": 3.6984293534939737e-07,
"loss": 0.0595,
"reward": 0.49158087372779846,
"reward_std": 0.6254527196288109,
"rewards/cosine_scaled_reward": -0.07712622173130512,
"rewards/format_reward": 0.645833358168602,
"step": 334
},
{
"completion_length": 2559.4583740234375,
"epoch": 0.38285714285714284,
"grad_norm": 0.4788146913051605,
"kl": 0.447509765625,
"learning_rate": 3.6696851061588994e-07,
"loss": 0.0589,
"reward": 0.47583791986107826,
"reward_std": 0.6539599671959877,
"rewards/cosine_scaled_reward": -0.08499772474169731,
"rewards/format_reward": 0.6458333432674408,
"step": 335
},
{
"completion_length": 2945.4584350585938,
"epoch": 0.384,
"grad_norm": 0.6187959313392639,
"kl": 0.59814453125,
"learning_rate": 3.641030065789562e-07,
"loss": 0.1016,
"reward": 0.08771202201023698,
"reward_std": 0.7820224016904831,
"rewards/cosine_scaled_reward": -0.23739399760961533,
"rewards/format_reward": 0.5625000298023224,
"step": 336
},
{
"completion_length": 2430.8958740234375,
"epoch": 0.3851428571428571,
"grad_norm": 0.7578234672546387,
"kl": 0.46826171875,
"learning_rate": 3.612465628992203e-07,
"loss": 0.0748,
"reward": 0.5553858801722527,
"reward_std": 0.7994070649147034,
"rewards/cosine_scaled_reward": -0.06605706363916397,
"rewards/format_reward": 0.6875000149011612,
"step": 337
},
{
"completion_length": 2227.916717529297,
"epoch": 0.3862857142857143,
"grad_norm": 0.8869759440422058,
"kl": 0.354248046875,
"learning_rate": 3.5839931879571725e-07,
"loss": 0.0831,
"reward": 0.7496502324938774,
"reward_std": 0.8079821169376373,
"rewards/cosine_scaled_reward": -0.0001748921349644661,
"rewards/format_reward": 0.7500000223517418,
"step": 338
},
{
"completion_length": 2985.3334350585938,
"epoch": 0.38742857142857146,
"grad_norm": 1.4707542657852173,
"kl": 0.666015625,
"learning_rate": 3.555614130391079e-07,
"loss": 0.1233,
"reward": 0.36759741231799126,
"reward_std": 0.8881158977746964,
"rewards/cosine_scaled_reward": -0.06620129197835922,
"rewards/format_reward": 0.5000000149011612,
"step": 339
},
{
"completion_length": 2439.7501220703125,
"epoch": 0.38857142857142857,
"grad_norm": 2.691328287124634,
"kl": 0.453125,
"learning_rate": 3.5273298394491515e-07,
"loss": -0.0493,
"reward": 1.0150221139192581,
"reward_std": 0.9879051297903061,
"rewards/cosine_scaled_reward": 0.11167772859334946,
"rewards/format_reward": 0.7916666865348816,
"step": 340
},
{
"completion_length": 2257.937530517578,
"epoch": 0.38971428571428574,
"grad_norm": 0.7236793637275696,
"kl": 0.3848876953125,
"learning_rate": 3.4991416936678276e-07,
"loss": 0.0485,
"reward": 1.5515939444303513,
"reward_std": 0.958163395524025,
"rewards/cosine_scaled_reward": 0.35913030058145523,
"rewards/format_reward": 0.8333333432674408,
"step": 341
},
{
"completion_length": 2541.0834045410156,
"epoch": 0.39085714285714285,
"grad_norm": 0.982089102268219,
"kl": 0.48095703125,
"learning_rate": 3.471051066897562e-07,
"loss": 0.0531,
"reward": 0.5335123301483691,
"reward_std": 0.8991846293210983,
"rewards/cosine_scaled_reward": -0.09782716228437494,
"rewards/format_reward": 0.7291666865348816,
"step": 342
},
{
"completion_length": 2201.8125915527344,
"epoch": 0.392,
"grad_norm": 3.367811918258667,
"kl": 0.84130859375,
"learning_rate": 3.4430593282358777e-07,
"loss": 0.0659,
"reward": 1.025502122938633,
"reward_std": 0.8074321299791336,
"rewards/cosine_scaled_reward": 0.11691772192716599,
"rewards/format_reward": 0.7916666865348816,
"step": 343
},
{
"completion_length": 2793.7501220703125,
"epoch": 0.3931428571428571,
"grad_norm": 0.6109259724617004,
"kl": 0.50537109375,
"learning_rate": 3.4151678419606233e-07,
"loss": 0.0712,
"reward": 0.599671695381403,
"reward_std": 0.8611319363117218,
"rewards/cosine_scaled_reward": -0.04391413927078247,
"rewards/format_reward": 0.6875000149011612,
"step": 344
},
{
"completion_length": 2314.166778564453,
"epoch": 0.3942857142857143,
"grad_norm": 0.6686170697212219,
"kl": 0.5712890625,
"learning_rate": 3.387377967463493e-07,
"loss": 0.0395,
"reward": 0.634972408413887,
"reward_std": 0.6707823574542999,
"rewards/cosine_scaled_reward": -0.05751381441950798,
"rewards/format_reward": 0.7500000149011612,
"step": 345
},
{
"completion_length": 2902.9584350585938,
"epoch": 0.3954285714285714,
"grad_norm": 0.6067929863929749,
"kl": 0.57958984375,
"learning_rate": 3.359691059183761e-07,
"loss": 0.1087,
"reward": 0.4132253248244524,
"reward_std": 0.8897982537746429,
"rewards/cosine_scaled_reward": -0.05380401201546192,
"rewards/format_reward": 0.5208333507180214,
"step": 346
},
{
"completion_length": 1998.2083740234375,
"epoch": 0.3965714285714286,
"grad_norm": 0.9779978394508362,
"kl": 0.2724609375,
"learning_rate": 3.3321084665422803e-07,
"loss": 0.0409,
"reward": 1.1842745244503021,
"reward_std": 1.0255057215690613,
"rewards/cosine_scaled_reward": 0.17547059804201126,
"rewards/format_reward": 0.8333333432674408,
"step": 347
},
{
"completion_length": 2504.416748046875,
"epoch": 0.3977142857142857,
"grad_norm": 0.7763749957084656,
"kl": 0.45556640625,
"learning_rate": 3.3046315338757026e-07,
"loss": 0.0586,
"reward": 0.6250789314508438,
"reward_std": 0.745910570025444,
"rewards/cosine_scaled_reward": -0.020793883129954338,
"rewards/format_reward": 0.6666666865348816,
"step": 348
},
{
"completion_length": 1897.9375915527344,
"epoch": 0.39885714285714285,
"grad_norm": 0.759898841381073,
"kl": 0.2515869140625,
"learning_rate": 3.2772616003709616e-07,
"loss": -0.0009,
"reward": 1.2401193976402283,
"reward_std": 0.7767119854688644,
"rewards/cosine_scaled_reward": 0.18255970953032374,
"rewards/format_reward": 0.8750000149011612,
"step": 349
},
{
"completion_length": 1845.9792175292969,
"epoch": 0.4,
"grad_norm": 0.5678505301475525,
"kl": 0.2552642822265625,
"learning_rate": 3.250000000000001e-07,
"loss": 0.0331,
"reward": 1.1045997142791748,
"reward_std": 0.6993750482797623,
"rewards/cosine_scaled_reward": 0.13563317246735096,
"rewards/format_reward": 0.8333333432674408,
"step": 350
},
{
"completion_length": 2162.1250610351562,
"epoch": 0.40114285714285713,
"grad_norm": 0.8248549699783325,
"kl": 0.34246826171875,
"learning_rate": 3.222848061454764e-07,
"loss": 0.0701,
"reward": 0.6730905398726463,
"reward_std": 1.0314117968082428,
"rewards/cosine_scaled_reward": -0.03845473984256387,
"rewards/format_reward": 0.75,
"step": 351
},
{
"completion_length": 2398.8750915527344,
"epoch": 0.4022857142857143,
"grad_norm": 0.7086507678031921,
"kl": 0.337158203125,
"learning_rate": 3.195807108082429e-07,
"loss": 0.0512,
"reward": 0.5578571353107691,
"reward_std": 0.8292429894208908,
"rewards/cosine_scaled_reward": -0.10648808628320694,
"rewards/format_reward": 0.770833358168602,
"step": 352
},
{
"completion_length": 2411.541778564453,
"epoch": 0.4034285714285714,
"grad_norm": 0.43448832631111145,
"kl": 0.3551025390625,
"learning_rate": 3.168878457820915e-07,
"loss": 0.032,
"reward": 0.7701159529387951,
"reward_std": 0.8441641330718994,
"rewards/cosine_scaled_reward": 0.010057959705591202,
"rewards/format_reward": 0.75,
"step": 353
},
{
"completion_length": 2516.8750915527344,
"epoch": 0.4045714285714286,
"grad_norm": 0.47943782806396484,
"kl": 0.382568359375,
"learning_rate": 3.142063423134644e-07,
"loss": 0.0606,
"reward": 0.435189101845026,
"reward_std": 0.6631861850619316,
"rewards/cosine_scaled_reward": -0.13657212257385254,
"rewards/format_reward": 0.708333358168602,
"step": 354
},
{
"completion_length": 1538.3125610351562,
"epoch": 0.4057142857142857,
"grad_norm": 0.3774828314781189,
"kl": 0.3017578125,
"learning_rate": 3.115363310950578e-07,
"loss": 0.0368,
"reward": 0.8316129595041275,
"reward_std": 0.5808935090899467,
"rewards/cosine_scaled_reward": -0.021693539805710316,
"rewards/format_reward": 0.8750000149011612,
"step": 355
},
{
"completion_length": 2109.8333435058594,
"epoch": 0.40685714285714286,
"grad_norm": 0.3181619346141815,
"kl": 0.30126953125,
"learning_rate": 3.0887794225945143e-07,
"loss": 0.0337,
"reward": 0.5732035748660564,
"reward_std": 0.6602266579866409,
"rewards/cosine_scaled_reward": -0.057148221880197525,
"rewards/format_reward": 0.6875000298023224,
"step": 356
},
{
"completion_length": 2442.1458740234375,
"epoch": 0.408,
"grad_norm": 0.8465009927749634,
"kl": 0.5537109375,
"learning_rate": 3.062313053727671e-07,
"loss": 0.0438,
"reward": 0.5404957421123981,
"reward_std": 0.6692793369293213,
"rewards/cosine_scaled_reward": -0.08391880989074707,
"rewards/format_reward": 0.708333358168602,
"step": 357
},
{
"completion_length": 2172.5001220703125,
"epoch": 0.40914285714285714,
"grad_norm": 0.5915915966033936,
"kl": 0.2880859375,
"learning_rate": 3.0359654942835247e-07,
"loss": 0.04,
"reward": 0.9776165038347244,
"reward_std": 0.8002345710992813,
"rewards/cosine_scaled_reward": 0.07214158028364182,
"rewards/format_reward": 0.833333358168602,
"step": 358
},
{
"completion_length": 1994.7709350585938,
"epoch": 0.4102857142857143,
"grad_norm": 0.5695796608924866,
"kl": 0.33642578125,
"learning_rate": 3.0097380284049523e-07,
"loss": 0.0421,
"reward": 0.5635941876098514,
"reward_std": 0.682354062795639,
"rewards/cosine_scaled_reward": -0.08278624271042645,
"rewards/format_reward": 0.7291666716337204,
"step": 359
},
{
"completion_length": 1582.5625305175781,
"epoch": 0.4114285714285714,
"grad_norm": 0.6911218166351318,
"kl": 0.187103271484375,
"learning_rate": 2.9836319343816397e-07,
"loss": 0.038,
"reward": 0.9810230135917664,
"reward_std": 0.6732440888881683,
"rewards/cosine_scaled_reward": 0.03217813931405544,
"rewards/format_reward": 0.9166666865348816,
"step": 360
},
{
"completion_length": 1716.8541870117188,
"epoch": 0.4125714285714286,
"grad_norm": 0.755465567111969,
"kl": 0.2716064453125,
"learning_rate": 2.9576484845877793e-07,
"loss": -0.0037,
"reward": 0.4921398665755987,
"reward_std": 0.7469517663121223,
"rewards/cosine_scaled_reward": -0.10809672623872757,
"rewards/format_reward": 0.7083333432674408,
"step": 361
},
{
"completion_length": 2381.7708435058594,
"epoch": 0.4137142857142857,
"grad_norm": 0.4649311900138855,
"kl": 0.435546875,
"learning_rate": 2.931788945420058e-07,
"loss": 0.0655,
"reward": 0.3485546410083771,
"reward_std": 0.8100304752588272,
"rewards/cosine_scaled_reward": -0.13822269346565008,
"rewards/format_reward": 0.6250000149011612,
"step": 362
},
{
"completion_length": 2278.6876220703125,
"epoch": 0.41485714285714287,
"grad_norm": 0.38487836718559265,
"kl": 0.3544921875,
"learning_rate": 2.9060545772359305e-07,
"loss": 0.0483,
"reward": 0.6228149347007275,
"reward_std": 0.7660052478313446,
"rewards/cosine_scaled_reward": -0.05317586287856102,
"rewards/format_reward": 0.7291666865348816,
"step": 363
},
{
"completion_length": 1783.0834045410156,
"epoch": 0.416,
"grad_norm": 0.6700667142868042,
"kl": 0.27978515625,
"learning_rate": 2.8804466342921987e-07,
"loss": 0.006,
"reward": 0.5264641232788563,
"reward_std": 0.7023270279169083,
"rewards/cosine_scaled_reward": -0.12218462734017521,
"rewards/format_reward": 0.7708333432674408,
"step": 364
},
{
"completion_length": 1910.2500305175781,
"epoch": 0.41714285714285715,
"grad_norm": 0.7392496466636658,
"kl": 0.290771484375,
"learning_rate": 2.854966364683872e-07,
"loss": 0.0333,
"reward": 0.8516478016972542,
"reward_std": 0.938531182706356,
"rewards/cosine_scaled_reward": 0.009157223626971245,
"rewards/format_reward": 0.8333333432674408,
"step": 365
},
{
"completion_length": 2063.8958740234375,
"epoch": 0.41828571428571426,
"grad_norm": 1.9315472841262817,
"kl": 0.2879638671875,
"learning_rate": 2.829615010283344e-07,
"loss": 0.068,
"reward": 0.9369229730218649,
"reward_std": 0.8918980956077576,
"rewards/cosine_scaled_reward": 0.09346149861812592,
"rewards/format_reward": 0.7500000298023224,
"step": 366
},
{
"completion_length": 1400.2917175292969,
"epoch": 0.41942857142857143,
"grad_norm": 0.2165093868970871,
"kl": 0.1763916015625,
"learning_rate": 2.8043938066798645e-07,
"loss": 0.0193,
"reward": 0.9957811124622822,
"reward_std": 0.45480820536613464,
"rewards/cosine_scaled_reward": 0.04997388273477554,
"rewards/format_reward": 0.8958333432674408,
"step": 367
},
{
"completion_length": 1647.7916717529297,
"epoch": 0.4205714285714286,
"grad_norm": 0.7413077354431152,
"kl": 0.174774169921875,
"learning_rate": 2.7793039831193133e-07,
"loss": -0.0034,
"reward": 0.8528083562850952,
"reward_std": 0.8265992403030396,
"rewards/cosine_scaled_reward": 0.009737495332956314,
"rewards/format_reward": 0.833333358168602,
"step": 368
},
{
"completion_length": 1487.3958435058594,
"epoch": 0.4217142857142857,
"grad_norm": 0.6509503722190857,
"kl": 0.12530517578125,
"learning_rate": 2.7543467624442956e-07,
"loss": -0.0257,
"reward": 0.9031364023685455,
"reward_std": 0.9219841361045837,
"rewards/cosine_scaled_reward": 0.03490149416029453,
"rewards/format_reward": 0.833333358168602,
"step": 369
},
{
"completion_length": 2323.229248046875,
"epoch": 0.4228571428571429,
"grad_norm": 1.1870368719100952,
"kl": 0.2625732421875,
"learning_rate": 2.729523361034538e-07,
"loss": -0.0417,
"reward": 0.7300510033965111,
"reward_std": 0.8341569006443024,
"rewards/cosine_scaled_reward": -0.051641182973980904,
"rewards/format_reward": 0.833333358168602,
"step": 370
},
{
"completion_length": 1885.0834045410156,
"epoch": 0.424,
"grad_norm": 0.3413795232772827,
"kl": 0.232666015625,
"learning_rate": 2.7048349887476037e-07,
"loss": 0.0358,
"reward": 0.33694031462073326,
"reward_std": 0.7036072686314583,
"rewards/cosine_scaled_reward": -0.21694651246070862,
"rewards/format_reward": 0.7708333432674408,
"step": 371
},
{
"completion_length": 2071.812530517578,
"epoch": 0.42514285714285716,
"grad_norm": 0.9272376894950867,
"kl": 0.242919921875,
"learning_rate": 2.6802828488599294e-07,
"loss": -0.0016,
"reward": 0.9880311861634254,
"reward_std": 0.629561685025692,
"rewards/cosine_scaled_reward": 0.025265559554100037,
"rewards/format_reward": 0.9375000149011612,
"step": 372
},
{
"completion_length": 2372.0834045410156,
"epoch": 0.42628571428571427,
"grad_norm": 0.8849138617515564,
"kl": 0.249755859375,
"learning_rate": 2.655868138008171e-07,
"loss": 0.0062,
"reward": 0.7052676677703857,
"reward_std": 0.6477234065532684,
"rewards/cosine_scaled_reward": -0.07444952987134457,
"rewards/format_reward": 0.8541667014360428,
"step": 373
},
{
"completion_length": 2331.8125610351562,
"epoch": 0.42742857142857144,
"grad_norm": 0.5580031275749207,
"kl": 0.309814453125,
"learning_rate": 2.631592046130896e-07,
"loss": 0.0456,
"reward": 0.6995935346931219,
"reward_std": 0.7008600682020187,
"rewards/cosine_scaled_reward": 0.00604674918577075,
"rewards/format_reward": 0.6875000223517418,
"step": 374
},
{
"completion_length": 1906.2083740234375,
"epoch": 0.42857142857142855,
"grad_norm": 0.5966392755508423,
"kl": 0.3814697265625,
"learning_rate": 2.6074557564105724e-07,
"loss": 0.0463,
"reward": 0.7689145356416702,
"reward_std": 0.7337282001972198,
"rewards/cosine_scaled_reward": -0.02179272472858429,
"rewards/format_reward": 0.8125000298023224,
"step": 375
},
{
"completion_length": 1971.3125915527344,
"epoch": 0.4297142857142857,
"grad_norm": 1.3154016733169556,
"kl": 0.175048828125,
"learning_rate": 2.583460445215911e-07,
"loss": 0.0574,
"reward": 0.968916192650795,
"reward_std": 0.9032018631696701,
"rewards/cosine_scaled_reward": 0.0677914135158062,
"rewards/format_reward": 0.8333333432674408,
"step": 376
},
{
"completion_length": 2224.666748046875,
"epoch": 0.4308571428571429,
"grad_norm": 0.892139196395874,
"kl": 0.1807861328125,
"learning_rate": 2.5596072820445254e-07,
"loss": 0.0031,
"reward": 1.009105697274208,
"reward_std": 0.9417294263839722,
"rewards/cosine_scaled_reward": 0.09830283187329769,
"rewards/format_reward": 0.8125000298023224,
"step": 377
},
{
"completion_length": 2115.2709045410156,
"epoch": 0.432,
"grad_norm": 0.9765793085098267,
"kl": 0.26611328125,
"learning_rate": 2.5358974294659373e-07,
"loss": 0.0684,
"reward": 0.5737282857298851,
"reward_std": 0.6101915389299393,
"rewards/cosine_scaled_reward": -0.12980252876877785,
"rewards/format_reward": 0.8333333432674408,
"step": 378
},
{
"completion_length": 1658.2916870117188,
"epoch": 0.43314285714285716,
"grad_norm": 0.20954985916614532,
"kl": 0.232666015625,
"learning_rate": 2.512332043064913e-07,
"loss": 0.0026,
"reward": 0.6455265134572983,
"reward_std": 0.5983955562114716,
"rewards/cosine_scaled_reward": -0.08348675072193146,
"rewards/format_reward": 0.8125000298023224,
"step": 379
},
{
"completion_length": 2212.4375610351562,
"epoch": 0.4342857142857143,
"grad_norm": 1.3722639083862305,
"kl": 0.3023681640625,
"learning_rate": 2.488912271385139e-07,
"loss": 0.0993,
"reward": 0.5304721817374229,
"reward_std": 0.7781679779291153,
"rewards/cosine_scaled_reward": -0.10976393148303032,
"rewards/format_reward": 0.7500000149011612,
"step": 380
},
{
"completion_length": 1918.8750305175781,
"epoch": 0.43542857142857144,
"grad_norm": 0.7221528887748718,
"kl": 0.295806884765625,
"learning_rate": 2.465639255873246e-07,
"loss": 0.0029,
"reward": 0.9501378051936626,
"reward_std": 0.6066517308354378,
"rewards/cosine_scaled_reward": 0.047985561192035675,
"rewards/format_reward": 0.8541666865348816,
"step": 381
},
{
"completion_length": 1793.6459045410156,
"epoch": 0.43657142857142855,
"grad_norm": 0.25511884689331055,
"kl": 0.2493896484375,
"learning_rate": 2.4425141308231765e-07,
"loss": 0.0226,
"reward": 0.9860572461038828,
"reward_std": 0.6644920855760574,
"rewards/cosine_scaled_reward": 0.08677859604358673,
"rewards/format_reward": 0.8125000298023224,
"step": 382
},
{
"completion_length": 1990.2500915527344,
"epoch": 0.4377142857142857,
"grad_norm": 0.4499902129173279,
"kl": 0.25341796875,
"learning_rate": 2.4195380233209006e-07,
"loss": 0.053,
"reward": 0.7591063939034939,
"reward_std": 0.5849988833069801,
"rewards/cosine_scaled_reward": -0.04753013700246811,
"rewards/format_reward": 0.8541667014360428,
"step": 383
},
{
"completion_length": 1889.8750610351562,
"epoch": 0.43885714285714283,
"grad_norm": 0.34465470910072327,
"kl": 0.23828125,
"learning_rate": 2.3967120531894857e-07,
"loss": 0.0019,
"reward": 0.626850601285696,
"reward_std": 0.5293265283107758,
"rewards/cosine_scaled_reward": -0.12407470063772053,
"rewards/format_reward": 0.8750000149011612,
"step": 384
},
{
"completion_length": 1649.5208587646484,
"epoch": 0.44,
"grad_norm": 1.0988309383392334,
"kl": 0.2174072265625,
"learning_rate": 2.374037332934512e-07,
"loss": 0.046,
"reward": 0.8215210735797882,
"reward_std": 0.7156432569026947,
"rewards/cosine_scaled_reward": -0.005906133679673076,
"rewards/format_reward": 0.833333358168602,
"step": 385
},
{
"completion_length": 1869.666748046875,
"epoch": 0.44114285714285717,
"grad_norm": 0.31057262420654297,
"kl": 0.213623046875,
"learning_rate": 2.3515149676898552e-07,
"loss": 0.0312,
"reward": 0.7544382140040398,
"reward_std": 0.5287479311227798,
"rewards/cosine_scaled_reward": -0.09153091069310904,
"rewards/format_reward": 0.9375000149011612,
"step": 386
},
{
"completion_length": 2174.000030517578,
"epoch": 0.4422857142857143,
"grad_norm": 0.7334949374198914,
"kl": 0.2723388671875,
"learning_rate": 2.3291460551638237e-07,
"loss": 0.0412,
"reward": 0.7021404728293419,
"reward_std": 0.8102448135614395,
"rewards/cosine_scaled_reward": -0.07601310685276985,
"rewards/format_reward": 0.8541666865348816,
"step": 387
},
{
"completion_length": 1764.0625305175781,
"epoch": 0.44342857142857145,
"grad_norm": 0.8506814241409302,
"kl": 0.211181640625,
"learning_rate": 2.306931685585657e-07,
"loss": 0.0326,
"reward": 0.9473480954766273,
"reward_std": 0.7040945738554001,
"rewards/cosine_scaled_reward": 0.025757367722690105,
"rewards/format_reward": 0.895833358168602,
"step": 388
},
{
"completion_length": 1558.6875915527344,
"epoch": 0.44457142857142856,
"grad_norm": 1.0051478147506714,
"kl": 0.10626220703125,
"learning_rate": 2.2848729416523859e-07,
"loss": 0.0233,
"reward": 1.2105353027582169,
"reward_std": 0.7370782792568207,
"rewards/cosine_scaled_reward": 0.14693431742489338,
"rewards/format_reward": 0.9166666865348816,
"step": 389
},
{
"completion_length": 1673.0417175292969,
"epoch": 0.44571428571428573,
"grad_norm": 1.0045956373214722,
"kl": 0.324462890625,
"learning_rate": 2.2629708984760706e-07,
"loss": -0.0122,
"reward": 0.682011567056179,
"reward_std": 0.668542355298996,
"rewards/cosine_scaled_reward": -0.08607756206765771,
"rewards/format_reward": 0.8541666865348816,
"step": 390
},
{
"completion_length": 1759.604248046875,
"epoch": 0.44685714285714284,
"grad_norm": 0.8641379475593567,
"kl": 0.30419921875,
"learning_rate": 2.2412266235313973e-07,
"loss": -0.0151,
"reward": 0.40198634564876556,
"reward_std": 0.4891185835003853,
"rewards/cosine_scaled_reward": -0.23650683648884296,
"rewards/format_reward": 0.8750000149011612,
"step": 391
},
{
"completion_length": 1997.0208740234375,
"epoch": 0.448,
"grad_norm": 0.601497232913971,
"kl": 0.3251953125,
"learning_rate": 2.2196411766036487e-07,
"loss": 0.0246,
"reward": 1.31626558303833,
"reward_std": 0.8470017611980438,
"rewards/cosine_scaled_reward": 0.2206327999010682,
"rewards/format_reward": 0.8750000149011612,
"step": 392
},
{
"completion_length": 1767.3958740234375,
"epoch": 0.4491428571428571,
"grad_norm": 0.9790117740631104,
"kl": 0.20623779296875,
"learning_rate": 2.1982156097370557e-07,
"loss": 0.0716,
"reward": 1.0628649685531855,
"reward_std": 0.7842252627015114,
"rewards/cosine_scaled_reward": 0.09393247216939926,
"rewards/format_reward": 0.8750000149011612,
"step": 393
},
{
"completion_length": 2281.5625610351562,
"epoch": 0.4502857142857143,
"grad_norm": 0.9092360138893127,
"kl": 0.2666015625,
"learning_rate": 2.1769509671835223e-07,
"loss": 0.0071,
"reward": 0.7091562300920486,
"reward_std": 0.6370756179094315,
"rewards/cosine_scaled_reward": -0.09333855286240578,
"rewards/format_reward": 0.8958333432674408,
"step": 394
},
{
"completion_length": 2072.7083740234375,
"epoch": 0.4514285714285714,
"grad_norm": 0.6948179006576538,
"kl": 0.335205078125,
"learning_rate": 2.1558482853517253e-07,
"loss": 0.0399,
"reward": 0.6186719592660666,
"reward_std": 0.8180225193500519,
"rewards/cosine_scaled_reward": -0.06566403433680534,
"rewards/format_reward": 0.7500000298023224,
"step": 395
},
{
"completion_length": 1713.0625305175781,
"epoch": 0.45257142857142857,
"grad_norm": 1.03392493724823,
"kl": 0.2850341796875,
"learning_rate": 2.134908592756607e-07,
"loss": 0.0576,
"reward": 0.6681124269962311,
"reward_std": 0.72493577003479,
"rewards/cosine_scaled_reward": -0.07219376973807812,
"rewards/format_reward": 0.8125000149011612,
"step": 396
},
{
"completion_length": 2008.166748046875,
"epoch": 0.45371428571428574,
"grad_norm": 1.2174099683761597,
"kl": 0.3359375,
"learning_rate": 2.1141329099692406e-07,
"loss": 0.0821,
"reward": 1.3461299315094948,
"reward_std": 0.8196755945682526,
"rewards/cosine_scaled_reward": 0.2668149508535862,
"rewards/format_reward": 0.8125000298023224,
"step": 397
},
{
"completion_length": 1758.4167175292969,
"epoch": 0.45485714285714285,
"grad_norm": 0.7967256307601929,
"kl": 0.3011474609375,
"learning_rate": 2.0935222495670968e-07,
"loss": 0.0175,
"reward": 1.0533079504966736,
"reward_std": 0.9479693919420242,
"rewards/cosine_scaled_reward": 0.057903981767594814,
"rewards/format_reward": 0.9375000149011612,
"step": 398
},
{
"completion_length": 2110.0000610351562,
"epoch": 0.456,
"grad_norm": 0.6236258149147034,
"kl": 0.3653564453125,
"learning_rate": 2.0730776160846853e-07,
"loss": 0.0552,
"reward": 0.8325799964368343,
"reward_std": 0.6572683453559875,
"rewards/cosine_scaled_reward": -0.00037669437006115913,
"rewards/format_reward": 0.8333333432674408,
"step": 399
},
{
"completion_length": 1693.8333587646484,
"epoch": 0.45714285714285713,
"grad_norm": 0.5594977736473083,
"kl": 0.239166259765625,
"learning_rate": 2.0528000059645995e-07,
"loss": 0.0092,
"reward": 0.5645224675536156,
"reward_std": 0.47261467576026917,
"rewards/cosine_scaled_reward": -0.18648880254477262,
"rewards/format_reward": 0.9375000149011612,
"step": 400
},
{
"completion_length": 2298.4375610351562,
"epoch": 0.4582857142857143,
"grad_norm": 0.46592381596565247,
"kl": 0.5498046875,
"learning_rate": 2.032690407508949e-07,
"loss": 0.0651,
"reward": 0.7146447077393532,
"reward_std": 0.9194528758525848,
"rewards/cosine_scaled_reward": -0.05934431403875351,
"rewards/format_reward": 0.833333358168602,
"step": 401
},
{
"completion_length": 2858.3959350585938,
"epoch": 0.4594285714285714,
"grad_norm": 1.3920950889587402,
"kl": 0.701171875,
"learning_rate": 2.0127498008311922e-07,
"loss": 0.0681,
"reward": 0.38334885984659195,
"reward_std": 0.6373907253146172,
"rewards/cosine_scaled_reward": -0.20415889844298363,
"rewards/format_reward": 0.7916666865348816,
"step": 402
},
{
"completion_length": 2565.5626220703125,
"epoch": 0.4605714285714286,
"grad_norm": 1.1024017333984375,
"kl": 0.625,
"learning_rate": 1.9929791578083655e-07,
"loss": 0.029,
"reward": 0.801287055015564,
"reward_std": 0.897977739572525,
"rewards/cosine_scaled_reward": -0.03685649996623397,
"rewards/format_reward": 0.8750000298023224,
"step": 403
},
{
"completion_length": 2505.916748046875,
"epoch": 0.4617142857142857,
"grad_norm": 1.409442663192749,
"kl": 0.65576171875,
"learning_rate": 1.9733794420337213e-07,
"loss": 0.0304,
"reward": 1.2360095381736755,
"reward_std": 0.7143290638923645,
"rewards/cosine_scaled_reward": 0.18050476163625717,
"rewards/format_reward": 0.8750000149011612,
"step": 404
},
{
"completion_length": 2441.0000610351562,
"epoch": 0.46285714285714286,
"grad_norm": 0.8860685229301453,
"kl": 0.64306640625,
"learning_rate": 1.9539516087697517e-07,
"loss": 0.0652,
"reward": 1.0503446012735367,
"reward_std": 0.8782050907611847,
"rewards/cosine_scaled_reward": 0.10850561456754804,
"rewards/format_reward": 0.833333358168602,
"step": 405
},
{
"completion_length": 2316.562530517578,
"epoch": 0.464,
"grad_norm": 0.9385198354721069,
"kl": 0.6611328125,
"learning_rate": 1.934696604901642e-07,
"loss": 0.039,
"reward": 0.8388771619647741,
"reward_std": 0.5718994289636612,
"rewards/cosine_scaled_reward": -0.007644776254892349,
"rewards/format_reward": 0.8541666865348816,
"step": 406
},
{
"completion_length": 2314.6459045410156,
"epoch": 0.46514285714285714,
"grad_norm": 1.216766357421875,
"kl": 0.55029296875,
"learning_rate": 1.915615368891117e-07,
"loss": 0.0239,
"reward": 0.8419212326407433,
"reward_std": 0.65188068151474,
"rewards/cosine_scaled_reward": -0.037372760474681854,
"rewards/format_reward": 0.9166666865348816,
"step": 407
},
{
"completion_length": 2388.791717529297,
"epoch": 0.4662857142857143,
"grad_norm": 0.6723232865333557,
"kl": 0.4609375,
"learning_rate": 1.8967088307307e-07,
"loss": 0.048,
"reward": 1.100903958082199,
"reward_std": 0.7514118552207947,
"rewards/cosine_scaled_reward": 0.10253530507907271,
"rewards/format_reward": 0.8958333432674408,
"step": 408
},
{
"completion_length": 2204.958465576172,
"epoch": 0.4674285714285714,
"grad_norm": 0.9829697012901306,
"kl": 0.53759765625,
"learning_rate": 1.8779779118983867e-07,
"loss": 0.0237,
"reward": 1.0897281467914581,
"reward_std": 0.4026891812682152,
"rewards/cosine_scaled_reward": 0.10736404359340668,
"rewards/format_reward": 0.8750000149011612,
"step": 409
},
{
"completion_length": 2508.729248046875,
"epoch": 0.4685714285714286,
"grad_norm": 1.1136001348495483,
"kl": 0.58837890625,
"learning_rate": 1.8594235253127372e-07,
"loss": 0.0388,
"reward": 0.4441644148901105,
"reward_std": 0.8706175982952118,
"rewards/cosine_scaled_reward": -0.1425011307001114,
"rewards/format_reward": 0.7291666865348816,
"step": 410
},
{
"completion_length": 2758.3543090820312,
"epoch": 0.4697142857142857,
"grad_norm": 1.1172066926956177,
"kl": 0.52685546875,
"learning_rate": 1.8410465752883758e-07,
"loss": 0.0214,
"reward": 0.38340113312005997,
"reward_std": 0.6312393695116043,
"rewards/cosine_scaled_reward": -0.235382791608572,
"rewards/format_reward": 0.8541667014360428,
"step": 411
},
{
"completion_length": 2535.416717529297,
"epoch": 0.47085714285714286,
"grad_norm": 1.1818182468414307,
"kl": 0.579833984375,
"learning_rate": 1.822847957491922e-07,
"loss": 0.028,
"reward": 0.8752952516078949,
"reward_std": 0.5417208820581436,
"rewards/cosine_scaled_reward": 0.02098093181848526,
"rewards/format_reward": 0.8333333432674408,
"step": 412
},
{
"completion_length": 2696.1250610351562,
"epoch": 0.472,
"grad_norm": 0.5541598796844482,
"kl": 0.5654296875,
"learning_rate": 1.804828558898332e-07,
"loss": 0.0783,
"reward": 0.497568441554904,
"reward_std": 0.7255310416221619,
"rewards/cosine_scaled_reward": -0.10538244433701038,
"rewards/format_reward": 0.708333358168602,
"step": 413
},
{
"completion_length": 2673.8751220703125,
"epoch": 0.47314285714285714,
"grad_norm": 0.9568617343902588,
"kl": 0.53369140625,
"learning_rate": 1.7869892577476722e-07,
"loss": 0.0999,
"reward": 0.9199014604091644,
"reward_std": 0.8385901600122452,
"rewards/cosine_scaled_reward": 0.053700722055509686,
"rewards/format_reward": 0.8125000298023224,
"step": 414
},
{
"completion_length": 2954.5418090820312,
"epoch": 0.4742857142857143,
"grad_norm": 1.3337595462799072,
"kl": 0.607421875,
"learning_rate": 1.7693309235023127e-07,
"loss": 0.0298,
"reward": 0.843063585460186,
"reward_std": 0.9124226570129395,
"rewards/cosine_scaled_reward": 0.0048651136457920074,
"rewards/format_reward": 0.833333358168602,
"step": 415
},
{
"completion_length": 2910.4375610351562,
"epoch": 0.4754285714285714,
"grad_norm": 0.6592503786087036,
"kl": 0.61865234375,
"learning_rate": 1.7518544168045524e-07,
"loss": 0.0733,
"reward": 0.46045139618217945,
"reward_std": 0.8773138746619225,
"rewards/cosine_scaled_reward": -0.10310766100883484,
"rewards/format_reward": 0.6666666865348816,
"step": 416
},
{
"completion_length": 2641.2918090820312,
"epoch": 0.4765714285714286,
"grad_norm": 0.829136073589325,
"kl": 0.49462890625,
"learning_rate": 1.7345605894346726e-07,
"loss": 0.057,
"reward": 0.9838578663766384,
"reward_std": 0.7910896837711334,
"rewards/cosine_scaled_reward": 0.054428933188319206,
"rewards/format_reward": 0.8750000298023224,
"step": 417
},
{
"completion_length": 2964.3541870117188,
"epoch": 0.4777142857142857,
"grad_norm": 0.9262496829032898,
"kl": 0.5478515625,
"learning_rate": 1.7174502842694212e-07,
"loss": 0.0357,
"reward": 0.6633618324995041,
"reward_std": 0.6466763466596603,
"rewards/cosine_scaled_reward": -0.10581910982728004,
"rewards/format_reward": 0.8750000149011612,
"step": 418
},
{
"completion_length": 2773.5626220703125,
"epoch": 0.47885714285714287,
"grad_norm": 0.8558900952339172,
"kl": 0.49072265625,
"learning_rate": 1.7005243352409333e-07,
"loss": 0.0542,
"reward": 0.6305762082338333,
"reward_std": 0.7357209548354149,
"rewards/cosine_scaled_reward": -0.080545240547508,
"rewards/format_reward": 0.7916666716337204,
"step": 419
},
{
"completion_length": 2345.8959045410156,
"epoch": 0.48,
"grad_norm": 0.6529119610786438,
"kl": 0.3431396484375,
"learning_rate": 1.6837835672960831e-07,
"loss": 0.0226,
"reward": 1.2573866918683052,
"reward_std": 0.9116456806659698,
"rewards/cosine_scaled_reward": 0.1911933235824108,
"rewards/format_reward": 0.8750000149011612,
"step": 420
},
{
"completion_length": 2318.1875915527344,
"epoch": 0.48114285714285715,
"grad_norm": 0.6412160396575928,
"kl": 0.35498046875,
"learning_rate": 1.6672287963562852e-07,
"loss": 0.0124,
"reward": 1.0443747788667679,
"reward_std": 0.7097911983728409,
"rewards/cosine_scaled_reward": 0.09510404244065285,
"rewards/format_reward": 0.8541666865348816,
"step": 421
},
{
"completion_length": 2448.5833740234375,
"epoch": 0.48228571428571426,
"grad_norm": 0.6165621280670166,
"kl": 0.421875,
"learning_rate": 1.6508608292777203e-07,
"loss": 0.0385,
"reward": 0.7055833786725998,
"reward_std": 0.7713779509067535,
"rewards/cosine_scaled_reward": -0.053458321839571,
"rewards/format_reward": 0.8125000298023224,
"step": 422
},
{
"completion_length": 2370.479278564453,
"epoch": 0.48342857142857143,
"grad_norm": 1.0260326862335205,
"kl": 0.325927734375,
"learning_rate": 1.6346804638120098e-07,
"loss": 0.0657,
"reward": 0.8030254691839218,
"reward_std": 0.8349241316318512,
"rewards/cosine_scaled_reward": -0.015153962187469006,
"rewards/format_reward": 0.8333333432674408,
"step": 423
},
{
"completion_length": 2863.0833740234375,
"epoch": 0.4845714285714286,
"grad_norm": 0.8439249396324158,
"kl": 0.43115234375,
"learning_rate": 1.6186884885673413e-07,
"loss": 0.0713,
"reward": 0.4908841624855995,
"reward_std": 0.8119627386331558,
"rewards/cosine_scaled_reward": -0.15039126574993134,
"rewards/format_reward": 0.7916666865348816,
"step": 424
},
{
"completion_length": 2920.604248046875,
"epoch": 0.4857142857142857,
"grad_norm": 0.7168906927108765,
"kl": 0.455078125,
"learning_rate": 1.6028856829700258e-07,
"loss": 0.0577,
"reward": 0.8773088157176971,
"reward_std": 0.8730379045009613,
"rewards/cosine_scaled_reward": 0.032404396682977676,
"rewards/format_reward": 0.8125000149011612,
"step": 425
},
{
"completion_length": 2877.354248046875,
"epoch": 0.4868571428571429,
"grad_norm": 0.7351894974708557,
"kl": 0.3916015625,
"learning_rate": 1.5872728172265146e-07,
"loss": 0.0252,
"reward": 1.0884526520967484,
"reward_std": 0.8330738395452499,
"rewards/cosine_scaled_reward": 0.10672629997134209,
"rewards/format_reward": 0.8750000298023224,
"step": 426
},
{
"completion_length": 2745.041748046875,
"epoch": 0.488,
"grad_norm": 0.4892515242099762,
"kl": 0.33447265625,
"learning_rate": 1.5718506522858572e-07,
"loss": 0.0358,
"reward": 1.0718627832829952,
"reward_std": 0.7832525819540024,
"rewards/cosine_scaled_reward": 0.11926471255719662,
"rewards/format_reward": 0.833333358168602,
"step": 427
},
{
"completion_length": 2932.1458740234375,
"epoch": 0.48914285714285716,
"grad_norm": 1.292845606803894,
"kl": 0.52685546875,
"learning_rate": 1.5566199398026147e-07,
"loss": 0.0264,
"reward": 0.31675857678055763,
"reward_std": 0.5401652418076992,
"rewards/cosine_scaled_reward": -0.21662072464823723,
"rewards/format_reward": 0.7500000074505806,
"step": 428
},
{
"completion_length": 2592.3334045410156,
"epoch": 0.49028571428571427,
"grad_norm": 0.6887741088867188,
"kl": 0.39111328125,
"learning_rate": 1.5415814221002265e-07,
"loss": 0.0134,
"reward": 0.8551270663738251,
"reward_std": 0.883497804403305,
"rewards/cosine_scaled_reward": -0.02035313844680786,
"rewards/format_reward": 0.8958333432674408,
"step": 429
},
{
"completion_length": 2997.7709350585938,
"epoch": 0.49142857142857144,
"grad_norm": 0.9550595283508301,
"kl": 0.4248046875,
"learning_rate": 1.5267358321348285e-07,
"loss": 0.0977,
"reward": 0.7853763314778917,
"reward_std": 0.862298920750618,
"rewards/cosine_scaled_reward": -0.013561863452196121,
"rewards/format_reward": 0.8125000298023224,
"step": 430
},
{
"completion_length": 2814.6459350585938,
"epoch": 0.49257142857142855,
"grad_norm": 0.35693833231925964,
"kl": 0.42578125,
"learning_rate": 1.5120838934595337e-07,
"loss": 0.0465,
"reward": 0.7550955265760422,
"reward_std": 0.797643780708313,
"rewards/cosine_scaled_reward": -0.01828559674322605,
"rewards/format_reward": 0.7916666865348816,
"step": 431
},
{
"completion_length": 2680.0833740234375,
"epoch": 0.4937142857142857,
"grad_norm": 0.3660014867782593,
"kl": 0.42138671875,
"learning_rate": 1.4976263201891613e-07,
"loss": 0.0453,
"reward": 0.5384078100323677,
"reward_std": 0.6302113831043243,
"rewards/cosine_scaled_reward": -0.11621277220547199,
"rewards/format_reward": 0.7708333432674408,
"step": 432
},
{
"completion_length": 3197.0625610351562,
"epoch": 0.4948571428571429,
"grad_norm": 0.834852397441864,
"kl": 0.45703125,
"learning_rate": 1.483363816965435e-07,
"loss": 0.0415,
"reward": 0.677655503153801,
"reward_std": 0.997919499874115,
"rewards/cosine_scaled_reward": -0.015338926576077938,
"rewards/format_reward": 0.7083333432674408,
"step": 433
},
{
"completion_length": 2018.541748046875,
"epoch": 0.496,
"grad_norm": 0.3951985836029053,
"kl": 0.17779541015625,
"learning_rate": 1.469297078922642e-07,
"loss": -0.0128,
"reward": 1.5104268491268158,
"reward_std": 0.6382196992635727,
"rewards/cosine_scaled_reward": 0.2760467454791069,
"rewards/format_reward": 0.9583333432674408,
"step": 434
},
{
"completion_length": 2781.45849609375,
"epoch": 0.49714285714285716,
"grad_norm": 0.8080605268478394,
"kl": 0.41552734375,
"learning_rate": 1.4554267916537495e-07,
"loss": 0.072,
"reward": 0.5199687406420708,
"reward_std": 0.697292298078537,
"rewards/cosine_scaled_reward": -0.11501563712954521,
"rewards/format_reward": 0.7500000298023224,
"step": 435
},
{
"completion_length": 2910.9168090820312,
"epoch": 0.4982857142857143,
"grad_norm": 1.0082898139953613,
"kl": 0.31591796875,
"learning_rate": 1.4417536311769885e-07,
"loss": 0.0441,
"reward": 1.0633302181959152,
"reward_std": 0.8466629385948181,
"rewards/cosine_scaled_reward": 0.06291508674621582,
"rewards/format_reward": 0.9375000149011612,
"step": 436
},
{
"completion_length": 2581.2500610351562,
"epoch": 0.49942857142857144,
"grad_norm": 0.5378354787826538,
"kl": 0.2705078125,
"learning_rate": 1.4282782639029128e-07,
"loss": 0.003,
"reward": 1.0827649384737015,
"reward_std": 0.822308674454689,
"rewards/cosine_scaled_reward": 0.10388245154172182,
"rewards/format_reward": 0.8750000149011612,
"step": 437
},
{
"completion_length": 2723.1251220703125,
"epoch": 0.5005714285714286,
"grad_norm": 0.6586508750915527,
"kl": 0.340087890625,
"learning_rate": 1.4150013466019114e-07,
"loss": 0.046,
"reward": 1.0363626778125763,
"reward_std": 0.9988095015287399,
"rewards/cosine_scaled_reward": 0.10151464305818081,
"rewards/format_reward": 0.8333333432674408,
"step": 438
},
{
"completion_length": 2458.041778564453,
"epoch": 0.5017142857142857,
"grad_norm": 0.6118423342704773,
"kl": 0.3319091796875,
"learning_rate": 1.4019235263722034e-07,
"loss": 0.059,
"reward": 0.7599635235965252,
"reward_std": 0.6979039385914803,
"rewards/cosine_scaled_reward": -0.05751825252082199,
"rewards/format_reward": 0.8750000149011612,
"step": 439
},
{
"completion_length": 2812.7916870117188,
"epoch": 0.5028571428571429,
"grad_norm": 0.6263717412948608,
"kl": 0.346435546875,
"learning_rate": 1.3890454406082956e-07,
"loss": 0.0402,
"reward": 0.7473399192094803,
"reward_std": 0.7950000017881393,
"rewards/cosine_scaled_reward": -0.04299671063199639,
"rewards/format_reward": 0.833333358168602,
"step": 440
},
{
"completion_length": 2658.854217529297,
"epoch": 0.504,
"grad_norm": 0.48751676082611084,
"kl": 0.3270263671875,
"learning_rate": 1.3763677169699217e-07,
"loss": 0.0419,
"reward": 0.7070795819163322,
"reward_std": 0.773023784160614,
"rewards/cosine_scaled_reward": -0.04229356348514557,
"rewards/format_reward": 0.7916666716337204,
"step": 441
},
{
"completion_length": 2351.6250610351562,
"epoch": 0.5051428571428571,
"grad_norm": 0.5668932199478149,
"kl": 0.252685546875,
"learning_rate": 1.3638909733514452e-07,
"loss": 0.0475,
"reward": 0.8659966886043549,
"reward_std": 0.5813730582594872,
"rewards/cosine_scaled_reward": 0.016331655904650688,
"rewards/format_reward": 0.833333358168602,
"step": 442
},
{
"completion_length": 2956.729248046875,
"epoch": 0.5062857142857143,
"grad_norm": 0.3870391249656677,
"kl": 0.30859375,
"learning_rate": 1.351615817851748e-07,
"loss": 0.0416,
"reward": 1.151278093457222,
"reward_std": 0.8103004992008209,
"rewards/cosine_scaled_reward": 0.11730570159852505,
"rewards/format_reward": 0.9166666865348816,
"step": 443
},
{
"completion_length": 2814.6876220703125,
"epoch": 0.5074285714285715,
"grad_norm": 0.5548789501190186,
"kl": 0.369140625,
"learning_rate": 1.3395428487445914e-07,
"loss": 0.0343,
"reward": 0.9690770208835602,
"reward_std": 0.9044716209173203,
"rewards/cosine_scaled_reward": 0.09912180341780186,
"rewards/format_reward": 0.770833358168602,
"step": 444
},
{
"completion_length": 2858.6875610351562,
"epoch": 0.5085714285714286,
"grad_norm": 0.7488447427749634,
"kl": 0.3701171875,
"learning_rate": 1.3276726544494571e-07,
"loss": 0.0152,
"reward": 0.6591560812667012,
"reward_std": 0.6855928599834442,
"rewards/cosine_scaled_reward": -0.035005307756364346,
"rewards/format_reward": 0.7291666865348816,
"step": 445
},
{
"completion_length": 2472.7500610351562,
"epoch": 0.5097142857142857,
"grad_norm": 0.5907102227210999,
"kl": 0.208251953125,
"learning_rate": 1.316005813502869e-07,
"loss": 0.0325,
"reward": 1.3291829228401184,
"reward_std": 0.7747218981385231,
"rewards/cosine_scaled_reward": 0.206258125603199,
"rewards/format_reward": 0.9166666716337204,
"step": 446
},
{
"completion_length": 2428.1459350585938,
"epoch": 0.5108571428571429,
"grad_norm": 0.5603023171424866,
"kl": 0.2802734375,
"learning_rate": 1.3045428945301953e-07,
"loss": 0.0368,
"reward": 0.9525867849588394,
"reward_std": 0.712784081697464,
"rewards/cosine_scaled_reward": 0.038793399930000305,
"rewards/format_reward": 0.875,
"step": 447
},
{
"completion_length": 2589.3958740234375,
"epoch": 0.512,
"grad_norm": 0.9914929866790771,
"kl": 0.297607421875,
"learning_rate": 1.2932844562179352e-07,
"loss": 0.0567,
"reward": 1.3133542239665985,
"reward_std": 1.0432665199041367,
"rewards/cosine_scaled_reward": 0.27126041799783707,
"rewards/format_reward": 0.770833358168602,
"step": 448
},
{
"completion_length": 2799.166748046875,
"epoch": 0.5131428571428571,
"grad_norm": 1.0846092700958252,
"kl": 0.4091796875,
"learning_rate": 1.2822310472864885e-07,
"loss": 0.0606,
"reward": 0.9647302851080894,
"reward_std": 0.7462186589837074,
"rewards/cosine_scaled_reward": 0.10736512392759323,
"rewards/format_reward": 0.7500000298023224,
"step": 449
},
{
"completion_length": 2719.916748046875,
"epoch": 0.5142857142857142,
"grad_norm": 0.5918545126914978,
"kl": 0.3916015625,
"learning_rate": 1.2713832064634125e-07,
"loss": 0.014,
"reward": 1.0999898612499237,
"reward_std": 0.8317281156778336,
"rewards/cosine_scaled_reward": 0.15416158083826303,
"rewards/format_reward": 0.7916666865348816,
"step": 450
},
{
"completion_length": 2748.4584350585938,
"epoch": 0.5154285714285715,
"grad_norm": 1.2674349546432495,
"kl": 0.348876953125,
"learning_rate": 1.260741462457165e-07,
"loss": 0.0753,
"reward": 0.851899653673172,
"reward_std": 0.9279103875160217,
"rewards/cosine_scaled_reward": 0.019699793308973312,
"rewards/format_reward": 0.8125000298023224,
"step": 451
},
{
"completion_length": 2946.291748046875,
"epoch": 0.5165714285714286,
"grad_norm": 0.9848341941833496,
"kl": 0.4384765625,
"learning_rate": 1.2503063339313356e-07,
"loss": 0.0244,
"reward": 0.7191433683037758,
"reward_std": 0.8444506227970123,
"rewards/cosine_scaled_reward": -0.0154283307492733,
"rewards/format_reward": 0.7500000149011612,
"step": 452
},
{
"completion_length": 2824.5000610351562,
"epoch": 0.5177142857142857,
"grad_norm": 1.562027931213379,
"kl": 0.450439453125,
"learning_rate": 1.2400783294793668e-07,
"loss": 0.0678,
"reward": 0.9857252687215805,
"reward_std": 0.8770118951797485,
"rewards/cosine_scaled_reward": 0.1074459683150053,
"rewards/format_reward": 0.7708333432674408,
"step": 453
},
{
"completion_length": 2845.291748046875,
"epoch": 0.5188571428571429,
"grad_norm": 1.0593106746673584,
"kl": 0.399658203125,
"learning_rate": 1.2300579475997657e-07,
"loss": 0.0191,
"reward": 0.5798447616398335,
"reward_std": 0.7729413360357285,
"rewards/cosine_scaled_reward": -0.11632763035595417,
"rewards/format_reward": 0.8125,
"step": 454
},
{
"completion_length": 2406.979248046875,
"epoch": 0.52,
"grad_norm": 0.4025033712387085,
"kl": 0.32861328125,
"learning_rate": 1.220245676671809e-07,
"loss": 0.0397,
"reward": 1.0016262233257294,
"reward_std": 0.6507641598582268,
"rewards/cosine_scaled_reward": 0.104979757219553,
"rewards/format_reward": 0.7916666865348816,
"step": 455
},
{
"completion_length": 2493.3125915527344,
"epoch": 0.5211428571428571,
"grad_norm": 0.6641373038291931,
"kl": 0.3935546875,
"learning_rate": 1.2106419949317388e-07,
"loss": 0.0496,
"reward": 0.8123725727200508,
"reward_std": 0.6888710185885429,
"rewards/cosine_scaled_reward": 0.04160293936729431,
"rewards/format_reward": 0.7291666865348816,
"step": 456
},
{
"completion_length": 2579.354248046875,
"epoch": 0.5222857142857142,
"grad_norm": 0.3551529347896576,
"kl": 0.35302734375,
"learning_rate": 1.2012473704494537e-07,
"loss": 0.0275,
"reward": 0.670621931552887,
"reward_std": 0.6615720614790916,
"rewards/cosine_scaled_reward": -0.06052236817777157,
"rewards/format_reward": 0.7916666865348816,
"step": 457
},
{
"completion_length": 2468.854217529297,
"epoch": 0.5234285714285715,
"grad_norm": 0.5066484212875366,
"kl": 0.423828125,
"learning_rate": 1.1920622611056974e-07,
"loss": 0.0466,
"reward": 1.0128154456615448,
"reward_std": 0.9961100518703461,
"rewards/cosine_scaled_reward": 0.08974102255888283,
"rewards/format_reward": 0.8333333432674408,
"step": 458
},
{
"completion_length": 2763.354248046875,
"epoch": 0.5245714285714286,
"grad_norm": 0.7024835348129272,
"kl": 0.363037109375,
"learning_rate": 1.1830871145697412e-07,
"loss": 0.0672,
"reward": 0.604728564620018,
"reward_std": 0.7839554250240326,
"rewards/cosine_scaled_reward": -0.11430239118635654,
"rewards/format_reward": 0.833333358168602,
"step": 459
},
{
"completion_length": 2871.4584350585938,
"epoch": 0.5257142857142857,
"grad_norm": 0.6273028254508972,
"kl": 0.372314453125,
"learning_rate": 1.1743223682775649e-07,
"loss": 0.0527,
"reward": 1.0004199892282486,
"reward_std": 0.8981437683105469,
"rewards/cosine_scaled_reward": 0.1147933267056942,
"rewards/format_reward": 0.770833358168602,
"step": 460
},
{
"completion_length": 2718.2709350585938,
"epoch": 0.5268571428571428,
"grad_norm": 0.46946173906326294,
"kl": 0.447021484375,
"learning_rate": 1.1657684494105386e-07,
"loss": 0.0404,
"reward": 1.022796869277954,
"reward_std": 0.7989484220743179,
"rewards/cosine_scaled_reward": 0.12598175182938576,
"rewards/format_reward": 0.7708333432674408,
"step": 461
},
{
"completion_length": 2926.0416870117188,
"epoch": 0.528,
"grad_norm": 1.261118769645691,
"kl": 0.525390625,
"learning_rate": 1.1574257748745986e-07,
"loss": 0.0831,
"reward": 0.7424125671386719,
"reward_std": 0.9555595070123672,
"rewards/cosine_scaled_reward": -0.0037937182933092117,
"rewards/format_reward": 0.7500000149011612,
"step": 462
},
{
"completion_length": 2262.4376220703125,
"epoch": 0.5291428571428571,
"grad_norm": 0.5456348657608032,
"kl": 0.3070068359375,
"learning_rate": 1.1492947512799328e-07,
"loss": 0.0543,
"reward": 1.0686239376664162,
"reward_std": 0.6754159927368164,
"rewards/cosine_scaled_reward": 0.1488952711224556,
"rewards/format_reward": 0.7708333432674408,
"step": 463
},
{
"completion_length": 3016.8958740234375,
"epoch": 0.5302857142857142,
"grad_norm": 1.5390175580978394,
"kl": 0.45947265625,
"learning_rate": 1.1413757749211602e-07,
"loss": 0.1119,
"reward": 0.8216940313577652,
"reward_std": 1.1384240239858627,
"rewards/cosine_scaled_reward": 0.03584700915962458,
"rewards/format_reward": 0.7500000149011612,
"step": 464
},
{
"completion_length": 2775.0208740234375,
"epoch": 0.5314285714285715,
"grad_norm": 1.5516222715377808,
"kl": 0.47607421875,
"learning_rate": 1.1336692317580158e-07,
"loss": 0.0147,
"reward": 0.7128820940852165,
"reward_std": 0.8897013664245605,
"rewards/cosine_scaled_reward": -0.018558980314992368,
"rewards/format_reward": 0.7500000298023224,
"step": 465
},
{
"completion_length": 2875.3333740234375,
"epoch": 0.5325714285714286,
"grad_norm": 0.6315276622772217,
"kl": 0.55029296875,
"learning_rate": 1.1261754973965422e-07,
"loss": 0.0399,
"reward": 0.6401756927371025,
"reward_std": 0.7611015811562538,
"rewards/cosine_scaled_reward": -0.054912167601287365,
"rewards/format_reward": 0.7500000149011612,
"step": 466
},
{
"completion_length": 2514.8750610351562,
"epoch": 0.5337142857142857,
"grad_norm": 0.43570035696029663,
"kl": 0.39990234375,
"learning_rate": 1.1188949370707787e-07,
"loss": 0.0301,
"reward": 0.6949951946735382,
"reward_std": 0.7680038511753082,
"rewards/cosine_scaled_reward": -0.06916908174753189,
"rewards/format_reward": 0.8333333730697632,
"step": 467
},
{
"completion_length": 2586.1458740234375,
"epoch": 0.5348571428571428,
"grad_norm": 0.6298258304595947,
"kl": 0.396484375,
"learning_rate": 1.1118279056249653e-07,
"loss": 0.0409,
"reward": 1.2849786281585693,
"reward_std": 0.9066727161407471,
"rewards/cosine_scaled_reward": 0.2570726328995079,
"rewards/format_reward": 0.770833358168602,
"step": 468
},
{
"completion_length": 2279.604248046875,
"epoch": 0.536,
"grad_norm": 0.42815151810646057,
"kl": 0.2633056640625,
"learning_rate": 1.1049747474962444e-07,
"loss": 0.015,
"reward": 0.784978911280632,
"reward_std": 0.6496678665280342,
"rewards/cosine_scaled_reward": -0.04501055763103068,
"rewards/format_reward": 0.8750000149011612,
"step": 469
},
{
"completion_length": 2119.416748046875,
"epoch": 0.5371428571428571,
"grad_norm": 1.2341870069503784,
"kl": 0.427001953125,
"learning_rate": 1.0983357966978745e-07,
"loss": 0.054,
"reward": 0.6538757495582104,
"reward_std": 0.8121753484010696,
"rewards/cosine_scaled_reward": -0.037645455449819565,
"rewards/format_reward": 0.7291666865348816,
"step": 470
},
{
"completion_length": 2927.5834350585938,
"epoch": 0.5382857142857143,
"grad_norm": 0.632990300655365,
"kl": 0.61474609375,
"learning_rate": 1.0919113768029517e-07,
"loss": 0.0463,
"reward": 0.5753965899348259,
"reward_std": 0.8329771310091019,
"rewards/cosine_scaled_reward": -0.1081350538879633,
"rewards/format_reward": 0.7916666865348816,
"step": 471
},
{
"completion_length": 2382.7500610351562,
"epoch": 0.5394285714285715,
"grad_norm": 0.4871074855327606,
"kl": 0.42333984375,
"learning_rate": 1.0857018009286381e-07,
"loss": 0.0518,
"reward": 1.0515232384204865,
"reward_std": 0.8982365727424622,
"rewards/cosine_scaled_reward": 0.16117826476693153,
"rewards/format_reward": 0.7291666865348816,
"step": 472
},
{
"completion_length": 2928.9584350585938,
"epoch": 0.5405714285714286,
"grad_norm": 1.3636996746063232,
"kl": 0.498779296875,
"learning_rate": 1.0797073717209013e-07,
"loss": 0.03,
"reward": 0.32807744294404984,
"reward_std": 0.5049104988574982,
"rewards/cosine_scaled_reward": -0.21096128597855568,
"rewards/format_reward": 0.7500000149011612,
"step": 473
},
{
"completion_length": 2577.0625610351562,
"epoch": 0.5417142857142857,
"grad_norm": 1.3398447036743164,
"kl": 0.351806640625,
"learning_rate": 1.0739283813397639e-07,
"loss": 0.0931,
"reward": 0.9431183338165283,
"reward_std": 0.893795982003212,
"rewards/cosine_scaled_reward": 0.08614248159574345,
"rewards/format_reward": 0.7708333432674408,
"step": 474
},
{
"completion_length": 2737.3751220703125,
"epoch": 0.5428571428571428,
"grad_norm": 1.3732081651687622,
"kl": 0.3955078125,
"learning_rate": 1.068365111445064e-07,
"loss": 0.0893,
"reward": 0.8586708009243011,
"reward_std": 0.8809327185153961,
"rewards/cosine_scaled_reward": 0.06475206837058067,
"rewards/format_reward": 0.7291666865348816,
"step": 475
},
{
"completion_length": 2780.3125610351562,
"epoch": 0.544,
"grad_norm": 1.55986750125885,
"kl": 0.4127197265625,
"learning_rate": 1.063017833182728e-07,
"loss": 0.0047,
"reward": 0.8244488090276718,
"reward_std": 0.7860056459903717,
"rewards/cosine_scaled_reward": 0.05805772356688976,
"rewards/format_reward": 0.7083333432674408,
"step": 476
},
{
"completion_length": 2252.229248046875,
"epoch": 0.5451428571428572,
"grad_norm": 0.784569263458252,
"kl": 0.378082275390625,
"learning_rate": 1.0578868071715544e-07,
"loss": 0.0354,
"reward": 1.200981080532074,
"reward_std": 0.7509779334068298,
"rewards/cosine_scaled_reward": 0.1734071932733059,
"rewards/format_reward": 0.8541666865348816,
"step": 477
},
{
"completion_length": 2425.9583740234375,
"epoch": 0.5462857142857143,
"grad_norm": 0.4835829436779022,
"kl": 0.4466552734375,
"learning_rate": 1.0529722834905125e-07,
"loss": 0.0496,
"reward": 0.7616169229149818,
"reward_std": 0.6851886659860611,
"rewards/cosine_scaled_reward": -0.035858187824487686,
"rewards/format_reward": 0.8333333432674408,
"step": 478
},
{
"completion_length": 2255.3750610351562,
"epoch": 0.5474285714285714,
"grad_norm": 0.9519103765487671,
"kl": 0.386962890625,
"learning_rate": 1.0482745016665526e-07,
"loss": 0.0216,
"reward": 0.9349322374910116,
"reward_std": 0.613688588142395,
"rewards/cosine_scaled_reward": 0.08204942103475332,
"rewards/format_reward": 0.770833358168602,
"step": 479
},
{
"completion_length": 2591.8959350585938,
"epoch": 0.5485714285714286,
"grad_norm": 0.619563102722168,
"kl": 0.52685546875,
"learning_rate": 1.0437936906629334e-07,
"loss": 0.042,
"reward": 0.7943236902356148,
"reward_std": 1.037893146276474,
"rewards/cosine_scaled_reward": 0.06382851302623749,
"rewards/format_reward": 0.6666666865348816,
"step": 480
},
{
"completion_length": 2677.4791870117188,
"epoch": 0.5497142857142857,
"grad_norm": 0.45002222061157227,
"kl": 0.56689453125,
"learning_rate": 1.0395300688680625e-07,
"loss": 0.0405,
"reward": 0.5671083256602287,
"reward_std": 0.708008423447609,
"rewards/cosine_scaled_reward": -0.10186250880360603,
"rewards/format_reward": 0.7708333432674408,
"step": 481
},
{
"completion_length": 2174.2291870117188,
"epoch": 0.5508571428571428,
"grad_norm": 0.3016662299633026,
"kl": 0.340087890625,
"learning_rate": 1.0354838440848501e-07,
"loss": 0.0176,
"reward": 0.9943665787577629,
"reward_std": 0.5935569703578949,
"rewards/cosine_scaled_reward": 0.04926658235490322,
"rewards/format_reward": 0.8958333432674408,
"step": 482
},
{
"completion_length": 2582.6459350585938,
"epoch": 0.552,
"grad_norm": 0.7917870879173279,
"kl": 0.466064453125,
"learning_rate": 1.0316552135205837e-07,
"loss": 0.0279,
"reward": 0.6264216639101505,
"reward_std": 0.9700927287340164,
"rewards/cosine_scaled_reward": -0.04095582733862102,
"rewards/format_reward": 0.708333358168602,
"step": 483
},
{
"completion_length": 2702.2708740234375,
"epoch": 0.5531428571428572,
"grad_norm": 0.5935311317443848,
"kl": 0.388427734375,
"learning_rate": 1.0280443637773163e-07,
"loss": 0.0319,
"reward": 0.9062394499778748,
"reward_std": 0.7218269556760788,
"rewards/cosine_scaled_reward": 0.06770304590463638,
"rewards/format_reward": 0.770833358168602,
"step": 484
},
{
"completion_length": 2429.1458435058594,
"epoch": 0.5542857142857143,
"grad_norm": 0.7909466028213501,
"kl": 0.4248046875,
"learning_rate": 1.0246514708427701e-07,
"loss": 0.0119,
"reward": 0.7019704282283783,
"reward_std": 0.6897935420274734,
"rewards/cosine_scaled_reward": -0.0448481235653162,
"rewards/format_reward": 0.7916666865348816,
"step": 485
},
{
"completion_length": 2677.291748046875,
"epoch": 0.5554285714285714,
"grad_norm": 1.1475855112075806,
"kl": 0.327880859375,
"learning_rate": 1.0214767000817596e-07,
"loss": 0.0051,
"reward": 1.1072902642190456,
"reward_std": 0.7692115753889084,
"rewards/cosine_scaled_reward": 0.14739511162042618,
"rewards/format_reward": 0.8125000149011612,
"step": 486
},
{
"completion_length": 2795.8750610351562,
"epoch": 0.5565714285714286,
"grad_norm": 0.5653597116470337,
"kl": 0.3798828125,
"learning_rate": 1.0185202062281336e-07,
"loss": 0.0451,
"reward": 0.7866236716508865,
"reward_std": 0.6821945160627365,
"rewards/cosine_scaled_reward": -0.02335483953356743,
"rewards/format_reward": 0.833333358168602,
"step": 487
},
{
"completion_length": 2895.3751220703125,
"epoch": 0.5577142857142857,
"grad_norm": 0.4974069893360138,
"kl": 0.4326171875,
"learning_rate": 1.0157821333772304e-07,
"loss": 0.062,
"reward": 0.5221007950603962,
"reward_std": 0.8605436235666275,
"rewards/cosine_scaled_reward": -0.1035329382866621,
"rewards/format_reward": 0.7291667014360428,
"step": 488
},
{
"completion_length": 2012.541748046875,
"epoch": 0.5588571428571428,
"grad_norm": 0.5164794921875,
"kl": 0.254058837890625,
"learning_rate": 1.013262614978859e-07,
"loss": 0.0022,
"reward": 1.416559837758541,
"reward_std": 0.6288183927536011,
"rewards/cosine_scaled_reward": 0.2707799021154642,
"rewards/format_reward": 0.8750000149011612,
"step": 489
},
{
"completion_length": 2575.6668090820312,
"epoch": 0.56,
"grad_norm": 0.8971602916717529,
"kl": 0.3701171875,
"learning_rate": 1.0109617738307911e-07,
"loss": 0.0111,
"reward": 0.5933700278401375,
"reward_std": 0.6079118028283119,
"rewards/cosine_scaled_reward": -0.1408149916678667,
"rewards/format_reward": 0.8750000149011612,
"step": 490
},
{
"completion_length": 2626.854217529297,
"epoch": 0.5611428571428572,
"grad_norm": 0.7071827054023743,
"kl": 0.3095703125,
"learning_rate": 1.0088797220727779e-07,
"loss": 0.043,
"reward": 0.9613501131534576,
"reward_std": 0.8130423650145531,
"rewards/cosine_scaled_reward": 0.07442504540085793,
"rewards/format_reward": 0.8125000149011612,
"step": 491
},
{
"completion_length": 2436.7291870117188,
"epoch": 0.5622857142857143,
"grad_norm": 0.44464409351348877,
"kl": 0.28173828125,
"learning_rate": 1.0070165611810855e-07,
"loss": 0.0244,
"reward": 0.7668804228305817,
"reward_std": 0.6314697042107582,
"rewards/cosine_scaled_reward": -0.07489313930273056,
"rewards/format_reward": 0.9166666865348816,
"step": 492
},
{
"completion_length": 2737.0209350585938,
"epoch": 0.5634285714285714,
"grad_norm": 0.5461977124214172,
"kl": 0.404296875,
"learning_rate": 1.005372381963547e-07,
"loss": 0.038,
"reward": 0.5373080670833588,
"reward_std": 0.7348825931549072,
"rewards/cosine_scaled_reward": -0.11676262941909954,
"rewards/format_reward": 0.770833358168602,
"step": 493
},
{
"completion_length": 2169.5416870117188,
"epoch": 0.5645714285714286,
"grad_norm": 0.2975417971611023,
"kl": 0.2210693359375,
"learning_rate": 1.0039472645551372e-07,
"loss": 0.0104,
"reward": 0.6467055715620518,
"reward_std": 0.6691789701581001,
"rewards/cosine_scaled_reward": -0.10373054444789886,
"rewards/format_reward": 0.8541666716337204,
"step": 494
},
{
"completion_length": 2759.9793090820312,
"epoch": 0.5657142857142857,
"grad_norm": 0.7536102533340454,
"kl": 0.2822265625,
"learning_rate": 1.002741278414069e-07,
"loss": 0.0527,
"reward": 1.0850744023919106,
"reward_std": 0.9734541922807693,
"rewards/cosine_scaled_reward": 0.13628720492124557,
"rewards/format_reward": 0.8125000149011612,
"step": 495
},
{
"completion_length": 2828.6459350585938,
"epoch": 0.5668571428571428,
"grad_norm": 0.7388039231300354,
"kl": 0.3896484375,
"learning_rate": 1.0017544823184055e-07,
"loss": 0.0107,
"reward": 0.9930586367845535,
"reward_std": 0.9435475766658783,
"rewards/cosine_scaled_reward": 0.1215293172863312,
"rewards/format_reward": 0.7500000298023224,
"step": 496
},
{
"completion_length": 2202.437530517578,
"epoch": 0.568,
"grad_norm": 0.4381030201911926,
"kl": 0.25762939453125,
"learning_rate": 1.0009869243631952e-07,
"loss": 0.0376,
"reward": 1.1173406671732664,
"reward_std": 0.5638850405812263,
"rewards/cosine_scaled_reward": 0.1836703196167946,
"rewards/format_reward": 0.7500000149011612,
"step": 497
},
{
"completion_length": 2922.4376220703125,
"epoch": 0.5691428571428572,
"grad_norm": 0.3199293315410614,
"kl": 0.4296875,
"learning_rate": 1.000438641958131e-07,
"loss": 0.0655,
"reward": 0.23180836997926235,
"reward_std": 0.6018998995423317,
"rewards/cosine_scaled_reward": -0.23826248571276665,
"rewards/format_reward": 0.708333358168602,
"step": 498
},
{
"completion_length": 2946.0626220703125,
"epoch": 0.5702857142857143,
"grad_norm": 0.9604411125183105,
"kl": 0.41943359375,
"learning_rate": 1.0001096618257236e-07,
"loss": 0.068,
"reward": 0.4334963224828243,
"reward_std": 0.9516143649816513,
"rewards/cosine_scaled_reward": -0.1270018396899104,
"rewards/format_reward": 0.6875000149011612,
"step": 499
},
{
"completion_length": 2515.8958740234375,
"epoch": 0.5714285714285714,
"grad_norm": 1.0595104694366455,
"kl": 0.287109375,
"learning_rate": 1e-07,
"loss": 0.0511,
"reward": 0.935544490814209,
"reward_std": 1.0099718570709229,
"rewards/cosine_scaled_reward": 0.07193891797214746,
"rewards/format_reward": 0.7916666865348816,
"step": 500
},
{
"epoch": 0.5714285714285714,
"step": 500,
"total_flos": 0.0,
"train_loss": 0.008432806108146906,
"train_runtime": 8817.9865,
"train_samples_per_second": 2.722,
"train_steps_per_second": 0.057
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}