kadid_train / trainer_state.json
Wu23333's picture
Upload folder using huggingface_hub
7320445 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 9.5625,
"eval_steps": 500,
"global_step": 450,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 108.36198425292969,
"epoch": 0.020833333333333332,
"grad_norm": 1.639632839083605,
"kl": 0.0,
"learning_rate": 9.979166666666667e-07,
"loss": 0.0,
"reward": 1.7404149770736694,
"reward_std": 0.0779535174369812,
"rewards/accuracy_reward": 0.7482273578643799,
"rewards/format_reward": 0.9921875,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 111.453125,
"epoch": 0.041666666666666664,
"grad_norm": 1.6143054455733772,
"kl": 0.000759124755859375,
"learning_rate": 9.958333333333333e-07,
"loss": 0.0001,
"reward": 1.7971906661987305,
"reward_std": 0.07512722909450531,
"rewards/accuracy_reward": 0.8076074123382568,
"rewards/format_reward": 0.9895833730697632,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 111.90885925292969,
"epoch": 0.0625,
"grad_norm": 2.4515981685409955,
"kl": 0.000972747802734375,
"learning_rate": 9.9375e-07,
"loss": 0.0001,
"reward": 1.7853158712387085,
"reward_std": 0.056390173733234406,
"rewards/accuracy_reward": 0.7866179347038269,
"rewards/format_reward": 0.9986979365348816,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 111.05989837646484,
"epoch": 0.08333333333333333,
"grad_norm": 1.6201269048159341,
"kl": 0.00115203857421875,
"learning_rate": 9.916666666666666e-07,
"loss": 0.0001,
"reward": 1.7735958099365234,
"reward_std": 0.0706130862236023,
"rewards/accuracy_reward": 0.7761998176574707,
"rewards/format_reward": 0.9973958730697632,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 111.74089050292969,
"epoch": 0.10416666666666667,
"grad_norm": 1.5772248618980282,
"kl": 0.0014801025390625,
"learning_rate": 9.895833333333333e-07,
"loss": 0.0001,
"reward": 1.775779128074646,
"reward_std": 0.0632912740111351,
"rewards/accuracy_reward": 0.7783832550048828,
"rewards/format_reward": 0.9973958730697632,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 113.27214050292969,
"epoch": 0.125,
"grad_norm": 1.6430473395229306,
"kl": 0.0023345947265625,
"learning_rate": 9.875e-07,
"loss": 0.0001,
"reward": 1.7499568462371826,
"reward_std": 0.06891956180334091,
"rewards/accuracy_reward": 0.7525607943534851,
"rewards/format_reward": 0.9973958730697632,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 113.0390625,
"epoch": 0.14583333333333334,
"grad_norm": 2.1082448590555587,
"kl": 0.0029754638671875,
"learning_rate": 9.854166666666666e-07,
"loss": 0.0002,
"reward": 1.7655704021453857,
"reward_std": 0.06707193702459335,
"rewards/accuracy_reward": 0.7707786560058594,
"rewards/format_reward": 0.9947916865348816,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 111.79948425292969,
"epoch": 0.16666666666666666,
"grad_norm": 1.2774316678996838,
"kl": 0.003570556640625,
"learning_rate": 9.833333333333332e-07,
"loss": 0.0002,
"reward": 1.7789148092269897,
"reward_std": 0.06289087980985641,
"rewards/accuracy_reward": 0.7828210592269897,
"rewards/format_reward": 0.99609375,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 111.25390625,
"epoch": 0.1875,
"grad_norm": 2.9892643389862297,
"kl": 0.00396728515625,
"learning_rate": 9.8125e-07,
"loss": 0.0002,
"reward": 1.7818154096603394,
"reward_std": 0.06365714222192764,
"rewards/accuracy_reward": 0.7844195365905762,
"rewards/format_reward": 0.9973958730697632,
"step": 9
},
{
"clip_ratio": 0.0,
"completion_length": 111.72005462646484,
"epoch": 0.20833333333333334,
"grad_norm": 2.067866094157864,
"kl": 0.00555419921875,
"learning_rate": 9.791666666666667e-07,
"loss": 0.0003,
"reward": 1.768758773803711,
"reward_std": 0.061586372554302216,
"rewards/accuracy_reward": 0.7713630199432373,
"rewards/format_reward": 0.9973958730697632,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 111.7109375,
"epoch": 0.22916666666666666,
"grad_norm": 1.9143306648652931,
"kl": 0.007568359375,
"learning_rate": 9.770833333333332e-07,
"loss": 0.0004,
"reward": 1.7840107679367065,
"reward_std": 0.06136108189821243,
"rewards/accuracy_reward": 0.7866148948669434,
"rewards/format_reward": 0.9973958730697632,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 109.51171875,
"epoch": 0.25,
"grad_norm": 1.3340893635394109,
"kl": 0.0089111328125,
"learning_rate": 9.75e-07,
"loss": 0.0005,
"reward": 1.7980231046676636,
"reward_std": 0.05285460874438286,
"rewards/accuracy_reward": 0.799325168132782,
"rewards/format_reward": 0.9986979365348816,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 109.94921875,
"epoch": 0.2708333333333333,
"grad_norm": 1.5039333862682631,
"kl": 0.01055908203125,
"learning_rate": 9.729166666666665e-07,
"loss": 0.0005,
"reward": 1.781097412109375,
"reward_std": 0.054250504821538925,
"rewards/accuracy_reward": 0.7810973525047302,
"rewards/format_reward": 1.0,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 108.20442962646484,
"epoch": 0.2916666666666667,
"grad_norm": 1.320078780832244,
"kl": 0.01153564453125,
"learning_rate": 9.708333333333333e-07,
"loss": 0.0006,
"reward": 1.7951558828353882,
"reward_std": 0.05504516512155533,
"rewards/accuracy_reward": 0.7964579463005066,
"rewards/format_reward": 0.9986979365348816,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 107.59635925292969,
"epoch": 0.3125,
"grad_norm": 1.2979058457846995,
"kl": 0.0120849609375,
"learning_rate": 9.6875e-07,
"loss": 0.0006,
"reward": 1.7878097295761108,
"reward_std": 0.05526263639330864,
"rewards/accuracy_reward": 0.7891117930412292,
"rewards/format_reward": 0.9986979365348816,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 107.14974212646484,
"epoch": 0.3333333333333333,
"grad_norm": 1.326162224402637,
"kl": 0.01275634765625,
"learning_rate": 9.666666666666666e-07,
"loss": 0.0006,
"reward": 1.7894906997680664,
"reward_std": 0.04853988438844681,
"rewards/accuracy_reward": 0.7907928824424744,
"rewards/format_reward": 0.9986979365348816,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 106.43489837646484,
"epoch": 0.3541666666666667,
"grad_norm": 2.7897636177227594,
"kl": 0.01263427734375,
"learning_rate": 9.645833333333333e-07,
"loss": 0.0005,
"reward": 1.8176600933074951,
"reward_std": 0.04950461909174919,
"rewards/accuracy_reward": 0.8176599740982056,
"rewards/format_reward": 1.0,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 107.42708587646484,
"epoch": 0.375,
"grad_norm": 1.1152706578469112,
"kl": 0.01409912109375,
"learning_rate": 9.624999999999999e-07,
"loss": 0.0006,
"reward": 1.7780466079711914,
"reward_std": 0.0461871400475502,
"rewards/accuracy_reward": 0.7780466079711914,
"rewards/format_reward": 1.0,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 106.89583587646484,
"epoch": 0.3958333333333333,
"grad_norm": 5.136433449809359,
"kl": 0.01300048828125,
"learning_rate": 9.604166666666666e-07,
"loss": 0.0006,
"reward": 1.7988059520721436,
"reward_std": 0.04341081529855728,
"rewards/accuracy_reward": 0.7988060712814331,
"rewards/format_reward": 1.0,
"step": 19
},
{
"clip_ratio": 0.0,
"completion_length": 105.54296875,
"epoch": 0.4166666666666667,
"grad_norm": 1.285848750333943,
"kl": 0.01422119140625,
"learning_rate": 9.583333333333334e-07,
"loss": 0.0007,
"reward": 1.795292854309082,
"reward_std": 0.045156918466091156,
"rewards/accuracy_reward": 0.7952930331230164,
"rewards/format_reward": 1.0,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 103.421875,
"epoch": 0.4375,
"grad_norm": 1.309348147777355,
"kl": 0.0162353515625,
"learning_rate": 9.5625e-07,
"loss": 0.0007,
"reward": 1.793769121170044,
"reward_std": 0.050124406814575195,
"rewards/accuracy_reward": 0.7963732481002808,
"rewards/format_reward": 0.9973958730697632,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 103.89192962646484,
"epoch": 0.4583333333333333,
"grad_norm": 1.2889966905867243,
"kl": 0.017578125,
"learning_rate": 9.541666666666667e-07,
"loss": 0.0007,
"reward": 1.8341398239135742,
"reward_std": 0.04439392685890198,
"rewards/accuracy_reward": 0.8341398239135742,
"rewards/format_reward": 1.0,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 104.41796875,
"epoch": 0.4791666666666667,
"grad_norm": 1.4675439511923518,
"kl": 0.0167236328125,
"learning_rate": 9.520833333333333e-07,
"loss": 0.0007,
"reward": 1.836176872253418,
"reward_std": 0.046172261238098145,
"rewards/accuracy_reward": 0.836176872253418,
"rewards/format_reward": 1.0,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 104.67448425292969,
"epoch": 0.5,
"grad_norm": 2.1035799373033917,
"kl": 0.0203857421875,
"learning_rate": 9.499999999999999e-07,
"loss": 0.0008,
"reward": 1.8129713535308838,
"reward_std": 0.04518614709377289,
"rewards/accuracy_reward": 0.812971293926239,
"rewards/format_reward": 1.0,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 104.63151550292969,
"epoch": 0.5208333333333334,
"grad_norm": 1.4208098502006166,
"kl": 0.0186767578125,
"learning_rate": 9.479166666666666e-07,
"loss": 0.0008,
"reward": 1.8406684398651123,
"reward_std": 0.041067786514759064,
"rewards/accuracy_reward": 0.8406683802604675,
"rewards/format_reward": 1.0,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 107.16927337646484,
"epoch": 0.5416666666666666,
"grad_norm": 1.7838214183915748,
"kl": 0.0203857421875,
"learning_rate": 9.458333333333333e-07,
"loss": 0.0008,
"reward": 1.803572177886963,
"reward_std": 0.04902785271406174,
"rewards/accuracy_reward": 0.8035721778869629,
"rewards/format_reward": 1.0,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 108.23046875,
"epoch": 0.5625,
"grad_norm": 1.2204814881350547,
"kl": 0.0211181640625,
"learning_rate": 9.4375e-07,
"loss": 0.0009,
"reward": 1.8056421279907227,
"reward_std": 0.04466244578361511,
"rewards/accuracy_reward": 0.8056421279907227,
"rewards/format_reward": 1.0,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 107.625,
"epoch": 0.5833333333333334,
"grad_norm": 1.802147920265982,
"kl": 0.0218505859375,
"learning_rate": 9.416666666666666e-07,
"loss": 0.0009,
"reward": 1.828833818435669,
"reward_std": 0.042848870158195496,
"rewards/accuracy_reward": 0.8288335800170898,
"rewards/format_reward": 1.0,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 109.18099212646484,
"epoch": 0.6041666666666666,
"grad_norm": 2.372355341881645,
"kl": 0.021240234375,
"learning_rate": 9.395833333333333e-07,
"loss": 0.0009,
"reward": 1.8150265216827393,
"reward_std": 0.04657554626464844,
"rewards/accuracy_reward": 0.8150264620780945,
"rewards/format_reward": 1.0,
"step": 29
},
{
"clip_ratio": 0.0,
"completion_length": 113.60807800292969,
"epoch": 0.625,
"grad_norm": 1.6452523523321965,
"kl": 0.022216796875,
"learning_rate": 9.374999999999999e-07,
"loss": 0.0009,
"reward": 1.8578990697860718,
"reward_std": 0.039906859397888184,
"rewards/accuracy_reward": 0.8578989505767822,
"rewards/format_reward": 1.0,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 114.49349212646484,
"epoch": 0.6458333333333334,
"grad_norm": 1.9283111710202978,
"kl": 0.022705078125,
"learning_rate": 9.354166666666667e-07,
"loss": 0.001,
"reward": 1.7941240072250366,
"reward_std": 0.046397458761930466,
"rewards/accuracy_reward": 0.7941240072250366,
"rewards/format_reward": 1.0,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 117.61589050292969,
"epoch": 0.6666666666666666,
"grad_norm": 1.8161381749377306,
"kl": 0.02490234375,
"learning_rate": 9.333333333333333e-07,
"loss": 0.001,
"reward": 1.807027816772461,
"reward_std": 0.047066252678632736,
"rewards/accuracy_reward": 0.8070278167724609,
"rewards/format_reward": 1.0,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 114.55599212646484,
"epoch": 0.6875,
"grad_norm": 1.4095225702209093,
"kl": 0.028076171875,
"learning_rate": 9.3125e-07,
"loss": 0.0011,
"reward": 1.816405177116394,
"reward_std": 0.0405634380877018,
"rewards/accuracy_reward": 0.816405177116394,
"rewards/format_reward": 1.0,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 115.86589050292969,
"epoch": 0.7083333333333334,
"grad_norm": 1.9475471383587142,
"kl": 0.0264892578125,
"learning_rate": 9.291666666666666e-07,
"loss": 0.0011,
"reward": 1.8178232908248901,
"reward_std": 0.048409104347229004,
"rewards/accuracy_reward": 0.8191253542900085,
"rewards/format_reward": 0.9986979365348816,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 114.81771087646484,
"epoch": 0.7291666666666666,
"grad_norm": 1.816929377906017,
"kl": 0.0250244140625,
"learning_rate": 9.270833333333333e-07,
"loss": 0.001,
"reward": 1.8112201690673828,
"reward_std": 0.04231969267129898,
"rewards/accuracy_reward": 0.811220109462738,
"rewards/format_reward": 1.0,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 115.69271087646484,
"epoch": 0.75,
"grad_norm": 1.9245265212018168,
"kl": 0.0260009765625,
"learning_rate": 9.25e-07,
"loss": 0.0011,
"reward": 1.800992727279663,
"reward_std": 0.04579651355743408,
"rewards/accuracy_reward": 0.8022947311401367,
"rewards/format_reward": 0.9986979365348816,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 115.50911712646484,
"epoch": 0.7708333333333334,
"grad_norm": 2.3374928077264565,
"kl": 0.0274658203125,
"learning_rate": 9.229166666666667e-07,
"loss": 0.0011,
"reward": 1.8183115720748901,
"reward_std": 0.047783225774765015,
"rewards/accuracy_reward": 0.820915699005127,
"rewards/format_reward": 0.9973958730697632,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 115.3515625,
"epoch": 0.7916666666666666,
"grad_norm": 1.959104932697817,
"kl": 0.0269775390625,
"learning_rate": 9.208333333333332e-07,
"loss": 0.0011,
"reward": 1.7997591495513916,
"reward_std": 0.04710128903388977,
"rewards/accuracy_reward": 0.8010611534118652,
"rewards/format_reward": 0.9986979365348816,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 115.93489837646484,
"epoch": 0.8125,
"grad_norm": 5.405686552072785,
"kl": 0.023193359375,
"learning_rate": 9.187499999999999e-07,
"loss": 0.0009,
"reward": 1.8103388547897339,
"reward_std": 0.048775218427181244,
"rewards/accuracy_reward": 0.8129429817199707,
"rewards/format_reward": 0.9973958730697632,
"step": 39
},
{
"clip_ratio": 0.0,
"completion_length": 112.42578125,
"epoch": 0.8333333333333334,
"grad_norm": 2.5109857220624874,
"kl": 0.02587890625,
"learning_rate": 9.166666666666665e-07,
"loss": 0.001,
"reward": 1.8131688833236694,
"reward_std": 0.045695092529058456,
"rewards/accuracy_reward": 0.8157729506492615,
"rewards/format_reward": 0.9973958730697632,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 114.1015625,
"epoch": 0.8541666666666666,
"grad_norm": 1.3500957332806702,
"kl": 0.0267333984375,
"learning_rate": 9.145833333333333e-07,
"loss": 0.0011,
"reward": 1.8379626274108887,
"reward_std": 0.03906077891588211,
"rewards/accuracy_reward": 0.8392646312713623,
"rewards/format_reward": 0.9986979365348816,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 114.87109375,
"epoch": 0.875,
"grad_norm": 1.9553905487315055,
"kl": 0.02734375,
"learning_rate": 9.124999999999999e-07,
"loss": 0.0011,
"reward": 1.8075612783432007,
"reward_std": 0.046475451439619064,
"rewards/accuracy_reward": 0.8075612783432007,
"rewards/format_reward": 1.0,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 110.93359375,
"epoch": 0.8958333333333334,
"grad_norm": 2.687075325377198,
"kl": 0.0262451171875,
"learning_rate": 9.104166666666666e-07,
"loss": 0.0011,
"reward": 1.8184566497802734,
"reward_std": 0.04643276333808899,
"rewards/accuracy_reward": 0.8197587728500366,
"rewards/format_reward": 0.9986979365348816,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 109.43489837646484,
"epoch": 0.9166666666666666,
"grad_norm": 1.2273175474621774,
"kl": 0.026611328125,
"learning_rate": 9.083333333333332e-07,
"loss": 0.0011,
"reward": 1.7925729751586914,
"reward_std": 0.04880434274673462,
"rewards/accuracy_reward": 0.7938751578330994,
"rewards/format_reward": 0.9986979365348816,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 110.70052337646484,
"epoch": 0.9375,
"grad_norm": 3.4979750979513184,
"kl": 0.0242919921875,
"learning_rate": 9.0625e-07,
"loss": 0.001,
"reward": 1.8129024505615234,
"reward_std": 0.04281633347272873,
"rewards/accuracy_reward": 0.8142046332359314,
"rewards/format_reward": 0.9986979365348816,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 109.12239837646484,
"epoch": 0.9583333333333334,
"grad_norm": 1.581390275171433,
"kl": 0.024658203125,
"learning_rate": 9.041666666666667e-07,
"loss": 0.001,
"reward": 1.8339308500289917,
"reward_std": 0.0428236648440361,
"rewards/accuracy_reward": 0.8339308500289917,
"rewards/format_reward": 1.0,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 109.57410430908203,
"epoch": 0.9791666666666666,
"grad_norm": 1.2859639310341215,
"kl": 0.031982421875,
"learning_rate": 9.020833333333333e-07,
"loss": 0.0013,
"reward": 1.8121892213821411,
"reward_std": 0.043297141790390015,
"rewards/accuracy_reward": 0.8135243654251099,
"rewards/format_reward": 0.998664915561676,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 105.44271087646484,
"epoch": 1.0208333333333333,
"grad_norm": 2.3284880809656916,
"kl": 0.0240478515625,
"learning_rate": 9e-07,
"loss": 0.001,
"reward": 1.842546820640564,
"reward_std": 0.03681856021285057,
"rewards/accuracy_reward": 0.842546820640564,
"rewards/format_reward": 1.0,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 107.63932800292969,
"epoch": 1.0416666666666667,
"grad_norm": 11.066834298993447,
"kl": 0.0245361328125,
"learning_rate": 8.979166666666666e-07,
"loss": 0.001,
"reward": 1.810459852218628,
"reward_std": 0.04241730272769928,
"rewards/accuracy_reward": 0.8117618560791016,
"rewards/format_reward": 0.9986979365348816,
"step": 49
},
{
"clip_ratio": 0.0,
"completion_length": 105.625,
"epoch": 1.0625,
"grad_norm": 3.9999069520932107,
"kl": 0.0284423828125,
"learning_rate": 8.958333333333334e-07,
"loss": 0.0011,
"reward": 1.8403337001800537,
"reward_std": 0.035806819796562195,
"rewards/accuracy_reward": 0.8403337001800537,
"rewards/format_reward": 1.0,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 105.18620300292969,
"epoch": 1.0833333333333333,
"grad_norm": 1.9543615318552672,
"kl": 0.0263671875,
"learning_rate": 8.9375e-07,
"loss": 0.0011,
"reward": 1.8314783573150635,
"reward_std": 0.03615511581301689,
"rewards/accuracy_reward": 0.8314781188964844,
"rewards/format_reward": 1.0,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 103.74349212646484,
"epoch": 1.1041666666666667,
"grad_norm": 1.7271064117720305,
"kl": 0.0284423828125,
"learning_rate": 8.916666666666667e-07,
"loss": 0.0012,
"reward": 1.8100008964538574,
"reward_std": 0.04172190651297569,
"rewards/accuracy_reward": 0.810001015663147,
"rewards/format_reward": 1.0,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 103.9609375,
"epoch": 1.125,
"grad_norm": 3.0099066254538225,
"kl": 0.0263671875,
"learning_rate": 8.895833333333332e-07,
"loss": 0.0011,
"reward": 1.8225668668746948,
"reward_std": 0.03787129372358322,
"rewards/accuracy_reward": 0.8225669860839844,
"rewards/format_reward": 1.0,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 103.16667175292969,
"epoch": 1.1458333333333333,
"grad_norm": 2.1966545517434746,
"kl": 0.02783203125,
"learning_rate": 8.874999999999999e-07,
"loss": 0.0012,
"reward": 1.825528860092163,
"reward_std": 0.0393807552754879,
"rewards/accuracy_reward": 0.8255288004875183,
"rewards/format_reward": 1.0,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 102.93880462646484,
"epoch": 1.1666666666666667,
"grad_norm": 2.2377689992101195,
"kl": 0.0289306640625,
"learning_rate": 8.854166666666666e-07,
"loss": 0.0012,
"reward": 1.84073007106781,
"reward_std": 0.036864347755908966,
"rewards/accuracy_reward": 0.8407299518585205,
"rewards/format_reward": 1.0,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 104.11198425292969,
"epoch": 1.1875,
"grad_norm": 1.4566342977805207,
"kl": 0.0250244140625,
"learning_rate": 8.833333333333333e-07,
"loss": 0.001,
"reward": 1.8056623935699463,
"reward_std": 0.04149676859378815,
"rewards/accuracy_reward": 0.8056623339653015,
"rewards/format_reward": 1.0,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 102.09765625,
"epoch": 1.2083333333333333,
"grad_norm": 6.235839920646965,
"kl": 0.0255126953125,
"learning_rate": 8.812499999999999e-07,
"loss": 0.0011,
"reward": 1.8528995513916016,
"reward_std": 0.033966001123189926,
"rewards/accuracy_reward": 0.8528995513916016,
"rewards/format_reward": 1.0,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 101.87630462646484,
"epoch": 1.2291666666666667,
"grad_norm": 1.5034574110083547,
"kl": 0.02978515625,
"learning_rate": 8.791666666666666e-07,
"loss": 0.0012,
"reward": 1.835959792137146,
"reward_std": 0.02824896201491356,
"rewards/accuracy_reward": 0.8359596729278564,
"rewards/format_reward": 1.0,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 104.05599212646484,
"epoch": 1.25,
"grad_norm": 1.6494502522344965,
"kl": 0.0301513671875,
"learning_rate": 8.770833333333333e-07,
"loss": 0.0013,
"reward": 1.8419498205184937,
"reward_std": 0.03271109610795975,
"rewards/accuracy_reward": 0.8419498205184937,
"rewards/format_reward": 1.0,
"step": 59
},
{
"clip_ratio": 0.0,
"completion_length": 103.515625,
"epoch": 1.2708333333333333,
"grad_norm": 2.5168432356014954,
"kl": 0.02587890625,
"learning_rate": 8.75e-07,
"loss": 0.0011,
"reward": 1.8278319835662842,
"reward_std": 0.04185899719595909,
"rewards/accuracy_reward": 0.8278318643569946,
"rewards/format_reward": 1.0,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 105.421875,
"epoch": 1.2916666666666667,
"grad_norm": 1.3151086988925837,
"kl": 0.0242919921875,
"learning_rate": 8.729166666666666e-07,
"loss": 0.001,
"reward": 1.8187057971954346,
"reward_std": 0.03452453017234802,
"rewards/accuracy_reward": 0.8187057375907898,
"rewards/format_reward": 1.0,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 108.04948425292969,
"epoch": 1.3125,
"grad_norm": 2.810138538550519,
"kl": 0.02783203125,
"learning_rate": 8.708333333333333e-07,
"loss": 0.0012,
"reward": 1.846640944480896,
"reward_std": 0.04408061131834984,
"rewards/accuracy_reward": 0.8492451906204224,
"rewards/format_reward": 0.9973958730697632,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 107.23698425292969,
"epoch": 1.3333333333333333,
"grad_norm": 1.563896261562582,
"kl": 0.03076171875,
"learning_rate": 8.687499999999999e-07,
"loss": 0.0013,
"reward": 1.8380591869354248,
"reward_std": 0.03534460812807083,
"rewards/accuracy_reward": 0.8380589485168457,
"rewards/format_reward": 1.0,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 107.85286712646484,
"epoch": 1.3541666666666667,
"grad_norm": 3.020754504226419,
"kl": 0.0255126953125,
"learning_rate": 8.666666666666667e-07,
"loss": 0.0011,
"reward": 1.8550291061401367,
"reward_std": 0.037260740995407104,
"rewards/accuracy_reward": 0.8563313484191895,
"rewards/format_reward": 0.9986979365348816,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 109.10026550292969,
"epoch": 1.375,
"grad_norm": 3.306677048985413,
"kl": 0.026123046875,
"learning_rate": 8.645833333333333e-07,
"loss": 0.0011,
"reward": 1.8508269786834717,
"reward_std": 0.0380985364317894,
"rewards/accuracy_reward": 0.8508269190788269,
"rewards/format_reward": 1.0,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 107.24870300292969,
"epoch": 1.3958333333333333,
"grad_norm": 1.6070004388732007,
"kl": 0.0263671875,
"learning_rate": 8.625e-07,
"loss": 0.0011,
"reward": 1.857025146484375,
"reward_std": 0.03701246529817581,
"rewards/accuracy_reward": 0.857025146484375,
"rewards/format_reward": 1.0,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 110.89323425292969,
"epoch": 1.4166666666666667,
"grad_norm": 2.3548695186042186,
"kl": 0.02783203125,
"learning_rate": 8.604166666666667e-07,
"loss": 0.0011,
"reward": 1.8029731512069702,
"reward_std": 0.042077165096998215,
"rewards/accuracy_reward": 0.8055772185325623,
"rewards/format_reward": 0.9973958730697632,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 110.38671875,
"epoch": 1.4375,
"grad_norm": 1.278138762095491,
"kl": 0.0228271484375,
"learning_rate": 8.583333333333332e-07,
"loss": 0.0009,
"reward": 1.8338496685028076,
"reward_std": 0.039083532989025116,
"rewards/accuracy_reward": 0.8338495492935181,
"rewards/format_reward": 1.0,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 110.65495300292969,
"epoch": 1.4583333333333333,
"grad_norm": 2.070944061321321,
"kl": 0.0284423828125,
"learning_rate": 8.5625e-07,
"loss": 0.0012,
"reward": 1.8258914947509766,
"reward_std": 0.03814253211021423,
"rewards/accuracy_reward": 0.8271937370300293,
"rewards/format_reward": 0.9986979365348816,
"step": 69
},
{
"clip_ratio": 0.0,
"completion_length": 112.45442962646484,
"epoch": 1.4791666666666667,
"grad_norm": 1.3471433182606216,
"kl": 0.0281982421875,
"learning_rate": 8.541666666666666e-07,
"loss": 0.0012,
"reward": 1.8398675918579102,
"reward_std": 0.0354573093354702,
"rewards/accuracy_reward": 0.8398677706718445,
"rewards/format_reward": 1.0,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 111.04427337646484,
"epoch": 1.5,
"grad_norm": 1.6026685707149524,
"kl": 0.02880859375,
"learning_rate": 8.520833333333333e-07,
"loss": 0.0012,
"reward": 1.853353500366211,
"reward_std": 0.03885906934738159,
"rewards/accuracy_reward": 0.8546554446220398,
"rewards/format_reward": 0.9986979365348816,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 110.84635925292969,
"epoch": 1.5208333333333335,
"grad_norm": 2.1398544036186014,
"kl": 0.02783203125,
"learning_rate": 8.499999999999999e-07,
"loss": 0.0012,
"reward": 1.8261979818344116,
"reward_std": 0.03391870856285095,
"rewards/accuracy_reward": 0.8261978626251221,
"rewards/format_reward": 1.0,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 108.69271087646484,
"epoch": 1.5416666666666665,
"grad_norm": 4.020748588139163,
"kl": 0.025634765625,
"learning_rate": 8.479166666666667e-07,
"loss": 0.0011,
"reward": 1.8563817739486694,
"reward_std": 0.031248420476913452,
"rewards/accuracy_reward": 0.8563817739486694,
"rewards/format_reward": 1.0,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 107.48046875,
"epoch": 1.5625,
"grad_norm": 1.3740118540705408,
"kl": 0.0291748046875,
"learning_rate": 8.458333333333333e-07,
"loss": 0.0012,
"reward": 1.8323755264282227,
"reward_std": 0.03359724208712578,
"rewards/accuracy_reward": 0.8323755264282227,
"rewards/format_reward": 1.0,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 108.44792175292969,
"epoch": 1.5833333333333335,
"grad_norm": 2.2940140494468553,
"kl": 0.026611328125,
"learning_rate": 8.4375e-07,
"loss": 0.0011,
"reward": 1.8279385566711426,
"reward_std": 0.03183002024888992,
"rewards/accuracy_reward": 0.8279385566711426,
"rewards/format_reward": 1.0,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 108.87890625,
"epoch": 1.6041666666666665,
"grad_norm": 1.9014935653138212,
"kl": 0.0299072265625,
"learning_rate": 8.416666666666666e-07,
"loss": 0.0013,
"reward": 1.8360404968261719,
"reward_std": 0.02742108330130577,
"rewards/accuracy_reward": 0.8360404372215271,
"rewards/format_reward": 1.0,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 108.87239837646484,
"epoch": 1.625,
"grad_norm": 1.7025904896492243,
"kl": 0.02587890625,
"learning_rate": 8.395833333333333e-07,
"loss": 0.0012,
"reward": 1.7988513708114624,
"reward_std": 0.035461340099573135,
"rewards/accuracy_reward": 0.7988513708114624,
"rewards/format_reward": 1.0,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 107.25651550292969,
"epoch": 1.6458333333333335,
"grad_norm": 2.0737766177718564,
"kl": 0.0262451171875,
"learning_rate": 8.375e-07,
"loss": 0.0011,
"reward": 1.8587430715560913,
"reward_std": 0.031236987560987473,
"rewards/accuracy_reward": 0.8600451946258545,
"rewards/format_reward": 0.9986979365348816,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 107.46875,
"epoch": 1.6666666666666665,
"grad_norm": 1.8776151359500948,
"kl": 0.0283203125,
"learning_rate": 8.354166666666667e-07,
"loss": 0.0012,
"reward": 1.8535531759262085,
"reward_std": 0.03250068426132202,
"rewards/accuracy_reward": 0.853553056716919,
"rewards/format_reward": 1.0,
"step": 79
},
{
"clip_ratio": 0.0,
"completion_length": 107.76692962646484,
"epoch": 1.6875,
"grad_norm": 1.4792182225675727,
"kl": 0.0274658203125,
"learning_rate": 8.333333333333333e-07,
"loss": 0.0012,
"reward": 1.8359715938568115,
"reward_std": 0.027499686926603317,
"rewards/accuracy_reward": 0.8359713554382324,
"rewards/format_reward": 1.0,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 107.09114837646484,
"epoch": 1.7083333333333335,
"grad_norm": 1.429067785258255,
"kl": 0.034423828125,
"learning_rate": 8.3125e-07,
"loss": 0.0015,
"reward": 1.8376563787460327,
"reward_std": 0.03089229390025139,
"rewards/accuracy_reward": 0.8389585018157959,
"rewards/format_reward": 0.9986979365348816,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 106.81901550292969,
"epoch": 1.7291666666666665,
"grad_norm": 1.7947286530222653,
"kl": 0.0269775390625,
"learning_rate": 8.291666666666666e-07,
"loss": 0.0011,
"reward": 1.8310506343841553,
"reward_std": 0.0325641892850399,
"rewards/accuracy_reward": 0.8323527574539185,
"rewards/format_reward": 0.9986979365348816,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 107.32292175292969,
"epoch": 1.75,
"grad_norm": 1.840921164440233,
"kl": 0.028076171875,
"learning_rate": 8.270833333333333e-07,
"loss": 0.0012,
"reward": 1.8312219381332397,
"reward_std": 0.029772888869047165,
"rewards/accuracy_reward": 0.8312219381332397,
"rewards/format_reward": 1.0,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 105.17839050292969,
"epoch": 1.7708333333333335,
"grad_norm": 1.8438240192624837,
"kl": 0.02685546875,
"learning_rate": 8.249999999999999e-07,
"loss": 0.0012,
"reward": 1.8248231410980225,
"reward_std": 0.03228841722011566,
"rewards/accuracy_reward": 0.8248231410980225,
"rewards/format_reward": 1.0,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 108.45703125,
"epoch": 1.7916666666666665,
"grad_norm": 1.529693340603501,
"kl": 0.028076171875,
"learning_rate": 8.229166666666666e-07,
"loss": 0.0012,
"reward": 1.8117046356201172,
"reward_std": 0.03222049027681351,
"rewards/accuracy_reward": 0.8117045760154724,
"rewards/format_reward": 1.0,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 106.87630462646484,
"epoch": 1.8125,
"grad_norm": 3.9505026027468504,
"kl": 0.031494140625,
"learning_rate": 8.208333333333332e-07,
"loss": 0.0013,
"reward": 1.814018726348877,
"reward_std": 0.031958386301994324,
"rewards/accuracy_reward": 0.814018726348877,
"rewards/format_reward": 1.0,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 105.60807800292969,
"epoch": 1.8333333333333335,
"grad_norm": 9.21528924853295,
"kl": 0.0274658203125,
"learning_rate": 8.187499999999999e-07,
"loss": 0.0012,
"reward": 1.8301608562469482,
"reward_std": 0.02933676168322563,
"rewards/accuracy_reward": 0.8301607370376587,
"rewards/format_reward": 1.0,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 105.74089050292969,
"epoch": 1.8541666666666665,
"grad_norm": 1.329936418308406,
"kl": 0.03173828125,
"learning_rate": 8.166666666666666e-07,
"loss": 0.0013,
"reward": 1.8375142812728882,
"reward_std": 0.03457921743392944,
"rewards/accuracy_reward": 0.8375141024589539,
"rewards/format_reward": 1.0,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 104.92578125,
"epoch": 1.875,
"grad_norm": 1.3218491918211248,
"kl": 0.029541015625,
"learning_rate": 8.145833333333333e-07,
"loss": 0.0013,
"reward": 1.8522485494613647,
"reward_std": 0.028189565986394882,
"rewards/accuracy_reward": 0.8522485494613647,
"rewards/format_reward": 1.0,
"step": 89
},
{
"clip_ratio": 0.0,
"completion_length": 106.46354675292969,
"epoch": 1.8958333333333335,
"grad_norm": 2.094729152422118,
"kl": 0.0283203125,
"learning_rate": 8.125e-07,
"loss": 0.0013,
"reward": 1.8424299955368042,
"reward_std": 0.034867409616708755,
"rewards/accuracy_reward": 0.8424299955368042,
"rewards/format_reward": 1.0,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 106.65364837646484,
"epoch": 1.9166666666666665,
"grad_norm": 7.368992349088265,
"kl": 0.0283203125,
"learning_rate": 8.104166666666666e-07,
"loss": 0.0012,
"reward": 1.855029582977295,
"reward_std": 0.027775254100561142,
"rewards/accuracy_reward": 0.8550295233726501,
"rewards/format_reward": 1.0,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 107.18880462646484,
"epoch": 1.9375,
"grad_norm": 2.0045320605738333,
"kl": 0.0341796875,
"learning_rate": 8.083333333333334e-07,
"loss": 0.0015,
"reward": 1.8531224727630615,
"reward_std": 0.02575305663049221,
"rewards/accuracy_reward": 0.853122353553772,
"rewards/format_reward": 1.0,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 108.109375,
"epoch": 1.9583333333333335,
"grad_norm": 1.9199651103984947,
"kl": 0.02587890625,
"learning_rate": 8.0625e-07,
"loss": 0.0011,
"reward": 1.8517922163009644,
"reward_std": 0.02801516279578209,
"rewards/accuracy_reward": 0.8517922163009644,
"rewards/format_reward": 1.0,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 106.6648941040039,
"epoch": 1.9791666666666665,
"grad_norm": 1.2125015465439164,
"kl": 0.025146484375,
"learning_rate": 8.041666666666667e-07,
"loss": 0.0011,
"reward": 1.8468000888824463,
"reward_std": 0.027415748685598373,
"rewards/accuracy_reward": 0.8467998504638672,
"rewards/format_reward": 1.0,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 107.87109375,
"epoch": 2.0208333333333335,
"grad_norm": 2.893832337696663,
"kl": 0.0240478515625,
"learning_rate": 8.020833333333333e-07,
"loss": 0.0011,
"reward": 1.8671659231185913,
"reward_std": 0.03437124937772751,
"rewards/accuracy_reward": 0.8684679865837097,
"rewards/format_reward": 0.9986979365348816,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 108.80729675292969,
"epoch": 2.0416666666666665,
"grad_norm": 2.4656073464269124,
"kl": 0.028564453125,
"learning_rate": 8e-07,
"loss": 0.0013,
"reward": 1.8920382261276245,
"reward_std": 0.024690520018339157,
"rewards/accuracy_reward": 0.8933402895927429,
"rewards/format_reward": 0.9986979365348816,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 108.49739837646484,
"epoch": 2.0625,
"grad_norm": 2.8323749391253124,
"kl": 0.025390625,
"learning_rate": 7.979166666666667e-07,
"loss": 0.001,
"reward": 1.8716254234313965,
"reward_std": 0.028798673301935196,
"rewards/accuracy_reward": 0.8716254234313965,
"rewards/format_reward": 1.0,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 108.86979675292969,
"epoch": 2.0833333333333335,
"grad_norm": 1.842256938186257,
"kl": 0.0303955078125,
"learning_rate": 7.958333333333333e-07,
"loss": 0.0013,
"reward": 1.849453330039978,
"reward_std": 0.033137306571006775,
"rewards/accuracy_reward": 0.849453330039978,
"rewards/format_reward": 1.0,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 110.98828125,
"epoch": 2.1041666666666665,
"grad_norm": 1.625206074387942,
"kl": 0.02783203125,
"learning_rate": 7.937499999999999e-07,
"loss": 0.0012,
"reward": 1.864532232284546,
"reward_std": 0.02849118784070015,
"rewards/accuracy_reward": 0.8645319938659668,
"rewards/format_reward": 1.0,
"step": 99
},
{
"clip_ratio": 0.0,
"completion_length": 112.125,
"epoch": 2.125,
"grad_norm": 1.8687654433880987,
"kl": 0.02880859375,
"learning_rate": 7.916666666666666e-07,
"loss": 0.0012,
"reward": 1.8906430006027222,
"reward_std": 0.02808341383934021,
"rewards/accuracy_reward": 0.8906428813934326,
"rewards/format_reward": 1.0,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 111.30859375,
"epoch": 2.1458333333333335,
"grad_norm": 1.812765837401351,
"kl": 0.0341796875,
"learning_rate": 7.895833333333332e-07,
"loss": 0.0015,
"reward": 1.84462308883667,
"reward_std": 0.02905876934528351,
"rewards/accuracy_reward": 0.8446230292320251,
"rewards/format_reward": 1.0,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 113.15625,
"epoch": 2.1666666666666665,
"grad_norm": 1.3199461131282353,
"kl": 0.04736328125,
"learning_rate": 7.875e-07,
"loss": 0.002,
"reward": 1.881453275680542,
"reward_std": 0.025718865916132927,
"rewards/accuracy_reward": 0.8814532160758972,
"rewards/format_reward": 1.0,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 112.74739837646484,
"epoch": 2.1875,
"grad_norm": 1.4379737344223804,
"kl": 0.0257568359375,
"learning_rate": 7.854166666666666e-07,
"loss": 0.0011,
"reward": 1.852853775024414,
"reward_std": 0.03441212326288223,
"rewards/accuracy_reward": 0.8541558980941772,
"rewards/format_reward": 0.9986979365348816,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 110.17839050292969,
"epoch": 2.2083333333333335,
"grad_norm": 1.4747961451326568,
"kl": 0.02587890625,
"learning_rate": 7.833333333333333e-07,
"loss": 0.0011,
"reward": 1.8564555644989014,
"reward_std": 0.03334889933466911,
"rewards/accuracy_reward": 0.8564555048942566,
"rewards/format_reward": 1.0,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 111.84114837646484,
"epoch": 2.2291666666666665,
"grad_norm": 1.595566840318838,
"kl": 0.0277099609375,
"learning_rate": 7.812499999999999e-07,
"loss": 0.0012,
"reward": 1.847129225730896,
"reward_std": 0.034906916320323944,
"rewards/accuracy_reward": 0.847129225730896,
"rewards/format_reward": 1.0,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 112.56640625,
"epoch": 2.25,
"grad_norm": 1.7252491389020819,
"kl": 0.029296875,
"learning_rate": 7.791666666666667e-07,
"loss": 0.0012,
"reward": 1.8392118215560913,
"reward_std": 0.032303862273693085,
"rewards/accuracy_reward": 0.8392118811607361,
"rewards/format_reward": 1.0,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 112.73046875,
"epoch": 2.2708333333333335,
"grad_norm": 1.3233610087560426,
"kl": 0.027587890625,
"learning_rate": 7.770833333333333e-07,
"loss": 0.0012,
"reward": 1.8880364894866943,
"reward_std": 0.0294729545712471,
"rewards/accuracy_reward": 0.8880362510681152,
"rewards/format_reward": 1.0,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 111.72135925292969,
"epoch": 2.2916666666666665,
"grad_norm": 1.3388400280888855,
"kl": 0.0308837890625,
"learning_rate": 7.75e-07,
"loss": 0.0013,
"reward": 1.841947078704834,
"reward_std": 0.03206552192568779,
"rewards/accuracy_reward": 0.8419471979141235,
"rewards/format_reward": 1.0,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 111.37239837646484,
"epoch": 2.3125,
"grad_norm": 1.6808004103234182,
"kl": 0.0301513671875,
"learning_rate": 7.729166666666666e-07,
"loss": 0.0012,
"reward": 1.872849464416504,
"reward_std": 0.028602521866559982,
"rewards/accuracy_reward": 0.8728495836257935,
"rewards/format_reward": 1.0,
"step": 109
},
{
"clip_ratio": 0.0,
"completion_length": 112.41536712646484,
"epoch": 2.3333333333333335,
"grad_norm": 1.1449720839601671,
"kl": 0.0284423828125,
"learning_rate": 7.708333333333333e-07,
"loss": 0.0012,
"reward": 1.8464049100875854,
"reward_std": 0.027897782623767853,
"rewards/accuracy_reward": 0.8464047908782959,
"rewards/format_reward": 1.0,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 113.18620300292969,
"epoch": 2.3541666666666665,
"grad_norm": 1.651982521267855,
"kl": 0.0279541015625,
"learning_rate": 7.6875e-07,
"loss": 0.0012,
"reward": 1.859885573387146,
"reward_std": 0.03053418919444084,
"rewards/accuracy_reward": 0.8611876368522644,
"rewards/format_reward": 0.9986979365348816,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 112.31901550292969,
"epoch": 2.375,
"grad_norm": 1.273847905587726,
"kl": 0.03125,
"learning_rate": 7.666666666666667e-07,
"loss": 0.0014,
"reward": 1.8361539840698242,
"reward_std": 0.03006243333220482,
"rewards/accuracy_reward": 0.8374561667442322,
"rewards/format_reward": 0.9986979365348816,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 111.92578125,
"epoch": 2.3958333333333335,
"grad_norm": 1.1387571840491482,
"kl": 0.0301513671875,
"learning_rate": 7.645833333333332e-07,
"loss": 0.0013,
"reward": 1.866465449333191,
"reward_std": 0.023312915116548538,
"rewards/accuracy_reward": 0.8664655089378357,
"rewards/format_reward": 1.0,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 111.92708587646484,
"epoch": 2.4166666666666665,
"grad_norm": 1.3560724890382403,
"kl": 0.029541015625,
"learning_rate": 7.624999999999999e-07,
"loss": 0.0013,
"reward": 1.8626346588134766,
"reward_std": 0.025532353669404984,
"rewards/accuracy_reward": 0.8626348376274109,
"rewards/format_reward": 1.0,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 110.17448425292969,
"epoch": 2.4375,
"grad_norm": 1.3842406282593351,
"kl": 0.03076171875,
"learning_rate": 7.604166666666666e-07,
"loss": 0.0013,
"reward": 1.8882970809936523,
"reward_std": 0.022818906232714653,
"rewards/accuracy_reward": 0.8882970809936523,
"rewards/format_reward": 1.0,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 112.09375,
"epoch": 2.4583333333333335,
"grad_norm": 1.4789503320621176,
"kl": 0.03271484375,
"learning_rate": 7.583333333333333e-07,
"loss": 0.0014,
"reward": 1.86592435836792,
"reward_std": 0.025339588522911072,
"rewards/accuracy_reward": 0.8659243583679199,
"rewards/format_reward": 1.0,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 112.37890625,
"epoch": 2.4791666666666665,
"grad_norm": 2.1260312341423484,
"kl": 0.0281982421875,
"learning_rate": 7.5625e-07,
"loss": 0.0012,
"reward": 1.8531742095947266,
"reward_std": 0.025770537555217743,
"rewards/accuracy_reward": 0.8531742095947266,
"rewards/format_reward": 1.0,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 112.8828125,
"epoch": 2.5,
"grad_norm": 8.068389238759387,
"kl": 0.0341796875,
"learning_rate": 7.541666666666666e-07,
"loss": 0.0015,
"reward": 1.8539113998413086,
"reward_std": 0.026992302387952805,
"rewards/accuracy_reward": 0.8552135825157166,
"rewards/format_reward": 0.9986979365348816,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 112.63932800292969,
"epoch": 2.5208333333333335,
"grad_norm": 4.211053430432126,
"kl": 0.03662109375,
"learning_rate": 7.520833333333333e-07,
"loss": 0.0016,
"reward": 1.8711378574371338,
"reward_std": 0.033812545239925385,
"rewards/accuracy_reward": 0.8724400401115417,
"rewards/format_reward": 0.9986979365348816,
"step": 119
},
{
"clip_ratio": 0.0,
"completion_length": 114.11979675292969,
"epoch": 2.5416666666666665,
"grad_norm": 1.9127642747673794,
"kl": 0.034423828125,
"learning_rate": 7.5e-07,
"loss": 0.0014,
"reward": 1.8767274618148804,
"reward_std": 0.03441750630736351,
"rewards/accuracy_reward": 0.8793315887451172,
"rewards/format_reward": 0.9973958730697632,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 112.91796875,
"epoch": 2.5625,
"grad_norm": 2.7351925655744354,
"kl": 0.03515625,
"learning_rate": 7.479166666666667e-07,
"loss": 0.0015,
"reward": 1.848404884338379,
"reward_std": 0.02676878124475479,
"rewards/accuracy_reward": 0.8484048843383789,
"rewards/format_reward": 1.0,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 114.83203125,
"epoch": 2.5833333333333335,
"grad_norm": 1.9431827631537368,
"kl": 0.037841796875,
"learning_rate": 7.458333333333333e-07,
"loss": 0.0016,
"reward": 1.8470710515975952,
"reward_std": 0.03329627588391304,
"rewards/accuracy_reward": 0.8496752977371216,
"rewards/format_reward": 0.9973958730697632,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 116.77474212646484,
"epoch": 2.6041666666666665,
"grad_norm": 1.5150916718262497,
"kl": 0.04296875,
"learning_rate": 7.4375e-07,
"loss": 0.0017,
"reward": 1.8823201656341553,
"reward_std": 0.028970589861273766,
"rewards/accuracy_reward": 0.8823199272155762,
"rewards/format_reward": 1.0,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 115.453125,
"epoch": 2.625,
"grad_norm": 1.349104952050238,
"kl": 0.037841796875,
"learning_rate": 7.416666666666666e-07,
"loss": 0.0016,
"reward": 1.8449368476867676,
"reward_std": 0.040684543550014496,
"rewards/accuracy_reward": 0.8462389707565308,
"rewards/format_reward": 0.9986979365348816,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 115.18620300292969,
"epoch": 2.6458333333333335,
"grad_norm": 1.3525153623022323,
"kl": 0.041748046875,
"learning_rate": 7.395833333333334e-07,
"loss": 0.0018,
"reward": 1.8602406978607178,
"reward_std": 0.028682291507720947,
"rewards/accuracy_reward": 0.8628449440002441,
"rewards/format_reward": 0.9973958730697632,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 114.31771087646484,
"epoch": 2.6666666666666665,
"grad_norm": 3.3377952313667607,
"kl": 0.03515625,
"learning_rate": 7.375e-07,
"loss": 0.0015,
"reward": 1.8633064031600952,
"reward_std": 0.029187675565481186,
"rewards/accuracy_reward": 0.8633064031600952,
"rewards/format_reward": 1.0,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 115.09765625,
"epoch": 2.6875,
"grad_norm": 3.295416813891869,
"kl": 0.034912109375,
"learning_rate": 7.354166666666667e-07,
"loss": 0.0014,
"reward": 1.8418749570846558,
"reward_std": 0.030345208942890167,
"rewards/accuracy_reward": 0.8418749570846558,
"rewards/format_reward": 1.0,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 113.78646087646484,
"epoch": 2.7083333333333335,
"grad_norm": 1.8029248151985298,
"kl": 0.038330078125,
"learning_rate": 7.333333333333332e-07,
"loss": 0.0016,
"reward": 1.878383994102478,
"reward_std": 0.030093541368842125,
"rewards/accuracy_reward": 0.8796859979629517,
"rewards/format_reward": 0.9986979365348816,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 113.72526550292969,
"epoch": 2.7291666666666665,
"grad_norm": 4.026175515300724,
"kl": 0.037841796875,
"learning_rate": 7.312499999999999e-07,
"loss": 0.0016,
"reward": 1.8830840587615967,
"reward_std": 0.03116484545171261,
"rewards/accuracy_reward": 0.8830841779708862,
"rewards/format_reward": 1.0,
"step": 129
},
{
"clip_ratio": 0.0,
"completion_length": 111.31510925292969,
"epoch": 2.75,
"grad_norm": 3.247277440713756,
"kl": 0.036865234375,
"learning_rate": 7.291666666666666e-07,
"loss": 0.0015,
"reward": 1.88235604763031,
"reward_std": 0.03274049982428551,
"rewards/accuracy_reward": 0.8836580514907837,
"rewards/format_reward": 0.9986979365348816,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 110.67708587646484,
"epoch": 2.7708333333333335,
"grad_norm": 2.3589645345270296,
"kl": 0.036376953125,
"learning_rate": 7.270833333333333e-07,
"loss": 0.0015,
"reward": 1.8647716045379639,
"reward_std": 0.037114016711711884,
"rewards/accuracy_reward": 0.8673758506774902,
"rewards/format_reward": 0.9973958730697632,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 110.82292175292969,
"epoch": 2.7916666666666665,
"grad_norm": 1.3075748353830943,
"kl": 0.038330078125,
"learning_rate": 7.249999999999999e-07,
"loss": 0.0016,
"reward": 1.8723247051239014,
"reward_std": 0.042349301278591156,
"rewards/accuracy_reward": 0.874928891658783,
"rewards/format_reward": 0.9973958730697632,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 111.65885925292969,
"epoch": 2.8125,
"grad_norm": 1.7497378920781035,
"kl": 0.041259765625,
"learning_rate": 7.229166666666666e-07,
"loss": 0.0017,
"reward": 1.870335578918457,
"reward_std": 0.03280794620513916,
"rewards/accuracy_reward": 0.8703355193138123,
"rewards/format_reward": 1.0,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 107.25651550292969,
"epoch": 2.8333333333333335,
"grad_norm": 1.8805437427920557,
"kl": 0.037841796875,
"learning_rate": 7.208333333333332e-07,
"loss": 0.0016,
"reward": 1.8854488134384155,
"reward_std": 0.030766207724809647,
"rewards/accuracy_reward": 0.8867508769035339,
"rewards/format_reward": 0.9986979365348816,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 109.58333587646484,
"epoch": 2.8541666666666665,
"grad_norm": 1.9097670028819336,
"kl": 0.034423828125,
"learning_rate": 7.1875e-07,
"loss": 0.0014,
"reward": 1.8947999477386475,
"reward_std": 0.02784878760576248,
"rewards/accuracy_reward": 0.8961019515991211,
"rewards/format_reward": 0.9986979365348816,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 112.38802337646484,
"epoch": 2.875,
"grad_norm": 2.148757546537433,
"kl": 0.035888671875,
"learning_rate": 7.166666666666667e-07,
"loss": 0.0015,
"reward": 1.867633581161499,
"reward_std": 0.027851156890392303,
"rewards/accuracy_reward": 0.867633581161499,
"rewards/format_reward": 1.0,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 111.4921875,
"epoch": 2.8958333333333335,
"grad_norm": 1.68972106618665,
"kl": 0.031982421875,
"learning_rate": 7.145833333333333e-07,
"loss": 0.0013,
"reward": 1.8540122509002686,
"reward_std": 0.03191521018743515,
"rewards/accuracy_reward": 0.854012131690979,
"rewards/format_reward": 1.0,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 112.47135925292969,
"epoch": 2.9166666666666665,
"grad_norm": 4.753459469453496,
"kl": 0.037353515625,
"learning_rate": 7.125e-07,
"loss": 0.0015,
"reward": 1.8603503704071045,
"reward_std": 0.03011004999279976,
"rewards/accuracy_reward": 0.8603503704071045,
"rewards/format_reward": 1.0,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 113.7109375,
"epoch": 2.9375,
"grad_norm": 3.309769561279882,
"kl": 0.03369140625,
"learning_rate": 7.104166666666667e-07,
"loss": 0.0014,
"reward": 1.8809528350830078,
"reward_std": 0.029512833803892136,
"rewards/accuracy_reward": 0.8809528350830078,
"rewards/format_reward": 1.0,
"step": 139
},
{
"clip_ratio": 0.0,
"completion_length": 113.109375,
"epoch": 2.9583333333333335,
"grad_norm": 1.5596309697590072,
"kl": 0.034912109375,
"learning_rate": 7.083333333333334e-07,
"loss": 0.0014,
"reward": 1.8926336765289307,
"reward_std": 0.029566586017608643,
"rewards/accuracy_reward": 0.8926336169242859,
"rewards/format_reward": 1.0,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 115.12683868408203,
"epoch": 2.9791666666666665,
"grad_norm": 2.3377448744574503,
"kl": 0.035400390625,
"learning_rate": 7.0625e-07,
"loss": 0.0014,
"reward": 1.870600938796997,
"reward_std": 0.028039831668138504,
"rewards/accuracy_reward": 0.8706008791923523,
"rewards/format_reward": 1.0,
"step": 141
},
{
"clip_ratio": 0.0,
"completion_length": 114.96745300292969,
"epoch": 3.0208333333333335,
"grad_norm": 1.2246745731883282,
"kl": 0.031494140625,
"learning_rate": 7.041666666666667e-07,
"loss": 0.0013,
"reward": 1.9021186828613281,
"reward_std": 0.023153727874159813,
"rewards/accuracy_reward": 0.9021186828613281,
"rewards/format_reward": 1.0,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 117.39974212646484,
"epoch": 3.0416666666666665,
"grad_norm": 2.052509803568965,
"kl": 0.03564453125,
"learning_rate": 7.020833333333332e-07,
"loss": 0.0015,
"reward": 1.875624656677246,
"reward_std": 0.026206960901618004,
"rewards/accuracy_reward": 0.876926839351654,
"rewards/format_reward": 0.9986979365348816,
"step": 143
},
{
"clip_ratio": 0.0,
"completion_length": 115.80208587646484,
"epoch": 3.0625,
"grad_norm": 2.354461875696106,
"kl": 0.033203125,
"learning_rate": 7e-07,
"loss": 0.0014,
"reward": 1.8924367427825928,
"reward_std": 0.027437550947070122,
"rewards/accuracy_reward": 0.8924366235733032,
"rewards/format_reward": 1.0,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 116.94792175292969,
"epoch": 3.0833333333333335,
"grad_norm": 2.0470904135861314,
"kl": 0.03759765625,
"learning_rate": 6.979166666666666e-07,
"loss": 0.0015,
"reward": 1.8674253225326538,
"reward_std": 0.03383837640285492,
"rewards/accuracy_reward": 0.8687273263931274,
"rewards/format_reward": 0.9986979365348816,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 116.73177337646484,
"epoch": 3.1041666666666665,
"grad_norm": 2.4829007525918714,
"kl": 0.035400390625,
"learning_rate": 6.958333333333333e-07,
"loss": 0.0015,
"reward": 1.8920822143554688,
"reward_std": 0.027063176035881042,
"rewards/accuracy_reward": 0.8933842182159424,
"rewards/format_reward": 0.9986979365348816,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 116.31380462646484,
"epoch": 3.125,
"grad_norm": 1.8187259240411606,
"kl": 0.03564453125,
"learning_rate": 6.937499999999999e-07,
"loss": 0.0015,
"reward": 1.8620394468307495,
"reward_std": 0.026920508593320847,
"rewards/accuracy_reward": 0.86203932762146,
"rewards/format_reward": 1.0,
"step": 147
},
{
"clip_ratio": 0.0,
"completion_length": 116.4765625,
"epoch": 3.1458333333333335,
"grad_norm": 2.130554923785352,
"kl": 0.038818359375,
"learning_rate": 6.916666666666666e-07,
"loss": 0.0016,
"reward": 1.881149172782898,
"reward_std": 0.03235545754432678,
"rewards/accuracy_reward": 0.8837532997131348,
"rewards/format_reward": 0.9973958730697632,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 115.44271087646484,
"epoch": 3.1666666666666665,
"grad_norm": 1.3415308591991317,
"kl": 0.041015625,
"learning_rate": 6.895833333333333e-07,
"loss": 0.0017,
"reward": 1.8772742748260498,
"reward_std": 0.03051435947418213,
"rewards/accuracy_reward": 0.8785762190818787,
"rewards/format_reward": 0.9986979365348816,
"step": 149
},
{
"clip_ratio": 0.0,
"completion_length": 114.02604675292969,
"epoch": 3.1875,
"grad_norm": 3.61409411357304,
"kl": 0.035400390625,
"learning_rate": 6.875e-07,
"loss": 0.0015,
"reward": 1.8919541835784912,
"reward_std": 0.025183459743857384,
"rewards/accuracy_reward": 0.8919543027877808,
"rewards/format_reward": 1.0,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 113.07161712646484,
"epoch": 3.2083333333333335,
"grad_norm": 1.6998976071316378,
"kl": 0.03955078125,
"learning_rate": 6.854166666666666e-07,
"loss": 0.0017,
"reward": 1.8747581243515015,
"reward_std": 0.0327390655875206,
"rewards/accuracy_reward": 0.8760601878166199,
"rewards/format_reward": 0.9986979365348816,
"step": 151
},
{
"clip_ratio": 0.0,
"completion_length": 110.97396087646484,
"epoch": 3.2291666666666665,
"grad_norm": 4.510995755729955,
"kl": 0.034912109375,
"learning_rate": 6.833333333333333e-07,
"loss": 0.0015,
"reward": 1.8948159217834473,
"reward_std": 0.024841880425810814,
"rewards/accuracy_reward": 0.8948158621788025,
"rewards/format_reward": 1.0,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 108.51432800292969,
"epoch": 3.25,
"grad_norm": 1.434499918763638,
"kl": 0.03515625,
"learning_rate": 6.8125e-07,
"loss": 0.0015,
"reward": 1.9077692031860352,
"reward_std": 0.02048024721443653,
"rewards/accuracy_reward": 0.9077692031860352,
"rewards/format_reward": 1.0,
"step": 153
},
{
"clip_ratio": 0.0,
"completion_length": 108.29948425292969,
"epoch": 3.2708333333333335,
"grad_norm": 2.6480453809741307,
"kl": 0.03662109375,
"learning_rate": 6.791666666666667e-07,
"loss": 0.0015,
"reward": 1.8736419677734375,
"reward_std": 0.026421895250678062,
"rewards/accuracy_reward": 0.8736419677734375,
"rewards/format_reward": 1.0,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 110.30339050292969,
"epoch": 3.2916666666666665,
"grad_norm": 1.6157614309136974,
"kl": 0.034423828125,
"learning_rate": 6.770833333333333e-07,
"loss": 0.0015,
"reward": 1.8868639469146729,
"reward_std": 0.02565614879131317,
"rewards/accuracy_reward": 0.8868638873100281,
"rewards/format_reward": 1.0,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 107.47917175292969,
"epoch": 3.3125,
"grad_norm": 3.614042738523906,
"kl": 0.03125,
"learning_rate": 6.75e-07,
"loss": 0.0014,
"reward": 1.854689598083496,
"reward_std": 0.026655998080968857,
"rewards/accuracy_reward": 0.8546894788742065,
"rewards/format_reward": 1.0,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 107.14714050292969,
"epoch": 3.3333333333333335,
"grad_norm": 3.6282010355717285,
"kl": 0.035400390625,
"learning_rate": 6.729166666666666e-07,
"loss": 0.0015,
"reward": 1.8728668689727783,
"reward_std": 0.02623789571225643,
"rewards/accuracy_reward": 0.8728668093681335,
"rewards/format_reward": 1.0,
"step": 157
},
{
"clip_ratio": 0.0,
"completion_length": 106.16146087646484,
"epoch": 3.3541666666666665,
"grad_norm": 3.096302314051656,
"kl": 0.07373046875,
"learning_rate": 6.708333333333333e-07,
"loss": 0.003,
"reward": 1.8780524730682373,
"reward_std": 0.02697448432445526,
"rewards/accuracy_reward": 0.8780522346496582,
"rewards/format_reward": 1.0,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 104.66536712646484,
"epoch": 3.375,
"grad_norm": 1.8815482668618946,
"kl": 0.034423828125,
"learning_rate": 6.6875e-07,
"loss": 0.0015,
"reward": 1.869466781616211,
"reward_std": 0.025683503597974777,
"rewards/accuracy_reward": 0.8694667816162109,
"rewards/format_reward": 1.0,
"step": 159
},
{
"clip_ratio": 0.0,
"completion_length": 103.4375,
"epoch": 3.3958333333333335,
"grad_norm": 3.6534900968467423,
"kl": 0.036376953125,
"learning_rate": 6.666666666666666e-07,
"loss": 0.0016,
"reward": 1.8859853744506836,
"reward_std": 0.025588493794202805,
"rewards/accuracy_reward": 0.8859855532646179,
"rewards/format_reward": 1.0,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 103.21875,
"epoch": 3.4166666666666665,
"grad_norm": 2.8670572231969214,
"kl": 0.0361328125,
"learning_rate": 6.645833333333333e-07,
"loss": 0.0015,
"reward": 1.8808234930038452,
"reward_std": 0.025275420397520065,
"rewards/accuracy_reward": 0.8808236122131348,
"rewards/format_reward": 1.0,
"step": 161
},
{
"clip_ratio": 0.0,
"completion_length": 102.73567962646484,
"epoch": 3.4375,
"grad_norm": 1.291526238821269,
"kl": 0.033203125,
"learning_rate": 6.624999999999999e-07,
"loss": 0.0015,
"reward": 1.8656648397445679,
"reward_std": 0.02325437031686306,
"rewards/accuracy_reward": 0.8656649589538574,
"rewards/format_reward": 1.0,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 103.76953125,
"epoch": 3.4583333333333335,
"grad_norm": 3.2481258355141716,
"kl": 0.034423828125,
"learning_rate": 6.604166666666667e-07,
"loss": 0.0015,
"reward": 1.8807281255722046,
"reward_std": 0.023684537038207054,
"rewards/accuracy_reward": 0.8807281255722046,
"rewards/format_reward": 1.0,
"step": 163
},
{
"clip_ratio": 0.0,
"completion_length": 101.98046875,
"epoch": 3.4791666666666665,
"grad_norm": 1.913896791712438,
"kl": 0.034912109375,
"learning_rate": 6.583333333333333e-07,
"loss": 0.0015,
"reward": 1.8801133632659912,
"reward_std": 0.020142659544944763,
"rewards/accuracy_reward": 0.8801132440567017,
"rewards/format_reward": 1.0,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 101.25651550292969,
"epoch": 3.5,
"grad_norm": 1.5822819853292256,
"kl": 0.037109375,
"learning_rate": 6.5625e-07,
"loss": 0.0016,
"reward": 1.8631795644760132,
"reward_std": 0.023415524512529373,
"rewards/accuracy_reward": 0.8631795644760132,
"rewards/format_reward": 1.0,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 102.02083587646484,
"epoch": 3.5208333333333335,
"grad_norm": 1.4328687070061792,
"kl": 0.0341796875,
"learning_rate": 6.541666666666666e-07,
"loss": 0.0015,
"reward": 1.8415846824645996,
"reward_std": 0.022014908492565155,
"rewards/accuracy_reward": 0.8415846824645996,
"rewards/format_reward": 1.0,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 103.24609375,
"epoch": 3.5416666666666665,
"grad_norm": 2.6519462272626697,
"kl": 0.033447265625,
"learning_rate": 6.520833333333333e-07,
"loss": 0.0014,
"reward": 1.8888130187988281,
"reward_std": 0.0215926356613636,
"rewards/accuracy_reward": 0.8888130187988281,
"rewards/format_reward": 1.0,
"step": 167
},
{
"clip_ratio": 0.0,
"completion_length": 101.84375,
"epoch": 3.5625,
"grad_norm": 1.7631289618422423,
"kl": 0.032958984375,
"learning_rate": 6.5e-07,
"loss": 0.0015,
"reward": 1.8750677108764648,
"reward_std": 0.023012561723589897,
"rewards/accuracy_reward": 0.8750675916671753,
"rewards/format_reward": 1.0,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 107.60026550292969,
"epoch": 3.5833333333333335,
"grad_norm": 2.0388719106728863,
"kl": 0.0361328125,
"learning_rate": 6.479166666666667e-07,
"loss": 0.0015,
"reward": 1.8989547491073608,
"reward_std": 0.024981368333101273,
"rewards/accuracy_reward": 0.8989547491073608,
"rewards/format_reward": 1.0,
"step": 169
},
{
"clip_ratio": 0.0,
"completion_length": 104.02864837646484,
"epoch": 3.6041666666666665,
"grad_norm": 3.0894511291299485,
"kl": 0.03369140625,
"learning_rate": 6.458333333333333e-07,
"loss": 0.0014,
"reward": 1.8803495168685913,
"reward_std": 0.024355322122573853,
"rewards/accuracy_reward": 0.8803495168685913,
"rewards/format_reward": 1.0,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 105.703125,
"epoch": 3.625,
"grad_norm": 1.6763141295980748,
"kl": 0.03662109375,
"learning_rate": 6.4375e-07,
"loss": 0.0016,
"reward": 1.8981932401657104,
"reward_std": 0.0216450747102499,
"rewards/accuracy_reward": 0.898193359375,
"rewards/format_reward": 1.0,
"step": 171
},
{
"clip_ratio": 0.0,
"completion_length": 104.81901550292969,
"epoch": 3.6458333333333335,
"grad_norm": 1.5197798990089235,
"kl": 0.03369140625,
"learning_rate": 6.416666666666667e-07,
"loss": 0.0015,
"reward": 1.8727521896362305,
"reward_std": 0.02708452008664608,
"rewards/accuracy_reward": 0.87275230884552,
"rewards/format_reward": 1.0,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 107.34245300292969,
"epoch": 3.6666666666666665,
"grad_norm": 4.145991173961336,
"kl": 0.049072265625,
"learning_rate": 6.395833333333333e-07,
"loss": 0.002,
"reward": 1.9041762351989746,
"reward_std": 0.024535808712244034,
"rewards/accuracy_reward": 0.9041762351989746,
"rewards/format_reward": 1.0,
"step": 173
},
{
"clip_ratio": 0.0,
"completion_length": 107.91667175292969,
"epoch": 3.6875,
"grad_norm": 3.626404435362734,
"kl": 0.038330078125,
"learning_rate": 6.374999999999999e-07,
"loss": 0.0016,
"reward": 1.8905611038208008,
"reward_std": 0.02579084411263466,
"rewards/accuracy_reward": 0.891863226890564,
"rewards/format_reward": 0.9986979365348816,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 109.83464050292969,
"epoch": 3.7083333333333335,
"grad_norm": 1.4575692565981726,
"kl": 0.037841796875,
"learning_rate": 6.354166666666666e-07,
"loss": 0.0015,
"reward": 1.9227180480957031,
"reward_std": 0.02236122451722622,
"rewards/accuracy_reward": 0.9227181673049927,
"rewards/format_reward": 1.0,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 110.88542175292969,
"epoch": 3.7291666666666665,
"grad_norm": 6.573675951265146,
"kl": 0.040771484375,
"learning_rate": 6.333333333333332e-07,
"loss": 0.0018,
"reward": 1.8883719444274902,
"reward_std": 0.023411914706230164,
"rewards/accuracy_reward": 0.8883718848228455,
"rewards/format_reward": 1.0,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 111.8828125,
"epoch": 3.75,
"grad_norm": 1.3628527356709195,
"kl": 0.038818359375,
"learning_rate": 6.3125e-07,
"loss": 0.0016,
"reward": 1.8975229263305664,
"reward_std": 0.022585680708289146,
"rewards/accuracy_reward": 0.8975229263305664,
"rewards/format_reward": 1.0,
"step": 177
},
{
"clip_ratio": 0.0,
"completion_length": 111.2890625,
"epoch": 3.7708333333333335,
"grad_norm": 1.7649706449204177,
"kl": 0.04052734375,
"learning_rate": 6.291666666666666e-07,
"loss": 0.0017,
"reward": 1.8944144248962402,
"reward_std": 0.02505827508866787,
"rewards/accuracy_reward": 0.8944144248962402,
"rewards/format_reward": 1.0,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 115.35546875,
"epoch": 3.7916666666666665,
"grad_norm": 2.198651075477608,
"kl": 0.039306640625,
"learning_rate": 6.270833333333333e-07,
"loss": 0.0016,
"reward": 1.891271948814392,
"reward_std": 0.028914332389831543,
"rewards/accuracy_reward": 0.8938760757446289,
"rewards/format_reward": 0.9973958730697632,
"step": 179
},
{
"clip_ratio": 0.0,
"completion_length": 115.40885925292969,
"epoch": 3.8125,
"grad_norm": 10.660708725864556,
"kl": 0.041748046875,
"learning_rate": 6.249999999999999e-07,
"loss": 0.0017,
"reward": 1.8875398635864258,
"reward_std": 0.024141697213053703,
"rewards/accuracy_reward": 0.887539803981781,
"rewards/format_reward": 1.0,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 118.68229675292969,
"epoch": 3.8333333333333335,
"grad_norm": 2.2321500902799647,
"kl": 0.04345703125,
"learning_rate": 6.229166666666666e-07,
"loss": 0.0018,
"reward": 1.8584728240966797,
"reward_std": 0.03329972177743912,
"rewards/accuracy_reward": 0.8623790740966797,
"rewards/format_reward": 0.99609375,
"step": 181
},
{
"clip_ratio": 0.0,
"completion_length": 117.95442962646484,
"epoch": 3.8541666666666665,
"grad_norm": 1.414758072615096,
"kl": 0.038818359375,
"learning_rate": 6.208333333333334e-07,
"loss": 0.0016,
"reward": 1.8576724529266357,
"reward_std": 0.03029349073767662,
"rewards/accuracy_reward": 0.8602765798568726,
"rewards/format_reward": 0.9973958730697632,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 116.26171875,
"epoch": 3.875,
"grad_norm": 2.767294776217013,
"kl": 0.037353515625,
"learning_rate": 6.1875e-07,
"loss": 0.0015,
"reward": 1.8842320442199707,
"reward_std": 0.026229776442050934,
"rewards/accuracy_reward": 0.8842320442199707,
"rewards/format_reward": 1.0,
"step": 183
},
{
"clip_ratio": 0.0,
"completion_length": 116.26302337646484,
"epoch": 3.8958333333333335,
"grad_norm": 2.0069160206563423,
"kl": 0.04541015625,
"learning_rate": 6.166666666666667e-07,
"loss": 0.0018,
"reward": 1.906886100769043,
"reward_std": 0.02242848090827465,
"rewards/accuracy_reward": 0.9068862795829773,
"rewards/format_reward": 1.0,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 118.28515625,
"epoch": 3.9166666666666665,
"grad_norm": 2.9167012710524416,
"kl": 0.041015625,
"learning_rate": 6.145833333333333e-07,
"loss": 0.0017,
"reward": 1.8881382942199707,
"reward_std": 0.027306437492370605,
"rewards/accuracy_reward": 0.8881382942199707,
"rewards/format_reward": 1.0,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 118.50260925292969,
"epoch": 3.9375,
"grad_norm": 2.5894313089017706,
"kl": 0.03955078125,
"learning_rate": 6.125000000000001e-07,
"loss": 0.0016,
"reward": 1.8834333419799805,
"reward_std": 0.02489401400089264,
"rewards/accuracy_reward": 0.8834332227706909,
"rewards/format_reward": 1.0,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 115.67578125,
"epoch": 3.9583333333333335,
"grad_norm": 2.8476363151522173,
"kl": 0.039794921875,
"learning_rate": 6.104166666666667e-07,
"loss": 0.0017,
"reward": 1.8963178396224976,
"reward_std": 0.02300976775586605,
"rewards/accuracy_reward": 0.8963178992271423,
"rewards/format_reward": 1.0,
"step": 187
},
{
"clip_ratio": 0.0,
"completion_length": 113.1749038696289,
"epoch": 3.9791666666666665,
"grad_norm": 102.50025138985052,
"kl": 0.9296875,
"learning_rate": 6.083333333333333e-07,
"loss": 0.0374,
"reward": 1.8949506282806396,
"reward_std": 0.03087977133691311,
"rewards/accuracy_reward": 0.8962857723236084,
"rewards/format_reward": 0.998664915561676,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 114.61458587646484,
"epoch": 4.020833333333333,
"grad_norm": 2.4242741219191553,
"kl": 0.03857421875,
"learning_rate": 6.062499999999999e-07,
"loss": 0.0016,
"reward": 1.8952863216400146,
"reward_std": 0.025780394673347473,
"rewards/accuracy_reward": 0.8952862620353699,
"rewards/format_reward": 1.0,
"step": 189
},
{
"clip_ratio": 0.0,
"completion_length": 112.21745300292969,
"epoch": 4.041666666666667,
"grad_norm": 1.3513636521416783,
"kl": 0.046630859375,
"learning_rate": 6.041666666666666e-07,
"loss": 0.0019,
"reward": 1.9090301990509033,
"reward_std": 0.029132381081581116,
"rewards/accuracy_reward": 0.9090301394462585,
"rewards/format_reward": 1.0,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 109.36328125,
"epoch": 4.0625,
"grad_norm": 2.0726251594251166,
"kl": 0.051513671875,
"learning_rate": 6.020833333333333e-07,
"loss": 0.0021,
"reward": 1.8756507635116577,
"reward_std": 0.027445685118436813,
"rewards/accuracy_reward": 0.8756507635116577,
"rewards/format_reward": 1.0,
"step": 191
},
{
"clip_ratio": 0.0,
"completion_length": 109.05859375,
"epoch": 4.083333333333333,
"grad_norm": 1.5875132735828674,
"kl": 0.056396484375,
"learning_rate": 6e-07,
"loss": 0.0023,
"reward": 1.8988969326019287,
"reward_std": 0.03272823989391327,
"rewards/accuracy_reward": 0.9015010595321655,
"rewards/format_reward": 0.9973958730697632,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 107.86458587646484,
"epoch": 4.104166666666667,
"grad_norm": 4.44164388575917,
"kl": 0.06201171875,
"learning_rate": 5.979166666666666e-07,
"loss": 0.0026,
"reward": 1.8918952941894531,
"reward_std": 0.028478458523750305,
"rewards/accuracy_reward": 0.8918952941894531,
"rewards/format_reward": 1.0,
"step": 193
},
{
"clip_ratio": 0.0,
"completion_length": 104.47135925292969,
"epoch": 4.125,
"grad_norm": 1.8931652214160692,
"kl": 0.08251953125,
"learning_rate": 5.958333333333333e-07,
"loss": 0.0034,
"reward": 1.877640724182129,
"reward_std": 0.03283580765128136,
"rewards/accuracy_reward": 0.8789429664611816,
"rewards/format_reward": 0.9986979365348816,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 104.86458587646484,
"epoch": 4.145833333333333,
"grad_norm": 1.4398266478518003,
"kl": 0.06884765625,
"learning_rate": 5.937499999999999e-07,
"loss": 0.0028,
"reward": 1.893932580947876,
"reward_std": 0.023818641901016235,
"rewards/accuracy_reward": 0.8939325213432312,
"rewards/format_reward": 1.0,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 105.59765625,
"epoch": 4.166666666666667,
"grad_norm": 1.9079500637571045,
"kl": 0.057861328125,
"learning_rate": 5.916666666666667e-07,
"loss": 0.0024,
"reward": 1.8989866971969604,
"reward_std": 0.029820134863257408,
"rewards/accuracy_reward": 0.9002887606620789,
"rewards/format_reward": 0.9986979365348816,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 105.17317962646484,
"epoch": 4.1875,
"grad_norm": 1.6892092308161555,
"kl": 0.05419921875,
"learning_rate": 5.895833333333333e-07,
"loss": 0.0022,
"reward": 1.9026626348495483,
"reward_std": 0.023554224520921707,
"rewards/accuracy_reward": 0.9026626348495483,
"rewards/format_reward": 1.0,
"step": 197
},
{
"clip_ratio": 0.0,
"completion_length": 105.45182800292969,
"epoch": 4.208333333333333,
"grad_norm": 2.0094232681652513,
"kl": 0.05126953125,
"learning_rate": 5.875e-07,
"loss": 0.0021,
"reward": 1.9009517431259155,
"reward_std": 0.026670873165130615,
"rewards/accuracy_reward": 0.9009518027305603,
"rewards/format_reward": 1.0,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 105.99609375,
"epoch": 4.229166666666667,
"grad_norm": 1.4660353955474104,
"kl": 0.045654296875,
"learning_rate": 5.854166666666666e-07,
"loss": 0.0018,
"reward": 1.8858369588851929,
"reward_std": 0.02353881672024727,
"rewards/accuracy_reward": 0.8858367800712585,
"rewards/format_reward": 1.0,
"step": 199
},
{
"clip_ratio": 0.0,
"completion_length": 106.44792175292969,
"epoch": 4.25,
"grad_norm": 1.9832423612020598,
"kl": 0.04833984375,
"learning_rate": 5.833333333333334e-07,
"loss": 0.002,
"reward": 1.9034093618392944,
"reward_std": 0.024380242452025414,
"rewards/accuracy_reward": 0.9034093618392944,
"rewards/format_reward": 1.0,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 107.12890625,
"epoch": 4.270833333333333,
"grad_norm": 1.9973956888778517,
"kl": 0.04541015625,
"learning_rate": 5.8125e-07,
"loss": 0.0019,
"reward": 1.918021559715271,
"reward_std": 0.01975633203983307,
"rewards/accuracy_reward": 0.9180216789245605,
"rewards/format_reward": 1.0,
"step": 201
},
{
"clip_ratio": 0.0,
"completion_length": 108.20182800292969,
"epoch": 4.291666666666667,
"grad_norm": 1.9799525839497707,
"kl": 0.046875,
"learning_rate": 5.791666666666667e-07,
"loss": 0.0019,
"reward": 1.903671145439148,
"reward_std": 0.02328427881002426,
"rewards/accuracy_reward": 0.903671145439148,
"rewards/format_reward": 1.0,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 107.953125,
"epoch": 4.3125,
"grad_norm": 2.1041072216648176,
"kl": 0.0439453125,
"learning_rate": 5.770833333333332e-07,
"loss": 0.0018,
"reward": 1.8897308111190796,
"reward_std": 0.02862635999917984,
"rewards/accuracy_reward": 0.891032874584198,
"rewards/format_reward": 0.9986979365348816,
"step": 203
},
{
"clip_ratio": 0.0,
"completion_length": 109.08203125,
"epoch": 4.333333333333333,
"grad_norm": 1.5204073340174282,
"kl": 0.039794921875,
"learning_rate": 5.749999999999999e-07,
"loss": 0.0016,
"reward": 1.8759989738464355,
"reward_std": 0.02895565889775753,
"rewards/accuracy_reward": 0.8759989738464355,
"rewards/format_reward": 1.0,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 107.44010925292969,
"epoch": 4.354166666666667,
"grad_norm": 2.001436237063012,
"kl": 0.046142578125,
"learning_rate": 5.729166666666667e-07,
"loss": 0.0019,
"reward": 1.8900182247161865,
"reward_std": 0.02687385492026806,
"rewards/accuracy_reward": 0.8913201689720154,
"rewards/format_reward": 0.9986979365348816,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 110.47396087646484,
"epoch": 4.375,
"grad_norm": 3.6969174457282366,
"kl": 0.041015625,
"learning_rate": 5.708333333333333e-07,
"loss": 0.0017,
"reward": 1.8979003429412842,
"reward_std": 0.0217414703220129,
"rewards/accuracy_reward": 0.8979001045227051,
"rewards/format_reward": 1.0,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 110.609375,
"epoch": 4.395833333333333,
"grad_norm": 2.0011010687642816,
"kl": 0.0400390625,
"learning_rate": 5.6875e-07,
"loss": 0.0017,
"reward": 1.890995740890503,
"reward_std": 0.025938181206583977,
"rewards/accuracy_reward": 0.8909956812858582,
"rewards/format_reward": 1.0,
"step": 207
},
{
"clip_ratio": 0.0,
"completion_length": 112.12890625,
"epoch": 4.416666666666667,
"grad_norm": 1.2721197172111447,
"kl": 0.041259765625,
"learning_rate": 5.666666666666666e-07,
"loss": 0.0017,
"reward": 1.8735730648040771,
"reward_std": 0.02578229270875454,
"rewards/accuracy_reward": 0.8735730051994324,
"rewards/format_reward": 1.0,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 111.66667175292969,
"epoch": 4.4375,
"grad_norm": 2.5188213924760667,
"kl": 0.07958984375,
"learning_rate": 5.645833333333333e-07,
"loss": 0.0032,
"reward": 1.8987796306610107,
"reward_std": 0.025997933000326157,
"rewards/accuracy_reward": 0.898779571056366,
"rewards/format_reward": 1.0,
"step": 209
},
{
"clip_ratio": 0.0,
"completion_length": 111.171875,
"epoch": 4.458333333333333,
"grad_norm": 2.1506836862902876,
"kl": 0.03515625,
"learning_rate": 5.625e-07,
"loss": 0.0015,
"reward": 1.891427993774414,
"reward_std": 0.02087043598294258,
"rewards/accuracy_reward": 0.8914279937744141,
"rewards/format_reward": 1.0,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 110.43880462646484,
"epoch": 4.479166666666667,
"grad_norm": 1.589375389211378,
"kl": 0.033203125,
"learning_rate": 5.604166666666667e-07,
"loss": 0.0014,
"reward": 1.8716447353363037,
"reward_std": 0.023239165544509888,
"rewards/accuracy_reward": 0.8716444969177246,
"rewards/format_reward": 1.0,
"step": 211
},
{
"clip_ratio": 0.0,
"completion_length": 110.87890625,
"epoch": 4.5,
"grad_norm": 2.9907672143516737,
"kl": 0.038330078125,
"learning_rate": 5.583333333333333e-07,
"loss": 0.0016,
"reward": 1.8827204704284668,
"reward_std": 0.028304576873779297,
"rewards/accuracy_reward": 0.8827204704284668,
"rewards/format_reward": 1.0,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 113.86849212646484,
"epoch": 4.520833333333333,
"grad_norm": 2.551740129842165,
"kl": 0.0390625,
"learning_rate": 5.5625e-07,
"loss": 0.0016,
"reward": 1.888314127922058,
"reward_std": 0.02232741191983223,
"rewards/accuracy_reward": 0.8883141279220581,
"rewards/format_reward": 1.0,
"step": 213
},
{
"clip_ratio": 0.0,
"completion_length": 111.05989837646484,
"epoch": 4.541666666666667,
"grad_norm": 4.368554014866076,
"kl": 0.044921875,
"learning_rate": 5.541666666666666e-07,
"loss": 0.0019,
"reward": 1.9319076538085938,
"reward_std": 0.020289087668061256,
"rewards/accuracy_reward": 0.9319076538085938,
"rewards/format_reward": 1.0,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 111.23828125,
"epoch": 4.5625,
"grad_norm": 1.4440015070439494,
"kl": 0.035888671875,
"learning_rate": 5.520833333333334e-07,
"loss": 0.0016,
"reward": 1.852222204208374,
"reward_std": 0.030050039291381836,
"rewards/accuracy_reward": 0.8535243272781372,
"rewards/format_reward": 0.9986979365348816,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 112.00911712646484,
"epoch": 4.583333333333333,
"grad_norm": 2.12920095214347,
"kl": 0.038818359375,
"learning_rate": 5.5e-07,
"loss": 0.0016,
"reward": 1.8882272243499756,
"reward_std": 0.024995621293783188,
"rewards/accuracy_reward": 0.8882272243499756,
"rewards/format_reward": 1.0,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 110.91146087646484,
"epoch": 4.604166666666667,
"grad_norm": 3.2550572641319016,
"kl": 0.03955078125,
"learning_rate": 5.479166666666667e-07,
"loss": 0.0016,
"reward": 1.9003304243087769,
"reward_std": 0.023031365126371384,
"rewards/accuracy_reward": 0.9003303050994873,
"rewards/format_reward": 1.0,
"step": 217
},
{
"clip_ratio": 0.0,
"completion_length": 111.77474212646484,
"epoch": 4.625,
"grad_norm": 1.4248440110140324,
"kl": 0.039794921875,
"learning_rate": 5.458333333333332e-07,
"loss": 0.0016,
"reward": 1.873998999595642,
"reward_std": 0.02907104603946209,
"rewards/accuracy_reward": 0.8753010630607605,
"rewards/format_reward": 0.9986979365348816,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 112.58984375,
"epoch": 4.645833333333333,
"grad_norm": 2.6245139424544983,
"kl": 0.03857421875,
"learning_rate": 5.4375e-07,
"loss": 0.0016,
"reward": 1.9035629034042358,
"reward_std": 0.022365760058164597,
"rewards/accuracy_reward": 0.9035629034042358,
"rewards/format_reward": 1.0,
"step": 219
},
{
"clip_ratio": 0.0,
"completion_length": 110.59896087646484,
"epoch": 4.666666666666667,
"grad_norm": 1.2926097030348538,
"kl": 0.03955078125,
"learning_rate": 5.416666666666666e-07,
"loss": 0.0016,
"reward": 1.890451192855835,
"reward_std": 0.025147125124931335,
"rewards/accuracy_reward": 0.8904510736465454,
"rewards/format_reward": 1.0,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 113.26432800292969,
"epoch": 4.6875,
"grad_norm": 1.2010688063099724,
"kl": 0.03955078125,
"learning_rate": 5.395833333333333e-07,
"loss": 0.0016,
"reward": 1.865971326828003,
"reward_std": 0.030050549656152725,
"rewards/accuracy_reward": 0.8672735095024109,
"rewards/format_reward": 0.9986979365348816,
"step": 221
},
{
"clip_ratio": 0.0,
"completion_length": 110.88932800292969,
"epoch": 4.708333333333333,
"grad_norm": 1.4851010437040935,
"kl": 0.0380859375,
"learning_rate": 5.374999999999999e-07,
"loss": 0.0016,
"reward": 1.8917864561080933,
"reward_std": 0.021093130111694336,
"rewards/accuracy_reward": 0.8917864561080933,
"rewards/format_reward": 1.0,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 111.63932800292969,
"epoch": 4.729166666666667,
"grad_norm": 1.774445050351076,
"kl": 0.041748046875,
"learning_rate": 5.354166666666666e-07,
"loss": 0.0017,
"reward": 1.8998801708221436,
"reward_std": 0.025802936404943466,
"rewards/accuracy_reward": 0.9011821150779724,
"rewards/format_reward": 0.9986979365348816,
"step": 223
},
{
"clip_ratio": 0.0,
"completion_length": 110.80599212646484,
"epoch": 4.75,
"grad_norm": 1.9122212791313642,
"kl": 0.04150390625,
"learning_rate": 5.333333333333333e-07,
"loss": 0.0017,
"reward": 1.898808479309082,
"reward_std": 0.02279862016439438,
"rewards/accuracy_reward": 0.898808479309082,
"rewards/format_reward": 1.0,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 111.49609375,
"epoch": 4.770833333333333,
"grad_norm": 1.8595252737006025,
"kl": 0.0400390625,
"learning_rate": 5.3125e-07,
"loss": 0.0016,
"reward": 1.8906606435775757,
"reward_std": 0.021603485569357872,
"rewards/accuracy_reward": 0.8906607031822205,
"rewards/format_reward": 1.0,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 112.00260925292969,
"epoch": 4.791666666666667,
"grad_norm": 3.9925980009816278,
"kl": 0.04833984375,
"learning_rate": 5.291666666666666e-07,
"loss": 0.002,
"reward": 1.896075963973999,
"reward_std": 0.02797180414199829,
"rewards/accuracy_reward": 0.8973779678344727,
"rewards/format_reward": 0.9986979365348816,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 113.4609375,
"epoch": 4.8125,
"grad_norm": 3.455468575777609,
"kl": 0.052001953125,
"learning_rate": 5.270833333333333e-07,
"loss": 0.0021,
"reward": 1.9048645496368408,
"reward_std": 0.02417534589767456,
"rewards/accuracy_reward": 0.9048646688461304,
"rewards/format_reward": 1.0,
"step": 227
},
{
"clip_ratio": 0.0,
"completion_length": 112.59765625,
"epoch": 4.833333333333333,
"grad_norm": 1.5619005634713135,
"kl": 0.042236328125,
"learning_rate": 5.25e-07,
"loss": 0.0017,
"reward": 1.905022382736206,
"reward_std": 0.021135296672582626,
"rewards/accuracy_reward": 0.905022144317627,
"rewards/format_reward": 1.0,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 112.34245300292969,
"epoch": 4.854166666666667,
"grad_norm": 1.8685437866312475,
"kl": 0.048095703125,
"learning_rate": 5.229166666666667e-07,
"loss": 0.002,
"reward": 1.89357590675354,
"reward_std": 0.025921311229467392,
"rewards/accuracy_reward": 0.8948779106140137,
"rewards/format_reward": 0.9986979365348816,
"step": 229
},
{
"clip_ratio": 0.0,
"completion_length": 111.43620300292969,
"epoch": 4.875,
"grad_norm": 5.452442844003646,
"kl": 0.043212890625,
"learning_rate": 5.208333333333334e-07,
"loss": 0.0018,
"reward": 1.911612868309021,
"reward_std": 0.024873455986380577,
"rewards/accuracy_reward": 0.9129147529602051,
"rewards/format_reward": 0.9986979365348816,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 110.40495300292969,
"epoch": 4.895833333333333,
"grad_norm": 2.179930419928341,
"kl": 0.04296875,
"learning_rate": 5.1875e-07,
"loss": 0.0017,
"reward": 1.8822953701019287,
"reward_std": 0.027776187285780907,
"rewards/accuracy_reward": 0.8822951316833496,
"rewards/format_reward": 1.0,
"step": 231
},
{
"clip_ratio": 0.0,
"completion_length": 111.78646087646484,
"epoch": 4.916666666666667,
"grad_norm": 5.555743499581958,
"kl": 0.04931640625,
"learning_rate": 5.166666666666667e-07,
"loss": 0.0021,
"reward": 1.9162639379501343,
"reward_std": 0.021101072430610657,
"rewards/accuracy_reward": 0.9162638187408447,
"rewards/format_reward": 1.0,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 109.9765625,
"epoch": 4.9375,
"grad_norm": 1.457940457573027,
"kl": 0.03955078125,
"learning_rate": 5.145833333333332e-07,
"loss": 0.0016,
"reward": 1.9340026378631592,
"reward_std": 0.020655512809753418,
"rewards/accuracy_reward": 0.9340025782585144,
"rewards/format_reward": 1.0,
"step": 233
},
{
"clip_ratio": 0.0,
"completion_length": 109.7734375,
"epoch": 4.958333333333333,
"grad_norm": 1.4365694122319177,
"kl": 0.04345703125,
"learning_rate": 5.125e-07,
"loss": 0.0018,
"reward": 1.8954408168792725,
"reward_std": 0.02152765914797783,
"rewards/accuracy_reward": 0.8954406976699829,
"rewards/format_reward": 1.0,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 107.25367736816406,
"epoch": 4.979166666666667,
"grad_norm": 1.734243537392683,
"kl": 0.04248046875,
"learning_rate": 5.104166666666666e-07,
"loss": 0.0017,
"reward": 1.9107515811920166,
"reward_std": 0.018960019573569298,
"rewards/accuracy_reward": 0.9107515811920166,
"rewards/format_reward": 1.0,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 109.74739837646484,
"epoch": 5.020833333333333,
"grad_norm": 1.3870009697213672,
"kl": 0.04833984375,
"learning_rate": 5.083333333333333e-07,
"loss": 0.002,
"reward": 1.8969802856445312,
"reward_std": 0.029202213510870934,
"rewards/accuracy_reward": 0.8982824087142944,
"rewards/format_reward": 0.9986979365348816,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 107.33464050292969,
"epoch": 5.041666666666667,
"grad_norm": 1.6927816472717638,
"kl": 0.0439453125,
"learning_rate": 5.062499999999999e-07,
"loss": 0.0018,
"reward": 1.9044766426086426,
"reward_std": 0.02422555536031723,
"rewards/accuracy_reward": 0.9044766426086426,
"rewards/format_reward": 1.0,
"step": 237
},
{
"clip_ratio": 0.0,
"completion_length": 107.39714050292969,
"epoch": 5.0625,
"grad_norm": 2.3914909546196594,
"kl": 0.04248046875,
"learning_rate": 5.041666666666667e-07,
"loss": 0.0018,
"reward": 1.9420578479766846,
"reward_std": 0.020115545019507408,
"rewards/accuracy_reward": 0.9420577883720398,
"rewards/format_reward": 1.0,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 108.46224212646484,
"epoch": 5.083333333333333,
"grad_norm": 2.834307847073615,
"kl": 0.046630859375,
"learning_rate": 5.020833333333333e-07,
"loss": 0.0019,
"reward": 1.9152748584747314,
"reward_std": 0.02309068851172924,
"rewards/accuracy_reward": 0.9152747392654419,
"rewards/format_reward": 1.0,
"step": 239
},
{
"clip_ratio": 0.0,
"completion_length": 108.2265625,
"epoch": 5.104166666666667,
"grad_norm": 2.1094761559134816,
"kl": 0.04052734375,
"learning_rate": 5e-07,
"loss": 0.0017,
"reward": 1.8892682790756226,
"reward_std": 0.023487474769353867,
"rewards/accuracy_reward": 0.8892682790756226,
"rewards/format_reward": 1.0,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 107.31380462646484,
"epoch": 5.125,
"grad_norm": 1.4100911159096023,
"kl": 0.039306640625,
"learning_rate": 4.979166666666666e-07,
"loss": 0.0016,
"reward": 1.8915185928344727,
"reward_std": 0.023427218198776245,
"rewards/accuracy_reward": 0.891518771648407,
"rewards/format_reward": 1.0,
"step": 241
},
{
"clip_ratio": 0.0,
"completion_length": 108.84114837646484,
"epoch": 5.145833333333333,
"grad_norm": 1.6579016766504264,
"kl": 0.04052734375,
"learning_rate": 4.958333333333333e-07,
"loss": 0.0017,
"reward": 1.9029381275177002,
"reward_std": 0.023404449224472046,
"rewards/accuracy_reward": 0.9029380083084106,
"rewards/format_reward": 1.0,
"step": 242
},
{
"clip_ratio": 0.0,
"completion_length": 110.58333587646484,
"epoch": 5.166666666666667,
"grad_norm": 2.4190316509927547,
"kl": 0.0439453125,
"learning_rate": 4.9375e-07,
"loss": 0.0018,
"reward": 1.9102628231048584,
"reward_std": 0.02647389844059944,
"rewards/accuracy_reward": 0.9115647077560425,
"rewards/format_reward": 0.9986979365348816,
"step": 243
},
{
"clip_ratio": 0.0,
"completion_length": 110.34245300292969,
"epoch": 5.1875,
"grad_norm": 1.6788038142728927,
"kl": 0.0390625,
"learning_rate": 4.916666666666666e-07,
"loss": 0.0016,
"reward": 1.8926337957382202,
"reward_std": 0.022663813084363937,
"rewards/accuracy_reward": 0.8926336765289307,
"rewards/format_reward": 1.0,
"step": 244
},
{
"clip_ratio": 0.0,
"completion_length": 111.6640625,
"epoch": 5.208333333333333,
"grad_norm": 2.1053897766649934,
"kl": 0.041015625,
"learning_rate": 4.895833333333333e-07,
"loss": 0.0017,
"reward": 1.9033950567245483,
"reward_std": 0.024107707664370537,
"rewards/accuracy_reward": 0.9033951163291931,
"rewards/format_reward": 1.0,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 109.30599212646484,
"epoch": 5.229166666666667,
"grad_norm": 8.173004175170574,
"kl": 0.042236328125,
"learning_rate": 4.875e-07,
"loss": 0.0017,
"reward": 1.8888883590698242,
"reward_std": 0.028628483414649963,
"rewards/accuracy_reward": 0.8901904821395874,
"rewards/format_reward": 0.9986979365348816,
"step": 246
},
{
"clip_ratio": 0.0,
"completion_length": 108.86328125,
"epoch": 5.25,
"grad_norm": 2.028487749549242,
"kl": 0.044921875,
"learning_rate": 4.854166666666666e-07,
"loss": 0.0019,
"reward": 1.9012870788574219,
"reward_std": 0.02197723090648651,
"rewards/accuracy_reward": 0.9012872576713562,
"rewards/format_reward": 1.0,
"step": 247
},
{
"clip_ratio": 0.0,
"completion_length": 109.8125,
"epoch": 5.270833333333333,
"grad_norm": 2.0303516292175523,
"kl": 0.03955078125,
"learning_rate": 4.833333333333333e-07,
"loss": 0.0016,
"reward": 1.8865455389022827,
"reward_std": 0.025701235979795456,
"rewards/accuracy_reward": 0.8865455985069275,
"rewards/format_reward": 1.0,
"step": 248
},
{
"clip_ratio": 0.0,
"completion_length": 110.77604675292969,
"epoch": 5.291666666666667,
"grad_norm": 1.5986544250252577,
"kl": 0.040771484375,
"learning_rate": 4.812499999999999e-07,
"loss": 0.0017,
"reward": 1.8878264427185059,
"reward_std": 0.023908209055662155,
"rewards/accuracy_reward": 0.8878263831138611,
"rewards/format_reward": 1.0,
"step": 249
},
{
"clip_ratio": 0.0,
"completion_length": 110.72005462646484,
"epoch": 5.3125,
"grad_norm": 2.0564085146093825,
"kl": 0.04541015625,
"learning_rate": 4.791666666666667e-07,
"loss": 0.0019,
"reward": 1.9011871814727783,
"reward_std": 0.026990963146090508,
"rewards/accuracy_reward": 0.902489185333252,
"rewards/format_reward": 0.9986979365348816,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 110.52604675292969,
"epoch": 5.333333333333333,
"grad_norm": 2.637539554946258,
"kl": 0.046875,
"learning_rate": 4.770833333333334e-07,
"loss": 0.0019,
"reward": 1.904599666595459,
"reward_std": 0.02259230427443981,
"rewards/accuracy_reward": 0.9059017896652222,
"rewards/format_reward": 0.9986979365348816,
"step": 251
},
{
"clip_ratio": 0.0,
"completion_length": 109.51823425292969,
"epoch": 5.354166666666667,
"grad_norm": 2.0291093041211545,
"kl": 0.043701171875,
"learning_rate": 4.7499999999999995e-07,
"loss": 0.0018,
"reward": 1.9127092361450195,
"reward_std": 0.02183235064148903,
"rewards/accuracy_reward": 0.9127092361450195,
"rewards/format_reward": 1.0,
"step": 252
},
{
"clip_ratio": 0.0,
"completion_length": 109.5234375,
"epoch": 5.375,
"grad_norm": 4.505797454206583,
"kl": 0.04345703125,
"learning_rate": 4.7291666666666666e-07,
"loss": 0.0018,
"reward": 1.9092210531234741,
"reward_std": 0.021370170637965202,
"rewards/accuracy_reward": 0.9092210531234741,
"rewards/format_reward": 1.0,
"step": 253
},
{
"clip_ratio": 0.0,
"completion_length": 111.19661712646484,
"epoch": 5.395833333333333,
"grad_norm": 1.8902834610451218,
"kl": 0.044677734375,
"learning_rate": 4.708333333333333e-07,
"loss": 0.0019,
"reward": 1.917715311050415,
"reward_std": 0.018400993198156357,
"rewards/accuracy_reward": 0.9177150726318359,
"rewards/format_reward": 1.0,
"step": 254
},
{
"clip_ratio": 0.0,
"completion_length": 109.390625,
"epoch": 5.416666666666667,
"grad_norm": 2.563147653109146,
"kl": 0.0458984375,
"learning_rate": 4.6874999999999996e-07,
"loss": 0.0019,
"reward": 1.8932042121887207,
"reward_std": 0.021255169063806534,
"rewards/accuracy_reward": 0.8932042121887207,
"rewards/format_reward": 1.0,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 109.11067962646484,
"epoch": 5.4375,
"grad_norm": 4.520917096171478,
"kl": 0.042236328125,
"learning_rate": 4.6666666666666666e-07,
"loss": 0.0018,
"reward": 1.9107736349105835,
"reward_std": 0.020730838179588318,
"rewards/accuracy_reward": 0.910773515701294,
"rewards/format_reward": 1.0,
"step": 256
},
{
"clip_ratio": 0.0,
"completion_length": 108.9140625,
"epoch": 5.458333333333333,
"grad_norm": 2.9043788526253866,
"kl": 0.046875,
"learning_rate": 4.645833333333333e-07,
"loss": 0.0019,
"reward": 1.932241439819336,
"reward_std": 0.019916361197829247,
"rewards/accuracy_reward": 0.932241678237915,
"rewards/format_reward": 1.0,
"step": 257
},
{
"clip_ratio": 0.0,
"completion_length": 107.98698425292969,
"epoch": 5.479166666666667,
"grad_norm": 4.137371389224913,
"kl": 0.04443359375,
"learning_rate": 4.625e-07,
"loss": 0.0018,
"reward": 1.8949460983276367,
"reward_std": 0.021680889651179314,
"rewards/accuracy_reward": 0.8949460983276367,
"rewards/format_reward": 1.0,
"step": 258
},
{
"clip_ratio": 0.0,
"completion_length": 108.06120300292969,
"epoch": 5.5,
"grad_norm": 1.5956956245046428,
"kl": 0.046875,
"learning_rate": 4.604166666666666e-07,
"loss": 0.002,
"reward": 1.9305871725082397,
"reward_std": 0.016989264637231827,
"rewards/accuracy_reward": 0.9305871725082397,
"rewards/format_reward": 1.0,
"step": 259
},
{
"clip_ratio": 0.0,
"completion_length": 107.46875,
"epoch": 5.520833333333333,
"grad_norm": 2.3801100164721527,
"kl": 0.05126953125,
"learning_rate": 4.5833333333333327e-07,
"loss": 0.0021,
"reward": 1.9142370223999023,
"reward_std": 0.020726464688777924,
"rewards/accuracy_reward": 0.9142370223999023,
"rewards/format_reward": 1.0,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 108.06380462646484,
"epoch": 5.541666666666667,
"grad_norm": 1.7313384895006823,
"kl": 0.051025390625,
"learning_rate": 4.5624999999999997e-07,
"loss": 0.0021,
"reward": 1.8905744552612305,
"reward_std": 0.02504381351172924,
"rewards/accuracy_reward": 0.8905746340751648,
"rewards/format_reward": 1.0,
"step": 261
},
{
"clip_ratio": 0.0,
"completion_length": 108.88151550292969,
"epoch": 5.5625,
"grad_norm": 1.5486364631929457,
"kl": 0.046875,
"learning_rate": 4.541666666666666e-07,
"loss": 0.002,
"reward": 1.8817743062973022,
"reward_std": 0.02400146797299385,
"rewards/accuracy_reward": 0.8817743062973022,
"rewards/format_reward": 1.0,
"step": 262
},
{
"clip_ratio": 0.0,
"completion_length": 109.19921875,
"epoch": 5.583333333333333,
"grad_norm": 1.9468347826097554,
"kl": 0.0498046875,
"learning_rate": 4.5208333333333333e-07,
"loss": 0.0021,
"reward": 1.9123249053955078,
"reward_std": 0.02088339626789093,
"rewards/accuracy_reward": 0.9123249053955078,
"rewards/format_reward": 1.0,
"step": 263
},
{
"clip_ratio": 0.0,
"completion_length": 108.50651550292969,
"epoch": 5.604166666666667,
"grad_norm": 1.8134017920844003,
"kl": 0.050537109375,
"learning_rate": 4.5e-07,
"loss": 0.002,
"reward": 1.9056270122528076,
"reward_std": 0.020590659230947495,
"rewards/accuracy_reward": 0.9056269526481628,
"rewards/format_reward": 1.0,
"step": 264
},
{
"clip_ratio": 0.0,
"completion_length": 110.43359375,
"epoch": 5.625,
"grad_norm": 2.097289435304608,
"kl": 0.058837890625,
"learning_rate": 4.479166666666667e-07,
"loss": 0.0024,
"reward": 1.8596689701080322,
"reward_std": 0.026524469256401062,
"rewards/accuracy_reward": 0.8609709143638611,
"rewards/format_reward": 0.9986979365348816,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 107.8515625,
"epoch": 5.645833333333333,
"grad_norm": 2.9899940677112995,
"kl": 0.0556640625,
"learning_rate": 4.4583333333333334e-07,
"loss": 0.0023,
"reward": 1.891918420791626,
"reward_std": 0.02338644489645958,
"rewards/accuracy_reward": 0.8919183611869812,
"rewards/format_reward": 1.0,
"step": 266
},
{
"clip_ratio": 0.0,
"completion_length": 106.14714050292969,
"epoch": 5.666666666666667,
"grad_norm": 1.5447722254023164,
"kl": 0.052734375,
"learning_rate": 4.4374999999999993e-07,
"loss": 0.0022,
"reward": 1.8943192958831787,
"reward_std": 0.021093344315886497,
"rewards/accuracy_reward": 0.8943192362785339,
"rewards/format_reward": 1.0,
"step": 267
},
{
"clip_ratio": 0.0,
"completion_length": 109.02083587646484,
"epoch": 5.6875,
"grad_norm": 1.7252851771135564,
"kl": 0.05419921875,
"learning_rate": 4.4166666666666664e-07,
"loss": 0.0023,
"reward": 1.8994736671447754,
"reward_std": 0.025407809764146805,
"rewards/accuracy_reward": 0.9007757902145386,
"rewards/format_reward": 0.9986979365348816,
"step": 268
},
{
"clip_ratio": 0.0,
"completion_length": 107.19792175292969,
"epoch": 5.708333333333333,
"grad_norm": 1.4821607784380106,
"kl": 0.05810546875,
"learning_rate": 4.395833333333333e-07,
"loss": 0.0025,
"reward": 1.9176604747772217,
"reward_std": 0.019861234351992607,
"rewards/accuracy_reward": 0.9176604151725769,
"rewards/format_reward": 1.0,
"step": 269
},
{
"clip_ratio": 0.0,
"completion_length": 106.72265625,
"epoch": 5.729166666666667,
"grad_norm": 6.345280883170612,
"kl": 0.052734375,
"learning_rate": 4.375e-07,
"loss": 0.0022,
"reward": 1.9127343893051147,
"reward_std": 0.019745318219065666,
"rewards/accuracy_reward": 0.9127345085144043,
"rewards/format_reward": 1.0,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 106.1640625,
"epoch": 5.75,
"grad_norm": 1.8167510177139423,
"kl": 0.052734375,
"learning_rate": 4.3541666666666664e-07,
"loss": 0.0022,
"reward": 1.9143961668014526,
"reward_std": 0.021544938907027245,
"rewards/accuracy_reward": 0.9143962264060974,
"rewards/format_reward": 1.0,
"step": 271
},
{
"clip_ratio": 0.0,
"completion_length": 106.9296875,
"epoch": 5.770833333333333,
"grad_norm": 2.4532452869571566,
"kl": 0.05322265625,
"learning_rate": 4.3333333333333335e-07,
"loss": 0.0021,
"reward": 1.8847317695617676,
"reward_std": 0.02402741275727749,
"rewards/accuracy_reward": 0.8847318887710571,
"rewards/format_reward": 1.0,
"step": 272
},
{
"clip_ratio": 0.0,
"completion_length": 106.16146087646484,
"epoch": 5.791666666666667,
"grad_norm": 4.133102609868267,
"kl": 0.04541015625,
"learning_rate": 4.3125e-07,
"loss": 0.0019,
"reward": 1.9220575094223022,
"reward_std": 0.01720447652041912,
"rewards/accuracy_reward": 0.9220575094223022,
"rewards/format_reward": 1.0,
"step": 273
},
{
"clip_ratio": 0.0,
"completion_length": 104.95573425292969,
"epoch": 5.8125,
"grad_norm": 7.211708520736478,
"kl": 0.04541015625,
"learning_rate": 4.291666666666666e-07,
"loss": 0.0019,
"reward": 1.9195680618286133,
"reward_std": 0.019044464454054832,
"rewards/accuracy_reward": 0.9195680618286133,
"rewards/format_reward": 1.0,
"step": 274
},
{
"clip_ratio": 0.0,
"completion_length": 105.79427337646484,
"epoch": 5.833333333333333,
"grad_norm": 2.5193114976640394,
"kl": 0.047607421875,
"learning_rate": 4.270833333333333e-07,
"loss": 0.002,
"reward": 1.9032373428344727,
"reward_std": 0.01688932441174984,
"rewards/accuracy_reward": 0.903237521648407,
"rewards/format_reward": 1.0,
"step": 275
},
{
"clip_ratio": 0.0,
"completion_length": 106.97135925292969,
"epoch": 5.854166666666667,
"grad_norm": 1.922113400060024,
"kl": 0.046875,
"learning_rate": 4.2499999999999995e-07,
"loss": 0.002,
"reward": 1.8878214359283447,
"reward_std": 0.023371964693069458,
"rewards/accuracy_reward": 0.8878213763237,
"rewards/format_reward": 1.0,
"step": 276
},
{
"clip_ratio": 0.0,
"completion_length": 107.08073425292969,
"epoch": 5.875,
"grad_norm": 1.786331439327784,
"kl": 0.04541015625,
"learning_rate": 4.2291666666666666e-07,
"loss": 0.0019,
"reward": 1.9193141460418701,
"reward_std": 0.02163059636950493,
"rewards/accuracy_reward": 0.9206160306930542,
"rewards/format_reward": 0.9986979365348816,
"step": 277
},
{
"clip_ratio": 0.0,
"completion_length": 106.82421875,
"epoch": 5.895833333333333,
"grad_norm": 4.285948048806076,
"kl": 0.0419921875,
"learning_rate": 4.208333333333333e-07,
"loss": 0.0017,
"reward": 1.915520429611206,
"reward_std": 0.021722108125686646,
"rewards/accuracy_reward": 0.9155203700065613,
"rewards/format_reward": 1.0,
"step": 278
},
{
"clip_ratio": 0.0,
"completion_length": 106.68229675292969,
"epoch": 5.916666666666667,
"grad_norm": 2.1862985744626102,
"kl": 0.045166015625,
"learning_rate": 4.1875e-07,
"loss": 0.0019,
"reward": 1.8867233991622925,
"reward_std": 0.022505465894937515,
"rewards/accuracy_reward": 0.8867233991622925,
"rewards/format_reward": 1.0,
"step": 279
},
{
"clip_ratio": 0.0,
"completion_length": 108.41536712646484,
"epoch": 5.9375,
"grad_norm": 2.6822943568035655,
"kl": 0.050537109375,
"learning_rate": 4.1666666666666667e-07,
"loss": 0.0021,
"reward": 1.9038598537445068,
"reward_std": 0.02783789113163948,
"rewards/accuracy_reward": 0.9051617980003357,
"rewards/format_reward": 0.9986979365348816,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 110.13542175292969,
"epoch": 5.958333333333333,
"grad_norm": 1.8215228571783213,
"kl": 0.0498046875,
"learning_rate": 4.145833333333333e-07,
"loss": 0.0021,
"reward": 1.9209411144256592,
"reward_std": 0.020260518416762352,
"rewards/accuracy_reward": 0.9209408760070801,
"rewards/format_reward": 1.0,
"step": 281
},
{
"clip_ratio": 0.0,
"completion_length": 111.25901794433594,
"epoch": 5.979166666666667,
"grad_norm": 1.7303647275137022,
"kl": 0.04931640625,
"learning_rate": 4.1249999999999997e-07,
"loss": 0.0021,
"reward": 1.8940850496292114,
"reward_std": 0.02454877458512783,
"rewards/accuracy_reward": 0.8954200744628906,
"rewards/format_reward": 0.998664915561676,
"step": 282
},
{
"clip_ratio": 0.0,
"completion_length": 109.49870300292969,
"epoch": 6.020833333333333,
"grad_norm": 2.065626526970926,
"kl": 0.045654296875,
"learning_rate": 4.104166666666666e-07,
"loss": 0.0019,
"reward": 1.8828470706939697,
"reward_std": 0.023679915815591812,
"rewards/accuracy_reward": 0.8841490745544434,
"rewards/format_reward": 0.9986979365348816,
"step": 283
},
{
"clip_ratio": 0.0,
"completion_length": 108.63021087646484,
"epoch": 6.041666666666667,
"grad_norm": 2.407620241107693,
"kl": 0.044189453125,
"learning_rate": 4.083333333333333e-07,
"loss": 0.0018,
"reward": 1.922331690788269,
"reward_std": 0.02023524045944214,
"rewards/accuracy_reward": 0.9223315715789795,
"rewards/format_reward": 1.0,
"step": 284
},
{
"clip_ratio": 0.0,
"completion_length": 109.02734375,
"epoch": 6.0625,
"grad_norm": 2.583573023564423,
"kl": 0.043212890625,
"learning_rate": 4.0625e-07,
"loss": 0.0018,
"reward": 1.9238051176071167,
"reward_std": 0.01875409483909607,
"rewards/accuracy_reward": 0.9238051176071167,
"rewards/format_reward": 1.0,
"step": 285
},
{
"clip_ratio": 0.0,
"completion_length": 110.75130462646484,
"epoch": 6.083333333333333,
"grad_norm": 1.5921015904802127,
"kl": 0.04736328125,
"learning_rate": 4.041666666666667e-07,
"loss": 0.0019,
"reward": 1.90175199508667,
"reward_std": 0.01868622750043869,
"rewards/accuracy_reward": 0.9017519950866699,
"rewards/format_reward": 1.0,
"step": 286
},
{
"clip_ratio": 0.0,
"completion_length": 110.14453125,
"epoch": 6.104166666666667,
"grad_norm": 2.063415833551645,
"kl": 0.04638671875,
"learning_rate": 4.0208333333333333e-07,
"loss": 0.0019,
"reward": 1.9288432598114014,
"reward_std": 0.017594818025827408,
"rewards/accuracy_reward": 0.9288431406021118,
"rewards/format_reward": 1.0,
"step": 287
},
{
"clip_ratio": 0.0,
"completion_length": 109.52995300292969,
"epoch": 6.125,
"grad_norm": 2.6980955335354597,
"kl": 0.043212890625,
"learning_rate": 4e-07,
"loss": 0.0018,
"reward": 1.9106167554855347,
"reward_std": 0.020342741161584854,
"rewards/accuracy_reward": 0.9106166362762451,
"rewards/format_reward": 1.0,
"step": 288
},
{
"clip_ratio": 0.0,
"completion_length": 109.89583587646484,
"epoch": 6.145833333333333,
"grad_norm": 1.7057280834613109,
"kl": 0.041259765625,
"learning_rate": 3.9791666666666663e-07,
"loss": 0.0017,
"reward": 1.9230403900146484,
"reward_std": 0.018507663160562515,
"rewards/accuracy_reward": 0.9230403900146484,
"rewards/format_reward": 1.0,
"step": 289
},
{
"clip_ratio": 0.0,
"completion_length": 109.31640625,
"epoch": 6.166666666666667,
"grad_norm": 1.7481050352203042,
"kl": 0.04296875,
"learning_rate": 3.958333333333333e-07,
"loss": 0.0018,
"reward": 1.8892457485198975,
"reward_std": 0.025482675060629845,
"rewards/accuracy_reward": 0.8905477523803711,
"rewards/format_reward": 0.9986979365348816,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 109.57552337646484,
"epoch": 6.1875,
"grad_norm": 1.5790651747348239,
"kl": 0.045654296875,
"learning_rate": 3.9375e-07,
"loss": 0.0019,
"reward": 1.9172825813293457,
"reward_std": 0.01870821975171566,
"rewards/accuracy_reward": 0.9172827005386353,
"rewards/format_reward": 1.0,
"step": 291
},
{
"clip_ratio": 0.0,
"completion_length": 110.67708587646484,
"epoch": 6.208333333333333,
"grad_norm": 5.985924437288418,
"kl": 0.0458984375,
"learning_rate": 3.9166666666666664e-07,
"loss": 0.0019,
"reward": 1.9062340259552002,
"reward_std": 0.017811615020036697,
"rewards/accuracy_reward": 0.9062339663505554,
"rewards/format_reward": 1.0,
"step": 292
},
{
"clip_ratio": 0.0,
"completion_length": 108.33854675292969,
"epoch": 6.229166666666667,
"grad_norm": 29.636849516000904,
"kl": 0.046630859375,
"learning_rate": 3.8958333333333334e-07,
"loss": 0.0019,
"reward": 1.9097627401351929,
"reward_std": 0.025265123695135117,
"rewards/accuracy_reward": 0.9110648036003113,
"rewards/format_reward": 0.9986979365348816,
"step": 293
},
{
"clip_ratio": 0.0,
"completion_length": 109.21745300292969,
"epoch": 6.25,
"grad_norm": 2.499873249076194,
"kl": 0.052001953125,
"learning_rate": 3.875e-07,
"loss": 0.0021,
"reward": 1.908249855041504,
"reward_std": 0.021753787994384766,
"rewards/accuracy_reward": 0.9082497358322144,
"rewards/format_reward": 1.0,
"step": 294
},
{
"clip_ratio": 0.0,
"completion_length": 107.96875,
"epoch": 6.270833333333333,
"grad_norm": 1.43250838119596,
"kl": 0.0478515625,
"learning_rate": 3.8541666666666665e-07,
"loss": 0.002,
"reward": 1.9135513305664062,
"reward_std": 0.02083742991089821,
"rewards/accuracy_reward": 0.9148534536361694,
"rewards/format_reward": 0.9986979365348816,
"step": 295
},
{
"clip_ratio": 0.0,
"completion_length": 109.49479675292969,
"epoch": 6.291666666666667,
"grad_norm": 1.9939986843684863,
"kl": 0.046630859375,
"learning_rate": 3.8333333333333335e-07,
"loss": 0.002,
"reward": 1.9211857318878174,
"reward_std": 0.019976306706666946,
"rewards/accuracy_reward": 0.9211856722831726,
"rewards/format_reward": 1.0,
"step": 296
},
{
"clip_ratio": 0.0,
"completion_length": 108.37239837646484,
"epoch": 6.3125,
"grad_norm": 1.8430355626574486,
"kl": 0.047607421875,
"learning_rate": 3.8124999999999995e-07,
"loss": 0.002,
"reward": 1.9055722951889038,
"reward_std": 0.020207837224006653,
"rewards/accuracy_reward": 0.9055722951889038,
"rewards/format_reward": 1.0,
"step": 297
},
{
"clip_ratio": 0.0,
"completion_length": 109.99870300292969,
"epoch": 6.333333333333333,
"grad_norm": 4.534254461924897,
"kl": 0.044921875,
"learning_rate": 3.7916666666666665e-07,
"loss": 0.0019,
"reward": 1.8959176540374756,
"reward_std": 0.018412087112665176,
"rewards/accuracy_reward": 0.8959175944328308,
"rewards/format_reward": 1.0,
"step": 298
},
{
"clip_ratio": 0.0,
"completion_length": 109.14714050292969,
"epoch": 6.354166666666667,
"grad_norm": 1.5417912395172437,
"kl": 0.04638671875,
"learning_rate": 3.770833333333333e-07,
"loss": 0.002,
"reward": 1.9178366661071777,
"reward_std": 0.01834617182612419,
"rewards/accuracy_reward": 0.9178365468978882,
"rewards/format_reward": 1.0,
"step": 299
},
{
"clip_ratio": 0.0,
"completion_length": 107.67448425292969,
"epoch": 6.375,
"grad_norm": 10.381006533833029,
"kl": 0.043701171875,
"learning_rate": 3.75e-07,
"loss": 0.0019,
"reward": 1.9170053005218506,
"reward_std": 0.020167209208011627,
"rewards/accuracy_reward": 0.9183073043823242,
"rewards/format_reward": 0.9986979365348816,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 108.34635925292969,
"epoch": 6.395833333333333,
"grad_norm": 3.2103701478375366,
"kl": 0.046630859375,
"learning_rate": 3.7291666666666666e-07,
"loss": 0.0019,
"reward": 1.9029948711395264,
"reward_std": 0.019157804548740387,
"rewards/accuracy_reward": 0.9029948711395264,
"rewards/format_reward": 1.0,
"step": 301
},
{
"clip_ratio": 0.0,
"completion_length": 108.34114837646484,
"epoch": 6.416666666666667,
"grad_norm": 2.1215276638392253,
"kl": 0.051025390625,
"learning_rate": 3.708333333333333e-07,
"loss": 0.0021,
"reward": 1.9299815893173218,
"reward_std": 0.01765059307217598,
"rewards/accuracy_reward": 0.9299815893173218,
"rewards/format_reward": 1.0,
"step": 302
},
{
"clip_ratio": 0.0,
"completion_length": 109.96354675292969,
"epoch": 6.4375,
"grad_norm": 2.0251198297391815,
"kl": 0.04736328125,
"learning_rate": 3.6875e-07,
"loss": 0.002,
"reward": 1.901715636253357,
"reward_std": 0.024447208270430565,
"rewards/accuracy_reward": 0.9030176401138306,
"rewards/format_reward": 0.9986979365348816,
"step": 303
},
{
"clip_ratio": 0.0,
"completion_length": 109.21224212646484,
"epoch": 6.458333333333333,
"grad_norm": 5.5850053870802645,
"kl": 0.04345703125,
"learning_rate": 3.666666666666666e-07,
"loss": 0.0018,
"reward": 1.9226205348968506,
"reward_std": 0.01778128370642662,
"rewards/accuracy_reward": 0.9226205348968506,
"rewards/format_reward": 1.0,
"step": 304
},
{
"clip_ratio": 0.0,
"completion_length": 108.30989837646484,
"epoch": 6.479166666666667,
"grad_norm": 2.1418335808540183,
"kl": 0.045654296875,
"learning_rate": 3.645833333333333e-07,
"loss": 0.0019,
"reward": 1.9199604988098145,
"reward_std": 0.018466424196958542,
"rewards/accuracy_reward": 0.919960618019104,
"rewards/format_reward": 1.0,
"step": 305
},
{
"clip_ratio": 0.0,
"completion_length": 108.81771087646484,
"epoch": 6.5,
"grad_norm": 1.5934526555915005,
"kl": 0.051513671875,
"learning_rate": 3.6249999999999997e-07,
"loss": 0.0021,
"reward": 1.9024890661239624,
"reward_std": 0.018778668716549873,
"rewards/accuracy_reward": 0.9024890661239624,
"rewards/format_reward": 1.0,
"step": 306
},
{
"clip_ratio": 0.0,
"completion_length": 108.77474212646484,
"epoch": 6.520833333333333,
"grad_norm": 9.90923642720643,
"kl": 0.04736328125,
"learning_rate": 3.604166666666666e-07,
"loss": 0.002,
"reward": 1.9020146131515503,
"reward_std": 0.021591586992144585,
"rewards/accuracy_reward": 0.9033166766166687,
"rewards/format_reward": 0.9986979365348816,
"step": 307
},
{
"clip_ratio": 0.0,
"completion_length": 109.80859375,
"epoch": 6.541666666666667,
"grad_norm": 2.2902946965667996,
"kl": 0.040771484375,
"learning_rate": 3.583333333333333e-07,
"loss": 0.0017,
"reward": 1.9083236455917358,
"reward_std": 0.020869677886366844,
"rewards/accuracy_reward": 0.9083236455917358,
"rewards/format_reward": 1.0,
"step": 308
},
{
"clip_ratio": 0.0,
"completion_length": 107.97135925292969,
"epoch": 6.5625,
"grad_norm": 2.141619909298115,
"kl": 0.04443359375,
"learning_rate": 3.5625e-07,
"loss": 0.0019,
"reward": 1.895308494567871,
"reward_std": 0.021585416048765182,
"rewards/accuracy_reward": 0.8966106176376343,
"rewards/format_reward": 0.9986979365348816,
"step": 309
},
{
"clip_ratio": 0.0,
"completion_length": 107.83984375,
"epoch": 6.583333333333333,
"grad_norm": 1.722945461090862,
"kl": 0.052490234375,
"learning_rate": 3.541666666666667e-07,
"loss": 0.0022,
"reward": 1.9142106771469116,
"reward_std": 0.022508492693305016,
"rewards/accuracy_reward": 0.91551274061203,
"rewards/format_reward": 0.9986979365348816,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 107.95703125,
"epoch": 6.604166666666667,
"grad_norm": 3.6778187227400396,
"kl": 0.0498046875,
"learning_rate": 3.5208333333333333e-07,
"loss": 0.0021,
"reward": 1.8938100337982178,
"reward_std": 0.024937432259321213,
"rewards/accuracy_reward": 0.8951120376586914,
"rewards/format_reward": 0.9986979365348816,
"step": 311
},
{
"clip_ratio": 0.0,
"completion_length": 107.52995300292969,
"epoch": 6.625,
"grad_norm": 2.535477979468292,
"kl": 0.0634765625,
"learning_rate": 3.5e-07,
"loss": 0.0026,
"reward": 1.900233268737793,
"reward_std": 0.01887938380241394,
"rewards/accuracy_reward": 0.9002333879470825,
"rewards/format_reward": 1.0,
"step": 312
},
{
"clip_ratio": 0.0,
"completion_length": 106.04817962646484,
"epoch": 6.645833333333333,
"grad_norm": 2.288272855129929,
"kl": 0.045654296875,
"learning_rate": 3.4791666666666664e-07,
"loss": 0.0019,
"reward": 1.912428855895996,
"reward_std": 0.020974930375814438,
"rewards/accuracy_reward": 0.9124290347099304,
"rewards/format_reward": 1.0,
"step": 313
},
{
"clip_ratio": 0.0,
"completion_length": 108.02864837646484,
"epoch": 6.666666666666667,
"grad_norm": 2.0183407375033413,
"kl": 0.04736328125,
"learning_rate": 3.458333333333333e-07,
"loss": 0.0019,
"reward": 1.9255565404891968,
"reward_std": 0.01740911416709423,
"rewards/accuracy_reward": 0.9255565404891968,
"rewards/format_reward": 1.0,
"step": 314
},
{
"clip_ratio": 0.0,
"completion_length": 105.26302337646484,
"epoch": 6.6875,
"grad_norm": 2.4818346839800007,
"kl": 0.04736328125,
"learning_rate": 3.4375e-07,
"loss": 0.0019,
"reward": 1.882767915725708,
"reward_std": 0.024225857108831406,
"rewards/accuracy_reward": 0.8827678561210632,
"rewards/format_reward": 1.0,
"step": 315
},
{
"clip_ratio": 0.0,
"completion_length": 107.06771087646484,
"epoch": 6.708333333333333,
"grad_norm": 1.4343972340076592,
"kl": 0.05322265625,
"learning_rate": 3.4166666666666664e-07,
"loss": 0.0022,
"reward": 1.9222266674041748,
"reward_std": 0.015834566205739975,
"rewards/accuracy_reward": 0.92222660779953,
"rewards/format_reward": 1.0,
"step": 316
},
{
"clip_ratio": 0.0,
"completion_length": 108.90234375,
"epoch": 6.729166666666667,
"grad_norm": 1.852771798485069,
"kl": 0.0517578125,
"learning_rate": 3.3958333333333335e-07,
"loss": 0.0022,
"reward": 1.9179143905639648,
"reward_std": 0.02144519053399563,
"rewards/accuracy_reward": 0.9192163944244385,
"rewards/format_reward": 0.9986979365348816,
"step": 317
},
{
"clip_ratio": 0.0,
"completion_length": 109.6328125,
"epoch": 6.75,
"grad_norm": 3.6471691567389057,
"kl": 0.054443359375,
"learning_rate": 3.375e-07,
"loss": 0.0022,
"reward": 1.909096598625183,
"reward_std": 0.01967495307326317,
"rewards/accuracy_reward": 0.9090965986251831,
"rewards/format_reward": 1.0,
"step": 318
},
{
"clip_ratio": 0.0,
"completion_length": 107.61458587646484,
"epoch": 6.770833333333333,
"grad_norm": 4.178729179494792,
"kl": 0.047607421875,
"learning_rate": 3.3541666666666665e-07,
"loss": 0.002,
"reward": 1.923593521118164,
"reward_std": 0.016256026923656464,
"rewards/accuracy_reward": 0.9235934019088745,
"rewards/format_reward": 1.0,
"step": 319
},
{
"clip_ratio": 0.0,
"completion_length": 111.46875,
"epoch": 6.791666666666667,
"grad_norm": 3.850110305328207,
"kl": 0.05224609375,
"learning_rate": 3.333333333333333e-07,
"loss": 0.0022,
"reward": 1.8894399404525757,
"reward_std": 0.0215081088244915,
"rewards/accuracy_reward": 0.8894399404525757,
"rewards/format_reward": 1.0,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 108.98567962646484,
"epoch": 6.8125,
"grad_norm": 1.5295790546923704,
"kl": 0.049560546875,
"learning_rate": 3.3124999999999995e-07,
"loss": 0.0021,
"reward": 1.902695894241333,
"reward_std": 0.022552501410245895,
"rewards/accuracy_reward": 0.9039978384971619,
"rewards/format_reward": 0.9986979365348816,
"step": 321
},
{
"clip_ratio": 0.0,
"completion_length": 110.2109375,
"epoch": 6.833333333333333,
"grad_norm": 2.177500326990495,
"kl": 0.049560546875,
"learning_rate": 3.2916666666666666e-07,
"loss": 0.0021,
"reward": 1.9026316404342651,
"reward_std": 0.028222566470503807,
"rewards/accuracy_reward": 0.9039337038993835,
"rewards/format_reward": 0.9986979365348816,
"step": 322
},
{
"clip_ratio": 0.0,
"completion_length": 110.26823425292969,
"epoch": 6.854166666666667,
"grad_norm": 2.25656186299227,
"kl": 0.049072265625,
"learning_rate": 3.270833333333333e-07,
"loss": 0.002,
"reward": 1.9238814115524292,
"reward_std": 0.01956191472709179,
"rewards/accuracy_reward": 0.9238814115524292,
"rewards/format_reward": 1.0,
"step": 323
},
{
"clip_ratio": 0.0,
"completion_length": 109.48177337646484,
"epoch": 6.875,
"grad_norm": 1.962758450391304,
"kl": 0.044677734375,
"learning_rate": 3.25e-07,
"loss": 0.0019,
"reward": 1.902562141418457,
"reward_std": 0.016970310360193253,
"rewards/accuracy_reward": 0.902562141418457,
"rewards/format_reward": 1.0,
"step": 324
},
{
"clip_ratio": 0.0,
"completion_length": 111.42317962646484,
"epoch": 6.895833333333333,
"grad_norm": 1.932226827561351,
"kl": 0.052978515625,
"learning_rate": 3.2291666666666666e-07,
"loss": 0.0022,
"reward": 1.9169180393218994,
"reward_std": 0.01541107427328825,
"rewards/accuracy_reward": 0.9169179797172546,
"rewards/format_reward": 1.0,
"step": 325
},
{
"clip_ratio": 0.0,
"completion_length": 110.95442962646484,
"epoch": 6.916666666666667,
"grad_norm": 2.123672528629966,
"kl": 0.0546875,
"learning_rate": 3.2083333333333337e-07,
"loss": 0.0023,
"reward": 1.9134833812713623,
"reward_std": 0.018955400213599205,
"rewards/accuracy_reward": 0.9134833812713623,
"rewards/format_reward": 1.0,
"step": 326
},
{
"clip_ratio": 0.0,
"completion_length": 112.23177337646484,
"epoch": 6.9375,
"grad_norm": 2.7379081437953436,
"kl": 0.04736328125,
"learning_rate": 3.1874999999999997e-07,
"loss": 0.0019,
"reward": 1.8997161388397217,
"reward_std": 0.019093122333288193,
"rewards/accuracy_reward": 0.8997160792350769,
"rewards/format_reward": 1.0,
"step": 327
},
{
"clip_ratio": 0.0,
"completion_length": 109.70052337646484,
"epoch": 6.958333333333333,
"grad_norm": 2.033398718348136,
"kl": 0.051025390625,
"learning_rate": 3.166666666666666e-07,
"loss": 0.0021,
"reward": 1.9058278799057007,
"reward_std": 0.020390968769788742,
"rewards/accuracy_reward": 0.9058279991149902,
"rewards/format_reward": 1.0,
"step": 328
},
{
"clip_ratio": 0.0,
"completion_length": 110.6568832397461,
"epoch": 6.979166666666667,
"grad_norm": 4.2350616184619625,
"kl": 0.0478515625,
"learning_rate": 3.145833333333333e-07,
"loss": 0.002,
"reward": 1.9187196493148804,
"reward_std": 0.019620845094323158,
"rewards/accuracy_reward": 0.9187195301055908,
"rewards/format_reward": 1.0,
"step": 329
},
{
"clip_ratio": 0.0,
"completion_length": 107.58984375,
"epoch": 7.020833333333333,
"grad_norm": 1.3225122948151316,
"kl": 0.04833984375,
"learning_rate": 3.1249999999999997e-07,
"loss": 0.002,
"reward": 1.904463768005371,
"reward_std": 0.016730796545743942,
"rewards/accuracy_reward": 0.9044637680053711,
"rewards/format_reward": 1.0,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 109.29036712646484,
"epoch": 7.041666666666667,
"grad_norm": 2.0001558004119135,
"kl": 0.04931640625,
"learning_rate": 3.104166666666667e-07,
"loss": 0.002,
"reward": 1.9282145500183105,
"reward_std": 0.020631009712815285,
"rewards/accuracy_reward": 0.9282145500183105,
"rewards/format_reward": 1.0,
"step": 331
},
{
"clip_ratio": 0.0,
"completion_length": 107.10026550292969,
"epoch": 7.0625,
"grad_norm": 1.7677360413399623,
"kl": 0.046142578125,
"learning_rate": 3.0833333333333333e-07,
"loss": 0.0019,
"reward": 1.913468360900879,
"reward_std": 0.01975328102707863,
"rewards/accuracy_reward": 0.9134685397148132,
"rewards/format_reward": 1.0,
"step": 332
},
{
"clip_ratio": 0.0,
"completion_length": 106.64583587646484,
"epoch": 7.083333333333333,
"grad_norm": 1.5863979383855535,
"kl": 0.044921875,
"learning_rate": 3.0625000000000003e-07,
"loss": 0.0019,
"reward": 1.9101800918579102,
"reward_std": 0.018298618495464325,
"rewards/accuracy_reward": 0.9101800918579102,
"rewards/format_reward": 1.0,
"step": 333
},
{
"clip_ratio": 0.0,
"completion_length": 106.95573425292969,
"epoch": 7.104166666666667,
"grad_norm": 1.7406147667004834,
"kl": 0.04296875,
"learning_rate": 3.0416666666666663e-07,
"loss": 0.0017,
"reward": 1.9128578901290894,
"reward_std": 0.025313010439276695,
"rewards/accuracy_reward": 0.9141599535942078,
"rewards/format_reward": 0.9986979365348816,
"step": 334
},
{
"clip_ratio": 0.0,
"completion_length": 107.0390625,
"epoch": 7.125,
"grad_norm": 6.234665754184476,
"kl": 0.046875,
"learning_rate": 3.020833333333333e-07,
"loss": 0.0019,
"reward": 1.8841427564620972,
"reward_std": 0.020634343847632408,
"rewards/accuracy_reward": 0.8841428160667419,
"rewards/format_reward": 1.0,
"step": 335
},
{
"clip_ratio": 0.0,
"completion_length": 106.65104675292969,
"epoch": 7.145833333333333,
"grad_norm": 2.942912834547579,
"kl": 0.053955078125,
"learning_rate": 3e-07,
"loss": 0.0022,
"reward": 1.9155168533325195,
"reward_std": 0.019924897700548172,
"rewards/accuracy_reward": 0.9155170321464539,
"rewards/format_reward": 1.0,
"step": 336
},
{
"clip_ratio": 0.0,
"completion_length": 106.85026550292969,
"epoch": 7.166666666666667,
"grad_norm": 1.658655338447552,
"kl": 0.048583984375,
"learning_rate": 2.9791666666666664e-07,
"loss": 0.002,
"reward": 1.9088094234466553,
"reward_std": 0.025105763226747513,
"rewards/accuracy_reward": 0.9101114273071289,
"rewards/format_reward": 0.9986979365348816,
"step": 337
},
{
"clip_ratio": 0.0,
"completion_length": 106.1953125,
"epoch": 7.1875,
"grad_norm": 2.4406244402139365,
"kl": 0.04833984375,
"learning_rate": 2.9583333333333334e-07,
"loss": 0.002,
"reward": 1.9100263118743896,
"reward_std": 0.02519826404750347,
"rewards/accuracy_reward": 0.9113283157348633,
"rewards/format_reward": 0.9986979365348816,
"step": 338
},
{
"clip_ratio": 0.0,
"completion_length": 105.19140625,
"epoch": 7.208333333333333,
"grad_norm": 4.271413410927959,
"kl": 0.048828125,
"learning_rate": 2.9375e-07,
"loss": 0.002,
"reward": 1.9093725681304932,
"reward_std": 0.01967555098235607,
"rewards/accuracy_reward": 0.9093725085258484,
"rewards/format_reward": 1.0,
"step": 339
},
{
"clip_ratio": 0.0,
"completion_length": 104.4140625,
"epoch": 7.229166666666667,
"grad_norm": 2.3763872048797414,
"kl": 0.0654296875,
"learning_rate": 2.916666666666667e-07,
"loss": 0.0027,
"reward": 1.90401029586792,
"reward_std": 0.023377878591418266,
"rewards/accuracy_reward": 0.9053124189376831,
"rewards/format_reward": 0.9986979365348816,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 105.7265625,
"epoch": 7.25,
"grad_norm": 2.806628637706787,
"kl": 0.04443359375,
"learning_rate": 2.8958333333333335e-07,
"loss": 0.0019,
"reward": 1.8880078792572021,
"reward_std": 0.023502841591835022,
"rewards/accuracy_reward": 0.8880078196525574,
"rewards/format_reward": 1.0,
"step": 341
},
{
"clip_ratio": 0.0,
"completion_length": 108.31510925292969,
"epoch": 7.270833333333333,
"grad_norm": 3.43433922412685,
"kl": 0.0888671875,
"learning_rate": 2.8749999999999995e-07,
"loss": 0.0036,
"reward": 1.9123291969299316,
"reward_std": 0.020550193265080452,
"rewards/accuracy_reward": 0.9123293161392212,
"rewards/format_reward": 1.0,
"step": 342
},
{
"clip_ratio": 0.0,
"completion_length": 103.36589050292969,
"epoch": 7.291666666666667,
"grad_norm": 2.842653921619938,
"kl": 0.046142578125,
"learning_rate": 2.8541666666666665e-07,
"loss": 0.0019,
"reward": 1.9058549404144287,
"reward_std": 0.019686056300997734,
"rewards/accuracy_reward": 0.9058548808097839,
"rewards/format_reward": 1.0,
"step": 343
},
{
"clip_ratio": 0.0,
"completion_length": 105.89974212646484,
"epoch": 7.3125,
"grad_norm": 2.0405865834971473,
"kl": 0.046630859375,
"learning_rate": 2.833333333333333e-07,
"loss": 0.0019,
"reward": 1.9217561483383179,
"reward_std": 0.018797121942043304,
"rewards/accuracy_reward": 0.9217562675476074,
"rewards/format_reward": 1.0,
"step": 344
},
{
"clip_ratio": 0.0,
"completion_length": 106.70182800292969,
"epoch": 7.333333333333333,
"grad_norm": 1.9486921558633905,
"kl": 0.0458984375,
"learning_rate": 2.8125e-07,
"loss": 0.0019,
"reward": 1.8922131061553955,
"reward_std": 0.02469293400645256,
"rewards/accuracy_reward": 0.892212986946106,
"rewards/format_reward": 1.0,
"step": 345
},
{
"clip_ratio": 0.0,
"completion_length": 107.19140625,
"epoch": 7.354166666666667,
"grad_norm": 2.5259874539949303,
"kl": 0.045654296875,
"learning_rate": 2.7916666666666666e-07,
"loss": 0.0019,
"reward": 1.905487060546875,
"reward_std": 0.023547440767288208,
"rewards/accuracy_reward": 0.905487060546875,
"rewards/format_reward": 1.0,
"step": 346
},
{
"clip_ratio": 0.0,
"completion_length": 104.25130462646484,
"epoch": 7.375,
"grad_norm": 6.590006203990273,
"kl": 0.055908203125,
"learning_rate": 2.770833333333333e-07,
"loss": 0.0023,
"reward": 1.931574821472168,
"reward_std": 0.019262373447418213,
"rewards/accuracy_reward": 0.9315750002861023,
"rewards/format_reward": 1.0,
"step": 347
},
{
"clip_ratio": 0.0,
"completion_length": 105.35677337646484,
"epoch": 7.395833333333333,
"grad_norm": 4.321196737881709,
"kl": 0.049560546875,
"learning_rate": 2.75e-07,
"loss": 0.0021,
"reward": 1.9387693405151367,
"reward_std": 0.017124010249972343,
"rewards/accuracy_reward": 0.9387692213058472,
"rewards/format_reward": 1.0,
"step": 348
},
{
"clip_ratio": 0.0,
"completion_length": 105.671875,
"epoch": 7.416666666666667,
"grad_norm": 2.8457346329200557,
"kl": 0.0458984375,
"learning_rate": 2.729166666666666e-07,
"loss": 0.0019,
"reward": 1.9095313549041748,
"reward_std": 0.021084271371364594,
"rewards/accuracy_reward": 0.90953129529953,
"rewards/format_reward": 1.0,
"step": 349
},
{
"clip_ratio": 0.0,
"completion_length": 105.71224212646484,
"epoch": 7.4375,
"grad_norm": 1.993210730812182,
"kl": 0.046875,
"learning_rate": 2.708333333333333e-07,
"loss": 0.0019,
"reward": 1.9209964275360107,
"reward_std": 0.020304953679442406,
"rewards/accuracy_reward": 0.9209963083267212,
"rewards/format_reward": 1.0,
"step": 350
},
{
"clip_ratio": 0.0,
"completion_length": 105.25521087646484,
"epoch": 7.458333333333333,
"grad_norm": 2.4373953511615856,
"kl": 0.0458984375,
"learning_rate": 2.6874999999999997e-07,
"loss": 0.0019,
"reward": 1.9061431884765625,
"reward_std": 0.01994149014353752,
"rewards/accuracy_reward": 0.906143069267273,
"rewards/format_reward": 1.0,
"step": 351
},
{
"clip_ratio": 0.0,
"completion_length": 106.44140625,
"epoch": 7.479166666666667,
"grad_norm": 3.7778106185692635,
"kl": 0.053466796875,
"learning_rate": 2.6666666666666667e-07,
"loss": 0.0022,
"reward": 1.9190186262130737,
"reward_std": 0.020440340042114258,
"rewards/accuracy_reward": 0.9190186262130737,
"rewards/format_reward": 1.0,
"step": 352
},
{
"clip_ratio": 0.0,
"completion_length": 106.86589050292969,
"epoch": 7.5,
"grad_norm": 2.376594210526858,
"kl": 0.047119140625,
"learning_rate": 2.645833333333333e-07,
"loss": 0.002,
"reward": 1.9218724966049194,
"reward_std": 0.02028195932507515,
"rewards/accuracy_reward": 0.9218723773956299,
"rewards/format_reward": 1.0,
"step": 353
},
{
"clip_ratio": 0.0,
"completion_length": 105.45052337646484,
"epoch": 7.520833333333333,
"grad_norm": 1.7750680835812318,
"kl": 0.048583984375,
"learning_rate": 2.625e-07,
"loss": 0.002,
"reward": 1.90578293800354,
"reward_std": 0.018642796203494072,
"rewards/accuracy_reward": 0.9057828187942505,
"rewards/format_reward": 1.0,
"step": 354
},
{
"clip_ratio": 0.0,
"completion_length": 107.1484375,
"epoch": 7.541666666666667,
"grad_norm": 2.022098717469963,
"kl": 0.04833984375,
"learning_rate": 2.604166666666667e-07,
"loss": 0.002,
"reward": 1.895311713218689,
"reward_std": 0.024539759382605553,
"rewards/accuracy_reward": 0.8966139554977417,
"rewards/format_reward": 0.9986979365348816,
"step": 355
},
{
"clip_ratio": 0.0,
"completion_length": 105.81380462646484,
"epoch": 7.5625,
"grad_norm": 4.701010132350987,
"kl": 0.053466796875,
"learning_rate": 2.5833333333333333e-07,
"loss": 0.0022,
"reward": 1.9116785526275635,
"reward_std": 0.01904495432972908,
"rewards/accuracy_reward": 0.9116784930229187,
"rewards/format_reward": 1.0,
"step": 356
},
{
"clip_ratio": 0.0,
"completion_length": 107.61328125,
"epoch": 7.583333333333333,
"grad_norm": 2.7971888222519197,
"kl": 0.04638671875,
"learning_rate": 2.5625e-07,
"loss": 0.0019,
"reward": 1.9103072881698608,
"reward_std": 0.019077036529779434,
"rewards/accuracy_reward": 0.9103072881698608,
"rewards/format_reward": 1.0,
"step": 357
},
{
"clip_ratio": 0.0,
"completion_length": 107.35677337646484,
"epoch": 7.604166666666667,
"grad_norm": 3.340602697847883,
"kl": 0.049560546875,
"learning_rate": 2.5416666666666663e-07,
"loss": 0.002,
"reward": 1.9107370376586914,
"reward_std": 0.01656663417816162,
"rewards/accuracy_reward": 0.9107369780540466,
"rewards/format_reward": 1.0,
"step": 358
},
{
"clip_ratio": 0.0,
"completion_length": 108.40755462646484,
"epoch": 7.625,
"grad_norm": 3.4757576150836527,
"kl": 0.052978515625,
"learning_rate": 2.5208333333333334e-07,
"loss": 0.0022,
"reward": 1.897055983543396,
"reward_std": 0.02028050646185875,
"rewards/accuracy_reward": 0.897055983543396,
"rewards/format_reward": 1.0,
"step": 359
},
{
"clip_ratio": 0.0,
"completion_length": 108.87109375,
"epoch": 7.645833333333333,
"grad_norm": 1.7537209554556075,
"kl": 0.052001953125,
"learning_rate": 2.5e-07,
"loss": 0.0021,
"reward": 1.9118655920028687,
"reward_std": 0.019177807494997978,
"rewards/accuracy_reward": 0.9118657112121582,
"rewards/format_reward": 1.0,
"step": 360
},
{
"clip_ratio": 0.0,
"completion_length": 108.87760925292969,
"epoch": 7.666666666666667,
"grad_norm": 1.985054115096998,
"kl": 0.053955078125,
"learning_rate": 2.4791666666666664e-07,
"loss": 0.0022,
"reward": 1.9363340139389038,
"reward_std": 0.01809048466384411,
"rewards/accuracy_reward": 0.9363340139389038,
"rewards/format_reward": 1.0,
"step": 361
},
{
"clip_ratio": 0.0,
"completion_length": 109.9375,
"epoch": 7.6875,
"grad_norm": 2.640046167255212,
"kl": 0.055908203125,
"learning_rate": 2.458333333333333e-07,
"loss": 0.0023,
"reward": 1.8969202041625977,
"reward_std": 0.02410067245364189,
"rewards/accuracy_reward": 0.8982224464416504,
"rewards/format_reward": 0.9986979365348816,
"step": 362
},
{
"clip_ratio": 0.0,
"completion_length": 108.78646087646484,
"epoch": 7.708333333333333,
"grad_norm": 2.0927423356674155,
"kl": 0.05322265625,
"learning_rate": 2.4375e-07,
"loss": 0.0023,
"reward": 1.934058427810669,
"reward_std": 0.020023031160235405,
"rewards/accuracy_reward": 0.9340583682060242,
"rewards/format_reward": 1.0,
"step": 363
},
{
"clip_ratio": 0.0,
"completion_length": 107.98177337646484,
"epoch": 7.729166666666667,
"grad_norm": 5.374648001232616,
"kl": 0.05517578125,
"learning_rate": 2.4166666666666665e-07,
"loss": 0.0023,
"reward": 1.9142836332321167,
"reward_std": 0.01773514598608017,
"rewards/accuracy_reward": 0.9142836928367615,
"rewards/format_reward": 1.0,
"step": 364
},
{
"clip_ratio": 0.0,
"completion_length": 110.72396087646484,
"epoch": 7.75,
"grad_norm": 1.5292733041237832,
"kl": 0.050537109375,
"learning_rate": 2.3958333333333335e-07,
"loss": 0.0021,
"reward": 1.9197324514389038,
"reward_std": 0.0220477432012558,
"rewards/accuracy_reward": 0.9210345149040222,
"rewards/format_reward": 0.9986979365348816,
"step": 365
},
{
"clip_ratio": 0.0,
"completion_length": 109.1875,
"epoch": 7.770833333333333,
"grad_norm": 9.487726177733885,
"kl": 0.045654296875,
"learning_rate": 2.3749999999999998e-07,
"loss": 0.0019,
"reward": 1.927847146987915,
"reward_std": 0.017860591411590576,
"rewards/accuracy_reward": 0.9278470873832703,
"rewards/format_reward": 1.0,
"step": 366
},
{
"clip_ratio": 0.0,
"completion_length": 112.3203125,
"epoch": 7.791666666666667,
"grad_norm": 2.667778768562944,
"kl": 0.0458984375,
"learning_rate": 2.3541666666666665e-07,
"loss": 0.0019,
"reward": 1.9139199256896973,
"reward_std": 0.02642824873328209,
"rewards/accuracy_reward": 0.9178261756896973,
"rewards/format_reward": 0.99609375,
"step": 367
},
{
"clip_ratio": 0.0,
"completion_length": 111.01302337646484,
"epoch": 7.8125,
"grad_norm": 2.86218482085828,
"kl": 0.051025390625,
"learning_rate": 2.3333333333333333e-07,
"loss": 0.0021,
"reward": 1.9041519165039062,
"reward_std": 0.02404339425265789,
"rewards/accuracy_reward": 0.9054540395736694,
"rewards/format_reward": 0.9986979365348816,
"step": 368
},
{
"clip_ratio": 0.0,
"completion_length": 109.94401550292969,
"epoch": 7.833333333333333,
"grad_norm": 1.5489939957891778,
"kl": 0.05322265625,
"learning_rate": 2.3125e-07,
"loss": 0.0022,
"reward": 1.9263066053390503,
"reward_std": 0.019394386559724808,
"rewards/accuracy_reward": 0.9263066649436951,
"rewards/format_reward": 1.0,
"step": 369
},
{
"clip_ratio": 0.0,
"completion_length": 110.15755462646484,
"epoch": 7.854166666666667,
"grad_norm": 2.64116020814343,
"kl": 0.052490234375,
"learning_rate": 2.2916666666666663e-07,
"loss": 0.0021,
"reward": 1.9046939611434937,
"reward_std": 0.023857450112700462,
"rewards/accuracy_reward": 0.9059960842132568,
"rewards/format_reward": 0.9986979365348816,
"step": 370
},
{
"clip_ratio": 0.0,
"completion_length": 110.32552337646484,
"epoch": 7.875,
"grad_norm": 1.80983212656234,
"kl": 0.044921875,
"learning_rate": 2.270833333333333e-07,
"loss": 0.0018,
"reward": 1.934501051902771,
"reward_std": 0.016163021326065063,
"rewards/accuracy_reward": 0.934501051902771,
"rewards/format_reward": 1.0,
"step": 371
},
{
"clip_ratio": 0.0,
"completion_length": 109.37890625,
"epoch": 7.895833333333333,
"grad_norm": 10.563112786998811,
"kl": 0.048583984375,
"learning_rate": 2.25e-07,
"loss": 0.002,
"reward": 1.8924764394760132,
"reward_std": 0.019610995426774025,
"rewards/accuracy_reward": 0.892476499080658,
"rewards/format_reward": 1.0,
"step": 372
},
{
"clip_ratio": 0.0,
"completion_length": 111.34896087646484,
"epoch": 7.916666666666667,
"grad_norm": 2.533818918571777,
"kl": 0.044921875,
"learning_rate": 2.2291666666666667e-07,
"loss": 0.0019,
"reward": 1.906684398651123,
"reward_std": 0.01782190427184105,
"rewards/accuracy_reward": 0.906684398651123,
"rewards/format_reward": 1.0,
"step": 373
},
{
"clip_ratio": 0.0,
"completion_length": 109.51823425292969,
"epoch": 7.9375,
"grad_norm": 2.2661373394338926,
"kl": 0.0546875,
"learning_rate": 2.2083333333333332e-07,
"loss": 0.0022,
"reward": 1.9244441986083984,
"reward_std": 0.01977790892124176,
"rewards/accuracy_reward": 0.9244440793991089,
"rewards/format_reward": 1.0,
"step": 374
},
{
"clip_ratio": 0.0,
"completion_length": 109.78385925292969,
"epoch": 7.958333333333333,
"grad_norm": 1.7111686099562637,
"kl": 0.051025390625,
"learning_rate": 2.1875e-07,
"loss": 0.0021,
"reward": 1.897660255432129,
"reward_std": 0.02316589280962944,
"rewards/accuracy_reward": 0.8976603746414185,
"rewards/format_reward": 1.0,
"step": 375
},
{
"clip_ratio": 0.0,
"completion_length": 109.38451385498047,
"epoch": 7.979166666666667,
"grad_norm": 3.8657178191371275,
"kl": 0.044921875,
"learning_rate": 2.1666666666666667e-07,
"loss": 0.0019,
"reward": 1.91867196559906,
"reward_std": 0.018634842708706856,
"rewards/accuracy_reward": 0.9186719655990601,
"rewards/format_reward": 1.0,
"step": 376
},
{
"clip_ratio": 0.0,
"completion_length": 110.22917175292969,
"epoch": 8.020833333333334,
"grad_norm": 1.9741658039132675,
"kl": 0.04638671875,
"learning_rate": 2.145833333333333e-07,
"loss": 0.0019,
"reward": 1.9112396240234375,
"reward_std": 0.01786581240594387,
"rewards/accuracy_reward": 0.9112398028373718,
"rewards/format_reward": 1.0,
"step": 377
},
{
"clip_ratio": 0.0,
"completion_length": 109.52604675292969,
"epoch": 8.041666666666666,
"grad_norm": 4.239952555901759,
"kl": 0.04931640625,
"learning_rate": 2.1249999999999998e-07,
"loss": 0.002,
"reward": 1.9145668745040894,
"reward_std": 0.03050382435321808,
"rewards/accuracy_reward": 0.9171710014343262,
"rewards/format_reward": 0.9973958730697632,
"step": 378
},
{
"clip_ratio": 0.0,
"completion_length": 109.51171875,
"epoch": 8.0625,
"grad_norm": 3.1873677902649264,
"kl": 0.04638671875,
"learning_rate": 2.1041666666666665e-07,
"loss": 0.0019,
"reward": 1.9099664688110352,
"reward_std": 0.021909143775701523,
"rewards/accuracy_reward": 0.9099664688110352,
"rewards/format_reward": 1.0,
"step": 379
},
{
"clip_ratio": 0.0,
"completion_length": 110.73046875,
"epoch": 8.083333333333334,
"grad_norm": 1.4761981134253073,
"kl": 0.043212890625,
"learning_rate": 2.0833333333333333e-07,
"loss": 0.0018,
"reward": 1.908822774887085,
"reward_std": 0.026725394651293755,
"rewards/accuracy_reward": 0.9114267826080322,
"rewards/format_reward": 0.9973958730697632,
"step": 380
},
{
"clip_ratio": 0.0,
"completion_length": 106.9765625,
"epoch": 8.104166666666666,
"grad_norm": 2.5298325743515413,
"kl": 0.055908203125,
"learning_rate": 2.0624999999999998e-07,
"loss": 0.0023,
"reward": 1.919610857963562,
"reward_std": 0.022303760051727295,
"rewards/accuracy_reward": 0.919610857963562,
"rewards/format_reward": 1.0,
"step": 381
},
{
"clip_ratio": 0.0,
"completion_length": 109.39453125,
"epoch": 8.125,
"grad_norm": 1.6701037952221194,
"kl": 0.04736328125,
"learning_rate": 2.0416666666666666e-07,
"loss": 0.002,
"reward": 1.8952322006225586,
"reward_std": 0.02632908523082733,
"rewards/accuracy_reward": 0.8978363275527954,
"rewards/format_reward": 0.9973958730697632,
"step": 382
},
{
"clip_ratio": 0.0,
"completion_length": 109.75911712646484,
"epoch": 8.145833333333334,
"grad_norm": 1.8105877210751262,
"kl": 0.049560546875,
"learning_rate": 2.0208333333333334e-07,
"loss": 0.002,
"reward": 1.9192695617675781,
"reward_std": 0.02205723151564598,
"rewards/accuracy_reward": 0.9192695617675781,
"rewards/format_reward": 1.0,
"step": 383
},
{
"clip_ratio": 0.0,
"completion_length": 110.484375,
"epoch": 8.166666666666666,
"grad_norm": 3.6256736252714084,
"kl": 0.052001953125,
"learning_rate": 2e-07,
"loss": 0.0022,
"reward": 1.899370551109314,
"reward_std": 0.02411123923957348,
"rewards/accuracy_reward": 0.9006726145744324,
"rewards/format_reward": 0.9986979365348816,
"step": 384
},
{
"clip_ratio": 0.0,
"completion_length": 109.23698425292969,
"epoch": 8.1875,
"grad_norm": 2.8889796981288867,
"kl": 0.05517578125,
"learning_rate": 1.9791666666666664e-07,
"loss": 0.0023,
"reward": 1.9239459037780762,
"reward_std": 0.020000584423542023,
"rewards/accuracy_reward": 0.9239459037780762,
"rewards/format_reward": 1.0,
"step": 385
},
{
"clip_ratio": 0.0,
"completion_length": 108.01953125,
"epoch": 8.208333333333334,
"grad_norm": 4.9653820626533385,
"kl": 0.052734375,
"learning_rate": 1.9583333333333332e-07,
"loss": 0.0022,
"reward": 1.9158563613891602,
"reward_std": 0.025936102494597435,
"rewards/accuracy_reward": 0.9171584844589233,
"rewards/format_reward": 0.9986979365348816,
"step": 386
},
{
"clip_ratio": 0.0,
"completion_length": 108.20964050292969,
"epoch": 8.229166666666666,
"grad_norm": 2.290735973233346,
"kl": 0.055908203125,
"learning_rate": 1.9375e-07,
"loss": 0.0023,
"reward": 1.920623540878296,
"reward_std": 0.016118617728352547,
"rewards/accuracy_reward": 0.9206234216690063,
"rewards/format_reward": 1.0,
"step": 387
},
{
"clip_ratio": 0.0,
"completion_length": 106.79817962646484,
"epoch": 8.25,
"grad_norm": 2.14149502952453,
"kl": 0.05078125,
"learning_rate": 1.9166666666666668e-07,
"loss": 0.0021,
"reward": 1.8982393741607666,
"reward_std": 0.02737743966281414,
"rewards/accuracy_reward": 0.8995413780212402,
"rewards/format_reward": 0.9986979365348816,
"step": 388
},
{
"clip_ratio": 0.0,
"completion_length": 109.703125,
"epoch": 8.270833333333334,
"grad_norm": 1.7520909830046585,
"kl": 0.049560546875,
"learning_rate": 1.8958333333333333e-07,
"loss": 0.0021,
"reward": 1.9203259944915771,
"reward_std": 0.017365001142024994,
"rewards/accuracy_reward": 0.9203259348869324,
"rewards/format_reward": 1.0,
"step": 389
},
{
"clip_ratio": 0.0,
"completion_length": 108.1953125,
"epoch": 8.291666666666666,
"grad_norm": 3.8682316676739044,
"kl": 0.05322265625,
"learning_rate": 1.875e-07,
"loss": 0.0023,
"reward": 1.90675950050354,
"reward_std": 0.020597189664840698,
"rewards/accuracy_reward": 0.9067594408988953,
"rewards/format_reward": 1.0,
"step": 390
},
{
"clip_ratio": 0.0,
"completion_length": 109.93489837646484,
"epoch": 8.3125,
"grad_norm": 2.4384512365064483,
"kl": 0.047607421875,
"learning_rate": 1.8541666666666666e-07,
"loss": 0.002,
"reward": 1.923602819442749,
"reward_std": 0.01742154359817505,
"rewards/accuracy_reward": 0.9249049425125122,
"rewards/format_reward": 0.9986979365348816,
"step": 391
},
{
"clip_ratio": 0.0,
"completion_length": 108.72005462646484,
"epoch": 8.333333333333334,
"grad_norm": 2.234427870328764,
"kl": 0.051513671875,
"learning_rate": 1.833333333333333e-07,
"loss": 0.0021,
"reward": 1.9096324443817139,
"reward_std": 0.021227438002824783,
"rewards/accuracy_reward": 0.9096323847770691,
"rewards/format_reward": 1.0,
"step": 392
},
{
"clip_ratio": 0.0,
"completion_length": 109.46745300292969,
"epoch": 8.354166666666666,
"grad_norm": 1.9160175504578056,
"kl": 0.05078125,
"learning_rate": 1.8124999999999999e-07,
"loss": 0.0022,
"reward": 1.9217007160186768,
"reward_std": 0.01802412047982216,
"rewards/accuracy_reward": 0.9230027198791504,
"rewards/format_reward": 0.9986979365348816,
"step": 393
},
{
"clip_ratio": 0.0,
"completion_length": 109.35807800292969,
"epoch": 8.375,
"grad_norm": 5.1139313564913955,
"kl": 0.044921875,
"learning_rate": 1.7916666666666666e-07,
"loss": 0.0019,
"reward": 1.9158090353012085,
"reward_std": 0.021614177152514458,
"rewards/accuracy_reward": 0.9171112775802612,
"rewards/format_reward": 0.9986979365348816,
"step": 394
},
{
"clip_ratio": 0.0,
"completion_length": 108.94271087646484,
"epoch": 8.395833333333334,
"grad_norm": 1.8921820475793412,
"kl": 0.046875,
"learning_rate": 1.7708333333333334e-07,
"loss": 0.002,
"reward": 1.9103114604949951,
"reward_std": 0.01914086937904358,
"rewards/accuracy_reward": 0.9103114604949951,
"rewards/format_reward": 1.0,
"step": 395
},
{
"clip_ratio": 0.0,
"completion_length": 108.96614837646484,
"epoch": 8.416666666666666,
"grad_norm": 2.8573255442328405,
"kl": 0.050537109375,
"learning_rate": 1.75e-07,
"loss": 0.0021,
"reward": 1.9096262454986572,
"reward_std": 0.01799336075782776,
"rewards/accuracy_reward": 0.9096261262893677,
"rewards/format_reward": 1.0,
"step": 396
},
{
"clip_ratio": 0.0,
"completion_length": 109.53646087646484,
"epoch": 8.4375,
"grad_norm": 2.533540112681752,
"kl": 0.044677734375,
"learning_rate": 1.7291666666666664e-07,
"loss": 0.0019,
"reward": 1.9281089305877686,
"reward_std": 0.01919987052679062,
"rewards/accuracy_reward": 0.928108811378479,
"rewards/format_reward": 1.0,
"step": 397
},
{
"clip_ratio": 0.0,
"completion_length": 109.0234375,
"epoch": 8.458333333333334,
"grad_norm": 2.2470205832432666,
"kl": 0.04443359375,
"learning_rate": 1.7083333333333332e-07,
"loss": 0.0018,
"reward": 1.9159855842590332,
"reward_std": 0.021353445947170258,
"rewards/accuracy_reward": 0.9159855842590332,
"rewards/format_reward": 1.0,
"step": 398
},
{
"clip_ratio": 0.0,
"completion_length": 108.69792175292969,
"epoch": 8.479166666666666,
"grad_norm": 2.60464995508122,
"kl": 0.046875,
"learning_rate": 1.6875e-07,
"loss": 0.002,
"reward": 1.9079928398132324,
"reward_std": 0.022192446514964104,
"rewards/accuracy_reward": 0.909294843673706,
"rewards/format_reward": 0.9986979365348816,
"step": 399
},
{
"clip_ratio": 0.0,
"completion_length": 108.828125,
"epoch": 8.5,
"grad_norm": 2.421415014993462,
"kl": 0.053466796875,
"learning_rate": 1.6666666666666665e-07,
"loss": 0.0022,
"reward": 1.9306447505950928,
"reward_std": 0.01638518087565899,
"rewards/accuracy_reward": 0.9306447505950928,
"rewards/format_reward": 1.0,
"step": 400
},
{
"clip_ratio": 0.0,
"completion_length": 109.39714050292969,
"epoch": 8.520833333333334,
"grad_norm": 1.7082957831764727,
"kl": 0.056640625,
"learning_rate": 1.6458333333333333e-07,
"loss": 0.0024,
"reward": 1.9131463766098022,
"reward_std": 0.023682190105319023,
"rewards/accuracy_reward": 0.9144482612609863,
"rewards/format_reward": 0.9986979365348816,
"step": 401
},
{
"clip_ratio": 0.0,
"completion_length": 110.69010925292969,
"epoch": 8.541666666666666,
"grad_norm": 2.2438919940917197,
"kl": 0.04638671875,
"learning_rate": 1.625e-07,
"loss": 0.002,
"reward": 1.9307001829147339,
"reward_std": 0.016804661601781845,
"rewards/accuracy_reward": 0.9307002425193787,
"rewards/format_reward": 1.0,
"step": 402
},
{
"clip_ratio": 0.0,
"completion_length": 110.14192962646484,
"epoch": 8.5625,
"grad_norm": 3.264026183321021,
"kl": 0.04638671875,
"learning_rate": 1.6041666666666668e-07,
"loss": 0.0019,
"reward": 1.8919553756713867,
"reward_std": 0.02204691618680954,
"rewards/accuracy_reward": 0.8932574987411499,
"rewards/format_reward": 0.9986979365348816,
"step": 403
},
{
"clip_ratio": 0.0,
"completion_length": 111.13542175292969,
"epoch": 8.583333333333334,
"grad_norm": 2.309073599276998,
"kl": 0.04443359375,
"learning_rate": 1.583333333333333e-07,
"loss": 0.0019,
"reward": 1.9184863567352295,
"reward_std": 0.016750024631619453,
"rewards/accuracy_reward": 0.918486475944519,
"rewards/format_reward": 1.0,
"step": 404
},
{
"clip_ratio": 0.0,
"completion_length": 110.02604675292969,
"epoch": 8.604166666666666,
"grad_norm": 1.920449334945207,
"kl": 0.046875,
"learning_rate": 1.5624999999999999e-07,
"loss": 0.0019,
"reward": 1.8996871709823608,
"reward_std": 0.019097616896033287,
"rewards/accuracy_reward": 0.8996869921684265,
"rewards/format_reward": 1.0,
"step": 405
},
{
"clip_ratio": 0.0,
"completion_length": 109.38671875,
"epoch": 8.625,
"grad_norm": 6.108556482978297,
"kl": 0.047607421875,
"learning_rate": 1.5416666666666666e-07,
"loss": 0.002,
"reward": 1.8946789503097534,
"reward_std": 0.021756049245595932,
"rewards/accuracy_reward": 0.8946789503097534,
"rewards/format_reward": 1.0,
"step": 406
},
{
"clip_ratio": 0.0,
"completion_length": 109.61589050292969,
"epoch": 8.645833333333334,
"grad_norm": 1.4312273043036572,
"kl": 0.04833984375,
"learning_rate": 1.5208333333333332e-07,
"loss": 0.002,
"reward": 1.930631399154663,
"reward_std": 0.019629666581749916,
"rewards/accuracy_reward": 0.9319334030151367,
"rewards/format_reward": 0.9986979365348816,
"step": 407
},
{
"clip_ratio": 0.0,
"completion_length": 111.0546875,
"epoch": 8.666666666666666,
"grad_norm": 2.0186934249559645,
"kl": 0.046142578125,
"learning_rate": 1.5e-07,
"loss": 0.0019,
"reward": 1.910029411315918,
"reward_std": 0.01880134642124176,
"rewards/accuracy_reward": 0.9100292921066284,
"rewards/format_reward": 1.0,
"step": 408
},
{
"clip_ratio": 0.0,
"completion_length": 109.23958587646484,
"epoch": 8.6875,
"grad_norm": 2.388178370024915,
"kl": 0.046630859375,
"learning_rate": 1.4791666666666667e-07,
"loss": 0.0019,
"reward": 1.9040935039520264,
"reward_std": 0.023898255079984665,
"rewards/accuracy_reward": 0.9040936231613159,
"rewards/format_reward": 1.0,
"step": 409
},
{
"clip_ratio": 0.0,
"completion_length": 111.42317962646484,
"epoch": 8.708333333333334,
"grad_norm": 3.09323894478763,
"kl": 0.045166015625,
"learning_rate": 1.4583333333333335e-07,
"loss": 0.0019,
"reward": 1.9278287887573242,
"reward_std": 0.01652323268353939,
"rewards/accuracy_reward": 0.9278289675712585,
"rewards/format_reward": 1.0,
"step": 410
},
{
"clip_ratio": 0.0,
"completion_length": 111.14714050292969,
"epoch": 8.729166666666666,
"grad_norm": 8.95075477806658,
"kl": 0.046875,
"learning_rate": 1.4374999999999997e-07,
"loss": 0.002,
"reward": 1.9176443815231323,
"reward_std": 0.020662346854805946,
"rewards/accuracy_reward": 0.9202485084533691,
"rewards/format_reward": 0.9973958730697632,
"step": 411
},
{
"clip_ratio": 0.0,
"completion_length": 109.69140625,
"epoch": 8.75,
"grad_norm": 14.290195535948639,
"kl": 0.04638671875,
"learning_rate": 1.4166666666666665e-07,
"loss": 0.0019,
"reward": 1.9256818294525146,
"reward_std": 0.01680697686970234,
"rewards/accuracy_reward": 0.9256815910339355,
"rewards/format_reward": 1.0,
"step": 412
},
{
"clip_ratio": 0.0,
"completion_length": 108.74089050292969,
"epoch": 8.770833333333334,
"grad_norm": 3.382470297102888,
"kl": 0.04833984375,
"learning_rate": 1.3958333333333333e-07,
"loss": 0.002,
"reward": 1.9325473308563232,
"reward_std": 0.015486609190702438,
"rewards/accuracy_reward": 0.9325472116470337,
"rewards/format_reward": 1.0,
"step": 413
},
{
"clip_ratio": 0.0,
"completion_length": 107.37890625,
"epoch": 8.791666666666666,
"grad_norm": 1.5498505485516252,
"kl": 0.04638671875,
"learning_rate": 1.375e-07,
"loss": 0.002,
"reward": 1.9133939743041992,
"reward_std": 0.019088715314865112,
"rewards/accuracy_reward": 0.9133939743041992,
"rewards/format_reward": 1.0,
"step": 414
},
{
"clip_ratio": 0.0,
"completion_length": 109.13151550292969,
"epoch": 8.8125,
"grad_norm": 2.1982092482674496,
"kl": 0.047119140625,
"learning_rate": 1.3541666666666666e-07,
"loss": 0.0019,
"reward": 1.9054036140441895,
"reward_std": 0.023469921201467514,
"rewards/accuracy_reward": 0.9067057371139526,
"rewards/format_reward": 0.9986979365348816,
"step": 415
},
{
"clip_ratio": 0.0,
"completion_length": 108.67578125,
"epoch": 8.833333333333334,
"grad_norm": 2.5337968426277158,
"kl": 0.052490234375,
"learning_rate": 1.3333333333333334e-07,
"loss": 0.0022,
"reward": 1.9113550186157227,
"reward_std": 0.017122842371463776,
"rewards/accuracy_reward": 0.9113550186157227,
"rewards/format_reward": 1.0,
"step": 416
},
{
"clip_ratio": 0.0,
"completion_length": 108.7421875,
"epoch": 8.854166666666666,
"grad_norm": 2.957658114504722,
"kl": 0.049072265625,
"learning_rate": 1.3125e-07,
"loss": 0.002,
"reward": 1.9356334209442139,
"reward_std": 0.01688789203763008,
"rewards/accuracy_reward": 0.9356333613395691,
"rewards/format_reward": 1.0,
"step": 417
},
{
"clip_ratio": 0.0,
"completion_length": 107.45442962646484,
"epoch": 8.875,
"grad_norm": 3.414030918508783,
"kl": 0.048095703125,
"learning_rate": 1.2916666666666667e-07,
"loss": 0.002,
"reward": 1.9134725332260132,
"reward_std": 0.01754312589764595,
"rewards/accuracy_reward": 0.9134725332260132,
"rewards/format_reward": 1.0,
"step": 418
},
{
"clip_ratio": 0.0,
"completion_length": 108.20833587646484,
"epoch": 8.895833333333334,
"grad_norm": 3.3858814535237918,
"kl": 0.04931640625,
"learning_rate": 1.2708333333333332e-07,
"loss": 0.002,
"reward": 1.8799127340316772,
"reward_std": 0.0196706410497427,
"rewards/accuracy_reward": 0.8799127340316772,
"rewards/format_reward": 1.0,
"step": 419
},
{
"clip_ratio": 0.0,
"completion_length": 106.65885925292969,
"epoch": 8.916666666666666,
"grad_norm": 4.177315577481865,
"kl": 0.048583984375,
"learning_rate": 1.25e-07,
"loss": 0.002,
"reward": 1.9059925079345703,
"reward_std": 0.017467858269810677,
"rewards/accuracy_reward": 0.9059926867485046,
"rewards/format_reward": 1.0,
"step": 420
},
{
"clip_ratio": 0.0,
"completion_length": 107.44010925292969,
"epoch": 8.9375,
"grad_norm": 1.8879259079304014,
"kl": 0.045654296875,
"learning_rate": 1.2291666666666665e-07,
"loss": 0.0019,
"reward": 1.9416245222091675,
"reward_std": 0.014588016085326672,
"rewards/accuracy_reward": 0.9416245818138123,
"rewards/format_reward": 1.0,
"step": 421
},
{
"clip_ratio": 0.0,
"completion_length": 107.62370300292969,
"epoch": 8.958333333333334,
"grad_norm": 2.993225929254523,
"kl": 0.053955078125,
"learning_rate": 1.2083333333333332e-07,
"loss": 0.0023,
"reward": 1.9129104614257812,
"reward_std": 0.018522052094340324,
"rewards/accuracy_reward": 0.9129105806350708,
"rewards/format_reward": 1.0,
"step": 422
},
{
"clip_ratio": 0.0,
"completion_length": 107.63018035888672,
"epoch": 8.979166666666666,
"grad_norm": 1.868194038764531,
"kl": 0.05908203125,
"learning_rate": 1.1874999999999999e-07,
"loss": 0.0024,
"reward": 1.8931113481521606,
"reward_std": 0.019662605598568916,
"rewards/accuracy_reward": 0.8931112885475159,
"rewards/format_reward": 1.0,
"step": 423
},
{
"clip_ratio": 0.0,
"completion_length": 106.67708587646484,
"epoch": 9.020833333333334,
"grad_norm": 4.908308927568047,
"kl": 0.053955078125,
"learning_rate": 1.1666666666666667e-07,
"loss": 0.0022,
"reward": 1.9184527397155762,
"reward_std": 0.022821567952632904,
"rewards/accuracy_reward": 0.9197548627853394,
"rewards/format_reward": 0.9986979365348816,
"step": 424
},
{
"clip_ratio": 0.0,
"completion_length": 105.43229675292969,
"epoch": 9.041666666666666,
"grad_norm": 2.0304888444214884,
"kl": 0.047119140625,
"learning_rate": 1.1458333333333332e-07,
"loss": 0.002,
"reward": 1.9061381816864014,
"reward_std": 0.017391815781593323,
"rewards/accuracy_reward": 0.9061381220817566,
"rewards/format_reward": 1.0,
"step": 425
},
{
"clip_ratio": 0.0,
"completion_length": 106.78385925292969,
"epoch": 9.0625,
"grad_norm": 4.941122982616481,
"kl": 0.05029296875,
"learning_rate": 1.125e-07,
"loss": 0.0021,
"reward": 1.9194796085357666,
"reward_std": 0.01858523301780224,
"rewards/accuracy_reward": 0.9194795489311218,
"rewards/format_reward": 1.0,
"step": 426
},
{
"clip_ratio": 0.0,
"completion_length": 106.11198425292969,
"epoch": 9.083333333333334,
"grad_norm": 1.5041501011347416,
"kl": 0.055419921875,
"learning_rate": 1.1041666666666666e-07,
"loss": 0.0023,
"reward": 1.926844596862793,
"reward_std": 0.017708610743284225,
"rewards/accuracy_reward": 0.926844596862793,
"rewards/format_reward": 1.0,
"step": 427
},
{
"clip_ratio": 0.0,
"completion_length": 104.85546875,
"epoch": 9.104166666666666,
"grad_norm": 5.314885280931172,
"kl": 0.0498046875,
"learning_rate": 1.0833333333333334e-07,
"loss": 0.0021,
"reward": 1.9273128509521484,
"reward_std": 0.01629455015063286,
"rewards/accuracy_reward": 0.927312970161438,
"rewards/format_reward": 1.0,
"step": 428
},
{
"clip_ratio": 0.0,
"completion_length": 106.00651550292969,
"epoch": 9.125,
"grad_norm": 1.6763167113014938,
"kl": 0.059326171875,
"learning_rate": 1.0624999999999999e-07,
"loss": 0.0024,
"reward": 1.9027307033538818,
"reward_std": 0.023923706263303757,
"rewards/accuracy_reward": 0.9040327072143555,
"rewards/format_reward": 0.9986979365348816,
"step": 429
},
{
"clip_ratio": 0.0,
"completion_length": 106.30729675292969,
"epoch": 9.145833333333334,
"grad_norm": 2.331935816678969,
"kl": 0.046875,
"learning_rate": 1.0416666666666667e-07,
"loss": 0.002,
"reward": 1.914282202720642,
"reward_std": 0.01946648769080639,
"rewards/accuracy_reward": 0.9142822027206421,
"rewards/format_reward": 1.0,
"step": 430
},
{
"clip_ratio": 0.0,
"completion_length": 106.20833587646484,
"epoch": 9.166666666666666,
"grad_norm": 2.7235761953830515,
"kl": 0.046875,
"learning_rate": 1.0208333333333333e-07,
"loss": 0.0019,
"reward": 1.9132049083709717,
"reward_std": 0.018830081447958946,
"rewards/accuracy_reward": 0.9132048487663269,
"rewards/format_reward": 1.0,
"step": 431
},
{
"clip_ratio": 0.0,
"completion_length": 105.61328125,
"epoch": 9.1875,
"grad_norm": 5.239801858902832,
"kl": 0.045166015625,
"learning_rate": 1e-07,
"loss": 0.0019,
"reward": 1.9167590141296387,
"reward_std": 0.01652991585433483,
"rewards/accuracy_reward": 0.9167590141296387,
"rewards/format_reward": 1.0,
"step": 432
},
{
"clip_ratio": 0.0,
"completion_length": 106.30989837646484,
"epoch": 9.208333333333334,
"grad_norm": 2.0958692476606453,
"kl": 0.047607421875,
"learning_rate": 9.791666666666666e-08,
"loss": 0.002,
"reward": 1.9000282287597656,
"reward_std": 0.017917610704898834,
"rewards/accuracy_reward": 0.9000282287597656,
"rewards/format_reward": 1.0,
"step": 433
},
{
"clip_ratio": 0.0,
"completion_length": 107.13021087646484,
"epoch": 9.229166666666666,
"grad_norm": 3.688730357890573,
"kl": 0.047119140625,
"learning_rate": 9.583333333333334e-08,
"loss": 0.002,
"reward": 1.915367841720581,
"reward_std": 0.01575218327343464,
"rewards/accuracy_reward": 0.915367841720581,
"rewards/format_reward": 1.0,
"step": 434
},
{
"clip_ratio": 0.0,
"completion_length": 107.29817962646484,
"epoch": 9.25,
"grad_norm": 4.777726769533225,
"kl": 0.0498046875,
"learning_rate": 9.375e-08,
"loss": 0.002,
"reward": 1.9215753078460693,
"reward_std": 0.018969135358929634,
"rewards/accuracy_reward": 0.9215752482414246,
"rewards/format_reward": 1.0,
"step": 435
},
{
"clip_ratio": 0.0,
"completion_length": 105.67578125,
"epoch": 9.270833333333334,
"grad_norm": 3.166832895506651,
"kl": 0.1455078125,
"learning_rate": 9.166666666666665e-08,
"loss": 0.0059,
"reward": 1.8940961360931396,
"reward_std": 0.022732451558113098,
"rewards/accuracy_reward": 0.8940961360931396,
"rewards/format_reward": 1.0,
"step": 436
},
{
"clip_ratio": 0.0,
"completion_length": 106.04427337646484,
"epoch": 9.291666666666666,
"grad_norm": 7.685144828516677,
"kl": 0.053466796875,
"learning_rate": 8.958333333333333e-08,
"loss": 0.0022,
"reward": 1.9240249395370483,
"reward_std": 0.020782217383384705,
"rewards/accuracy_reward": 0.9253270030021667,
"rewards/format_reward": 0.9986979365348816,
"step": 437
},
{
"clip_ratio": 0.0,
"completion_length": 105.99609375,
"epoch": 9.3125,
"grad_norm": 2.378060228855226,
"kl": 0.047119140625,
"learning_rate": 8.75e-08,
"loss": 0.0019,
"reward": 1.9038584232330322,
"reward_std": 0.018619615584611893,
"rewards/accuracy_reward": 0.9038585424423218,
"rewards/format_reward": 1.0,
"step": 438
},
{
"clip_ratio": 0.0,
"completion_length": 107.53515625,
"epoch": 9.333333333333334,
"grad_norm": 1.7692084618793151,
"kl": 0.04150390625,
"learning_rate": 8.541666666666666e-08,
"loss": 0.0018,
"reward": 1.9091638326644897,
"reward_std": 0.022435273975133896,
"rewards/accuracy_reward": 0.9104660749435425,
"rewards/format_reward": 0.9986979365348816,
"step": 439
},
{
"clip_ratio": 0.0,
"completion_length": 107.88542175292969,
"epoch": 9.354166666666666,
"grad_norm": 1.9024254121008084,
"kl": 0.05615234375,
"learning_rate": 8.333333333333333e-08,
"loss": 0.0023,
"reward": 1.9238141775131226,
"reward_std": 0.022633202373981476,
"rewards/accuracy_reward": 0.9251161813735962,
"rewards/format_reward": 0.9986979365348816,
"step": 440
},
{
"clip_ratio": 0.0,
"completion_length": 108.61589050292969,
"epoch": 9.375,
"grad_norm": 1.6956193954619285,
"kl": 0.047119140625,
"learning_rate": 8.125e-08,
"loss": 0.002,
"reward": 1.9281830787658691,
"reward_std": 0.016372021287679672,
"rewards/accuracy_reward": 0.9281830787658691,
"rewards/format_reward": 1.0,
"step": 441
},
{
"clip_ratio": 0.0,
"completion_length": 105.97786712646484,
"epoch": 9.395833333333334,
"grad_norm": 1.7595623551426245,
"kl": 0.046142578125,
"learning_rate": 7.916666666666665e-08,
"loss": 0.0019,
"reward": 1.8898437023162842,
"reward_std": 0.020021602511405945,
"rewards/accuracy_reward": 0.8898436427116394,
"rewards/format_reward": 1.0,
"step": 442
},
{
"clip_ratio": 0.0,
"completion_length": 107.08464050292969,
"epoch": 9.416666666666666,
"grad_norm": 2.5345231096237626,
"kl": 0.045654296875,
"learning_rate": 7.708333333333333e-08,
"loss": 0.0019,
"reward": 1.9146665334701538,
"reward_std": 0.018744416534900665,
"rewards/accuracy_reward": 0.9146665334701538,
"rewards/format_reward": 1.0,
"step": 443
},
{
"clip_ratio": 0.0,
"completion_length": 107.6875,
"epoch": 9.4375,
"grad_norm": 3.953395223073277,
"kl": 0.05224609375,
"learning_rate": 7.5e-08,
"loss": 0.0022,
"reward": 1.901958703994751,
"reward_std": 0.02426784299314022,
"rewards/accuracy_reward": 0.9032607078552246,
"rewards/format_reward": 0.9986979365348816,
"step": 444
},
{
"clip_ratio": 0.0,
"completion_length": 107.35026550292969,
"epoch": 9.458333333333334,
"grad_norm": 2.9548245945269294,
"kl": 0.046875,
"learning_rate": 7.291666666666667e-08,
"loss": 0.0019,
"reward": 1.9318149089813232,
"reward_std": 0.01830216310918331,
"rewards/accuracy_reward": 0.9318150281906128,
"rewards/format_reward": 1.0,
"step": 445
},
{
"clip_ratio": 0.0,
"completion_length": 107.83203125,
"epoch": 9.479166666666666,
"grad_norm": 2.485294511659845,
"kl": 0.050048828125,
"learning_rate": 7.083333333333333e-08,
"loss": 0.0021,
"reward": 1.94561767578125,
"reward_std": 0.014377261511981487,
"rewards/accuracy_reward": 0.9456178545951843,
"rewards/format_reward": 1.0,
"step": 446
},
{
"clip_ratio": 0.0,
"completion_length": 108.19921875,
"epoch": 9.5,
"grad_norm": 3.3111964897684425,
"kl": 0.072265625,
"learning_rate": 6.875e-08,
"loss": 0.003,
"reward": 1.932363510131836,
"reward_std": 0.02322392538189888,
"rewards/accuracy_reward": 0.9336656332015991,
"rewards/format_reward": 0.9986979365348816,
"step": 447
},
{
"clip_ratio": 0.0,
"completion_length": 108.52214050292969,
"epoch": 9.520833333333334,
"grad_norm": 1.8991735989887744,
"kl": 0.04541015625,
"learning_rate": 6.666666666666667e-08,
"loss": 0.002,
"reward": 1.928739070892334,
"reward_std": 0.017306815832853317,
"rewards/accuracy_reward": 0.928739070892334,
"rewards/format_reward": 1.0,
"step": 448
},
{
"clip_ratio": 0.0,
"completion_length": 107.90104675292969,
"epoch": 9.541666666666666,
"grad_norm": 2.0677038589369747,
"kl": 0.0478515625,
"learning_rate": 6.458333333333333e-08,
"loss": 0.002,
"reward": 1.9004323482513428,
"reward_std": 0.018483035266399384,
"rewards/accuracy_reward": 0.900432288646698,
"rewards/format_reward": 1.0,
"step": 449
},
{
"clip_ratio": 0.0,
"completion_length": 109.74479675292969,
"epoch": 9.5625,
"grad_norm": 2.4393059130616748,
"kl": 0.04736328125,
"learning_rate": 6.25e-08,
"loss": 0.002,
"reward": 1.9109472036361694,
"reward_std": 0.01986522786319256,
"rewards/accuracy_reward": 0.9109472036361694,
"rewards/format_reward": 1.0,
"step": 450
}
],
"logging_steps": 1.0,
"max_steps": 480,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 48,
"trial_name": null,
"trial_params": null
}