| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 9.5625, | |
| "eval_steps": 500, | |
| "global_step": 450, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.36198425292969, | |
| "epoch": 0.020833333333333332, | |
| "grad_norm": 1.639632839083605, | |
| "kl": 0.0, | |
| "learning_rate": 9.979166666666667e-07, | |
| "loss": 0.0, | |
| "reward": 1.7404149770736694, | |
| "reward_std": 0.0779535174369812, | |
| "rewards/accuracy_reward": 0.7482273578643799, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.453125, | |
| "epoch": 0.041666666666666664, | |
| "grad_norm": 1.6143054455733772, | |
| "kl": 0.000759124755859375, | |
| "learning_rate": 9.958333333333333e-07, | |
| "loss": 0.0001, | |
| "reward": 1.7971906661987305, | |
| "reward_std": 0.07512722909450531, | |
| "rewards/accuracy_reward": 0.8076074123382568, | |
| "rewards/format_reward": 0.9895833730697632, | |
| "step": 2 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.90885925292969, | |
| "epoch": 0.0625, | |
| "grad_norm": 2.4515981685409955, | |
| "kl": 0.000972747802734375, | |
| "learning_rate": 9.9375e-07, | |
| "loss": 0.0001, | |
| "reward": 1.7853158712387085, | |
| "reward_std": 0.056390173733234406, | |
| "rewards/accuracy_reward": 0.7866179347038269, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 3 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.05989837646484, | |
| "epoch": 0.08333333333333333, | |
| "grad_norm": 1.6201269048159341, | |
| "kl": 0.00115203857421875, | |
| "learning_rate": 9.916666666666666e-07, | |
| "loss": 0.0001, | |
| "reward": 1.7735958099365234, | |
| "reward_std": 0.0706130862236023, | |
| "rewards/accuracy_reward": 0.7761998176574707, | |
| "rewards/format_reward": 0.9973958730697632, | |
| "step": 4 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.74089050292969, | |
| "epoch": 0.10416666666666667, | |
| "grad_norm": 1.5772248618980282, | |
| "kl": 0.0014801025390625, | |
| "learning_rate": 9.895833333333333e-07, | |
| "loss": 0.0001, | |
| "reward": 1.775779128074646, | |
| "reward_std": 0.0632912740111351, | |
| "rewards/accuracy_reward": 0.7783832550048828, | |
| "rewards/format_reward": 0.9973958730697632, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 113.27214050292969, | |
| "epoch": 0.125, | |
| "grad_norm": 1.6430473395229306, | |
| "kl": 0.0023345947265625, | |
| "learning_rate": 9.875e-07, | |
| "loss": 0.0001, | |
| "reward": 1.7499568462371826, | |
| "reward_std": 0.06891956180334091, | |
| "rewards/accuracy_reward": 0.7525607943534851, | |
| "rewards/format_reward": 0.9973958730697632, | |
| "step": 6 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 113.0390625, | |
| "epoch": 0.14583333333333334, | |
| "grad_norm": 2.1082448590555587, | |
| "kl": 0.0029754638671875, | |
| "learning_rate": 9.854166666666666e-07, | |
| "loss": 0.0002, | |
| "reward": 1.7655704021453857, | |
| "reward_std": 0.06707193702459335, | |
| "rewards/accuracy_reward": 0.7707786560058594, | |
| "rewards/format_reward": 0.9947916865348816, | |
| "step": 7 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.79948425292969, | |
| "epoch": 0.16666666666666666, | |
| "grad_norm": 1.2774316678996838, | |
| "kl": 0.003570556640625, | |
| "learning_rate": 9.833333333333332e-07, | |
| "loss": 0.0002, | |
| "reward": 1.7789148092269897, | |
| "reward_std": 0.06289087980985641, | |
| "rewards/accuracy_reward": 0.7828210592269897, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 8 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.25390625, | |
| "epoch": 0.1875, | |
| "grad_norm": 2.9892643389862297, | |
| "kl": 0.00396728515625, | |
| "learning_rate": 9.8125e-07, | |
| "loss": 0.0002, | |
| "reward": 1.7818154096603394, | |
| "reward_std": 0.06365714222192764, | |
| "rewards/accuracy_reward": 0.7844195365905762, | |
| "rewards/format_reward": 0.9973958730697632, | |
| "step": 9 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.72005462646484, | |
| "epoch": 0.20833333333333334, | |
| "grad_norm": 2.067866094157864, | |
| "kl": 0.00555419921875, | |
| "learning_rate": 9.791666666666667e-07, | |
| "loss": 0.0003, | |
| "reward": 1.768758773803711, | |
| "reward_std": 0.061586372554302216, | |
| "rewards/accuracy_reward": 0.7713630199432373, | |
| "rewards/format_reward": 0.9973958730697632, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.7109375, | |
| "epoch": 0.22916666666666666, | |
| "grad_norm": 1.9143306648652931, | |
| "kl": 0.007568359375, | |
| "learning_rate": 9.770833333333332e-07, | |
| "loss": 0.0004, | |
| "reward": 1.7840107679367065, | |
| "reward_std": 0.06136108189821243, | |
| "rewards/accuracy_reward": 0.7866148948669434, | |
| "rewards/format_reward": 0.9973958730697632, | |
| "step": 11 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.51171875, | |
| "epoch": 0.25, | |
| "grad_norm": 1.3340893635394109, | |
| "kl": 0.0089111328125, | |
| "learning_rate": 9.75e-07, | |
| "loss": 0.0005, | |
| "reward": 1.7980231046676636, | |
| "reward_std": 0.05285460874438286, | |
| "rewards/accuracy_reward": 0.799325168132782, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 12 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.94921875, | |
| "epoch": 0.2708333333333333, | |
| "grad_norm": 1.5039333862682631, | |
| "kl": 0.01055908203125, | |
| "learning_rate": 9.729166666666665e-07, | |
| "loss": 0.0005, | |
| "reward": 1.781097412109375, | |
| "reward_std": 0.054250504821538925, | |
| "rewards/accuracy_reward": 0.7810973525047302, | |
| "rewards/format_reward": 1.0, | |
| "step": 13 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.20442962646484, | |
| "epoch": 0.2916666666666667, | |
| "grad_norm": 1.320078780832244, | |
| "kl": 0.01153564453125, | |
| "learning_rate": 9.708333333333333e-07, | |
| "loss": 0.0006, | |
| "reward": 1.7951558828353882, | |
| "reward_std": 0.05504516512155533, | |
| "rewards/accuracy_reward": 0.7964579463005066, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 14 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.59635925292969, | |
| "epoch": 0.3125, | |
| "grad_norm": 1.2979058457846995, | |
| "kl": 0.0120849609375, | |
| "learning_rate": 9.6875e-07, | |
| "loss": 0.0006, | |
| "reward": 1.7878097295761108, | |
| "reward_std": 0.05526263639330864, | |
| "rewards/accuracy_reward": 0.7891117930412292, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.14974212646484, | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 1.326162224402637, | |
| "kl": 0.01275634765625, | |
| "learning_rate": 9.666666666666666e-07, | |
| "loss": 0.0006, | |
| "reward": 1.7894906997680664, | |
| "reward_std": 0.04853988438844681, | |
| "rewards/accuracy_reward": 0.7907928824424744, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 16 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.43489837646484, | |
| "epoch": 0.3541666666666667, | |
| "grad_norm": 2.7897636177227594, | |
| "kl": 0.01263427734375, | |
| "learning_rate": 9.645833333333333e-07, | |
| "loss": 0.0005, | |
| "reward": 1.8176600933074951, | |
| "reward_std": 0.04950461909174919, | |
| "rewards/accuracy_reward": 0.8176599740982056, | |
| "rewards/format_reward": 1.0, | |
| "step": 17 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.42708587646484, | |
| "epoch": 0.375, | |
| "grad_norm": 1.1152706578469112, | |
| "kl": 0.01409912109375, | |
| "learning_rate": 9.624999999999999e-07, | |
| "loss": 0.0006, | |
| "reward": 1.7780466079711914, | |
| "reward_std": 0.0461871400475502, | |
| "rewards/accuracy_reward": 0.7780466079711914, | |
| "rewards/format_reward": 1.0, | |
| "step": 18 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.89583587646484, | |
| "epoch": 0.3958333333333333, | |
| "grad_norm": 5.136433449809359, | |
| "kl": 0.01300048828125, | |
| "learning_rate": 9.604166666666666e-07, | |
| "loss": 0.0006, | |
| "reward": 1.7988059520721436, | |
| "reward_std": 0.04341081529855728, | |
| "rewards/accuracy_reward": 0.7988060712814331, | |
| "rewards/format_reward": 1.0, | |
| "step": 19 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.54296875, | |
| "epoch": 0.4166666666666667, | |
| "grad_norm": 1.285848750333943, | |
| "kl": 0.01422119140625, | |
| "learning_rate": 9.583333333333334e-07, | |
| "loss": 0.0007, | |
| "reward": 1.795292854309082, | |
| "reward_std": 0.045156918466091156, | |
| "rewards/accuracy_reward": 0.7952930331230164, | |
| "rewards/format_reward": 1.0, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.421875, | |
| "epoch": 0.4375, | |
| "grad_norm": 1.309348147777355, | |
| "kl": 0.0162353515625, | |
| "learning_rate": 9.5625e-07, | |
| "loss": 0.0007, | |
| "reward": 1.793769121170044, | |
| "reward_std": 0.050124406814575195, | |
| "rewards/accuracy_reward": 0.7963732481002808, | |
| "rewards/format_reward": 0.9973958730697632, | |
| "step": 21 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.89192962646484, | |
| "epoch": 0.4583333333333333, | |
| "grad_norm": 1.2889966905867243, | |
| "kl": 0.017578125, | |
| "learning_rate": 9.541666666666667e-07, | |
| "loss": 0.0007, | |
| "reward": 1.8341398239135742, | |
| "reward_std": 0.04439392685890198, | |
| "rewards/accuracy_reward": 0.8341398239135742, | |
| "rewards/format_reward": 1.0, | |
| "step": 22 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.41796875, | |
| "epoch": 0.4791666666666667, | |
| "grad_norm": 1.4675439511923518, | |
| "kl": 0.0167236328125, | |
| "learning_rate": 9.520833333333333e-07, | |
| "loss": 0.0007, | |
| "reward": 1.836176872253418, | |
| "reward_std": 0.046172261238098145, | |
| "rewards/accuracy_reward": 0.836176872253418, | |
| "rewards/format_reward": 1.0, | |
| "step": 23 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.67448425292969, | |
| "epoch": 0.5, | |
| "grad_norm": 2.1035799373033917, | |
| "kl": 0.0203857421875, | |
| "learning_rate": 9.499999999999999e-07, | |
| "loss": 0.0008, | |
| "reward": 1.8129713535308838, | |
| "reward_std": 0.04518614709377289, | |
| "rewards/accuracy_reward": 0.812971293926239, | |
| "rewards/format_reward": 1.0, | |
| "step": 24 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.63151550292969, | |
| "epoch": 0.5208333333333334, | |
| "grad_norm": 1.4208098502006166, | |
| "kl": 0.0186767578125, | |
| "learning_rate": 9.479166666666666e-07, | |
| "loss": 0.0008, | |
| "reward": 1.8406684398651123, | |
| "reward_std": 0.041067786514759064, | |
| "rewards/accuracy_reward": 0.8406683802604675, | |
| "rewards/format_reward": 1.0, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.16927337646484, | |
| "epoch": 0.5416666666666666, | |
| "grad_norm": 1.7838214183915748, | |
| "kl": 0.0203857421875, | |
| "learning_rate": 9.458333333333333e-07, | |
| "loss": 0.0008, | |
| "reward": 1.803572177886963, | |
| "reward_std": 0.04902785271406174, | |
| "rewards/accuracy_reward": 0.8035721778869629, | |
| "rewards/format_reward": 1.0, | |
| "step": 26 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.23046875, | |
| "epoch": 0.5625, | |
| "grad_norm": 1.2204814881350547, | |
| "kl": 0.0211181640625, | |
| "learning_rate": 9.4375e-07, | |
| "loss": 0.0009, | |
| "reward": 1.8056421279907227, | |
| "reward_std": 0.04466244578361511, | |
| "rewards/accuracy_reward": 0.8056421279907227, | |
| "rewards/format_reward": 1.0, | |
| "step": 27 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.625, | |
| "epoch": 0.5833333333333334, | |
| "grad_norm": 1.802147920265982, | |
| "kl": 0.0218505859375, | |
| "learning_rate": 9.416666666666666e-07, | |
| "loss": 0.0009, | |
| "reward": 1.828833818435669, | |
| "reward_std": 0.042848870158195496, | |
| "rewards/accuracy_reward": 0.8288335800170898, | |
| "rewards/format_reward": 1.0, | |
| "step": 28 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.18099212646484, | |
| "epoch": 0.6041666666666666, | |
| "grad_norm": 2.372355341881645, | |
| "kl": 0.021240234375, | |
| "learning_rate": 9.395833333333333e-07, | |
| "loss": 0.0009, | |
| "reward": 1.8150265216827393, | |
| "reward_std": 0.04657554626464844, | |
| "rewards/accuracy_reward": 0.8150264620780945, | |
| "rewards/format_reward": 1.0, | |
| "step": 29 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 113.60807800292969, | |
| "epoch": 0.625, | |
| "grad_norm": 1.6452523523321965, | |
| "kl": 0.022216796875, | |
| "learning_rate": 9.374999999999999e-07, | |
| "loss": 0.0009, | |
| "reward": 1.8578990697860718, | |
| "reward_std": 0.039906859397888184, | |
| "rewards/accuracy_reward": 0.8578989505767822, | |
| "rewards/format_reward": 1.0, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 114.49349212646484, | |
| "epoch": 0.6458333333333334, | |
| "grad_norm": 1.9283111710202978, | |
| "kl": 0.022705078125, | |
| "learning_rate": 9.354166666666667e-07, | |
| "loss": 0.001, | |
| "reward": 1.7941240072250366, | |
| "reward_std": 0.046397458761930466, | |
| "rewards/accuracy_reward": 0.7941240072250366, | |
| "rewards/format_reward": 1.0, | |
| "step": 31 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 117.61589050292969, | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 1.8161381749377306, | |
| "kl": 0.02490234375, | |
| "learning_rate": 9.333333333333333e-07, | |
| "loss": 0.001, | |
| "reward": 1.807027816772461, | |
| "reward_std": 0.047066252678632736, | |
| "rewards/accuracy_reward": 0.8070278167724609, | |
| "rewards/format_reward": 1.0, | |
| "step": 32 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 114.55599212646484, | |
| "epoch": 0.6875, | |
| "grad_norm": 1.4095225702209093, | |
| "kl": 0.028076171875, | |
| "learning_rate": 9.3125e-07, | |
| "loss": 0.0011, | |
| "reward": 1.816405177116394, | |
| "reward_std": 0.0405634380877018, | |
| "rewards/accuracy_reward": 0.816405177116394, | |
| "rewards/format_reward": 1.0, | |
| "step": 33 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 115.86589050292969, | |
| "epoch": 0.7083333333333334, | |
| "grad_norm": 1.9475471383587142, | |
| "kl": 0.0264892578125, | |
| "learning_rate": 9.291666666666666e-07, | |
| "loss": 0.0011, | |
| "reward": 1.8178232908248901, | |
| "reward_std": 0.048409104347229004, | |
| "rewards/accuracy_reward": 0.8191253542900085, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 34 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 114.81771087646484, | |
| "epoch": 0.7291666666666666, | |
| "grad_norm": 1.816929377906017, | |
| "kl": 0.0250244140625, | |
| "learning_rate": 9.270833333333333e-07, | |
| "loss": 0.001, | |
| "reward": 1.8112201690673828, | |
| "reward_std": 0.04231969267129898, | |
| "rewards/accuracy_reward": 0.811220109462738, | |
| "rewards/format_reward": 1.0, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 115.69271087646484, | |
| "epoch": 0.75, | |
| "grad_norm": 1.9245265212018168, | |
| "kl": 0.0260009765625, | |
| "learning_rate": 9.25e-07, | |
| "loss": 0.0011, | |
| "reward": 1.800992727279663, | |
| "reward_std": 0.04579651355743408, | |
| "rewards/accuracy_reward": 0.8022947311401367, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 36 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 115.50911712646484, | |
| "epoch": 0.7708333333333334, | |
| "grad_norm": 2.3374928077264565, | |
| "kl": 0.0274658203125, | |
| "learning_rate": 9.229166666666667e-07, | |
| "loss": 0.0011, | |
| "reward": 1.8183115720748901, | |
| "reward_std": 0.047783225774765015, | |
| "rewards/accuracy_reward": 0.820915699005127, | |
| "rewards/format_reward": 0.9973958730697632, | |
| "step": 37 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 115.3515625, | |
| "epoch": 0.7916666666666666, | |
| "grad_norm": 1.959104932697817, | |
| "kl": 0.0269775390625, | |
| "learning_rate": 9.208333333333332e-07, | |
| "loss": 0.0011, | |
| "reward": 1.7997591495513916, | |
| "reward_std": 0.04710128903388977, | |
| "rewards/accuracy_reward": 0.8010611534118652, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 38 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 115.93489837646484, | |
| "epoch": 0.8125, | |
| "grad_norm": 5.405686552072785, | |
| "kl": 0.023193359375, | |
| "learning_rate": 9.187499999999999e-07, | |
| "loss": 0.0009, | |
| "reward": 1.8103388547897339, | |
| "reward_std": 0.048775218427181244, | |
| "rewards/accuracy_reward": 0.8129429817199707, | |
| "rewards/format_reward": 0.9973958730697632, | |
| "step": 39 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 112.42578125, | |
| "epoch": 0.8333333333333334, | |
| "grad_norm": 2.5109857220624874, | |
| "kl": 0.02587890625, | |
| "learning_rate": 9.166666666666665e-07, | |
| "loss": 0.001, | |
| "reward": 1.8131688833236694, | |
| "reward_std": 0.045695092529058456, | |
| "rewards/accuracy_reward": 0.8157729506492615, | |
| "rewards/format_reward": 0.9973958730697632, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 114.1015625, | |
| "epoch": 0.8541666666666666, | |
| "grad_norm": 1.3500957332806702, | |
| "kl": 0.0267333984375, | |
| "learning_rate": 9.145833333333333e-07, | |
| "loss": 0.0011, | |
| "reward": 1.8379626274108887, | |
| "reward_std": 0.03906077891588211, | |
| "rewards/accuracy_reward": 0.8392646312713623, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 41 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 114.87109375, | |
| "epoch": 0.875, | |
| "grad_norm": 1.9553905487315055, | |
| "kl": 0.02734375, | |
| "learning_rate": 9.124999999999999e-07, | |
| "loss": 0.0011, | |
| "reward": 1.8075612783432007, | |
| "reward_std": 0.046475451439619064, | |
| "rewards/accuracy_reward": 0.8075612783432007, | |
| "rewards/format_reward": 1.0, | |
| "step": 42 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.93359375, | |
| "epoch": 0.8958333333333334, | |
| "grad_norm": 2.687075325377198, | |
| "kl": 0.0262451171875, | |
| "learning_rate": 9.104166666666666e-07, | |
| "loss": 0.0011, | |
| "reward": 1.8184566497802734, | |
| "reward_std": 0.04643276333808899, | |
| "rewards/accuracy_reward": 0.8197587728500366, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 43 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.43489837646484, | |
| "epoch": 0.9166666666666666, | |
| "grad_norm": 1.2273175474621774, | |
| "kl": 0.026611328125, | |
| "learning_rate": 9.083333333333332e-07, | |
| "loss": 0.0011, | |
| "reward": 1.7925729751586914, | |
| "reward_std": 0.04880434274673462, | |
| "rewards/accuracy_reward": 0.7938751578330994, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 44 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.70052337646484, | |
| "epoch": 0.9375, | |
| "grad_norm": 3.4979750979513184, | |
| "kl": 0.0242919921875, | |
| "learning_rate": 9.0625e-07, | |
| "loss": 0.001, | |
| "reward": 1.8129024505615234, | |
| "reward_std": 0.04281633347272873, | |
| "rewards/accuracy_reward": 0.8142046332359314, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.12239837646484, | |
| "epoch": 0.9583333333333334, | |
| "grad_norm": 1.581390275171433, | |
| "kl": 0.024658203125, | |
| "learning_rate": 9.041666666666667e-07, | |
| "loss": 0.001, | |
| "reward": 1.8339308500289917, | |
| "reward_std": 0.0428236648440361, | |
| "rewards/accuracy_reward": 0.8339308500289917, | |
| "rewards/format_reward": 1.0, | |
| "step": 46 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.57410430908203, | |
| "epoch": 0.9791666666666666, | |
| "grad_norm": 1.2859639310341215, | |
| "kl": 0.031982421875, | |
| "learning_rate": 9.020833333333333e-07, | |
| "loss": 0.0013, | |
| "reward": 1.8121892213821411, | |
| "reward_std": 0.043297141790390015, | |
| "rewards/accuracy_reward": 0.8135243654251099, | |
| "rewards/format_reward": 0.998664915561676, | |
| "step": 47 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.44271087646484, | |
| "epoch": 1.0208333333333333, | |
| "grad_norm": 2.3284880809656916, | |
| "kl": 0.0240478515625, | |
| "learning_rate": 9e-07, | |
| "loss": 0.001, | |
| "reward": 1.842546820640564, | |
| "reward_std": 0.03681856021285057, | |
| "rewards/accuracy_reward": 0.842546820640564, | |
| "rewards/format_reward": 1.0, | |
| "step": 48 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.63932800292969, | |
| "epoch": 1.0416666666666667, | |
| "grad_norm": 11.066834298993447, | |
| "kl": 0.0245361328125, | |
| "learning_rate": 8.979166666666666e-07, | |
| "loss": 0.001, | |
| "reward": 1.810459852218628, | |
| "reward_std": 0.04241730272769928, | |
| "rewards/accuracy_reward": 0.8117618560791016, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 49 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.625, | |
| "epoch": 1.0625, | |
| "grad_norm": 3.9999069520932107, | |
| "kl": 0.0284423828125, | |
| "learning_rate": 8.958333333333334e-07, | |
| "loss": 0.0011, | |
| "reward": 1.8403337001800537, | |
| "reward_std": 0.035806819796562195, | |
| "rewards/accuracy_reward": 0.8403337001800537, | |
| "rewards/format_reward": 1.0, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.18620300292969, | |
| "epoch": 1.0833333333333333, | |
| "grad_norm": 1.9543615318552672, | |
| "kl": 0.0263671875, | |
| "learning_rate": 8.9375e-07, | |
| "loss": 0.0011, | |
| "reward": 1.8314783573150635, | |
| "reward_std": 0.03615511581301689, | |
| "rewards/accuracy_reward": 0.8314781188964844, | |
| "rewards/format_reward": 1.0, | |
| "step": 51 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.74349212646484, | |
| "epoch": 1.1041666666666667, | |
| "grad_norm": 1.7271064117720305, | |
| "kl": 0.0284423828125, | |
| "learning_rate": 8.916666666666667e-07, | |
| "loss": 0.0012, | |
| "reward": 1.8100008964538574, | |
| "reward_std": 0.04172190651297569, | |
| "rewards/accuracy_reward": 0.810001015663147, | |
| "rewards/format_reward": 1.0, | |
| "step": 52 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.9609375, | |
| "epoch": 1.125, | |
| "grad_norm": 3.0099066254538225, | |
| "kl": 0.0263671875, | |
| "learning_rate": 8.895833333333332e-07, | |
| "loss": 0.0011, | |
| "reward": 1.8225668668746948, | |
| "reward_std": 0.03787129372358322, | |
| "rewards/accuracy_reward": 0.8225669860839844, | |
| "rewards/format_reward": 1.0, | |
| "step": 53 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.16667175292969, | |
| "epoch": 1.1458333333333333, | |
| "grad_norm": 2.1966545517434746, | |
| "kl": 0.02783203125, | |
| "learning_rate": 8.874999999999999e-07, | |
| "loss": 0.0012, | |
| "reward": 1.825528860092163, | |
| "reward_std": 0.0393807552754879, | |
| "rewards/accuracy_reward": 0.8255288004875183, | |
| "rewards/format_reward": 1.0, | |
| "step": 54 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.93880462646484, | |
| "epoch": 1.1666666666666667, | |
| "grad_norm": 2.2377689992101195, | |
| "kl": 0.0289306640625, | |
| "learning_rate": 8.854166666666666e-07, | |
| "loss": 0.0012, | |
| "reward": 1.84073007106781, | |
| "reward_std": 0.036864347755908966, | |
| "rewards/accuracy_reward": 0.8407299518585205, | |
| "rewards/format_reward": 1.0, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.11198425292969, | |
| "epoch": 1.1875, | |
| "grad_norm": 1.4566342977805207, | |
| "kl": 0.0250244140625, | |
| "learning_rate": 8.833333333333333e-07, | |
| "loss": 0.001, | |
| "reward": 1.8056623935699463, | |
| "reward_std": 0.04149676859378815, | |
| "rewards/accuracy_reward": 0.8056623339653015, | |
| "rewards/format_reward": 1.0, | |
| "step": 56 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.09765625, | |
| "epoch": 1.2083333333333333, | |
| "grad_norm": 6.235839920646965, | |
| "kl": 0.0255126953125, | |
| "learning_rate": 8.812499999999999e-07, | |
| "loss": 0.0011, | |
| "reward": 1.8528995513916016, | |
| "reward_std": 0.033966001123189926, | |
| "rewards/accuracy_reward": 0.8528995513916016, | |
| "rewards/format_reward": 1.0, | |
| "step": 57 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 101.87630462646484, | |
| "epoch": 1.2291666666666667, | |
| "grad_norm": 1.5034574110083547, | |
| "kl": 0.02978515625, | |
| "learning_rate": 8.791666666666666e-07, | |
| "loss": 0.0012, | |
| "reward": 1.835959792137146, | |
| "reward_std": 0.02824896201491356, | |
| "rewards/accuracy_reward": 0.8359596729278564, | |
| "rewards/format_reward": 1.0, | |
| "step": 58 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.05599212646484, | |
| "epoch": 1.25, | |
| "grad_norm": 1.6494502522344965, | |
| "kl": 0.0301513671875, | |
| "learning_rate": 8.770833333333333e-07, | |
| "loss": 0.0013, | |
| "reward": 1.8419498205184937, | |
| "reward_std": 0.03271109610795975, | |
| "rewards/accuracy_reward": 0.8419498205184937, | |
| "rewards/format_reward": 1.0, | |
| "step": 59 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.515625, | |
| "epoch": 1.2708333333333333, | |
| "grad_norm": 2.5168432356014954, | |
| "kl": 0.02587890625, | |
| "learning_rate": 8.75e-07, | |
| "loss": 0.0011, | |
| "reward": 1.8278319835662842, | |
| "reward_std": 0.04185899719595909, | |
| "rewards/accuracy_reward": 0.8278318643569946, | |
| "rewards/format_reward": 1.0, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.421875, | |
| "epoch": 1.2916666666666667, | |
| "grad_norm": 1.3151086988925837, | |
| "kl": 0.0242919921875, | |
| "learning_rate": 8.729166666666666e-07, | |
| "loss": 0.001, | |
| "reward": 1.8187057971954346, | |
| "reward_std": 0.03452453017234802, | |
| "rewards/accuracy_reward": 0.8187057375907898, | |
| "rewards/format_reward": 1.0, | |
| "step": 61 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.04948425292969, | |
| "epoch": 1.3125, | |
| "grad_norm": 2.810138538550519, | |
| "kl": 0.02783203125, | |
| "learning_rate": 8.708333333333333e-07, | |
| "loss": 0.0012, | |
| "reward": 1.846640944480896, | |
| "reward_std": 0.04408061131834984, | |
| "rewards/accuracy_reward": 0.8492451906204224, | |
| "rewards/format_reward": 0.9973958730697632, | |
| "step": 62 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.23698425292969, | |
| "epoch": 1.3333333333333333, | |
| "grad_norm": 1.563896261562582, | |
| "kl": 0.03076171875, | |
| "learning_rate": 8.687499999999999e-07, | |
| "loss": 0.0013, | |
| "reward": 1.8380591869354248, | |
| "reward_std": 0.03534460812807083, | |
| "rewards/accuracy_reward": 0.8380589485168457, | |
| "rewards/format_reward": 1.0, | |
| "step": 63 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.85286712646484, | |
| "epoch": 1.3541666666666667, | |
| "grad_norm": 3.020754504226419, | |
| "kl": 0.0255126953125, | |
| "learning_rate": 8.666666666666667e-07, | |
| "loss": 0.0011, | |
| "reward": 1.8550291061401367, | |
| "reward_std": 0.037260740995407104, | |
| "rewards/accuracy_reward": 0.8563313484191895, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 64 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.10026550292969, | |
| "epoch": 1.375, | |
| "grad_norm": 3.306677048985413, | |
| "kl": 0.026123046875, | |
| "learning_rate": 8.645833333333333e-07, | |
| "loss": 0.0011, | |
| "reward": 1.8508269786834717, | |
| "reward_std": 0.0380985364317894, | |
| "rewards/accuracy_reward": 0.8508269190788269, | |
| "rewards/format_reward": 1.0, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.24870300292969, | |
| "epoch": 1.3958333333333333, | |
| "grad_norm": 1.6070004388732007, | |
| "kl": 0.0263671875, | |
| "learning_rate": 8.625e-07, | |
| "loss": 0.0011, | |
| "reward": 1.857025146484375, | |
| "reward_std": 0.03701246529817581, | |
| "rewards/accuracy_reward": 0.857025146484375, | |
| "rewards/format_reward": 1.0, | |
| "step": 66 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.89323425292969, | |
| "epoch": 1.4166666666666667, | |
| "grad_norm": 2.3548695186042186, | |
| "kl": 0.02783203125, | |
| "learning_rate": 8.604166666666667e-07, | |
| "loss": 0.0011, | |
| "reward": 1.8029731512069702, | |
| "reward_std": 0.042077165096998215, | |
| "rewards/accuracy_reward": 0.8055772185325623, | |
| "rewards/format_reward": 0.9973958730697632, | |
| "step": 67 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.38671875, | |
| "epoch": 1.4375, | |
| "grad_norm": 1.278138762095491, | |
| "kl": 0.0228271484375, | |
| "learning_rate": 8.583333333333332e-07, | |
| "loss": 0.0009, | |
| "reward": 1.8338496685028076, | |
| "reward_std": 0.039083532989025116, | |
| "rewards/accuracy_reward": 0.8338495492935181, | |
| "rewards/format_reward": 1.0, | |
| "step": 68 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.65495300292969, | |
| "epoch": 1.4583333333333333, | |
| "grad_norm": 2.070944061321321, | |
| "kl": 0.0284423828125, | |
| "learning_rate": 8.5625e-07, | |
| "loss": 0.0012, | |
| "reward": 1.8258914947509766, | |
| "reward_std": 0.03814253211021423, | |
| "rewards/accuracy_reward": 0.8271937370300293, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 69 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 112.45442962646484, | |
| "epoch": 1.4791666666666667, | |
| "grad_norm": 1.3471433182606216, | |
| "kl": 0.0281982421875, | |
| "learning_rate": 8.541666666666666e-07, | |
| "loss": 0.0012, | |
| "reward": 1.8398675918579102, | |
| "reward_std": 0.0354573093354702, | |
| "rewards/accuracy_reward": 0.8398677706718445, | |
| "rewards/format_reward": 1.0, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.04427337646484, | |
| "epoch": 1.5, | |
| "grad_norm": 1.6026685707149524, | |
| "kl": 0.02880859375, | |
| "learning_rate": 8.520833333333333e-07, | |
| "loss": 0.0012, | |
| "reward": 1.853353500366211, | |
| "reward_std": 0.03885906934738159, | |
| "rewards/accuracy_reward": 0.8546554446220398, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 71 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.84635925292969, | |
| "epoch": 1.5208333333333335, | |
| "grad_norm": 2.1398544036186014, | |
| "kl": 0.02783203125, | |
| "learning_rate": 8.499999999999999e-07, | |
| "loss": 0.0012, | |
| "reward": 1.8261979818344116, | |
| "reward_std": 0.03391870856285095, | |
| "rewards/accuracy_reward": 0.8261978626251221, | |
| "rewards/format_reward": 1.0, | |
| "step": 72 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.69271087646484, | |
| "epoch": 1.5416666666666665, | |
| "grad_norm": 4.020748588139163, | |
| "kl": 0.025634765625, | |
| "learning_rate": 8.479166666666667e-07, | |
| "loss": 0.0011, | |
| "reward": 1.8563817739486694, | |
| "reward_std": 0.031248420476913452, | |
| "rewards/accuracy_reward": 0.8563817739486694, | |
| "rewards/format_reward": 1.0, | |
| "step": 73 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.48046875, | |
| "epoch": 1.5625, | |
| "grad_norm": 1.3740118540705408, | |
| "kl": 0.0291748046875, | |
| "learning_rate": 8.458333333333333e-07, | |
| "loss": 0.0012, | |
| "reward": 1.8323755264282227, | |
| "reward_std": 0.03359724208712578, | |
| "rewards/accuracy_reward": 0.8323755264282227, | |
| "rewards/format_reward": 1.0, | |
| "step": 74 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.44792175292969, | |
| "epoch": 1.5833333333333335, | |
| "grad_norm": 2.2940140494468553, | |
| "kl": 0.026611328125, | |
| "learning_rate": 8.4375e-07, | |
| "loss": 0.0011, | |
| "reward": 1.8279385566711426, | |
| "reward_std": 0.03183002024888992, | |
| "rewards/accuracy_reward": 0.8279385566711426, | |
| "rewards/format_reward": 1.0, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.87890625, | |
| "epoch": 1.6041666666666665, | |
| "grad_norm": 1.9014935653138212, | |
| "kl": 0.0299072265625, | |
| "learning_rate": 8.416666666666666e-07, | |
| "loss": 0.0013, | |
| "reward": 1.8360404968261719, | |
| "reward_std": 0.02742108330130577, | |
| "rewards/accuracy_reward": 0.8360404372215271, | |
| "rewards/format_reward": 1.0, | |
| "step": 76 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.87239837646484, | |
| "epoch": 1.625, | |
| "grad_norm": 1.7025904896492243, | |
| "kl": 0.02587890625, | |
| "learning_rate": 8.395833333333333e-07, | |
| "loss": 0.0012, | |
| "reward": 1.7988513708114624, | |
| "reward_std": 0.035461340099573135, | |
| "rewards/accuracy_reward": 0.7988513708114624, | |
| "rewards/format_reward": 1.0, | |
| "step": 77 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.25651550292969, | |
| "epoch": 1.6458333333333335, | |
| "grad_norm": 2.0737766177718564, | |
| "kl": 0.0262451171875, | |
| "learning_rate": 8.375e-07, | |
| "loss": 0.0011, | |
| "reward": 1.8587430715560913, | |
| "reward_std": 0.031236987560987473, | |
| "rewards/accuracy_reward": 0.8600451946258545, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 78 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.46875, | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 1.8776151359500948, | |
| "kl": 0.0283203125, | |
| "learning_rate": 8.354166666666667e-07, | |
| "loss": 0.0012, | |
| "reward": 1.8535531759262085, | |
| "reward_std": 0.03250068426132202, | |
| "rewards/accuracy_reward": 0.853553056716919, | |
| "rewards/format_reward": 1.0, | |
| "step": 79 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.76692962646484, | |
| "epoch": 1.6875, | |
| "grad_norm": 1.4792182225675727, | |
| "kl": 0.0274658203125, | |
| "learning_rate": 8.333333333333333e-07, | |
| "loss": 0.0012, | |
| "reward": 1.8359715938568115, | |
| "reward_std": 0.027499686926603317, | |
| "rewards/accuracy_reward": 0.8359713554382324, | |
| "rewards/format_reward": 1.0, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.09114837646484, | |
| "epoch": 1.7083333333333335, | |
| "grad_norm": 1.429067785258255, | |
| "kl": 0.034423828125, | |
| "learning_rate": 8.3125e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8376563787460327, | |
| "reward_std": 0.03089229390025139, | |
| "rewards/accuracy_reward": 0.8389585018157959, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 81 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.81901550292969, | |
| "epoch": 1.7291666666666665, | |
| "grad_norm": 1.7947286530222653, | |
| "kl": 0.0269775390625, | |
| "learning_rate": 8.291666666666666e-07, | |
| "loss": 0.0011, | |
| "reward": 1.8310506343841553, | |
| "reward_std": 0.0325641892850399, | |
| "rewards/accuracy_reward": 0.8323527574539185, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 82 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.32292175292969, | |
| "epoch": 1.75, | |
| "grad_norm": 1.840921164440233, | |
| "kl": 0.028076171875, | |
| "learning_rate": 8.270833333333333e-07, | |
| "loss": 0.0012, | |
| "reward": 1.8312219381332397, | |
| "reward_std": 0.029772888869047165, | |
| "rewards/accuracy_reward": 0.8312219381332397, | |
| "rewards/format_reward": 1.0, | |
| "step": 83 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.17839050292969, | |
| "epoch": 1.7708333333333335, | |
| "grad_norm": 1.8438240192624837, | |
| "kl": 0.02685546875, | |
| "learning_rate": 8.249999999999999e-07, | |
| "loss": 0.0012, | |
| "reward": 1.8248231410980225, | |
| "reward_std": 0.03228841722011566, | |
| "rewards/accuracy_reward": 0.8248231410980225, | |
| "rewards/format_reward": 1.0, | |
| "step": 84 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.45703125, | |
| "epoch": 1.7916666666666665, | |
| "grad_norm": 1.529693340603501, | |
| "kl": 0.028076171875, | |
| "learning_rate": 8.229166666666666e-07, | |
| "loss": 0.0012, | |
| "reward": 1.8117046356201172, | |
| "reward_std": 0.03222049027681351, | |
| "rewards/accuracy_reward": 0.8117045760154724, | |
| "rewards/format_reward": 1.0, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.87630462646484, | |
| "epoch": 1.8125, | |
| "grad_norm": 3.9505026027468504, | |
| "kl": 0.031494140625, | |
| "learning_rate": 8.208333333333332e-07, | |
| "loss": 0.0013, | |
| "reward": 1.814018726348877, | |
| "reward_std": 0.031958386301994324, | |
| "rewards/accuracy_reward": 0.814018726348877, | |
| "rewards/format_reward": 1.0, | |
| "step": 86 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.60807800292969, | |
| "epoch": 1.8333333333333335, | |
| "grad_norm": 9.21528924853295, | |
| "kl": 0.0274658203125, | |
| "learning_rate": 8.187499999999999e-07, | |
| "loss": 0.0012, | |
| "reward": 1.8301608562469482, | |
| "reward_std": 0.02933676168322563, | |
| "rewards/accuracy_reward": 0.8301607370376587, | |
| "rewards/format_reward": 1.0, | |
| "step": 87 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.74089050292969, | |
| "epoch": 1.8541666666666665, | |
| "grad_norm": 1.329936418308406, | |
| "kl": 0.03173828125, | |
| "learning_rate": 8.166666666666666e-07, | |
| "loss": 0.0013, | |
| "reward": 1.8375142812728882, | |
| "reward_std": 0.03457921743392944, | |
| "rewards/accuracy_reward": 0.8375141024589539, | |
| "rewards/format_reward": 1.0, | |
| "step": 88 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.92578125, | |
| "epoch": 1.875, | |
| "grad_norm": 1.3218491918211248, | |
| "kl": 0.029541015625, | |
| "learning_rate": 8.145833333333333e-07, | |
| "loss": 0.0013, | |
| "reward": 1.8522485494613647, | |
| "reward_std": 0.028189565986394882, | |
| "rewards/accuracy_reward": 0.8522485494613647, | |
| "rewards/format_reward": 1.0, | |
| "step": 89 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.46354675292969, | |
| "epoch": 1.8958333333333335, | |
| "grad_norm": 2.094729152422118, | |
| "kl": 0.0283203125, | |
| "learning_rate": 8.125e-07, | |
| "loss": 0.0013, | |
| "reward": 1.8424299955368042, | |
| "reward_std": 0.034867409616708755, | |
| "rewards/accuracy_reward": 0.8424299955368042, | |
| "rewards/format_reward": 1.0, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.65364837646484, | |
| "epoch": 1.9166666666666665, | |
| "grad_norm": 7.368992349088265, | |
| "kl": 0.0283203125, | |
| "learning_rate": 8.104166666666666e-07, | |
| "loss": 0.0012, | |
| "reward": 1.855029582977295, | |
| "reward_std": 0.027775254100561142, | |
| "rewards/accuracy_reward": 0.8550295233726501, | |
| "rewards/format_reward": 1.0, | |
| "step": 91 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.18880462646484, | |
| "epoch": 1.9375, | |
| "grad_norm": 2.0045320605738333, | |
| "kl": 0.0341796875, | |
| "learning_rate": 8.083333333333334e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8531224727630615, | |
| "reward_std": 0.02575305663049221, | |
| "rewards/accuracy_reward": 0.853122353553772, | |
| "rewards/format_reward": 1.0, | |
| "step": 92 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.109375, | |
| "epoch": 1.9583333333333335, | |
| "grad_norm": 1.9199651103984947, | |
| "kl": 0.02587890625, | |
| "learning_rate": 8.0625e-07, | |
| "loss": 0.0011, | |
| "reward": 1.8517922163009644, | |
| "reward_std": 0.02801516279578209, | |
| "rewards/accuracy_reward": 0.8517922163009644, | |
| "rewards/format_reward": 1.0, | |
| "step": 93 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.6648941040039, | |
| "epoch": 1.9791666666666665, | |
| "grad_norm": 1.2125015465439164, | |
| "kl": 0.025146484375, | |
| "learning_rate": 8.041666666666667e-07, | |
| "loss": 0.0011, | |
| "reward": 1.8468000888824463, | |
| "reward_std": 0.027415748685598373, | |
| "rewards/accuracy_reward": 0.8467998504638672, | |
| "rewards/format_reward": 1.0, | |
| "step": 94 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.87109375, | |
| "epoch": 2.0208333333333335, | |
| "grad_norm": 2.893832337696663, | |
| "kl": 0.0240478515625, | |
| "learning_rate": 8.020833333333333e-07, | |
| "loss": 0.0011, | |
| "reward": 1.8671659231185913, | |
| "reward_std": 0.03437124937772751, | |
| "rewards/accuracy_reward": 0.8684679865837097, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 95 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.80729675292969, | |
| "epoch": 2.0416666666666665, | |
| "grad_norm": 2.4656073464269124, | |
| "kl": 0.028564453125, | |
| "learning_rate": 8e-07, | |
| "loss": 0.0013, | |
| "reward": 1.8920382261276245, | |
| "reward_std": 0.024690520018339157, | |
| "rewards/accuracy_reward": 0.8933402895927429, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 96 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.49739837646484, | |
| "epoch": 2.0625, | |
| "grad_norm": 2.8323749391253124, | |
| "kl": 0.025390625, | |
| "learning_rate": 7.979166666666667e-07, | |
| "loss": 0.001, | |
| "reward": 1.8716254234313965, | |
| "reward_std": 0.028798673301935196, | |
| "rewards/accuracy_reward": 0.8716254234313965, | |
| "rewards/format_reward": 1.0, | |
| "step": 97 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.86979675292969, | |
| "epoch": 2.0833333333333335, | |
| "grad_norm": 1.842256938186257, | |
| "kl": 0.0303955078125, | |
| "learning_rate": 7.958333333333333e-07, | |
| "loss": 0.0013, | |
| "reward": 1.849453330039978, | |
| "reward_std": 0.033137306571006775, | |
| "rewards/accuracy_reward": 0.849453330039978, | |
| "rewards/format_reward": 1.0, | |
| "step": 98 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.98828125, | |
| "epoch": 2.1041666666666665, | |
| "grad_norm": 1.625206074387942, | |
| "kl": 0.02783203125, | |
| "learning_rate": 7.937499999999999e-07, | |
| "loss": 0.0012, | |
| "reward": 1.864532232284546, | |
| "reward_std": 0.02849118784070015, | |
| "rewards/accuracy_reward": 0.8645319938659668, | |
| "rewards/format_reward": 1.0, | |
| "step": 99 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 112.125, | |
| "epoch": 2.125, | |
| "grad_norm": 1.8687654433880987, | |
| "kl": 0.02880859375, | |
| "learning_rate": 7.916666666666666e-07, | |
| "loss": 0.0012, | |
| "reward": 1.8906430006027222, | |
| "reward_std": 0.02808341383934021, | |
| "rewards/accuracy_reward": 0.8906428813934326, | |
| "rewards/format_reward": 1.0, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.30859375, | |
| "epoch": 2.1458333333333335, | |
| "grad_norm": 1.812765837401351, | |
| "kl": 0.0341796875, | |
| "learning_rate": 7.895833333333332e-07, | |
| "loss": 0.0015, | |
| "reward": 1.84462308883667, | |
| "reward_std": 0.02905876934528351, | |
| "rewards/accuracy_reward": 0.8446230292320251, | |
| "rewards/format_reward": 1.0, | |
| "step": 101 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 113.15625, | |
| "epoch": 2.1666666666666665, | |
| "grad_norm": 1.3199461131282353, | |
| "kl": 0.04736328125, | |
| "learning_rate": 7.875e-07, | |
| "loss": 0.002, | |
| "reward": 1.881453275680542, | |
| "reward_std": 0.025718865916132927, | |
| "rewards/accuracy_reward": 0.8814532160758972, | |
| "rewards/format_reward": 1.0, | |
| "step": 102 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 112.74739837646484, | |
| "epoch": 2.1875, | |
| "grad_norm": 1.4379737344223804, | |
| "kl": 0.0257568359375, | |
| "learning_rate": 7.854166666666666e-07, | |
| "loss": 0.0011, | |
| "reward": 1.852853775024414, | |
| "reward_std": 0.03441212326288223, | |
| "rewards/accuracy_reward": 0.8541558980941772, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 103 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.17839050292969, | |
| "epoch": 2.2083333333333335, | |
| "grad_norm": 1.4747961451326568, | |
| "kl": 0.02587890625, | |
| "learning_rate": 7.833333333333333e-07, | |
| "loss": 0.0011, | |
| "reward": 1.8564555644989014, | |
| "reward_std": 0.03334889933466911, | |
| "rewards/accuracy_reward": 0.8564555048942566, | |
| "rewards/format_reward": 1.0, | |
| "step": 104 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.84114837646484, | |
| "epoch": 2.2291666666666665, | |
| "grad_norm": 1.595566840318838, | |
| "kl": 0.0277099609375, | |
| "learning_rate": 7.812499999999999e-07, | |
| "loss": 0.0012, | |
| "reward": 1.847129225730896, | |
| "reward_std": 0.034906916320323944, | |
| "rewards/accuracy_reward": 0.847129225730896, | |
| "rewards/format_reward": 1.0, | |
| "step": 105 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 112.56640625, | |
| "epoch": 2.25, | |
| "grad_norm": 1.7252491389020819, | |
| "kl": 0.029296875, | |
| "learning_rate": 7.791666666666667e-07, | |
| "loss": 0.0012, | |
| "reward": 1.8392118215560913, | |
| "reward_std": 0.032303862273693085, | |
| "rewards/accuracy_reward": 0.8392118811607361, | |
| "rewards/format_reward": 1.0, | |
| "step": 106 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 112.73046875, | |
| "epoch": 2.2708333333333335, | |
| "grad_norm": 1.3233610087560426, | |
| "kl": 0.027587890625, | |
| "learning_rate": 7.770833333333333e-07, | |
| "loss": 0.0012, | |
| "reward": 1.8880364894866943, | |
| "reward_std": 0.0294729545712471, | |
| "rewards/accuracy_reward": 0.8880362510681152, | |
| "rewards/format_reward": 1.0, | |
| "step": 107 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.72135925292969, | |
| "epoch": 2.2916666666666665, | |
| "grad_norm": 1.3388400280888855, | |
| "kl": 0.0308837890625, | |
| "learning_rate": 7.75e-07, | |
| "loss": 0.0013, | |
| "reward": 1.841947078704834, | |
| "reward_std": 0.03206552192568779, | |
| "rewards/accuracy_reward": 0.8419471979141235, | |
| "rewards/format_reward": 1.0, | |
| "step": 108 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.37239837646484, | |
| "epoch": 2.3125, | |
| "grad_norm": 1.6808004103234182, | |
| "kl": 0.0301513671875, | |
| "learning_rate": 7.729166666666666e-07, | |
| "loss": 0.0012, | |
| "reward": 1.872849464416504, | |
| "reward_std": 0.028602521866559982, | |
| "rewards/accuracy_reward": 0.8728495836257935, | |
| "rewards/format_reward": 1.0, | |
| "step": 109 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 112.41536712646484, | |
| "epoch": 2.3333333333333335, | |
| "grad_norm": 1.1449720839601671, | |
| "kl": 0.0284423828125, | |
| "learning_rate": 7.708333333333333e-07, | |
| "loss": 0.0012, | |
| "reward": 1.8464049100875854, | |
| "reward_std": 0.027897782623767853, | |
| "rewards/accuracy_reward": 0.8464047908782959, | |
| "rewards/format_reward": 1.0, | |
| "step": 110 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 113.18620300292969, | |
| "epoch": 2.3541666666666665, | |
| "grad_norm": 1.651982521267855, | |
| "kl": 0.0279541015625, | |
| "learning_rate": 7.6875e-07, | |
| "loss": 0.0012, | |
| "reward": 1.859885573387146, | |
| "reward_std": 0.03053418919444084, | |
| "rewards/accuracy_reward": 0.8611876368522644, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 111 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 112.31901550292969, | |
| "epoch": 2.375, | |
| "grad_norm": 1.273847905587726, | |
| "kl": 0.03125, | |
| "learning_rate": 7.666666666666667e-07, | |
| "loss": 0.0014, | |
| "reward": 1.8361539840698242, | |
| "reward_std": 0.03006243333220482, | |
| "rewards/accuracy_reward": 0.8374561667442322, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 112 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.92578125, | |
| "epoch": 2.3958333333333335, | |
| "grad_norm": 1.1387571840491482, | |
| "kl": 0.0301513671875, | |
| "learning_rate": 7.645833333333332e-07, | |
| "loss": 0.0013, | |
| "reward": 1.866465449333191, | |
| "reward_std": 0.023312915116548538, | |
| "rewards/accuracy_reward": 0.8664655089378357, | |
| "rewards/format_reward": 1.0, | |
| "step": 113 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.92708587646484, | |
| "epoch": 2.4166666666666665, | |
| "grad_norm": 1.3560724890382403, | |
| "kl": 0.029541015625, | |
| "learning_rate": 7.624999999999999e-07, | |
| "loss": 0.0013, | |
| "reward": 1.8626346588134766, | |
| "reward_std": 0.025532353669404984, | |
| "rewards/accuracy_reward": 0.8626348376274109, | |
| "rewards/format_reward": 1.0, | |
| "step": 114 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.17448425292969, | |
| "epoch": 2.4375, | |
| "grad_norm": 1.3842406282593351, | |
| "kl": 0.03076171875, | |
| "learning_rate": 7.604166666666666e-07, | |
| "loss": 0.0013, | |
| "reward": 1.8882970809936523, | |
| "reward_std": 0.022818906232714653, | |
| "rewards/accuracy_reward": 0.8882970809936523, | |
| "rewards/format_reward": 1.0, | |
| "step": 115 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 112.09375, | |
| "epoch": 2.4583333333333335, | |
| "grad_norm": 1.4789503320621176, | |
| "kl": 0.03271484375, | |
| "learning_rate": 7.583333333333333e-07, | |
| "loss": 0.0014, | |
| "reward": 1.86592435836792, | |
| "reward_std": 0.025339588522911072, | |
| "rewards/accuracy_reward": 0.8659243583679199, | |
| "rewards/format_reward": 1.0, | |
| "step": 116 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 112.37890625, | |
| "epoch": 2.4791666666666665, | |
| "grad_norm": 2.1260312341423484, | |
| "kl": 0.0281982421875, | |
| "learning_rate": 7.5625e-07, | |
| "loss": 0.0012, | |
| "reward": 1.8531742095947266, | |
| "reward_std": 0.025770537555217743, | |
| "rewards/accuracy_reward": 0.8531742095947266, | |
| "rewards/format_reward": 1.0, | |
| "step": 117 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 112.8828125, | |
| "epoch": 2.5, | |
| "grad_norm": 8.068389238759387, | |
| "kl": 0.0341796875, | |
| "learning_rate": 7.541666666666666e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8539113998413086, | |
| "reward_std": 0.026992302387952805, | |
| "rewards/accuracy_reward": 0.8552135825157166, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 118 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 112.63932800292969, | |
| "epoch": 2.5208333333333335, | |
| "grad_norm": 4.211053430432126, | |
| "kl": 0.03662109375, | |
| "learning_rate": 7.520833333333333e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8711378574371338, | |
| "reward_std": 0.033812545239925385, | |
| "rewards/accuracy_reward": 0.8724400401115417, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 119 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 114.11979675292969, | |
| "epoch": 2.5416666666666665, | |
| "grad_norm": 1.9127642747673794, | |
| "kl": 0.034423828125, | |
| "learning_rate": 7.5e-07, | |
| "loss": 0.0014, | |
| "reward": 1.8767274618148804, | |
| "reward_std": 0.03441750630736351, | |
| "rewards/accuracy_reward": 0.8793315887451172, | |
| "rewards/format_reward": 0.9973958730697632, | |
| "step": 120 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 112.91796875, | |
| "epoch": 2.5625, | |
| "grad_norm": 2.7351925655744354, | |
| "kl": 0.03515625, | |
| "learning_rate": 7.479166666666667e-07, | |
| "loss": 0.0015, | |
| "reward": 1.848404884338379, | |
| "reward_std": 0.02676878124475479, | |
| "rewards/accuracy_reward": 0.8484048843383789, | |
| "rewards/format_reward": 1.0, | |
| "step": 121 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 114.83203125, | |
| "epoch": 2.5833333333333335, | |
| "grad_norm": 1.9431827631537368, | |
| "kl": 0.037841796875, | |
| "learning_rate": 7.458333333333333e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8470710515975952, | |
| "reward_std": 0.03329627588391304, | |
| "rewards/accuracy_reward": 0.8496752977371216, | |
| "rewards/format_reward": 0.9973958730697632, | |
| "step": 122 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 116.77474212646484, | |
| "epoch": 2.6041666666666665, | |
| "grad_norm": 1.5150916718262497, | |
| "kl": 0.04296875, | |
| "learning_rate": 7.4375e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8823201656341553, | |
| "reward_std": 0.028970589861273766, | |
| "rewards/accuracy_reward": 0.8823199272155762, | |
| "rewards/format_reward": 1.0, | |
| "step": 123 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 115.453125, | |
| "epoch": 2.625, | |
| "grad_norm": 1.349104952050238, | |
| "kl": 0.037841796875, | |
| "learning_rate": 7.416666666666666e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8449368476867676, | |
| "reward_std": 0.040684543550014496, | |
| "rewards/accuracy_reward": 0.8462389707565308, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 124 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 115.18620300292969, | |
| "epoch": 2.6458333333333335, | |
| "grad_norm": 1.3525153623022323, | |
| "kl": 0.041748046875, | |
| "learning_rate": 7.395833333333334e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8602406978607178, | |
| "reward_std": 0.028682291507720947, | |
| "rewards/accuracy_reward": 0.8628449440002441, | |
| "rewards/format_reward": 0.9973958730697632, | |
| "step": 125 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 114.31771087646484, | |
| "epoch": 2.6666666666666665, | |
| "grad_norm": 3.3377952313667607, | |
| "kl": 0.03515625, | |
| "learning_rate": 7.375e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8633064031600952, | |
| "reward_std": 0.029187675565481186, | |
| "rewards/accuracy_reward": 0.8633064031600952, | |
| "rewards/format_reward": 1.0, | |
| "step": 126 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 115.09765625, | |
| "epoch": 2.6875, | |
| "grad_norm": 3.295416813891869, | |
| "kl": 0.034912109375, | |
| "learning_rate": 7.354166666666667e-07, | |
| "loss": 0.0014, | |
| "reward": 1.8418749570846558, | |
| "reward_std": 0.030345208942890167, | |
| "rewards/accuracy_reward": 0.8418749570846558, | |
| "rewards/format_reward": 1.0, | |
| "step": 127 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 113.78646087646484, | |
| "epoch": 2.7083333333333335, | |
| "grad_norm": 1.8029248151985298, | |
| "kl": 0.038330078125, | |
| "learning_rate": 7.333333333333332e-07, | |
| "loss": 0.0016, | |
| "reward": 1.878383994102478, | |
| "reward_std": 0.030093541368842125, | |
| "rewards/accuracy_reward": 0.8796859979629517, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 128 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 113.72526550292969, | |
| "epoch": 2.7291666666666665, | |
| "grad_norm": 4.026175515300724, | |
| "kl": 0.037841796875, | |
| "learning_rate": 7.312499999999999e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8830840587615967, | |
| "reward_std": 0.03116484545171261, | |
| "rewards/accuracy_reward": 0.8830841779708862, | |
| "rewards/format_reward": 1.0, | |
| "step": 129 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.31510925292969, | |
| "epoch": 2.75, | |
| "grad_norm": 3.247277440713756, | |
| "kl": 0.036865234375, | |
| "learning_rate": 7.291666666666666e-07, | |
| "loss": 0.0015, | |
| "reward": 1.88235604763031, | |
| "reward_std": 0.03274049982428551, | |
| "rewards/accuracy_reward": 0.8836580514907837, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 130 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.67708587646484, | |
| "epoch": 2.7708333333333335, | |
| "grad_norm": 2.3589645345270296, | |
| "kl": 0.036376953125, | |
| "learning_rate": 7.270833333333333e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8647716045379639, | |
| "reward_std": 0.037114016711711884, | |
| "rewards/accuracy_reward": 0.8673758506774902, | |
| "rewards/format_reward": 0.9973958730697632, | |
| "step": 131 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.82292175292969, | |
| "epoch": 2.7916666666666665, | |
| "grad_norm": 1.3075748353830943, | |
| "kl": 0.038330078125, | |
| "learning_rate": 7.249999999999999e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8723247051239014, | |
| "reward_std": 0.042349301278591156, | |
| "rewards/accuracy_reward": 0.874928891658783, | |
| "rewards/format_reward": 0.9973958730697632, | |
| "step": 132 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.65885925292969, | |
| "epoch": 2.8125, | |
| "grad_norm": 1.7497378920781035, | |
| "kl": 0.041259765625, | |
| "learning_rate": 7.229166666666666e-07, | |
| "loss": 0.0017, | |
| "reward": 1.870335578918457, | |
| "reward_std": 0.03280794620513916, | |
| "rewards/accuracy_reward": 0.8703355193138123, | |
| "rewards/format_reward": 1.0, | |
| "step": 133 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.25651550292969, | |
| "epoch": 2.8333333333333335, | |
| "grad_norm": 1.8805437427920557, | |
| "kl": 0.037841796875, | |
| "learning_rate": 7.208333333333332e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8854488134384155, | |
| "reward_std": 0.030766207724809647, | |
| "rewards/accuracy_reward": 0.8867508769035339, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 134 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.58333587646484, | |
| "epoch": 2.8541666666666665, | |
| "grad_norm": 1.9097670028819336, | |
| "kl": 0.034423828125, | |
| "learning_rate": 7.1875e-07, | |
| "loss": 0.0014, | |
| "reward": 1.8947999477386475, | |
| "reward_std": 0.02784878760576248, | |
| "rewards/accuracy_reward": 0.8961019515991211, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 135 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 112.38802337646484, | |
| "epoch": 2.875, | |
| "grad_norm": 2.148757546537433, | |
| "kl": 0.035888671875, | |
| "learning_rate": 7.166666666666667e-07, | |
| "loss": 0.0015, | |
| "reward": 1.867633581161499, | |
| "reward_std": 0.027851156890392303, | |
| "rewards/accuracy_reward": 0.867633581161499, | |
| "rewards/format_reward": 1.0, | |
| "step": 136 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.4921875, | |
| "epoch": 2.8958333333333335, | |
| "grad_norm": 1.68972106618665, | |
| "kl": 0.031982421875, | |
| "learning_rate": 7.145833333333333e-07, | |
| "loss": 0.0013, | |
| "reward": 1.8540122509002686, | |
| "reward_std": 0.03191521018743515, | |
| "rewards/accuracy_reward": 0.854012131690979, | |
| "rewards/format_reward": 1.0, | |
| "step": 137 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 112.47135925292969, | |
| "epoch": 2.9166666666666665, | |
| "grad_norm": 4.753459469453496, | |
| "kl": 0.037353515625, | |
| "learning_rate": 7.125e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8603503704071045, | |
| "reward_std": 0.03011004999279976, | |
| "rewards/accuracy_reward": 0.8603503704071045, | |
| "rewards/format_reward": 1.0, | |
| "step": 138 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 113.7109375, | |
| "epoch": 2.9375, | |
| "grad_norm": 3.309769561279882, | |
| "kl": 0.03369140625, | |
| "learning_rate": 7.104166666666667e-07, | |
| "loss": 0.0014, | |
| "reward": 1.8809528350830078, | |
| "reward_std": 0.029512833803892136, | |
| "rewards/accuracy_reward": 0.8809528350830078, | |
| "rewards/format_reward": 1.0, | |
| "step": 139 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 113.109375, | |
| "epoch": 2.9583333333333335, | |
| "grad_norm": 1.5596309697590072, | |
| "kl": 0.034912109375, | |
| "learning_rate": 7.083333333333334e-07, | |
| "loss": 0.0014, | |
| "reward": 1.8926336765289307, | |
| "reward_std": 0.029566586017608643, | |
| "rewards/accuracy_reward": 0.8926336169242859, | |
| "rewards/format_reward": 1.0, | |
| "step": 140 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 115.12683868408203, | |
| "epoch": 2.9791666666666665, | |
| "grad_norm": 2.3377448744574503, | |
| "kl": 0.035400390625, | |
| "learning_rate": 7.0625e-07, | |
| "loss": 0.0014, | |
| "reward": 1.870600938796997, | |
| "reward_std": 0.028039831668138504, | |
| "rewards/accuracy_reward": 0.8706008791923523, | |
| "rewards/format_reward": 1.0, | |
| "step": 141 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 114.96745300292969, | |
| "epoch": 3.0208333333333335, | |
| "grad_norm": 1.2246745731883282, | |
| "kl": 0.031494140625, | |
| "learning_rate": 7.041666666666667e-07, | |
| "loss": 0.0013, | |
| "reward": 1.9021186828613281, | |
| "reward_std": 0.023153727874159813, | |
| "rewards/accuracy_reward": 0.9021186828613281, | |
| "rewards/format_reward": 1.0, | |
| "step": 142 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 117.39974212646484, | |
| "epoch": 3.0416666666666665, | |
| "grad_norm": 2.052509803568965, | |
| "kl": 0.03564453125, | |
| "learning_rate": 7.020833333333332e-07, | |
| "loss": 0.0015, | |
| "reward": 1.875624656677246, | |
| "reward_std": 0.026206960901618004, | |
| "rewards/accuracy_reward": 0.876926839351654, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 143 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 115.80208587646484, | |
| "epoch": 3.0625, | |
| "grad_norm": 2.354461875696106, | |
| "kl": 0.033203125, | |
| "learning_rate": 7e-07, | |
| "loss": 0.0014, | |
| "reward": 1.8924367427825928, | |
| "reward_std": 0.027437550947070122, | |
| "rewards/accuracy_reward": 0.8924366235733032, | |
| "rewards/format_reward": 1.0, | |
| "step": 144 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 116.94792175292969, | |
| "epoch": 3.0833333333333335, | |
| "grad_norm": 2.0470904135861314, | |
| "kl": 0.03759765625, | |
| "learning_rate": 6.979166666666666e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8674253225326538, | |
| "reward_std": 0.03383837640285492, | |
| "rewards/accuracy_reward": 0.8687273263931274, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 145 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 116.73177337646484, | |
| "epoch": 3.1041666666666665, | |
| "grad_norm": 2.4829007525918714, | |
| "kl": 0.035400390625, | |
| "learning_rate": 6.958333333333333e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8920822143554688, | |
| "reward_std": 0.027063176035881042, | |
| "rewards/accuracy_reward": 0.8933842182159424, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 146 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 116.31380462646484, | |
| "epoch": 3.125, | |
| "grad_norm": 1.8187259240411606, | |
| "kl": 0.03564453125, | |
| "learning_rate": 6.937499999999999e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8620394468307495, | |
| "reward_std": 0.026920508593320847, | |
| "rewards/accuracy_reward": 0.86203932762146, | |
| "rewards/format_reward": 1.0, | |
| "step": 147 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 116.4765625, | |
| "epoch": 3.1458333333333335, | |
| "grad_norm": 2.130554923785352, | |
| "kl": 0.038818359375, | |
| "learning_rate": 6.916666666666666e-07, | |
| "loss": 0.0016, | |
| "reward": 1.881149172782898, | |
| "reward_std": 0.03235545754432678, | |
| "rewards/accuracy_reward": 0.8837532997131348, | |
| "rewards/format_reward": 0.9973958730697632, | |
| "step": 148 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 115.44271087646484, | |
| "epoch": 3.1666666666666665, | |
| "grad_norm": 1.3415308591991317, | |
| "kl": 0.041015625, | |
| "learning_rate": 6.895833333333333e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8772742748260498, | |
| "reward_std": 0.03051435947418213, | |
| "rewards/accuracy_reward": 0.8785762190818787, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 149 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 114.02604675292969, | |
| "epoch": 3.1875, | |
| "grad_norm": 3.61409411357304, | |
| "kl": 0.035400390625, | |
| "learning_rate": 6.875e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8919541835784912, | |
| "reward_std": 0.025183459743857384, | |
| "rewards/accuracy_reward": 0.8919543027877808, | |
| "rewards/format_reward": 1.0, | |
| "step": 150 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 113.07161712646484, | |
| "epoch": 3.2083333333333335, | |
| "grad_norm": 1.6998976071316378, | |
| "kl": 0.03955078125, | |
| "learning_rate": 6.854166666666666e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8747581243515015, | |
| "reward_std": 0.0327390655875206, | |
| "rewards/accuracy_reward": 0.8760601878166199, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 151 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.97396087646484, | |
| "epoch": 3.2291666666666665, | |
| "grad_norm": 4.510995755729955, | |
| "kl": 0.034912109375, | |
| "learning_rate": 6.833333333333333e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8948159217834473, | |
| "reward_std": 0.024841880425810814, | |
| "rewards/accuracy_reward": 0.8948158621788025, | |
| "rewards/format_reward": 1.0, | |
| "step": 152 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.51432800292969, | |
| "epoch": 3.25, | |
| "grad_norm": 1.434499918763638, | |
| "kl": 0.03515625, | |
| "learning_rate": 6.8125e-07, | |
| "loss": 0.0015, | |
| "reward": 1.9077692031860352, | |
| "reward_std": 0.02048024721443653, | |
| "rewards/accuracy_reward": 0.9077692031860352, | |
| "rewards/format_reward": 1.0, | |
| "step": 153 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.29948425292969, | |
| "epoch": 3.2708333333333335, | |
| "grad_norm": 2.6480453809741307, | |
| "kl": 0.03662109375, | |
| "learning_rate": 6.791666666666667e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8736419677734375, | |
| "reward_std": 0.026421895250678062, | |
| "rewards/accuracy_reward": 0.8736419677734375, | |
| "rewards/format_reward": 1.0, | |
| "step": 154 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.30339050292969, | |
| "epoch": 3.2916666666666665, | |
| "grad_norm": 1.6157614309136974, | |
| "kl": 0.034423828125, | |
| "learning_rate": 6.770833333333333e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8868639469146729, | |
| "reward_std": 0.02565614879131317, | |
| "rewards/accuracy_reward": 0.8868638873100281, | |
| "rewards/format_reward": 1.0, | |
| "step": 155 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.47917175292969, | |
| "epoch": 3.3125, | |
| "grad_norm": 3.614042738523906, | |
| "kl": 0.03125, | |
| "learning_rate": 6.75e-07, | |
| "loss": 0.0014, | |
| "reward": 1.854689598083496, | |
| "reward_std": 0.026655998080968857, | |
| "rewards/accuracy_reward": 0.8546894788742065, | |
| "rewards/format_reward": 1.0, | |
| "step": 156 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.14714050292969, | |
| "epoch": 3.3333333333333335, | |
| "grad_norm": 3.6282010355717285, | |
| "kl": 0.035400390625, | |
| "learning_rate": 6.729166666666666e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8728668689727783, | |
| "reward_std": 0.02623789571225643, | |
| "rewards/accuracy_reward": 0.8728668093681335, | |
| "rewards/format_reward": 1.0, | |
| "step": 157 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.16146087646484, | |
| "epoch": 3.3541666666666665, | |
| "grad_norm": 3.096302314051656, | |
| "kl": 0.07373046875, | |
| "learning_rate": 6.708333333333333e-07, | |
| "loss": 0.003, | |
| "reward": 1.8780524730682373, | |
| "reward_std": 0.02697448432445526, | |
| "rewards/accuracy_reward": 0.8780522346496582, | |
| "rewards/format_reward": 1.0, | |
| "step": 158 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.66536712646484, | |
| "epoch": 3.375, | |
| "grad_norm": 1.8815482668618946, | |
| "kl": 0.034423828125, | |
| "learning_rate": 6.6875e-07, | |
| "loss": 0.0015, | |
| "reward": 1.869466781616211, | |
| "reward_std": 0.025683503597974777, | |
| "rewards/accuracy_reward": 0.8694667816162109, | |
| "rewards/format_reward": 1.0, | |
| "step": 159 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.4375, | |
| "epoch": 3.3958333333333335, | |
| "grad_norm": 3.6534900968467423, | |
| "kl": 0.036376953125, | |
| "learning_rate": 6.666666666666666e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8859853744506836, | |
| "reward_std": 0.025588493794202805, | |
| "rewards/accuracy_reward": 0.8859855532646179, | |
| "rewards/format_reward": 1.0, | |
| "step": 160 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.21875, | |
| "epoch": 3.4166666666666665, | |
| "grad_norm": 2.8670572231969214, | |
| "kl": 0.0361328125, | |
| "learning_rate": 6.645833333333333e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8808234930038452, | |
| "reward_std": 0.025275420397520065, | |
| "rewards/accuracy_reward": 0.8808236122131348, | |
| "rewards/format_reward": 1.0, | |
| "step": 161 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.73567962646484, | |
| "epoch": 3.4375, | |
| "grad_norm": 1.291526238821269, | |
| "kl": 0.033203125, | |
| "learning_rate": 6.624999999999999e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8656648397445679, | |
| "reward_std": 0.02325437031686306, | |
| "rewards/accuracy_reward": 0.8656649589538574, | |
| "rewards/format_reward": 1.0, | |
| "step": 162 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.76953125, | |
| "epoch": 3.4583333333333335, | |
| "grad_norm": 3.2481258355141716, | |
| "kl": 0.034423828125, | |
| "learning_rate": 6.604166666666667e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8807281255722046, | |
| "reward_std": 0.023684537038207054, | |
| "rewards/accuracy_reward": 0.8807281255722046, | |
| "rewards/format_reward": 1.0, | |
| "step": 163 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 101.98046875, | |
| "epoch": 3.4791666666666665, | |
| "grad_norm": 1.913896791712438, | |
| "kl": 0.034912109375, | |
| "learning_rate": 6.583333333333333e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8801133632659912, | |
| "reward_std": 0.020142659544944763, | |
| "rewards/accuracy_reward": 0.8801132440567017, | |
| "rewards/format_reward": 1.0, | |
| "step": 164 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 101.25651550292969, | |
| "epoch": 3.5, | |
| "grad_norm": 1.5822819853292256, | |
| "kl": 0.037109375, | |
| "learning_rate": 6.5625e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8631795644760132, | |
| "reward_std": 0.023415524512529373, | |
| "rewards/accuracy_reward": 0.8631795644760132, | |
| "rewards/format_reward": 1.0, | |
| "step": 165 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 102.02083587646484, | |
| "epoch": 3.5208333333333335, | |
| "grad_norm": 1.4328687070061792, | |
| "kl": 0.0341796875, | |
| "learning_rate": 6.541666666666666e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8415846824645996, | |
| "reward_std": 0.022014908492565155, | |
| "rewards/accuracy_reward": 0.8415846824645996, | |
| "rewards/format_reward": 1.0, | |
| "step": 166 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.24609375, | |
| "epoch": 3.5416666666666665, | |
| "grad_norm": 2.6519462272626697, | |
| "kl": 0.033447265625, | |
| "learning_rate": 6.520833333333333e-07, | |
| "loss": 0.0014, | |
| "reward": 1.8888130187988281, | |
| "reward_std": 0.0215926356613636, | |
| "rewards/accuracy_reward": 0.8888130187988281, | |
| "rewards/format_reward": 1.0, | |
| "step": 167 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 101.84375, | |
| "epoch": 3.5625, | |
| "grad_norm": 1.7631289618422423, | |
| "kl": 0.032958984375, | |
| "learning_rate": 6.5e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8750677108764648, | |
| "reward_std": 0.023012561723589897, | |
| "rewards/accuracy_reward": 0.8750675916671753, | |
| "rewards/format_reward": 1.0, | |
| "step": 168 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.60026550292969, | |
| "epoch": 3.5833333333333335, | |
| "grad_norm": 2.0388719106728863, | |
| "kl": 0.0361328125, | |
| "learning_rate": 6.479166666666667e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8989547491073608, | |
| "reward_std": 0.024981368333101273, | |
| "rewards/accuracy_reward": 0.8989547491073608, | |
| "rewards/format_reward": 1.0, | |
| "step": 169 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.02864837646484, | |
| "epoch": 3.6041666666666665, | |
| "grad_norm": 3.0894511291299485, | |
| "kl": 0.03369140625, | |
| "learning_rate": 6.458333333333333e-07, | |
| "loss": 0.0014, | |
| "reward": 1.8803495168685913, | |
| "reward_std": 0.024355322122573853, | |
| "rewards/accuracy_reward": 0.8803495168685913, | |
| "rewards/format_reward": 1.0, | |
| "step": 170 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.703125, | |
| "epoch": 3.625, | |
| "grad_norm": 1.6763141295980748, | |
| "kl": 0.03662109375, | |
| "learning_rate": 6.4375e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8981932401657104, | |
| "reward_std": 0.0216450747102499, | |
| "rewards/accuracy_reward": 0.898193359375, | |
| "rewards/format_reward": 1.0, | |
| "step": 171 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.81901550292969, | |
| "epoch": 3.6458333333333335, | |
| "grad_norm": 1.5197798990089235, | |
| "kl": 0.03369140625, | |
| "learning_rate": 6.416666666666667e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8727521896362305, | |
| "reward_std": 0.02708452008664608, | |
| "rewards/accuracy_reward": 0.87275230884552, | |
| "rewards/format_reward": 1.0, | |
| "step": 172 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.34245300292969, | |
| "epoch": 3.6666666666666665, | |
| "grad_norm": 4.145991173961336, | |
| "kl": 0.049072265625, | |
| "learning_rate": 6.395833333333333e-07, | |
| "loss": 0.002, | |
| "reward": 1.9041762351989746, | |
| "reward_std": 0.024535808712244034, | |
| "rewards/accuracy_reward": 0.9041762351989746, | |
| "rewards/format_reward": 1.0, | |
| "step": 173 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.91667175292969, | |
| "epoch": 3.6875, | |
| "grad_norm": 3.626404435362734, | |
| "kl": 0.038330078125, | |
| "learning_rate": 6.374999999999999e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8905611038208008, | |
| "reward_std": 0.02579084411263466, | |
| "rewards/accuracy_reward": 0.891863226890564, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 174 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.83464050292969, | |
| "epoch": 3.7083333333333335, | |
| "grad_norm": 1.4575692565981726, | |
| "kl": 0.037841796875, | |
| "learning_rate": 6.354166666666666e-07, | |
| "loss": 0.0015, | |
| "reward": 1.9227180480957031, | |
| "reward_std": 0.02236122451722622, | |
| "rewards/accuracy_reward": 0.9227181673049927, | |
| "rewards/format_reward": 1.0, | |
| "step": 175 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.88542175292969, | |
| "epoch": 3.7291666666666665, | |
| "grad_norm": 6.573675951265146, | |
| "kl": 0.040771484375, | |
| "learning_rate": 6.333333333333332e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8883719444274902, | |
| "reward_std": 0.023411914706230164, | |
| "rewards/accuracy_reward": 0.8883718848228455, | |
| "rewards/format_reward": 1.0, | |
| "step": 176 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.8828125, | |
| "epoch": 3.75, | |
| "grad_norm": 1.3628527356709195, | |
| "kl": 0.038818359375, | |
| "learning_rate": 6.3125e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8975229263305664, | |
| "reward_std": 0.022585680708289146, | |
| "rewards/accuracy_reward": 0.8975229263305664, | |
| "rewards/format_reward": 1.0, | |
| "step": 177 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.2890625, | |
| "epoch": 3.7708333333333335, | |
| "grad_norm": 1.7649706449204177, | |
| "kl": 0.04052734375, | |
| "learning_rate": 6.291666666666666e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8944144248962402, | |
| "reward_std": 0.02505827508866787, | |
| "rewards/accuracy_reward": 0.8944144248962402, | |
| "rewards/format_reward": 1.0, | |
| "step": 178 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 115.35546875, | |
| "epoch": 3.7916666666666665, | |
| "grad_norm": 2.198651075477608, | |
| "kl": 0.039306640625, | |
| "learning_rate": 6.270833333333333e-07, | |
| "loss": 0.0016, | |
| "reward": 1.891271948814392, | |
| "reward_std": 0.028914332389831543, | |
| "rewards/accuracy_reward": 0.8938760757446289, | |
| "rewards/format_reward": 0.9973958730697632, | |
| "step": 179 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 115.40885925292969, | |
| "epoch": 3.8125, | |
| "grad_norm": 10.660708725864556, | |
| "kl": 0.041748046875, | |
| "learning_rate": 6.249999999999999e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8875398635864258, | |
| "reward_std": 0.024141697213053703, | |
| "rewards/accuracy_reward": 0.887539803981781, | |
| "rewards/format_reward": 1.0, | |
| "step": 180 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 118.68229675292969, | |
| "epoch": 3.8333333333333335, | |
| "grad_norm": 2.2321500902799647, | |
| "kl": 0.04345703125, | |
| "learning_rate": 6.229166666666666e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8584728240966797, | |
| "reward_std": 0.03329972177743912, | |
| "rewards/accuracy_reward": 0.8623790740966797, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 181 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 117.95442962646484, | |
| "epoch": 3.8541666666666665, | |
| "grad_norm": 1.414758072615096, | |
| "kl": 0.038818359375, | |
| "learning_rate": 6.208333333333334e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8576724529266357, | |
| "reward_std": 0.03029349073767662, | |
| "rewards/accuracy_reward": 0.8602765798568726, | |
| "rewards/format_reward": 0.9973958730697632, | |
| "step": 182 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 116.26171875, | |
| "epoch": 3.875, | |
| "grad_norm": 2.767294776217013, | |
| "kl": 0.037353515625, | |
| "learning_rate": 6.1875e-07, | |
| "loss": 0.0015, | |
| "reward": 1.8842320442199707, | |
| "reward_std": 0.026229776442050934, | |
| "rewards/accuracy_reward": 0.8842320442199707, | |
| "rewards/format_reward": 1.0, | |
| "step": 183 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 116.26302337646484, | |
| "epoch": 3.8958333333333335, | |
| "grad_norm": 2.0069160206563423, | |
| "kl": 0.04541015625, | |
| "learning_rate": 6.166666666666667e-07, | |
| "loss": 0.0018, | |
| "reward": 1.906886100769043, | |
| "reward_std": 0.02242848090827465, | |
| "rewards/accuracy_reward": 0.9068862795829773, | |
| "rewards/format_reward": 1.0, | |
| "step": 184 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 118.28515625, | |
| "epoch": 3.9166666666666665, | |
| "grad_norm": 2.9167012710524416, | |
| "kl": 0.041015625, | |
| "learning_rate": 6.145833333333333e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8881382942199707, | |
| "reward_std": 0.027306437492370605, | |
| "rewards/accuracy_reward": 0.8881382942199707, | |
| "rewards/format_reward": 1.0, | |
| "step": 185 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 118.50260925292969, | |
| "epoch": 3.9375, | |
| "grad_norm": 2.5894313089017706, | |
| "kl": 0.03955078125, | |
| "learning_rate": 6.125000000000001e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8834333419799805, | |
| "reward_std": 0.02489401400089264, | |
| "rewards/accuracy_reward": 0.8834332227706909, | |
| "rewards/format_reward": 1.0, | |
| "step": 186 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 115.67578125, | |
| "epoch": 3.9583333333333335, | |
| "grad_norm": 2.8476363151522173, | |
| "kl": 0.039794921875, | |
| "learning_rate": 6.104166666666667e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8963178396224976, | |
| "reward_std": 0.02300976775586605, | |
| "rewards/accuracy_reward": 0.8963178992271423, | |
| "rewards/format_reward": 1.0, | |
| "step": 187 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 113.1749038696289, | |
| "epoch": 3.9791666666666665, | |
| "grad_norm": 102.50025138985052, | |
| "kl": 0.9296875, | |
| "learning_rate": 6.083333333333333e-07, | |
| "loss": 0.0374, | |
| "reward": 1.8949506282806396, | |
| "reward_std": 0.03087977133691311, | |
| "rewards/accuracy_reward": 0.8962857723236084, | |
| "rewards/format_reward": 0.998664915561676, | |
| "step": 188 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 114.61458587646484, | |
| "epoch": 4.020833333333333, | |
| "grad_norm": 2.4242741219191553, | |
| "kl": 0.03857421875, | |
| "learning_rate": 6.062499999999999e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8952863216400146, | |
| "reward_std": 0.025780394673347473, | |
| "rewards/accuracy_reward": 0.8952862620353699, | |
| "rewards/format_reward": 1.0, | |
| "step": 189 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 112.21745300292969, | |
| "epoch": 4.041666666666667, | |
| "grad_norm": 1.3513636521416783, | |
| "kl": 0.046630859375, | |
| "learning_rate": 6.041666666666666e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9090301990509033, | |
| "reward_std": 0.029132381081581116, | |
| "rewards/accuracy_reward": 0.9090301394462585, | |
| "rewards/format_reward": 1.0, | |
| "step": 190 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.36328125, | |
| "epoch": 4.0625, | |
| "grad_norm": 2.0726251594251166, | |
| "kl": 0.051513671875, | |
| "learning_rate": 6.020833333333333e-07, | |
| "loss": 0.0021, | |
| "reward": 1.8756507635116577, | |
| "reward_std": 0.027445685118436813, | |
| "rewards/accuracy_reward": 0.8756507635116577, | |
| "rewards/format_reward": 1.0, | |
| "step": 191 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.05859375, | |
| "epoch": 4.083333333333333, | |
| "grad_norm": 1.5875132735828674, | |
| "kl": 0.056396484375, | |
| "learning_rate": 6e-07, | |
| "loss": 0.0023, | |
| "reward": 1.8988969326019287, | |
| "reward_std": 0.03272823989391327, | |
| "rewards/accuracy_reward": 0.9015010595321655, | |
| "rewards/format_reward": 0.9973958730697632, | |
| "step": 192 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.86458587646484, | |
| "epoch": 4.104166666666667, | |
| "grad_norm": 4.44164388575917, | |
| "kl": 0.06201171875, | |
| "learning_rate": 5.979166666666666e-07, | |
| "loss": 0.0026, | |
| "reward": 1.8918952941894531, | |
| "reward_std": 0.028478458523750305, | |
| "rewards/accuracy_reward": 0.8918952941894531, | |
| "rewards/format_reward": 1.0, | |
| "step": 193 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.47135925292969, | |
| "epoch": 4.125, | |
| "grad_norm": 1.8931652214160692, | |
| "kl": 0.08251953125, | |
| "learning_rate": 5.958333333333333e-07, | |
| "loss": 0.0034, | |
| "reward": 1.877640724182129, | |
| "reward_std": 0.03283580765128136, | |
| "rewards/accuracy_reward": 0.8789429664611816, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 194 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.86458587646484, | |
| "epoch": 4.145833333333333, | |
| "grad_norm": 1.4398266478518003, | |
| "kl": 0.06884765625, | |
| "learning_rate": 5.937499999999999e-07, | |
| "loss": 0.0028, | |
| "reward": 1.893932580947876, | |
| "reward_std": 0.023818641901016235, | |
| "rewards/accuracy_reward": 0.8939325213432312, | |
| "rewards/format_reward": 1.0, | |
| "step": 195 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.59765625, | |
| "epoch": 4.166666666666667, | |
| "grad_norm": 1.9079500637571045, | |
| "kl": 0.057861328125, | |
| "learning_rate": 5.916666666666667e-07, | |
| "loss": 0.0024, | |
| "reward": 1.8989866971969604, | |
| "reward_std": 0.029820134863257408, | |
| "rewards/accuracy_reward": 0.9002887606620789, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 196 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.17317962646484, | |
| "epoch": 4.1875, | |
| "grad_norm": 1.6892092308161555, | |
| "kl": 0.05419921875, | |
| "learning_rate": 5.895833333333333e-07, | |
| "loss": 0.0022, | |
| "reward": 1.9026626348495483, | |
| "reward_std": 0.023554224520921707, | |
| "rewards/accuracy_reward": 0.9026626348495483, | |
| "rewards/format_reward": 1.0, | |
| "step": 197 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.45182800292969, | |
| "epoch": 4.208333333333333, | |
| "grad_norm": 2.0094232681652513, | |
| "kl": 0.05126953125, | |
| "learning_rate": 5.875e-07, | |
| "loss": 0.0021, | |
| "reward": 1.9009517431259155, | |
| "reward_std": 0.026670873165130615, | |
| "rewards/accuracy_reward": 0.9009518027305603, | |
| "rewards/format_reward": 1.0, | |
| "step": 198 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.99609375, | |
| "epoch": 4.229166666666667, | |
| "grad_norm": 1.4660353955474104, | |
| "kl": 0.045654296875, | |
| "learning_rate": 5.854166666666666e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8858369588851929, | |
| "reward_std": 0.02353881672024727, | |
| "rewards/accuracy_reward": 0.8858367800712585, | |
| "rewards/format_reward": 1.0, | |
| "step": 199 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.44792175292969, | |
| "epoch": 4.25, | |
| "grad_norm": 1.9832423612020598, | |
| "kl": 0.04833984375, | |
| "learning_rate": 5.833333333333334e-07, | |
| "loss": 0.002, | |
| "reward": 1.9034093618392944, | |
| "reward_std": 0.024380242452025414, | |
| "rewards/accuracy_reward": 0.9034093618392944, | |
| "rewards/format_reward": 1.0, | |
| "step": 200 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.12890625, | |
| "epoch": 4.270833333333333, | |
| "grad_norm": 1.9973956888778517, | |
| "kl": 0.04541015625, | |
| "learning_rate": 5.8125e-07, | |
| "loss": 0.0019, | |
| "reward": 1.918021559715271, | |
| "reward_std": 0.01975633203983307, | |
| "rewards/accuracy_reward": 0.9180216789245605, | |
| "rewards/format_reward": 1.0, | |
| "step": 201 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.20182800292969, | |
| "epoch": 4.291666666666667, | |
| "grad_norm": 1.9799525839497707, | |
| "kl": 0.046875, | |
| "learning_rate": 5.791666666666667e-07, | |
| "loss": 0.0019, | |
| "reward": 1.903671145439148, | |
| "reward_std": 0.02328427881002426, | |
| "rewards/accuracy_reward": 0.903671145439148, | |
| "rewards/format_reward": 1.0, | |
| "step": 202 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.953125, | |
| "epoch": 4.3125, | |
| "grad_norm": 2.1041072216648176, | |
| "kl": 0.0439453125, | |
| "learning_rate": 5.770833333333332e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8897308111190796, | |
| "reward_std": 0.02862635999917984, | |
| "rewards/accuracy_reward": 0.891032874584198, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 203 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.08203125, | |
| "epoch": 4.333333333333333, | |
| "grad_norm": 1.5204073340174282, | |
| "kl": 0.039794921875, | |
| "learning_rate": 5.749999999999999e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8759989738464355, | |
| "reward_std": 0.02895565889775753, | |
| "rewards/accuracy_reward": 0.8759989738464355, | |
| "rewards/format_reward": 1.0, | |
| "step": 204 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.44010925292969, | |
| "epoch": 4.354166666666667, | |
| "grad_norm": 2.001436237063012, | |
| "kl": 0.046142578125, | |
| "learning_rate": 5.729166666666667e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8900182247161865, | |
| "reward_std": 0.02687385492026806, | |
| "rewards/accuracy_reward": 0.8913201689720154, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 205 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.47396087646484, | |
| "epoch": 4.375, | |
| "grad_norm": 3.6969174457282366, | |
| "kl": 0.041015625, | |
| "learning_rate": 5.708333333333333e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8979003429412842, | |
| "reward_std": 0.0217414703220129, | |
| "rewards/accuracy_reward": 0.8979001045227051, | |
| "rewards/format_reward": 1.0, | |
| "step": 206 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.609375, | |
| "epoch": 4.395833333333333, | |
| "grad_norm": 2.0011010687642816, | |
| "kl": 0.0400390625, | |
| "learning_rate": 5.6875e-07, | |
| "loss": 0.0017, | |
| "reward": 1.890995740890503, | |
| "reward_std": 0.025938181206583977, | |
| "rewards/accuracy_reward": 0.8909956812858582, | |
| "rewards/format_reward": 1.0, | |
| "step": 207 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 112.12890625, | |
| "epoch": 4.416666666666667, | |
| "grad_norm": 1.2721197172111447, | |
| "kl": 0.041259765625, | |
| "learning_rate": 5.666666666666666e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8735730648040771, | |
| "reward_std": 0.02578229270875454, | |
| "rewards/accuracy_reward": 0.8735730051994324, | |
| "rewards/format_reward": 1.0, | |
| "step": 208 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.66667175292969, | |
| "epoch": 4.4375, | |
| "grad_norm": 2.5188213924760667, | |
| "kl": 0.07958984375, | |
| "learning_rate": 5.645833333333333e-07, | |
| "loss": 0.0032, | |
| "reward": 1.8987796306610107, | |
| "reward_std": 0.025997933000326157, | |
| "rewards/accuracy_reward": 0.898779571056366, | |
| "rewards/format_reward": 1.0, | |
| "step": 209 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.171875, | |
| "epoch": 4.458333333333333, | |
| "grad_norm": 2.1506836862902876, | |
| "kl": 0.03515625, | |
| "learning_rate": 5.625e-07, | |
| "loss": 0.0015, | |
| "reward": 1.891427993774414, | |
| "reward_std": 0.02087043598294258, | |
| "rewards/accuracy_reward": 0.8914279937744141, | |
| "rewards/format_reward": 1.0, | |
| "step": 210 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.43880462646484, | |
| "epoch": 4.479166666666667, | |
| "grad_norm": 1.589375389211378, | |
| "kl": 0.033203125, | |
| "learning_rate": 5.604166666666667e-07, | |
| "loss": 0.0014, | |
| "reward": 1.8716447353363037, | |
| "reward_std": 0.023239165544509888, | |
| "rewards/accuracy_reward": 0.8716444969177246, | |
| "rewards/format_reward": 1.0, | |
| "step": 211 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.87890625, | |
| "epoch": 4.5, | |
| "grad_norm": 2.9907672143516737, | |
| "kl": 0.038330078125, | |
| "learning_rate": 5.583333333333333e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8827204704284668, | |
| "reward_std": 0.028304576873779297, | |
| "rewards/accuracy_reward": 0.8827204704284668, | |
| "rewards/format_reward": 1.0, | |
| "step": 212 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 113.86849212646484, | |
| "epoch": 4.520833333333333, | |
| "grad_norm": 2.551740129842165, | |
| "kl": 0.0390625, | |
| "learning_rate": 5.5625e-07, | |
| "loss": 0.0016, | |
| "reward": 1.888314127922058, | |
| "reward_std": 0.02232741191983223, | |
| "rewards/accuracy_reward": 0.8883141279220581, | |
| "rewards/format_reward": 1.0, | |
| "step": 213 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.05989837646484, | |
| "epoch": 4.541666666666667, | |
| "grad_norm": 4.368554014866076, | |
| "kl": 0.044921875, | |
| "learning_rate": 5.541666666666666e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9319076538085938, | |
| "reward_std": 0.020289087668061256, | |
| "rewards/accuracy_reward": 0.9319076538085938, | |
| "rewards/format_reward": 1.0, | |
| "step": 214 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.23828125, | |
| "epoch": 4.5625, | |
| "grad_norm": 1.4440015070439494, | |
| "kl": 0.035888671875, | |
| "learning_rate": 5.520833333333334e-07, | |
| "loss": 0.0016, | |
| "reward": 1.852222204208374, | |
| "reward_std": 0.030050039291381836, | |
| "rewards/accuracy_reward": 0.8535243272781372, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 215 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 112.00911712646484, | |
| "epoch": 4.583333333333333, | |
| "grad_norm": 2.12920095214347, | |
| "kl": 0.038818359375, | |
| "learning_rate": 5.5e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8882272243499756, | |
| "reward_std": 0.024995621293783188, | |
| "rewards/accuracy_reward": 0.8882272243499756, | |
| "rewards/format_reward": 1.0, | |
| "step": 216 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.91146087646484, | |
| "epoch": 4.604166666666667, | |
| "grad_norm": 3.2550572641319016, | |
| "kl": 0.03955078125, | |
| "learning_rate": 5.479166666666667e-07, | |
| "loss": 0.0016, | |
| "reward": 1.9003304243087769, | |
| "reward_std": 0.023031365126371384, | |
| "rewards/accuracy_reward": 0.9003303050994873, | |
| "rewards/format_reward": 1.0, | |
| "step": 217 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.77474212646484, | |
| "epoch": 4.625, | |
| "grad_norm": 1.4248440110140324, | |
| "kl": 0.039794921875, | |
| "learning_rate": 5.458333333333332e-07, | |
| "loss": 0.0016, | |
| "reward": 1.873998999595642, | |
| "reward_std": 0.02907104603946209, | |
| "rewards/accuracy_reward": 0.8753010630607605, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 218 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 112.58984375, | |
| "epoch": 4.645833333333333, | |
| "grad_norm": 2.6245139424544983, | |
| "kl": 0.03857421875, | |
| "learning_rate": 5.4375e-07, | |
| "loss": 0.0016, | |
| "reward": 1.9035629034042358, | |
| "reward_std": 0.022365760058164597, | |
| "rewards/accuracy_reward": 0.9035629034042358, | |
| "rewards/format_reward": 1.0, | |
| "step": 219 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.59896087646484, | |
| "epoch": 4.666666666666667, | |
| "grad_norm": 1.2926097030348538, | |
| "kl": 0.03955078125, | |
| "learning_rate": 5.416666666666666e-07, | |
| "loss": 0.0016, | |
| "reward": 1.890451192855835, | |
| "reward_std": 0.025147125124931335, | |
| "rewards/accuracy_reward": 0.8904510736465454, | |
| "rewards/format_reward": 1.0, | |
| "step": 220 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 113.26432800292969, | |
| "epoch": 4.6875, | |
| "grad_norm": 1.2010688063099724, | |
| "kl": 0.03955078125, | |
| "learning_rate": 5.395833333333333e-07, | |
| "loss": 0.0016, | |
| "reward": 1.865971326828003, | |
| "reward_std": 0.030050549656152725, | |
| "rewards/accuracy_reward": 0.8672735095024109, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 221 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.88932800292969, | |
| "epoch": 4.708333333333333, | |
| "grad_norm": 1.4851010437040935, | |
| "kl": 0.0380859375, | |
| "learning_rate": 5.374999999999999e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8917864561080933, | |
| "reward_std": 0.021093130111694336, | |
| "rewards/accuracy_reward": 0.8917864561080933, | |
| "rewards/format_reward": 1.0, | |
| "step": 222 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.63932800292969, | |
| "epoch": 4.729166666666667, | |
| "grad_norm": 1.774445050351076, | |
| "kl": 0.041748046875, | |
| "learning_rate": 5.354166666666666e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8998801708221436, | |
| "reward_std": 0.025802936404943466, | |
| "rewards/accuracy_reward": 0.9011821150779724, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 223 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.80599212646484, | |
| "epoch": 4.75, | |
| "grad_norm": 1.9122212791313642, | |
| "kl": 0.04150390625, | |
| "learning_rate": 5.333333333333333e-07, | |
| "loss": 0.0017, | |
| "reward": 1.898808479309082, | |
| "reward_std": 0.02279862016439438, | |
| "rewards/accuracy_reward": 0.898808479309082, | |
| "rewards/format_reward": 1.0, | |
| "step": 224 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.49609375, | |
| "epoch": 4.770833333333333, | |
| "grad_norm": 1.8595252737006025, | |
| "kl": 0.0400390625, | |
| "learning_rate": 5.3125e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8906606435775757, | |
| "reward_std": 0.021603485569357872, | |
| "rewards/accuracy_reward": 0.8906607031822205, | |
| "rewards/format_reward": 1.0, | |
| "step": 225 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 112.00260925292969, | |
| "epoch": 4.791666666666667, | |
| "grad_norm": 3.9925980009816278, | |
| "kl": 0.04833984375, | |
| "learning_rate": 5.291666666666666e-07, | |
| "loss": 0.002, | |
| "reward": 1.896075963973999, | |
| "reward_std": 0.02797180414199829, | |
| "rewards/accuracy_reward": 0.8973779678344727, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 226 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 113.4609375, | |
| "epoch": 4.8125, | |
| "grad_norm": 3.455468575777609, | |
| "kl": 0.052001953125, | |
| "learning_rate": 5.270833333333333e-07, | |
| "loss": 0.0021, | |
| "reward": 1.9048645496368408, | |
| "reward_std": 0.02417534589767456, | |
| "rewards/accuracy_reward": 0.9048646688461304, | |
| "rewards/format_reward": 1.0, | |
| "step": 227 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 112.59765625, | |
| "epoch": 4.833333333333333, | |
| "grad_norm": 1.5619005634713135, | |
| "kl": 0.042236328125, | |
| "learning_rate": 5.25e-07, | |
| "loss": 0.0017, | |
| "reward": 1.905022382736206, | |
| "reward_std": 0.021135296672582626, | |
| "rewards/accuracy_reward": 0.905022144317627, | |
| "rewards/format_reward": 1.0, | |
| "step": 228 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 112.34245300292969, | |
| "epoch": 4.854166666666667, | |
| "grad_norm": 1.8685437866312475, | |
| "kl": 0.048095703125, | |
| "learning_rate": 5.229166666666667e-07, | |
| "loss": 0.002, | |
| "reward": 1.89357590675354, | |
| "reward_std": 0.025921311229467392, | |
| "rewards/accuracy_reward": 0.8948779106140137, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 229 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.43620300292969, | |
| "epoch": 4.875, | |
| "grad_norm": 5.452442844003646, | |
| "kl": 0.043212890625, | |
| "learning_rate": 5.208333333333334e-07, | |
| "loss": 0.0018, | |
| "reward": 1.911612868309021, | |
| "reward_std": 0.024873455986380577, | |
| "rewards/accuracy_reward": 0.9129147529602051, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 230 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.40495300292969, | |
| "epoch": 4.895833333333333, | |
| "grad_norm": 2.179930419928341, | |
| "kl": 0.04296875, | |
| "learning_rate": 5.1875e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8822953701019287, | |
| "reward_std": 0.027776187285780907, | |
| "rewards/accuracy_reward": 0.8822951316833496, | |
| "rewards/format_reward": 1.0, | |
| "step": 231 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.78646087646484, | |
| "epoch": 4.916666666666667, | |
| "grad_norm": 5.555743499581958, | |
| "kl": 0.04931640625, | |
| "learning_rate": 5.166666666666667e-07, | |
| "loss": 0.0021, | |
| "reward": 1.9162639379501343, | |
| "reward_std": 0.021101072430610657, | |
| "rewards/accuracy_reward": 0.9162638187408447, | |
| "rewards/format_reward": 1.0, | |
| "step": 232 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.9765625, | |
| "epoch": 4.9375, | |
| "grad_norm": 1.457940457573027, | |
| "kl": 0.03955078125, | |
| "learning_rate": 5.145833333333332e-07, | |
| "loss": 0.0016, | |
| "reward": 1.9340026378631592, | |
| "reward_std": 0.020655512809753418, | |
| "rewards/accuracy_reward": 0.9340025782585144, | |
| "rewards/format_reward": 1.0, | |
| "step": 233 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.7734375, | |
| "epoch": 4.958333333333333, | |
| "grad_norm": 1.4365694122319177, | |
| "kl": 0.04345703125, | |
| "learning_rate": 5.125e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8954408168792725, | |
| "reward_std": 0.02152765914797783, | |
| "rewards/accuracy_reward": 0.8954406976699829, | |
| "rewards/format_reward": 1.0, | |
| "step": 234 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.25367736816406, | |
| "epoch": 4.979166666666667, | |
| "grad_norm": 1.734243537392683, | |
| "kl": 0.04248046875, | |
| "learning_rate": 5.104166666666666e-07, | |
| "loss": 0.0017, | |
| "reward": 1.9107515811920166, | |
| "reward_std": 0.018960019573569298, | |
| "rewards/accuracy_reward": 0.9107515811920166, | |
| "rewards/format_reward": 1.0, | |
| "step": 235 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.74739837646484, | |
| "epoch": 5.020833333333333, | |
| "grad_norm": 1.3870009697213672, | |
| "kl": 0.04833984375, | |
| "learning_rate": 5.083333333333333e-07, | |
| "loss": 0.002, | |
| "reward": 1.8969802856445312, | |
| "reward_std": 0.029202213510870934, | |
| "rewards/accuracy_reward": 0.8982824087142944, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 236 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.33464050292969, | |
| "epoch": 5.041666666666667, | |
| "grad_norm": 1.6927816472717638, | |
| "kl": 0.0439453125, | |
| "learning_rate": 5.062499999999999e-07, | |
| "loss": 0.0018, | |
| "reward": 1.9044766426086426, | |
| "reward_std": 0.02422555536031723, | |
| "rewards/accuracy_reward": 0.9044766426086426, | |
| "rewards/format_reward": 1.0, | |
| "step": 237 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.39714050292969, | |
| "epoch": 5.0625, | |
| "grad_norm": 2.3914909546196594, | |
| "kl": 0.04248046875, | |
| "learning_rate": 5.041666666666667e-07, | |
| "loss": 0.0018, | |
| "reward": 1.9420578479766846, | |
| "reward_std": 0.020115545019507408, | |
| "rewards/accuracy_reward": 0.9420577883720398, | |
| "rewards/format_reward": 1.0, | |
| "step": 238 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.46224212646484, | |
| "epoch": 5.083333333333333, | |
| "grad_norm": 2.834307847073615, | |
| "kl": 0.046630859375, | |
| "learning_rate": 5.020833333333333e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9152748584747314, | |
| "reward_std": 0.02309068851172924, | |
| "rewards/accuracy_reward": 0.9152747392654419, | |
| "rewards/format_reward": 1.0, | |
| "step": 239 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.2265625, | |
| "epoch": 5.104166666666667, | |
| "grad_norm": 2.1094761559134816, | |
| "kl": 0.04052734375, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8892682790756226, | |
| "reward_std": 0.023487474769353867, | |
| "rewards/accuracy_reward": 0.8892682790756226, | |
| "rewards/format_reward": 1.0, | |
| "step": 240 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.31380462646484, | |
| "epoch": 5.125, | |
| "grad_norm": 1.4100911159096023, | |
| "kl": 0.039306640625, | |
| "learning_rate": 4.979166666666666e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8915185928344727, | |
| "reward_std": 0.023427218198776245, | |
| "rewards/accuracy_reward": 0.891518771648407, | |
| "rewards/format_reward": 1.0, | |
| "step": 241 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.84114837646484, | |
| "epoch": 5.145833333333333, | |
| "grad_norm": 1.6579016766504264, | |
| "kl": 0.04052734375, | |
| "learning_rate": 4.958333333333333e-07, | |
| "loss": 0.0017, | |
| "reward": 1.9029381275177002, | |
| "reward_std": 0.023404449224472046, | |
| "rewards/accuracy_reward": 0.9029380083084106, | |
| "rewards/format_reward": 1.0, | |
| "step": 242 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.58333587646484, | |
| "epoch": 5.166666666666667, | |
| "grad_norm": 2.4190316509927547, | |
| "kl": 0.0439453125, | |
| "learning_rate": 4.9375e-07, | |
| "loss": 0.0018, | |
| "reward": 1.9102628231048584, | |
| "reward_std": 0.02647389844059944, | |
| "rewards/accuracy_reward": 0.9115647077560425, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 243 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.34245300292969, | |
| "epoch": 5.1875, | |
| "grad_norm": 1.6788038142728927, | |
| "kl": 0.0390625, | |
| "learning_rate": 4.916666666666666e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8926337957382202, | |
| "reward_std": 0.022663813084363937, | |
| "rewards/accuracy_reward": 0.8926336765289307, | |
| "rewards/format_reward": 1.0, | |
| "step": 244 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.6640625, | |
| "epoch": 5.208333333333333, | |
| "grad_norm": 2.1053897766649934, | |
| "kl": 0.041015625, | |
| "learning_rate": 4.895833333333333e-07, | |
| "loss": 0.0017, | |
| "reward": 1.9033950567245483, | |
| "reward_std": 0.024107707664370537, | |
| "rewards/accuracy_reward": 0.9033951163291931, | |
| "rewards/format_reward": 1.0, | |
| "step": 245 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.30599212646484, | |
| "epoch": 5.229166666666667, | |
| "grad_norm": 8.173004175170574, | |
| "kl": 0.042236328125, | |
| "learning_rate": 4.875e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8888883590698242, | |
| "reward_std": 0.028628483414649963, | |
| "rewards/accuracy_reward": 0.8901904821395874, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 246 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.86328125, | |
| "epoch": 5.25, | |
| "grad_norm": 2.028487749549242, | |
| "kl": 0.044921875, | |
| "learning_rate": 4.854166666666666e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9012870788574219, | |
| "reward_std": 0.02197723090648651, | |
| "rewards/accuracy_reward": 0.9012872576713562, | |
| "rewards/format_reward": 1.0, | |
| "step": 247 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.8125, | |
| "epoch": 5.270833333333333, | |
| "grad_norm": 2.0303516292175523, | |
| "kl": 0.03955078125, | |
| "learning_rate": 4.833333333333333e-07, | |
| "loss": 0.0016, | |
| "reward": 1.8865455389022827, | |
| "reward_std": 0.025701235979795456, | |
| "rewards/accuracy_reward": 0.8865455985069275, | |
| "rewards/format_reward": 1.0, | |
| "step": 248 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.77604675292969, | |
| "epoch": 5.291666666666667, | |
| "grad_norm": 1.5986544250252577, | |
| "kl": 0.040771484375, | |
| "learning_rate": 4.812499999999999e-07, | |
| "loss": 0.0017, | |
| "reward": 1.8878264427185059, | |
| "reward_std": 0.023908209055662155, | |
| "rewards/accuracy_reward": 0.8878263831138611, | |
| "rewards/format_reward": 1.0, | |
| "step": 249 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.72005462646484, | |
| "epoch": 5.3125, | |
| "grad_norm": 2.0564085146093825, | |
| "kl": 0.04541015625, | |
| "learning_rate": 4.791666666666667e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9011871814727783, | |
| "reward_std": 0.026990963146090508, | |
| "rewards/accuracy_reward": 0.902489185333252, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 250 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.52604675292969, | |
| "epoch": 5.333333333333333, | |
| "grad_norm": 2.637539554946258, | |
| "kl": 0.046875, | |
| "learning_rate": 4.770833333333334e-07, | |
| "loss": 0.0019, | |
| "reward": 1.904599666595459, | |
| "reward_std": 0.02259230427443981, | |
| "rewards/accuracy_reward": 0.9059017896652222, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 251 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.51823425292969, | |
| "epoch": 5.354166666666667, | |
| "grad_norm": 2.0291093041211545, | |
| "kl": 0.043701171875, | |
| "learning_rate": 4.7499999999999995e-07, | |
| "loss": 0.0018, | |
| "reward": 1.9127092361450195, | |
| "reward_std": 0.02183235064148903, | |
| "rewards/accuracy_reward": 0.9127092361450195, | |
| "rewards/format_reward": 1.0, | |
| "step": 252 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.5234375, | |
| "epoch": 5.375, | |
| "grad_norm": 4.505797454206583, | |
| "kl": 0.04345703125, | |
| "learning_rate": 4.7291666666666666e-07, | |
| "loss": 0.0018, | |
| "reward": 1.9092210531234741, | |
| "reward_std": 0.021370170637965202, | |
| "rewards/accuracy_reward": 0.9092210531234741, | |
| "rewards/format_reward": 1.0, | |
| "step": 253 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.19661712646484, | |
| "epoch": 5.395833333333333, | |
| "grad_norm": 1.8902834610451218, | |
| "kl": 0.044677734375, | |
| "learning_rate": 4.708333333333333e-07, | |
| "loss": 0.0019, | |
| "reward": 1.917715311050415, | |
| "reward_std": 0.018400993198156357, | |
| "rewards/accuracy_reward": 0.9177150726318359, | |
| "rewards/format_reward": 1.0, | |
| "step": 254 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.390625, | |
| "epoch": 5.416666666666667, | |
| "grad_norm": 2.563147653109146, | |
| "kl": 0.0458984375, | |
| "learning_rate": 4.6874999999999996e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8932042121887207, | |
| "reward_std": 0.021255169063806534, | |
| "rewards/accuracy_reward": 0.8932042121887207, | |
| "rewards/format_reward": 1.0, | |
| "step": 255 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.11067962646484, | |
| "epoch": 5.4375, | |
| "grad_norm": 4.520917096171478, | |
| "kl": 0.042236328125, | |
| "learning_rate": 4.6666666666666666e-07, | |
| "loss": 0.0018, | |
| "reward": 1.9107736349105835, | |
| "reward_std": 0.020730838179588318, | |
| "rewards/accuracy_reward": 0.910773515701294, | |
| "rewards/format_reward": 1.0, | |
| "step": 256 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.9140625, | |
| "epoch": 5.458333333333333, | |
| "grad_norm": 2.9043788526253866, | |
| "kl": 0.046875, | |
| "learning_rate": 4.645833333333333e-07, | |
| "loss": 0.0019, | |
| "reward": 1.932241439819336, | |
| "reward_std": 0.019916361197829247, | |
| "rewards/accuracy_reward": 0.932241678237915, | |
| "rewards/format_reward": 1.0, | |
| "step": 257 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.98698425292969, | |
| "epoch": 5.479166666666667, | |
| "grad_norm": 4.137371389224913, | |
| "kl": 0.04443359375, | |
| "learning_rate": 4.625e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8949460983276367, | |
| "reward_std": 0.021680889651179314, | |
| "rewards/accuracy_reward": 0.8949460983276367, | |
| "rewards/format_reward": 1.0, | |
| "step": 258 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.06120300292969, | |
| "epoch": 5.5, | |
| "grad_norm": 1.5956956245046428, | |
| "kl": 0.046875, | |
| "learning_rate": 4.604166666666666e-07, | |
| "loss": 0.002, | |
| "reward": 1.9305871725082397, | |
| "reward_std": 0.016989264637231827, | |
| "rewards/accuracy_reward": 0.9305871725082397, | |
| "rewards/format_reward": 1.0, | |
| "step": 259 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.46875, | |
| "epoch": 5.520833333333333, | |
| "grad_norm": 2.3801100164721527, | |
| "kl": 0.05126953125, | |
| "learning_rate": 4.5833333333333327e-07, | |
| "loss": 0.0021, | |
| "reward": 1.9142370223999023, | |
| "reward_std": 0.020726464688777924, | |
| "rewards/accuracy_reward": 0.9142370223999023, | |
| "rewards/format_reward": 1.0, | |
| "step": 260 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.06380462646484, | |
| "epoch": 5.541666666666667, | |
| "grad_norm": 1.7313384895006823, | |
| "kl": 0.051025390625, | |
| "learning_rate": 4.5624999999999997e-07, | |
| "loss": 0.0021, | |
| "reward": 1.8905744552612305, | |
| "reward_std": 0.02504381351172924, | |
| "rewards/accuracy_reward": 0.8905746340751648, | |
| "rewards/format_reward": 1.0, | |
| "step": 261 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.88151550292969, | |
| "epoch": 5.5625, | |
| "grad_norm": 1.5486364631929457, | |
| "kl": 0.046875, | |
| "learning_rate": 4.541666666666666e-07, | |
| "loss": 0.002, | |
| "reward": 1.8817743062973022, | |
| "reward_std": 0.02400146797299385, | |
| "rewards/accuracy_reward": 0.8817743062973022, | |
| "rewards/format_reward": 1.0, | |
| "step": 262 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.19921875, | |
| "epoch": 5.583333333333333, | |
| "grad_norm": 1.9468347826097554, | |
| "kl": 0.0498046875, | |
| "learning_rate": 4.5208333333333333e-07, | |
| "loss": 0.0021, | |
| "reward": 1.9123249053955078, | |
| "reward_std": 0.02088339626789093, | |
| "rewards/accuracy_reward": 0.9123249053955078, | |
| "rewards/format_reward": 1.0, | |
| "step": 263 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.50651550292969, | |
| "epoch": 5.604166666666667, | |
| "grad_norm": 1.8134017920844003, | |
| "kl": 0.050537109375, | |
| "learning_rate": 4.5e-07, | |
| "loss": 0.002, | |
| "reward": 1.9056270122528076, | |
| "reward_std": 0.020590659230947495, | |
| "rewards/accuracy_reward": 0.9056269526481628, | |
| "rewards/format_reward": 1.0, | |
| "step": 264 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.43359375, | |
| "epoch": 5.625, | |
| "grad_norm": 2.097289435304608, | |
| "kl": 0.058837890625, | |
| "learning_rate": 4.479166666666667e-07, | |
| "loss": 0.0024, | |
| "reward": 1.8596689701080322, | |
| "reward_std": 0.026524469256401062, | |
| "rewards/accuracy_reward": 0.8609709143638611, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 265 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.8515625, | |
| "epoch": 5.645833333333333, | |
| "grad_norm": 2.9899940677112995, | |
| "kl": 0.0556640625, | |
| "learning_rate": 4.4583333333333334e-07, | |
| "loss": 0.0023, | |
| "reward": 1.891918420791626, | |
| "reward_std": 0.02338644489645958, | |
| "rewards/accuracy_reward": 0.8919183611869812, | |
| "rewards/format_reward": 1.0, | |
| "step": 266 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.14714050292969, | |
| "epoch": 5.666666666666667, | |
| "grad_norm": 1.5447722254023164, | |
| "kl": 0.052734375, | |
| "learning_rate": 4.4374999999999993e-07, | |
| "loss": 0.0022, | |
| "reward": 1.8943192958831787, | |
| "reward_std": 0.021093344315886497, | |
| "rewards/accuracy_reward": 0.8943192362785339, | |
| "rewards/format_reward": 1.0, | |
| "step": 267 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.02083587646484, | |
| "epoch": 5.6875, | |
| "grad_norm": 1.7252851771135564, | |
| "kl": 0.05419921875, | |
| "learning_rate": 4.4166666666666664e-07, | |
| "loss": 0.0023, | |
| "reward": 1.8994736671447754, | |
| "reward_std": 0.025407809764146805, | |
| "rewards/accuracy_reward": 0.9007757902145386, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 268 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.19792175292969, | |
| "epoch": 5.708333333333333, | |
| "grad_norm": 1.4821607784380106, | |
| "kl": 0.05810546875, | |
| "learning_rate": 4.395833333333333e-07, | |
| "loss": 0.0025, | |
| "reward": 1.9176604747772217, | |
| "reward_std": 0.019861234351992607, | |
| "rewards/accuracy_reward": 0.9176604151725769, | |
| "rewards/format_reward": 1.0, | |
| "step": 269 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.72265625, | |
| "epoch": 5.729166666666667, | |
| "grad_norm": 6.345280883170612, | |
| "kl": 0.052734375, | |
| "learning_rate": 4.375e-07, | |
| "loss": 0.0022, | |
| "reward": 1.9127343893051147, | |
| "reward_std": 0.019745318219065666, | |
| "rewards/accuracy_reward": 0.9127345085144043, | |
| "rewards/format_reward": 1.0, | |
| "step": 270 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.1640625, | |
| "epoch": 5.75, | |
| "grad_norm": 1.8167510177139423, | |
| "kl": 0.052734375, | |
| "learning_rate": 4.3541666666666664e-07, | |
| "loss": 0.0022, | |
| "reward": 1.9143961668014526, | |
| "reward_std": 0.021544938907027245, | |
| "rewards/accuracy_reward": 0.9143962264060974, | |
| "rewards/format_reward": 1.0, | |
| "step": 271 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.9296875, | |
| "epoch": 5.770833333333333, | |
| "grad_norm": 2.4532452869571566, | |
| "kl": 0.05322265625, | |
| "learning_rate": 4.3333333333333335e-07, | |
| "loss": 0.0021, | |
| "reward": 1.8847317695617676, | |
| "reward_std": 0.02402741275727749, | |
| "rewards/accuracy_reward": 0.8847318887710571, | |
| "rewards/format_reward": 1.0, | |
| "step": 272 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.16146087646484, | |
| "epoch": 5.791666666666667, | |
| "grad_norm": 4.133102609868267, | |
| "kl": 0.04541015625, | |
| "learning_rate": 4.3125e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9220575094223022, | |
| "reward_std": 0.01720447652041912, | |
| "rewards/accuracy_reward": 0.9220575094223022, | |
| "rewards/format_reward": 1.0, | |
| "step": 273 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.95573425292969, | |
| "epoch": 5.8125, | |
| "grad_norm": 7.211708520736478, | |
| "kl": 0.04541015625, | |
| "learning_rate": 4.291666666666666e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9195680618286133, | |
| "reward_std": 0.019044464454054832, | |
| "rewards/accuracy_reward": 0.9195680618286133, | |
| "rewards/format_reward": 1.0, | |
| "step": 274 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.79427337646484, | |
| "epoch": 5.833333333333333, | |
| "grad_norm": 2.5193114976640394, | |
| "kl": 0.047607421875, | |
| "learning_rate": 4.270833333333333e-07, | |
| "loss": 0.002, | |
| "reward": 1.9032373428344727, | |
| "reward_std": 0.01688932441174984, | |
| "rewards/accuracy_reward": 0.903237521648407, | |
| "rewards/format_reward": 1.0, | |
| "step": 275 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.97135925292969, | |
| "epoch": 5.854166666666667, | |
| "grad_norm": 1.922113400060024, | |
| "kl": 0.046875, | |
| "learning_rate": 4.2499999999999995e-07, | |
| "loss": 0.002, | |
| "reward": 1.8878214359283447, | |
| "reward_std": 0.023371964693069458, | |
| "rewards/accuracy_reward": 0.8878213763237, | |
| "rewards/format_reward": 1.0, | |
| "step": 276 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.08073425292969, | |
| "epoch": 5.875, | |
| "grad_norm": 1.786331439327784, | |
| "kl": 0.04541015625, | |
| "learning_rate": 4.2291666666666666e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9193141460418701, | |
| "reward_std": 0.02163059636950493, | |
| "rewards/accuracy_reward": 0.9206160306930542, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 277 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.82421875, | |
| "epoch": 5.895833333333333, | |
| "grad_norm": 4.285948048806076, | |
| "kl": 0.0419921875, | |
| "learning_rate": 4.208333333333333e-07, | |
| "loss": 0.0017, | |
| "reward": 1.915520429611206, | |
| "reward_std": 0.021722108125686646, | |
| "rewards/accuracy_reward": 0.9155203700065613, | |
| "rewards/format_reward": 1.0, | |
| "step": 278 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.68229675292969, | |
| "epoch": 5.916666666666667, | |
| "grad_norm": 2.1862985744626102, | |
| "kl": 0.045166015625, | |
| "learning_rate": 4.1875e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8867233991622925, | |
| "reward_std": 0.022505465894937515, | |
| "rewards/accuracy_reward": 0.8867233991622925, | |
| "rewards/format_reward": 1.0, | |
| "step": 279 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.41536712646484, | |
| "epoch": 5.9375, | |
| "grad_norm": 2.6822943568035655, | |
| "kl": 0.050537109375, | |
| "learning_rate": 4.1666666666666667e-07, | |
| "loss": 0.0021, | |
| "reward": 1.9038598537445068, | |
| "reward_std": 0.02783789113163948, | |
| "rewards/accuracy_reward": 0.9051617980003357, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 280 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.13542175292969, | |
| "epoch": 5.958333333333333, | |
| "grad_norm": 1.8215228571783213, | |
| "kl": 0.0498046875, | |
| "learning_rate": 4.145833333333333e-07, | |
| "loss": 0.0021, | |
| "reward": 1.9209411144256592, | |
| "reward_std": 0.020260518416762352, | |
| "rewards/accuracy_reward": 0.9209408760070801, | |
| "rewards/format_reward": 1.0, | |
| "step": 281 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.25901794433594, | |
| "epoch": 5.979166666666667, | |
| "grad_norm": 1.7303647275137022, | |
| "kl": 0.04931640625, | |
| "learning_rate": 4.1249999999999997e-07, | |
| "loss": 0.0021, | |
| "reward": 1.8940850496292114, | |
| "reward_std": 0.02454877458512783, | |
| "rewards/accuracy_reward": 0.8954200744628906, | |
| "rewards/format_reward": 0.998664915561676, | |
| "step": 282 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.49870300292969, | |
| "epoch": 6.020833333333333, | |
| "grad_norm": 2.065626526970926, | |
| "kl": 0.045654296875, | |
| "learning_rate": 4.104166666666666e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8828470706939697, | |
| "reward_std": 0.023679915815591812, | |
| "rewards/accuracy_reward": 0.8841490745544434, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 283 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.63021087646484, | |
| "epoch": 6.041666666666667, | |
| "grad_norm": 2.407620241107693, | |
| "kl": 0.044189453125, | |
| "learning_rate": 4.083333333333333e-07, | |
| "loss": 0.0018, | |
| "reward": 1.922331690788269, | |
| "reward_std": 0.02023524045944214, | |
| "rewards/accuracy_reward": 0.9223315715789795, | |
| "rewards/format_reward": 1.0, | |
| "step": 284 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.02734375, | |
| "epoch": 6.0625, | |
| "grad_norm": 2.583573023564423, | |
| "kl": 0.043212890625, | |
| "learning_rate": 4.0625e-07, | |
| "loss": 0.0018, | |
| "reward": 1.9238051176071167, | |
| "reward_std": 0.01875409483909607, | |
| "rewards/accuracy_reward": 0.9238051176071167, | |
| "rewards/format_reward": 1.0, | |
| "step": 285 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.75130462646484, | |
| "epoch": 6.083333333333333, | |
| "grad_norm": 1.5921015904802127, | |
| "kl": 0.04736328125, | |
| "learning_rate": 4.041666666666667e-07, | |
| "loss": 0.0019, | |
| "reward": 1.90175199508667, | |
| "reward_std": 0.01868622750043869, | |
| "rewards/accuracy_reward": 0.9017519950866699, | |
| "rewards/format_reward": 1.0, | |
| "step": 286 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.14453125, | |
| "epoch": 6.104166666666667, | |
| "grad_norm": 2.063415833551645, | |
| "kl": 0.04638671875, | |
| "learning_rate": 4.0208333333333333e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9288432598114014, | |
| "reward_std": 0.017594818025827408, | |
| "rewards/accuracy_reward": 0.9288431406021118, | |
| "rewards/format_reward": 1.0, | |
| "step": 287 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.52995300292969, | |
| "epoch": 6.125, | |
| "grad_norm": 2.6980955335354597, | |
| "kl": 0.043212890625, | |
| "learning_rate": 4e-07, | |
| "loss": 0.0018, | |
| "reward": 1.9106167554855347, | |
| "reward_std": 0.020342741161584854, | |
| "rewards/accuracy_reward": 0.9106166362762451, | |
| "rewards/format_reward": 1.0, | |
| "step": 288 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.89583587646484, | |
| "epoch": 6.145833333333333, | |
| "grad_norm": 1.7057280834613109, | |
| "kl": 0.041259765625, | |
| "learning_rate": 3.9791666666666663e-07, | |
| "loss": 0.0017, | |
| "reward": 1.9230403900146484, | |
| "reward_std": 0.018507663160562515, | |
| "rewards/accuracy_reward": 0.9230403900146484, | |
| "rewards/format_reward": 1.0, | |
| "step": 289 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.31640625, | |
| "epoch": 6.166666666666667, | |
| "grad_norm": 1.7481050352203042, | |
| "kl": 0.04296875, | |
| "learning_rate": 3.958333333333333e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8892457485198975, | |
| "reward_std": 0.025482675060629845, | |
| "rewards/accuracy_reward": 0.8905477523803711, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 290 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.57552337646484, | |
| "epoch": 6.1875, | |
| "grad_norm": 1.5790651747348239, | |
| "kl": 0.045654296875, | |
| "learning_rate": 3.9375e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9172825813293457, | |
| "reward_std": 0.01870821975171566, | |
| "rewards/accuracy_reward": 0.9172827005386353, | |
| "rewards/format_reward": 1.0, | |
| "step": 291 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.67708587646484, | |
| "epoch": 6.208333333333333, | |
| "grad_norm": 5.985924437288418, | |
| "kl": 0.0458984375, | |
| "learning_rate": 3.9166666666666664e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9062340259552002, | |
| "reward_std": 0.017811615020036697, | |
| "rewards/accuracy_reward": 0.9062339663505554, | |
| "rewards/format_reward": 1.0, | |
| "step": 292 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.33854675292969, | |
| "epoch": 6.229166666666667, | |
| "grad_norm": 29.636849516000904, | |
| "kl": 0.046630859375, | |
| "learning_rate": 3.8958333333333334e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9097627401351929, | |
| "reward_std": 0.025265123695135117, | |
| "rewards/accuracy_reward": 0.9110648036003113, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 293 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.21745300292969, | |
| "epoch": 6.25, | |
| "grad_norm": 2.499873249076194, | |
| "kl": 0.052001953125, | |
| "learning_rate": 3.875e-07, | |
| "loss": 0.0021, | |
| "reward": 1.908249855041504, | |
| "reward_std": 0.021753787994384766, | |
| "rewards/accuracy_reward": 0.9082497358322144, | |
| "rewards/format_reward": 1.0, | |
| "step": 294 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.96875, | |
| "epoch": 6.270833333333333, | |
| "grad_norm": 1.43250838119596, | |
| "kl": 0.0478515625, | |
| "learning_rate": 3.8541666666666665e-07, | |
| "loss": 0.002, | |
| "reward": 1.9135513305664062, | |
| "reward_std": 0.02083742991089821, | |
| "rewards/accuracy_reward": 0.9148534536361694, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 295 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.49479675292969, | |
| "epoch": 6.291666666666667, | |
| "grad_norm": 1.9939986843684863, | |
| "kl": 0.046630859375, | |
| "learning_rate": 3.8333333333333335e-07, | |
| "loss": 0.002, | |
| "reward": 1.9211857318878174, | |
| "reward_std": 0.019976306706666946, | |
| "rewards/accuracy_reward": 0.9211856722831726, | |
| "rewards/format_reward": 1.0, | |
| "step": 296 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.37239837646484, | |
| "epoch": 6.3125, | |
| "grad_norm": 1.8430355626574486, | |
| "kl": 0.047607421875, | |
| "learning_rate": 3.8124999999999995e-07, | |
| "loss": 0.002, | |
| "reward": 1.9055722951889038, | |
| "reward_std": 0.020207837224006653, | |
| "rewards/accuracy_reward": 0.9055722951889038, | |
| "rewards/format_reward": 1.0, | |
| "step": 297 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.99870300292969, | |
| "epoch": 6.333333333333333, | |
| "grad_norm": 4.534254461924897, | |
| "kl": 0.044921875, | |
| "learning_rate": 3.7916666666666665e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8959176540374756, | |
| "reward_std": 0.018412087112665176, | |
| "rewards/accuracy_reward": 0.8959175944328308, | |
| "rewards/format_reward": 1.0, | |
| "step": 298 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.14714050292969, | |
| "epoch": 6.354166666666667, | |
| "grad_norm": 1.5417912395172437, | |
| "kl": 0.04638671875, | |
| "learning_rate": 3.770833333333333e-07, | |
| "loss": 0.002, | |
| "reward": 1.9178366661071777, | |
| "reward_std": 0.01834617182612419, | |
| "rewards/accuracy_reward": 0.9178365468978882, | |
| "rewards/format_reward": 1.0, | |
| "step": 299 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.67448425292969, | |
| "epoch": 6.375, | |
| "grad_norm": 10.381006533833029, | |
| "kl": 0.043701171875, | |
| "learning_rate": 3.75e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9170053005218506, | |
| "reward_std": 0.020167209208011627, | |
| "rewards/accuracy_reward": 0.9183073043823242, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 300 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.34635925292969, | |
| "epoch": 6.395833333333333, | |
| "grad_norm": 3.2103701478375366, | |
| "kl": 0.046630859375, | |
| "learning_rate": 3.7291666666666666e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9029948711395264, | |
| "reward_std": 0.019157804548740387, | |
| "rewards/accuracy_reward": 0.9029948711395264, | |
| "rewards/format_reward": 1.0, | |
| "step": 301 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.34114837646484, | |
| "epoch": 6.416666666666667, | |
| "grad_norm": 2.1215276638392253, | |
| "kl": 0.051025390625, | |
| "learning_rate": 3.708333333333333e-07, | |
| "loss": 0.0021, | |
| "reward": 1.9299815893173218, | |
| "reward_std": 0.01765059307217598, | |
| "rewards/accuracy_reward": 0.9299815893173218, | |
| "rewards/format_reward": 1.0, | |
| "step": 302 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.96354675292969, | |
| "epoch": 6.4375, | |
| "grad_norm": 2.0251198297391815, | |
| "kl": 0.04736328125, | |
| "learning_rate": 3.6875e-07, | |
| "loss": 0.002, | |
| "reward": 1.901715636253357, | |
| "reward_std": 0.024447208270430565, | |
| "rewards/accuracy_reward": 0.9030176401138306, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 303 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.21224212646484, | |
| "epoch": 6.458333333333333, | |
| "grad_norm": 5.5850053870802645, | |
| "kl": 0.04345703125, | |
| "learning_rate": 3.666666666666666e-07, | |
| "loss": 0.0018, | |
| "reward": 1.9226205348968506, | |
| "reward_std": 0.01778128370642662, | |
| "rewards/accuracy_reward": 0.9226205348968506, | |
| "rewards/format_reward": 1.0, | |
| "step": 304 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.30989837646484, | |
| "epoch": 6.479166666666667, | |
| "grad_norm": 2.1418335808540183, | |
| "kl": 0.045654296875, | |
| "learning_rate": 3.645833333333333e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9199604988098145, | |
| "reward_std": 0.018466424196958542, | |
| "rewards/accuracy_reward": 0.919960618019104, | |
| "rewards/format_reward": 1.0, | |
| "step": 305 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.81771087646484, | |
| "epoch": 6.5, | |
| "grad_norm": 1.5934526555915005, | |
| "kl": 0.051513671875, | |
| "learning_rate": 3.6249999999999997e-07, | |
| "loss": 0.0021, | |
| "reward": 1.9024890661239624, | |
| "reward_std": 0.018778668716549873, | |
| "rewards/accuracy_reward": 0.9024890661239624, | |
| "rewards/format_reward": 1.0, | |
| "step": 306 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.77474212646484, | |
| "epoch": 6.520833333333333, | |
| "grad_norm": 9.90923642720643, | |
| "kl": 0.04736328125, | |
| "learning_rate": 3.604166666666666e-07, | |
| "loss": 0.002, | |
| "reward": 1.9020146131515503, | |
| "reward_std": 0.021591586992144585, | |
| "rewards/accuracy_reward": 0.9033166766166687, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 307 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.80859375, | |
| "epoch": 6.541666666666667, | |
| "grad_norm": 2.2902946965667996, | |
| "kl": 0.040771484375, | |
| "learning_rate": 3.583333333333333e-07, | |
| "loss": 0.0017, | |
| "reward": 1.9083236455917358, | |
| "reward_std": 0.020869677886366844, | |
| "rewards/accuracy_reward": 0.9083236455917358, | |
| "rewards/format_reward": 1.0, | |
| "step": 308 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.97135925292969, | |
| "epoch": 6.5625, | |
| "grad_norm": 2.141619909298115, | |
| "kl": 0.04443359375, | |
| "learning_rate": 3.5625e-07, | |
| "loss": 0.0019, | |
| "reward": 1.895308494567871, | |
| "reward_std": 0.021585416048765182, | |
| "rewards/accuracy_reward": 0.8966106176376343, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 309 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.83984375, | |
| "epoch": 6.583333333333333, | |
| "grad_norm": 1.722945461090862, | |
| "kl": 0.052490234375, | |
| "learning_rate": 3.541666666666667e-07, | |
| "loss": 0.0022, | |
| "reward": 1.9142106771469116, | |
| "reward_std": 0.022508492693305016, | |
| "rewards/accuracy_reward": 0.91551274061203, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 310 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.95703125, | |
| "epoch": 6.604166666666667, | |
| "grad_norm": 3.6778187227400396, | |
| "kl": 0.0498046875, | |
| "learning_rate": 3.5208333333333333e-07, | |
| "loss": 0.0021, | |
| "reward": 1.8938100337982178, | |
| "reward_std": 0.024937432259321213, | |
| "rewards/accuracy_reward": 0.8951120376586914, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 311 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.52995300292969, | |
| "epoch": 6.625, | |
| "grad_norm": 2.535477979468292, | |
| "kl": 0.0634765625, | |
| "learning_rate": 3.5e-07, | |
| "loss": 0.0026, | |
| "reward": 1.900233268737793, | |
| "reward_std": 0.01887938380241394, | |
| "rewards/accuracy_reward": 0.9002333879470825, | |
| "rewards/format_reward": 1.0, | |
| "step": 312 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.04817962646484, | |
| "epoch": 6.645833333333333, | |
| "grad_norm": 2.288272855129929, | |
| "kl": 0.045654296875, | |
| "learning_rate": 3.4791666666666664e-07, | |
| "loss": 0.0019, | |
| "reward": 1.912428855895996, | |
| "reward_std": 0.020974930375814438, | |
| "rewards/accuracy_reward": 0.9124290347099304, | |
| "rewards/format_reward": 1.0, | |
| "step": 313 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.02864837646484, | |
| "epoch": 6.666666666666667, | |
| "grad_norm": 2.0183407375033413, | |
| "kl": 0.04736328125, | |
| "learning_rate": 3.458333333333333e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9255565404891968, | |
| "reward_std": 0.01740911416709423, | |
| "rewards/accuracy_reward": 0.9255565404891968, | |
| "rewards/format_reward": 1.0, | |
| "step": 314 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.26302337646484, | |
| "epoch": 6.6875, | |
| "grad_norm": 2.4818346839800007, | |
| "kl": 0.04736328125, | |
| "learning_rate": 3.4375e-07, | |
| "loss": 0.0019, | |
| "reward": 1.882767915725708, | |
| "reward_std": 0.024225857108831406, | |
| "rewards/accuracy_reward": 0.8827678561210632, | |
| "rewards/format_reward": 1.0, | |
| "step": 315 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.06771087646484, | |
| "epoch": 6.708333333333333, | |
| "grad_norm": 1.4343972340076592, | |
| "kl": 0.05322265625, | |
| "learning_rate": 3.4166666666666664e-07, | |
| "loss": 0.0022, | |
| "reward": 1.9222266674041748, | |
| "reward_std": 0.015834566205739975, | |
| "rewards/accuracy_reward": 0.92222660779953, | |
| "rewards/format_reward": 1.0, | |
| "step": 316 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.90234375, | |
| "epoch": 6.729166666666667, | |
| "grad_norm": 1.852771798485069, | |
| "kl": 0.0517578125, | |
| "learning_rate": 3.3958333333333335e-07, | |
| "loss": 0.0022, | |
| "reward": 1.9179143905639648, | |
| "reward_std": 0.02144519053399563, | |
| "rewards/accuracy_reward": 0.9192163944244385, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 317 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.6328125, | |
| "epoch": 6.75, | |
| "grad_norm": 3.6471691567389057, | |
| "kl": 0.054443359375, | |
| "learning_rate": 3.375e-07, | |
| "loss": 0.0022, | |
| "reward": 1.909096598625183, | |
| "reward_std": 0.01967495307326317, | |
| "rewards/accuracy_reward": 0.9090965986251831, | |
| "rewards/format_reward": 1.0, | |
| "step": 318 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.61458587646484, | |
| "epoch": 6.770833333333333, | |
| "grad_norm": 4.178729179494792, | |
| "kl": 0.047607421875, | |
| "learning_rate": 3.3541666666666665e-07, | |
| "loss": 0.002, | |
| "reward": 1.923593521118164, | |
| "reward_std": 0.016256026923656464, | |
| "rewards/accuracy_reward": 0.9235934019088745, | |
| "rewards/format_reward": 1.0, | |
| "step": 319 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.46875, | |
| "epoch": 6.791666666666667, | |
| "grad_norm": 3.850110305328207, | |
| "kl": 0.05224609375, | |
| "learning_rate": 3.333333333333333e-07, | |
| "loss": 0.0022, | |
| "reward": 1.8894399404525757, | |
| "reward_std": 0.0215081088244915, | |
| "rewards/accuracy_reward": 0.8894399404525757, | |
| "rewards/format_reward": 1.0, | |
| "step": 320 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.98567962646484, | |
| "epoch": 6.8125, | |
| "grad_norm": 1.5295790546923704, | |
| "kl": 0.049560546875, | |
| "learning_rate": 3.3124999999999995e-07, | |
| "loss": 0.0021, | |
| "reward": 1.902695894241333, | |
| "reward_std": 0.022552501410245895, | |
| "rewards/accuracy_reward": 0.9039978384971619, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 321 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.2109375, | |
| "epoch": 6.833333333333333, | |
| "grad_norm": 2.177500326990495, | |
| "kl": 0.049560546875, | |
| "learning_rate": 3.2916666666666666e-07, | |
| "loss": 0.0021, | |
| "reward": 1.9026316404342651, | |
| "reward_std": 0.028222566470503807, | |
| "rewards/accuracy_reward": 0.9039337038993835, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 322 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.26823425292969, | |
| "epoch": 6.854166666666667, | |
| "grad_norm": 2.25656186299227, | |
| "kl": 0.049072265625, | |
| "learning_rate": 3.270833333333333e-07, | |
| "loss": 0.002, | |
| "reward": 1.9238814115524292, | |
| "reward_std": 0.01956191472709179, | |
| "rewards/accuracy_reward": 0.9238814115524292, | |
| "rewards/format_reward": 1.0, | |
| "step": 323 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.48177337646484, | |
| "epoch": 6.875, | |
| "grad_norm": 1.962758450391304, | |
| "kl": 0.044677734375, | |
| "learning_rate": 3.25e-07, | |
| "loss": 0.0019, | |
| "reward": 1.902562141418457, | |
| "reward_std": 0.016970310360193253, | |
| "rewards/accuracy_reward": 0.902562141418457, | |
| "rewards/format_reward": 1.0, | |
| "step": 324 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.42317962646484, | |
| "epoch": 6.895833333333333, | |
| "grad_norm": 1.932226827561351, | |
| "kl": 0.052978515625, | |
| "learning_rate": 3.2291666666666666e-07, | |
| "loss": 0.0022, | |
| "reward": 1.9169180393218994, | |
| "reward_std": 0.01541107427328825, | |
| "rewards/accuracy_reward": 0.9169179797172546, | |
| "rewards/format_reward": 1.0, | |
| "step": 325 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.95442962646484, | |
| "epoch": 6.916666666666667, | |
| "grad_norm": 2.123672528629966, | |
| "kl": 0.0546875, | |
| "learning_rate": 3.2083333333333337e-07, | |
| "loss": 0.0023, | |
| "reward": 1.9134833812713623, | |
| "reward_std": 0.018955400213599205, | |
| "rewards/accuracy_reward": 0.9134833812713623, | |
| "rewards/format_reward": 1.0, | |
| "step": 326 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 112.23177337646484, | |
| "epoch": 6.9375, | |
| "grad_norm": 2.7379081437953436, | |
| "kl": 0.04736328125, | |
| "learning_rate": 3.1874999999999997e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8997161388397217, | |
| "reward_std": 0.019093122333288193, | |
| "rewards/accuracy_reward": 0.8997160792350769, | |
| "rewards/format_reward": 1.0, | |
| "step": 327 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.70052337646484, | |
| "epoch": 6.958333333333333, | |
| "grad_norm": 2.033398718348136, | |
| "kl": 0.051025390625, | |
| "learning_rate": 3.166666666666666e-07, | |
| "loss": 0.0021, | |
| "reward": 1.9058278799057007, | |
| "reward_std": 0.020390968769788742, | |
| "rewards/accuracy_reward": 0.9058279991149902, | |
| "rewards/format_reward": 1.0, | |
| "step": 328 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.6568832397461, | |
| "epoch": 6.979166666666667, | |
| "grad_norm": 4.2350616184619625, | |
| "kl": 0.0478515625, | |
| "learning_rate": 3.145833333333333e-07, | |
| "loss": 0.002, | |
| "reward": 1.9187196493148804, | |
| "reward_std": 0.019620845094323158, | |
| "rewards/accuracy_reward": 0.9187195301055908, | |
| "rewards/format_reward": 1.0, | |
| "step": 329 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.58984375, | |
| "epoch": 7.020833333333333, | |
| "grad_norm": 1.3225122948151316, | |
| "kl": 0.04833984375, | |
| "learning_rate": 3.1249999999999997e-07, | |
| "loss": 0.002, | |
| "reward": 1.904463768005371, | |
| "reward_std": 0.016730796545743942, | |
| "rewards/accuracy_reward": 0.9044637680053711, | |
| "rewards/format_reward": 1.0, | |
| "step": 330 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.29036712646484, | |
| "epoch": 7.041666666666667, | |
| "grad_norm": 2.0001558004119135, | |
| "kl": 0.04931640625, | |
| "learning_rate": 3.104166666666667e-07, | |
| "loss": 0.002, | |
| "reward": 1.9282145500183105, | |
| "reward_std": 0.020631009712815285, | |
| "rewards/accuracy_reward": 0.9282145500183105, | |
| "rewards/format_reward": 1.0, | |
| "step": 331 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.10026550292969, | |
| "epoch": 7.0625, | |
| "grad_norm": 1.7677360413399623, | |
| "kl": 0.046142578125, | |
| "learning_rate": 3.0833333333333333e-07, | |
| "loss": 0.0019, | |
| "reward": 1.913468360900879, | |
| "reward_std": 0.01975328102707863, | |
| "rewards/accuracy_reward": 0.9134685397148132, | |
| "rewards/format_reward": 1.0, | |
| "step": 332 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.64583587646484, | |
| "epoch": 7.083333333333333, | |
| "grad_norm": 1.5863979383855535, | |
| "kl": 0.044921875, | |
| "learning_rate": 3.0625000000000003e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9101800918579102, | |
| "reward_std": 0.018298618495464325, | |
| "rewards/accuracy_reward": 0.9101800918579102, | |
| "rewards/format_reward": 1.0, | |
| "step": 333 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.95573425292969, | |
| "epoch": 7.104166666666667, | |
| "grad_norm": 1.7406147667004834, | |
| "kl": 0.04296875, | |
| "learning_rate": 3.0416666666666663e-07, | |
| "loss": 0.0017, | |
| "reward": 1.9128578901290894, | |
| "reward_std": 0.025313010439276695, | |
| "rewards/accuracy_reward": 0.9141599535942078, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 334 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.0390625, | |
| "epoch": 7.125, | |
| "grad_norm": 6.234665754184476, | |
| "kl": 0.046875, | |
| "learning_rate": 3.020833333333333e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8841427564620972, | |
| "reward_std": 0.020634343847632408, | |
| "rewards/accuracy_reward": 0.8841428160667419, | |
| "rewards/format_reward": 1.0, | |
| "step": 335 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.65104675292969, | |
| "epoch": 7.145833333333333, | |
| "grad_norm": 2.942912834547579, | |
| "kl": 0.053955078125, | |
| "learning_rate": 3e-07, | |
| "loss": 0.0022, | |
| "reward": 1.9155168533325195, | |
| "reward_std": 0.019924897700548172, | |
| "rewards/accuracy_reward": 0.9155170321464539, | |
| "rewards/format_reward": 1.0, | |
| "step": 336 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.85026550292969, | |
| "epoch": 7.166666666666667, | |
| "grad_norm": 1.658655338447552, | |
| "kl": 0.048583984375, | |
| "learning_rate": 2.9791666666666664e-07, | |
| "loss": 0.002, | |
| "reward": 1.9088094234466553, | |
| "reward_std": 0.025105763226747513, | |
| "rewards/accuracy_reward": 0.9101114273071289, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 337 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.1953125, | |
| "epoch": 7.1875, | |
| "grad_norm": 2.4406244402139365, | |
| "kl": 0.04833984375, | |
| "learning_rate": 2.9583333333333334e-07, | |
| "loss": 0.002, | |
| "reward": 1.9100263118743896, | |
| "reward_std": 0.02519826404750347, | |
| "rewards/accuracy_reward": 0.9113283157348633, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 338 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.19140625, | |
| "epoch": 7.208333333333333, | |
| "grad_norm": 4.271413410927959, | |
| "kl": 0.048828125, | |
| "learning_rate": 2.9375e-07, | |
| "loss": 0.002, | |
| "reward": 1.9093725681304932, | |
| "reward_std": 0.01967555098235607, | |
| "rewards/accuracy_reward": 0.9093725085258484, | |
| "rewards/format_reward": 1.0, | |
| "step": 339 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.4140625, | |
| "epoch": 7.229166666666667, | |
| "grad_norm": 2.3763872048797414, | |
| "kl": 0.0654296875, | |
| "learning_rate": 2.916666666666667e-07, | |
| "loss": 0.0027, | |
| "reward": 1.90401029586792, | |
| "reward_std": 0.023377878591418266, | |
| "rewards/accuracy_reward": 0.9053124189376831, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 340 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.7265625, | |
| "epoch": 7.25, | |
| "grad_norm": 2.806628637706787, | |
| "kl": 0.04443359375, | |
| "learning_rate": 2.8958333333333335e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8880078792572021, | |
| "reward_std": 0.023502841591835022, | |
| "rewards/accuracy_reward": 0.8880078196525574, | |
| "rewards/format_reward": 1.0, | |
| "step": 341 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.31510925292969, | |
| "epoch": 7.270833333333333, | |
| "grad_norm": 3.43433922412685, | |
| "kl": 0.0888671875, | |
| "learning_rate": 2.8749999999999995e-07, | |
| "loss": 0.0036, | |
| "reward": 1.9123291969299316, | |
| "reward_std": 0.020550193265080452, | |
| "rewards/accuracy_reward": 0.9123293161392212, | |
| "rewards/format_reward": 1.0, | |
| "step": 342 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 103.36589050292969, | |
| "epoch": 7.291666666666667, | |
| "grad_norm": 2.842653921619938, | |
| "kl": 0.046142578125, | |
| "learning_rate": 2.8541666666666665e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9058549404144287, | |
| "reward_std": 0.019686056300997734, | |
| "rewards/accuracy_reward": 0.9058548808097839, | |
| "rewards/format_reward": 1.0, | |
| "step": 343 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.89974212646484, | |
| "epoch": 7.3125, | |
| "grad_norm": 2.0405865834971473, | |
| "kl": 0.046630859375, | |
| "learning_rate": 2.833333333333333e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9217561483383179, | |
| "reward_std": 0.018797121942043304, | |
| "rewards/accuracy_reward": 0.9217562675476074, | |
| "rewards/format_reward": 1.0, | |
| "step": 344 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.70182800292969, | |
| "epoch": 7.333333333333333, | |
| "grad_norm": 1.9486921558633905, | |
| "kl": 0.0458984375, | |
| "learning_rate": 2.8125e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8922131061553955, | |
| "reward_std": 0.02469293400645256, | |
| "rewards/accuracy_reward": 0.892212986946106, | |
| "rewards/format_reward": 1.0, | |
| "step": 345 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.19140625, | |
| "epoch": 7.354166666666667, | |
| "grad_norm": 2.5259874539949303, | |
| "kl": 0.045654296875, | |
| "learning_rate": 2.7916666666666666e-07, | |
| "loss": 0.0019, | |
| "reward": 1.905487060546875, | |
| "reward_std": 0.023547440767288208, | |
| "rewards/accuracy_reward": 0.905487060546875, | |
| "rewards/format_reward": 1.0, | |
| "step": 346 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.25130462646484, | |
| "epoch": 7.375, | |
| "grad_norm": 6.590006203990273, | |
| "kl": 0.055908203125, | |
| "learning_rate": 2.770833333333333e-07, | |
| "loss": 0.0023, | |
| "reward": 1.931574821472168, | |
| "reward_std": 0.019262373447418213, | |
| "rewards/accuracy_reward": 0.9315750002861023, | |
| "rewards/format_reward": 1.0, | |
| "step": 347 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.35677337646484, | |
| "epoch": 7.395833333333333, | |
| "grad_norm": 4.321196737881709, | |
| "kl": 0.049560546875, | |
| "learning_rate": 2.75e-07, | |
| "loss": 0.0021, | |
| "reward": 1.9387693405151367, | |
| "reward_std": 0.017124010249972343, | |
| "rewards/accuracy_reward": 0.9387692213058472, | |
| "rewards/format_reward": 1.0, | |
| "step": 348 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.671875, | |
| "epoch": 7.416666666666667, | |
| "grad_norm": 2.8457346329200557, | |
| "kl": 0.0458984375, | |
| "learning_rate": 2.729166666666666e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9095313549041748, | |
| "reward_std": 0.021084271371364594, | |
| "rewards/accuracy_reward": 0.90953129529953, | |
| "rewards/format_reward": 1.0, | |
| "step": 349 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.71224212646484, | |
| "epoch": 7.4375, | |
| "grad_norm": 1.993210730812182, | |
| "kl": 0.046875, | |
| "learning_rate": 2.708333333333333e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9209964275360107, | |
| "reward_std": 0.020304953679442406, | |
| "rewards/accuracy_reward": 0.9209963083267212, | |
| "rewards/format_reward": 1.0, | |
| "step": 350 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.25521087646484, | |
| "epoch": 7.458333333333333, | |
| "grad_norm": 2.4373953511615856, | |
| "kl": 0.0458984375, | |
| "learning_rate": 2.6874999999999997e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9061431884765625, | |
| "reward_std": 0.01994149014353752, | |
| "rewards/accuracy_reward": 0.906143069267273, | |
| "rewards/format_reward": 1.0, | |
| "step": 351 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.44140625, | |
| "epoch": 7.479166666666667, | |
| "grad_norm": 3.7778106185692635, | |
| "kl": 0.053466796875, | |
| "learning_rate": 2.6666666666666667e-07, | |
| "loss": 0.0022, | |
| "reward": 1.9190186262130737, | |
| "reward_std": 0.020440340042114258, | |
| "rewards/accuracy_reward": 0.9190186262130737, | |
| "rewards/format_reward": 1.0, | |
| "step": 352 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.86589050292969, | |
| "epoch": 7.5, | |
| "grad_norm": 2.376594210526858, | |
| "kl": 0.047119140625, | |
| "learning_rate": 2.645833333333333e-07, | |
| "loss": 0.002, | |
| "reward": 1.9218724966049194, | |
| "reward_std": 0.02028195932507515, | |
| "rewards/accuracy_reward": 0.9218723773956299, | |
| "rewards/format_reward": 1.0, | |
| "step": 353 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.45052337646484, | |
| "epoch": 7.520833333333333, | |
| "grad_norm": 1.7750680835812318, | |
| "kl": 0.048583984375, | |
| "learning_rate": 2.625e-07, | |
| "loss": 0.002, | |
| "reward": 1.90578293800354, | |
| "reward_std": 0.018642796203494072, | |
| "rewards/accuracy_reward": 0.9057828187942505, | |
| "rewards/format_reward": 1.0, | |
| "step": 354 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.1484375, | |
| "epoch": 7.541666666666667, | |
| "grad_norm": 2.022098717469963, | |
| "kl": 0.04833984375, | |
| "learning_rate": 2.604166666666667e-07, | |
| "loss": 0.002, | |
| "reward": 1.895311713218689, | |
| "reward_std": 0.024539759382605553, | |
| "rewards/accuracy_reward": 0.8966139554977417, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 355 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.81380462646484, | |
| "epoch": 7.5625, | |
| "grad_norm": 4.701010132350987, | |
| "kl": 0.053466796875, | |
| "learning_rate": 2.5833333333333333e-07, | |
| "loss": 0.0022, | |
| "reward": 1.9116785526275635, | |
| "reward_std": 0.01904495432972908, | |
| "rewards/accuracy_reward": 0.9116784930229187, | |
| "rewards/format_reward": 1.0, | |
| "step": 356 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.61328125, | |
| "epoch": 7.583333333333333, | |
| "grad_norm": 2.7971888222519197, | |
| "kl": 0.04638671875, | |
| "learning_rate": 2.5625e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9103072881698608, | |
| "reward_std": 0.019077036529779434, | |
| "rewards/accuracy_reward": 0.9103072881698608, | |
| "rewards/format_reward": 1.0, | |
| "step": 357 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.35677337646484, | |
| "epoch": 7.604166666666667, | |
| "grad_norm": 3.340602697847883, | |
| "kl": 0.049560546875, | |
| "learning_rate": 2.5416666666666663e-07, | |
| "loss": 0.002, | |
| "reward": 1.9107370376586914, | |
| "reward_std": 0.01656663417816162, | |
| "rewards/accuracy_reward": 0.9107369780540466, | |
| "rewards/format_reward": 1.0, | |
| "step": 358 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.40755462646484, | |
| "epoch": 7.625, | |
| "grad_norm": 3.4757576150836527, | |
| "kl": 0.052978515625, | |
| "learning_rate": 2.5208333333333334e-07, | |
| "loss": 0.0022, | |
| "reward": 1.897055983543396, | |
| "reward_std": 0.02028050646185875, | |
| "rewards/accuracy_reward": 0.897055983543396, | |
| "rewards/format_reward": 1.0, | |
| "step": 359 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.87109375, | |
| "epoch": 7.645833333333333, | |
| "grad_norm": 1.7537209554556075, | |
| "kl": 0.052001953125, | |
| "learning_rate": 2.5e-07, | |
| "loss": 0.0021, | |
| "reward": 1.9118655920028687, | |
| "reward_std": 0.019177807494997978, | |
| "rewards/accuracy_reward": 0.9118657112121582, | |
| "rewards/format_reward": 1.0, | |
| "step": 360 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.87760925292969, | |
| "epoch": 7.666666666666667, | |
| "grad_norm": 1.985054115096998, | |
| "kl": 0.053955078125, | |
| "learning_rate": 2.4791666666666664e-07, | |
| "loss": 0.0022, | |
| "reward": 1.9363340139389038, | |
| "reward_std": 0.01809048466384411, | |
| "rewards/accuracy_reward": 0.9363340139389038, | |
| "rewards/format_reward": 1.0, | |
| "step": 361 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.9375, | |
| "epoch": 7.6875, | |
| "grad_norm": 2.640046167255212, | |
| "kl": 0.055908203125, | |
| "learning_rate": 2.458333333333333e-07, | |
| "loss": 0.0023, | |
| "reward": 1.8969202041625977, | |
| "reward_std": 0.02410067245364189, | |
| "rewards/accuracy_reward": 0.8982224464416504, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 362 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.78646087646484, | |
| "epoch": 7.708333333333333, | |
| "grad_norm": 2.0927423356674155, | |
| "kl": 0.05322265625, | |
| "learning_rate": 2.4375e-07, | |
| "loss": 0.0023, | |
| "reward": 1.934058427810669, | |
| "reward_std": 0.020023031160235405, | |
| "rewards/accuracy_reward": 0.9340583682060242, | |
| "rewards/format_reward": 1.0, | |
| "step": 363 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.98177337646484, | |
| "epoch": 7.729166666666667, | |
| "grad_norm": 5.374648001232616, | |
| "kl": 0.05517578125, | |
| "learning_rate": 2.4166666666666665e-07, | |
| "loss": 0.0023, | |
| "reward": 1.9142836332321167, | |
| "reward_std": 0.01773514598608017, | |
| "rewards/accuracy_reward": 0.9142836928367615, | |
| "rewards/format_reward": 1.0, | |
| "step": 364 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.72396087646484, | |
| "epoch": 7.75, | |
| "grad_norm": 1.5292733041237832, | |
| "kl": 0.050537109375, | |
| "learning_rate": 2.3958333333333335e-07, | |
| "loss": 0.0021, | |
| "reward": 1.9197324514389038, | |
| "reward_std": 0.0220477432012558, | |
| "rewards/accuracy_reward": 0.9210345149040222, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 365 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.1875, | |
| "epoch": 7.770833333333333, | |
| "grad_norm": 9.487726177733885, | |
| "kl": 0.045654296875, | |
| "learning_rate": 2.3749999999999998e-07, | |
| "loss": 0.0019, | |
| "reward": 1.927847146987915, | |
| "reward_std": 0.017860591411590576, | |
| "rewards/accuracy_reward": 0.9278470873832703, | |
| "rewards/format_reward": 1.0, | |
| "step": 366 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 112.3203125, | |
| "epoch": 7.791666666666667, | |
| "grad_norm": 2.667778768562944, | |
| "kl": 0.0458984375, | |
| "learning_rate": 2.3541666666666665e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9139199256896973, | |
| "reward_std": 0.02642824873328209, | |
| "rewards/accuracy_reward": 0.9178261756896973, | |
| "rewards/format_reward": 0.99609375, | |
| "step": 367 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.01302337646484, | |
| "epoch": 7.8125, | |
| "grad_norm": 2.86218482085828, | |
| "kl": 0.051025390625, | |
| "learning_rate": 2.3333333333333333e-07, | |
| "loss": 0.0021, | |
| "reward": 1.9041519165039062, | |
| "reward_std": 0.02404339425265789, | |
| "rewards/accuracy_reward": 0.9054540395736694, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 368 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.94401550292969, | |
| "epoch": 7.833333333333333, | |
| "grad_norm": 1.5489939957891778, | |
| "kl": 0.05322265625, | |
| "learning_rate": 2.3125e-07, | |
| "loss": 0.0022, | |
| "reward": 1.9263066053390503, | |
| "reward_std": 0.019394386559724808, | |
| "rewards/accuracy_reward": 0.9263066649436951, | |
| "rewards/format_reward": 1.0, | |
| "step": 369 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.15755462646484, | |
| "epoch": 7.854166666666667, | |
| "grad_norm": 2.64116020814343, | |
| "kl": 0.052490234375, | |
| "learning_rate": 2.2916666666666663e-07, | |
| "loss": 0.0021, | |
| "reward": 1.9046939611434937, | |
| "reward_std": 0.023857450112700462, | |
| "rewards/accuracy_reward": 0.9059960842132568, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 370 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.32552337646484, | |
| "epoch": 7.875, | |
| "grad_norm": 1.80983212656234, | |
| "kl": 0.044921875, | |
| "learning_rate": 2.270833333333333e-07, | |
| "loss": 0.0018, | |
| "reward": 1.934501051902771, | |
| "reward_std": 0.016163021326065063, | |
| "rewards/accuracy_reward": 0.934501051902771, | |
| "rewards/format_reward": 1.0, | |
| "step": 371 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.37890625, | |
| "epoch": 7.895833333333333, | |
| "grad_norm": 10.563112786998811, | |
| "kl": 0.048583984375, | |
| "learning_rate": 2.25e-07, | |
| "loss": 0.002, | |
| "reward": 1.8924764394760132, | |
| "reward_std": 0.019610995426774025, | |
| "rewards/accuracy_reward": 0.892476499080658, | |
| "rewards/format_reward": 1.0, | |
| "step": 372 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.34896087646484, | |
| "epoch": 7.916666666666667, | |
| "grad_norm": 2.533818918571777, | |
| "kl": 0.044921875, | |
| "learning_rate": 2.2291666666666667e-07, | |
| "loss": 0.0019, | |
| "reward": 1.906684398651123, | |
| "reward_std": 0.01782190427184105, | |
| "rewards/accuracy_reward": 0.906684398651123, | |
| "rewards/format_reward": 1.0, | |
| "step": 373 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.51823425292969, | |
| "epoch": 7.9375, | |
| "grad_norm": 2.2661373394338926, | |
| "kl": 0.0546875, | |
| "learning_rate": 2.2083333333333332e-07, | |
| "loss": 0.0022, | |
| "reward": 1.9244441986083984, | |
| "reward_std": 0.01977790892124176, | |
| "rewards/accuracy_reward": 0.9244440793991089, | |
| "rewards/format_reward": 1.0, | |
| "step": 374 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.78385925292969, | |
| "epoch": 7.958333333333333, | |
| "grad_norm": 1.7111686099562637, | |
| "kl": 0.051025390625, | |
| "learning_rate": 2.1875e-07, | |
| "loss": 0.0021, | |
| "reward": 1.897660255432129, | |
| "reward_std": 0.02316589280962944, | |
| "rewards/accuracy_reward": 0.8976603746414185, | |
| "rewards/format_reward": 1.0, | |
| "step": 375 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.38451385498047, | |
| "epoch": 7.979166666666667, | |
| "grad_norm": 3.8657178191371275, | |
| "kl": 0.044921875, | |
| "learning_rate": 2.1666666666666667e-07, | |
| "loss": 0.0019, | |
| "reward": 1.91867196559906, | |
| "reward_std": 0.018634842708706856, | |
| "rewards/accuracy_reward": 0.9186719655990601, | |
| "rewards/format_reward": 1.0, | |
| "step": 376 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.22917175292969, | |
| "epoch": 8.020833333333334, | |
| "grad_norm": 1.9741658039132675, | |
| "kl": 0.04638671875, | |
| "learning_rate": 2.145833333333333e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9112396240234375, | |
| "reward_std": 0.01786581240594387, | |
| "rewards/accuracy_reward": 0.9112398028373718, | |
| "rewards/format_reward": 1.0, | |
| "step": 377 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.52604675292969, | |
| "epoch": 8.041666666666666, | |
| "grad_norm": 4.239952555901759, | |
| "kl": 0.04931640625, | |
| "learning_rate": 2.1249999999999998e-07, | |
| "loss": 0.002, | |
| "reward": 1.9145668745040894, | |
| "reward_std": 0.03050382435321808, | |
| "rewards/accuracy_reward": 0.9171710014343262, | |
| "rewards/format_reward": 0.9973958730697632, | |
| "step": 378 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.51171875, | |
| "epoch": 8.0625, | |
| "grad_norm": 3.1873677902649264, | |
| "kl": 0.04638671875, | |
| "learning_rate": 2.1041666666666665e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9099664688110352, | |
| "reward_std": 0.021909143775701523, | |
| "rewards/accuracy_reward": 0.9099664688110352, | |
| "rewards/format_reward": 1.0, | |
| "step": 379 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.73046875, | |
| "epoch": 8.083333333333334, | |
| "grad_norm": 1.4761981134253073, | |
| "kl": 0.043212890625, | |
| "learning_rate": 2.0833333333333333e-07, | |
| "loss": 0.0018, | |
| "reward": 1.908822774887085, | |
| "reward_std": 0.026725394651293755, | |
| "rewards/accuracy_reward": 0.9114267826080322, | |
| "rewards/format_reward": 0.9973958730697632, | |
| "step": 380 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.9765625, | |
| "epoch": 8.104166666666666, | |
| "grad_norm": 2.5298325743515413, | |
| "kl": 0.055908203125, | |
| "learning_rate": 2.0624999999999998e-07, | |
| "loss": 0.0023, | |
| "reward": 1.919610857963562, | |
| "reward_std": 0.022303760051727295, | |
| "rewards/accuracy_reward": 0.919610857963562, | |
| "rewards/format_reward": 1.0, | |
| "step": 381 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.39453125, | |
| "epoch": 8.125, | |
| "grad_norm": 1.6701037952221194, | |
| "kl": 0.04736328125, | |
| "learning_rate": 2.0416666666666666e-07, | |
| "loss": 0.002, | |
| "reward": 1.8952322006225586, | |
| "reward_std": 0.02632908523082733, | |
| "rewards/accuracy_reward": 0.8978363275527954, | |
| "rewards/format_reward": 0.9973958730697632, | |
| "step": 382 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.75911712646484, | |
| "epoch": 8.145833333333334, | |
| "grad_norm": 1.8105877210751262, | |
| "kl": 0.049560546875, | |
| "learning_rate": 2.0208333333333334e-07, | |
| "loss": 0.002, | |
| "reward": 1.9192695617675781, | |
| "reward_std": 0.02205723151564598, | |
| "rewards/accuracy_reward": 0.9192695617675781, | |
| "rewards/format_reward": 1.0, | |
| "step": 383 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.484375, | |
| "epoch": 8.166666666666666, | |
| "grad_norm": 3.6256736252714084, | |
| "kl": 0.052001953125, | |
| "learning_rate": 2e-07, | |
| "loss": 0.0022, | |
| "reward": 1.899370551109314, | |
| "reward_std": 0.02411123923957348, | |
| "rewards/accuracy_reward": 0.9006726145744324, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 384 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.23698425292969, | |
| "epoch": 8.1875, | |
| "grad_norm": 2.8889796981288867, | |
| "kl": 0.05517578125, | |
| "learning_rate": 1.9791666666666664e-07, | |
| "loss": 0.0023, | |
| "reward": 1.9239459037780762, | |
| "reward_std": 0.020000584423542023, | |
| "rewards/accuracy_reward": 0.9239459037780762, | |
| "rewards/format_reward": 1.0, | |
| "step": 385 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.01953125, | |
| "epoch": 8.208333333333334, | |
| "grad_norm": 4.9653820626533385, | |
| "kl": 0.052734375, | |
| "learning_rate": 1.9583333333333332e-07, | |
| "loss": 0.0022, | |
| "reward": 1.9158563613891602, | |
| "reward_std": 0.025936102494597435, | |
| "rewards/accuracy_reward": 0.9171584844589233, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 386 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.20964050292969, | |
| "epoch": 8.229166666666666, | |
| "grad_norm": 2.290735973233346, | |
| "kl": 0.055908203125, | |
| "learning_rate": 1.9375e-07, | |
| "loss": 0.0023, | |
| "reward": 1.920623540878296, | |
| "reward_std": 0.016118617728352547, | |
| "rewards/accuracy_reward": 0.9206234216690063, | |
| "rewards/format_reward": 1.0, | |
| "step": 387 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.79817962646484, | |
| "epoch": 8.25, | |
| "grad_norm": 2.14149502952453, | |
| "kl": 0.05078125, | |
| "learning_rate": 1.9166666666666668e-07, | |
| "loss": 0.0021, | |
| "reward": 1.8982393741607666, | |
| "reward_std": 0.02737743966281414, | |
| "rewards/accuracy_reward": 0.8995413780212402, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 388 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.703125, | |
| "epoch": 8.270833333333334, | |
| "grad_norm": 1.7520909830046585, | |
| "kl": 0.049560546875, | |
| "learning_rate": 1.8958333333333333e-07, | |
| "loss": 0.0021, | |
| "reward": 1.9203259944915771, | |
| "reward_std": 0.017365001142024994, | |
| "rewards/accuracy_reward": 0.9203259348869324, | |
| "rewards/format_reward": 1.0, | |
| "step": 389 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.1953125, | |
| "epoch": 8.291666666666666, | |
| "grad_norm": 3.8682316676739044, | |
| "kl": 0.05322265625, | |
| "learning_rate": 1.875e-07, | |
| "loss": 0.0023, | |
| "reward": 1.90675950050354, | |
| "reward_std": 0.020597189664840698, | |
| "rewards/accuracy_reward": 0.9067594408988953, | |
| "rewards/format_reward": 1.0, | |
| "step": 390 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.93489837646484, | |
| "epoch": 8.3125, | |
| "grad_norm": 2.4384512365064483, | |
| "kl": 0.047607421875, | |
| "learning_rate": 1.8541666666666666e-07, | |
| "loss": 0.002, | |
| "reward": 1.923602819442749, | |
| "reward_std": 0.01742154359817505, | |
| "rewards/accuracy_reward": 0.9249049425125122, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 391 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.72005462646484, | |
| "epoch": 8.333333333333334, | |
| "grad_norm": 2.234427870328764, | |
| "kl": 0.051513671875, | |
| "learning_rate": 1.833333333333333e-07, | |
| "loss": 0.0021, | |
| "reward": 1.9096324443817139, | |
| "reward_std": 0.021227438002824783, | |
| "rewards/accuracy_reward": 0.9096323847770691, | |
| "rewards/format_reward": 1.0, | |
| "step": 392 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.46745300292969, | |
| "epoch": 8.354166666666666, | |
| "grad_norm": 1.9160175504578056, | |
| "kl": 0.05078125, | |
| "learning_rate": 1.8124999999999999e-07, | |
| "loss": 0.0022, | |
| "reward": 1.9217007160186768, | |
| "reward_std": 0.01802412047982216, | |
| "rewards/accuracy_reward": 0.9230027198791504, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 393 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.35807800292969, | |
| "epoch": 8.375, | |
| "grad_norm": 5.1139313564913955, | |
| "kl": 0.044921875, | |
| "learning_rate": 1.7916666666666666e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9158090353012085, | |
| "reward_std": 0.021614177152514458, | |
| "rewards/accuracy_reward": 0.9171112775802612, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 394 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.94271087646484, | |
| "epoch": 8.395833333333334, | |
| "grad_norm": 1.8921820475793412, | |
| "kl": 0.046875, | |
| "learning_rate": 1.7708333333333334e-07, | |
| "loss": 0.002, | |
| "reward": 1.9103114604949951, | |
| "reward_std": 0.01914086937904358, | |
| "rewards/accuracy_reward": 0.9103114604949951, | |
| "rewards/format_reward": 1.0, | |
| "step": 395 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.96614837646484, | |
| "epoch": 8.416666666666666, | |
| "grad_norm": 2.8573255442328405, | |
| "kl": 0.050537109375, | |
| "learning_rate": 1.75e-07, | |
| "loss": 0.0021, | |
| "reward": 1.9096262454986572, | |
| "reward_std": 0.01799336075782776, | |
| "rewards/accuracy_reward": 0.9096261262893677, | |
| "rewards/format_reward": 1.0, | |
| "step": 396 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.53646087646484, | |
| "epoch": 8.4375, | |
| "grad_norm": 2.533540112681752, | |
| "kl": 0.044677734375, | |
| "learning_rate": 1.7291666666666664e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9281089305877686, | |
| "reward_std": 0.01919987052679062, | |
| "rewards/accuracy_reward": 0.928108811378479, | |
| "rewards/format_reward": 1.0, | |
| "step": 397 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.0234375, | |
| "epoch": 8.458333333333334, | |
| "grad_norm": 2.2470205832432666, | |
| "kl": 0.04443359375, | |
| "learning_rate": 1.7083333333333332e-07, | |
| "loss": 0.0018, | |
| "reward": 1.9159855842590332, | |
| "reward_std": 0.021353445947170258, | |
| "rewards/accuracy_reward": 0.9159855842590332, | |
| "rewards/format_reward": 1.0, | |
| "step": 398 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.69792175292969, | |
| "epoch": 8.479166666666666, | |
| "grad_norm": 2.60464995508122, | |
| "kl": 0.046875, | |
| "learning_rate": 1.6875e-07, | |
| "loss": 0.002, | |
| "reward": 1.9079928398132324, | |
| "reward_std": 0.022192446514964104, | |
| "rewards/accuracy_reward": 0.909294843673706, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 399 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.828125, | |
| "epoch": 8.5, | |
| "grad_norm": 2.421415014993462, | |
| "kl": 0.053466796875, | |
| "learning_rate": 1.6666666666666665e-07, | |
| "loss": 0.0022, | |
| "reward": 1.9306447505950928, | |
| "reward_std": 0.01638518087565899, | |
| "rewards/accuracy_reward": 0.9306447505950928, | |
| "rewards/format_reward": 1.0, | |
| "step": 400 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.39714050292969, | |
| "epoch": 8.520833333333334, | |
| "grad_norm": 1.7082957831764727, | |
| "kl": 0.056640625, | |
| "learning_rate": 1.6458333333333333e-07, | |
| "loss": 0.0024, | |
| "reward": 1.9131463766098022, | |
| "reward_std": 0.023682190105319023, | |
| "rewards/accuracy_reward": 0.9144482612609863, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 401 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.69010925292969, | |
| "epoch": 8.541666666666666, | |
| "grad_norm": 2.2438919940917197, | |
| "kl": 0.04638671875, | |
| "learning_rate": 1.625e-07, | |
| "loss": 0.002, | |
| "reward": 1.9307001829147339, | |
| "reward_std": 0.016804661601781845, | |
| "rewards/accuracy_reward": 0.9307002425193787, | |
| "rewards/format_reward": 1.0, | |
| "step": 402 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.14192962646484, | |
| "epoch": 8.5625, | |
| "grad_norm": 3.264026183321021, | |
| "kl": 0.04638671875, | |
| "learning_rate": 1.6041666666666668e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8919553756713867, | |
| "reward_std": 0.02204691618680954, | |
| "rewards/accuracy_reward": 0.8932574987411499, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 403 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.13542175292969, | |
| "epoch": 8.583333333333334, | |
| "grad_norm": 2.309073599276998, | |
| "kl": 0.04443359375, | |
| "learning_rate": 1.583333333333333e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9184863567352295, | |
| "reward_std": 0.016750024631619453, | |
| "rewards/accuracy_reward": 0.918486475944519, | |
| "rewards/format_reward": 1.0, | |
| "step": 404 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 110.02604675292969, | |
| "epoch": 8.604166666666666, | |
| "grad_norm": 1.920449334945207, | |
| "kl": 0.046875, | |
| "learning_rate": 1.5624999999999999e-07, | |
| "loss": 0.0019, | |
| "reward": 1.8996871709823608, | |
| "reward_std": 0.019097616896033287, | |
| "rewards/accuracy_reward": 0.8996869921684265, | |
| "rewards/format_reward": 1.0, | |
| "step": 405 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.38671875, | |
| "epoch": 8.625, | |
| "grad_norm": 6.108556482978297, | |
| "kl": 0.047607421875, | |
| "learning_rate": 1.5416666666666666e-07, | |
| "loss": 0.002, | |
| "reward": 1.8946789503097534, | |
| "reward_std": 0.021756049245595932, | |
| "rewards/accuracy_reward": 0.8946789503097534, | |
| "rewards/format_reward": 1.0, | |
| "step": 406 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.61589050292969, | |
| "epoch": 8.645833333333334, | |
| "grad_norm": 1.4312273043036572, | |
| "kl": 0.04833984375, | |
| "learning_rate": 1.5208333333333332e-07, | |
| "loss": 0.002, | |
| "reward": 1.930631399154663, | |
| "reward_std": 0.019629666581749916, | |
| "rewards/accuracy_reward": 0.9319334030151367, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 407 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.0546875, | |
| "epoch": 8.666666666666666, | |
| "grad_norm": 2.0186934249559645, | |
| "kl": 0.046142578125, | |
| "learning_rate": 1.5e-07, | |
| "loss": 0.0019, | |
| "reward": 1.910029411315918, | |
| "reward_std": 0.01880134642124176, | |
| "rewards/accuracy_reward": 0.9100292921066284, | |
| "rewards/format_reward": 1.0, | |
| "step": 408 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.23958587646484, | |
| "epoch": 8.6875, | |
| "grad_norm": 2.388178370024915, | |
| "kl": 0.046630859375, | |
| "learning_rate": 1.4791666666666667e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9040935039520264, | |
| "reward_std": 0.023898255079984665, | |
| "rewards/accuracy_reward": 0.9040936231613159, | |
| "rewards/format_reward": 1.0, | |
| "step": 409 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.42317962646484, | |
| "epoch": 8.708333333333334, | |
| "grad_norm": 3.09323894478763, | |
| "kl": 0.045166015625, | |
| "learning_rate": 1.4583333333333335e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9278287887573242, | |
| "reward_std": 0.01652323268353939, | |
| "rewards/accuracy_reward": 0.9278289675712585, | |
| "rewards/format_reward": 1.0, | |
| "step": 410 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 111.14714050292969, | |
| "epoch": 8.729166666666666, | |
| "grad_norm": 8.95075477806658, | |
| "kl": 0.046875, | |
| "learning_rate": 1.4374999999999997e-07, | |
| "loss": 0.002, | |
| "reward": 1.9176443815231323, | |
| "reward_std": 0.020662346854805946, | |
| "rewards/accuracy_reward": 0.9202485084533691, | |
| "rewards/format_reward": 0.9973958730697632, | |
| "step": 411 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.69140625, | |
| "epoch": 8.75, | |
| "grad_norm": 14.290195535948639, | |
| "kl": 0.04638671875, | |
| "learning_rate": 1.4166666666666665e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9256818294525146, | |
| "reward_std": 0.01680697686970234, | |
| "rewards/accuracy_reward": 0.9256815910339355, | |
| "rewards/format_reward": 1.0, | |
| "step": 412 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.74089050292969, | |
| "epoch": 8.770833333333334, | |
| "grad_norm": 3.382470297102888, | |
| "kl": 0.04833984375, | |
| "learning_rate": 1.3958333333333333e-07, | |
| "loss": 0.002, | |
| "reward": 1.9325473308563232, | |
| "reward_std": 0.015486609190702438, | |
| "rewards/accuracy_reward": 0.9325472116470337, | |
| "rewards/format_reward": 1.0, | |
| "step": 413 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.37890625, | |
| "epoch": 8.791666666666666, | |
| "grad_norm": 1.5498505485516252, | |
| "kl": 0.04638671875, | |
| "learning_rate": 1.375e-07, | |
| "loss": 0.002, | |
| "reward": 1.9133939743041992, | |
| "reward_std": 0.019088715314865112, | |
| "rewards/accuracy_reward": 0.9133939743041992, | |
| "rewards/format_reward": 1.0, | |
| "step": 414 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.13151550292969, | |
| "epoch": 8.8125, | |
| "grad_norm": 2.1982092482674496, | |
| "kl": 0.047119140625, | |
| "learning_rate": 1.3541666666666666e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9054036140441895, | |
| "reward_std": 0.023469921201467514, | |
| "rewards/accuracy_reward": 0.9067057371139526, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 415 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.67578125, | |
| "epoch": 8.833333333333334, | |
| "grad_norm": 2.5337968426277158, | |
| "kl": 0.052490234375, | |
| "learning_rate": 1.3333333333333334e-07, | |
| "loss": 0.0022, | |
| "reward": 1.9113550186157227, | |
| "reward_std": 0.017122842371463776, | |
| "rewards/accuracy_reward": 0.9113550186157227, | |
| "rewards/format_reward": 1.0, | |
| "step": 416 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.7421875, | |
| "epoch": 8.854166666666666, | |
| "grad_norm": 2.957658114504722, | |
| "kl": 0.049072265625, | |
| "learning_rate": 1.3125e-07, | |
| "loss": 0.002, | |
| "reward": 1.9356334209442139, | |
| "reward_std": 0.01688789203763008, | |
| "rewards/accuracy_reward": 0.9356333613395691, | |
| "rewards/format_reward": 1.0, | |
| "step": 417 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.45442962646484, | |
| "epoch": 8.875, | |
| "grad_norm": 3.414030918508783, | |
| "kl": 0.048095703125, | |
| "learning_rate": 1.2916666666666667e-07, | |
| "loss": 0.002, | |
| "reward": 1.9134725332260132, | |
| "reward_std": 0.01754312589764595, | |
| "rewards/accuracy_reward": 0.9134725332260132, | |
| "rewards/format_reward": 1.0, | |
| "step": 418 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.20833587646484, | |
| "epoch": 8.895833333333334, | |
| "grad_norm": 3.3858814535237918, | |
| "kl": 0.04931640625, | |
| "learning_rate": 1.2708333333333332e-07, | |
| "loss": 0.002, | |
| "reward": 1.8799127340316772, | |
| "reward_std": 0.0196706410497427, | |
| "rewards/accuracy_reward": 0.8799127340316772, | |
| "rewards/format_reward": 1.0, | |
| "step": 419 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.65885925292969, | |
| "epoch": 8.916666666666666, | |
| "grad_norm": 4.177315577481865, | |
| "kl": 0.048583984375, | |
| "learning_rate": 1.25e-07, | |
| "loss": 0.002, | |
| "reward": 1.9059925079345703, | |
| "reward_std": 0.017467858269810677, | |
| "rewards/accuracy_reward": 0.9059926867485046, | |
| "rewards/format_reward": 1.0, | |
| "step": 420 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.44010925292969, | |
| "epoch": 8.9375, | |
| "grad_norm": 1.8879259079304014, | |
| "kl": 0.045654296875, | |
| "learning_rate": 1.2291666666666665e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9416245222091675, | |
| "reward_std": 0.014588016085326672, | |
| "rewards/accuracy_reward": 0.9416245818138123, | |
| "rewards/format_reward": 1.0, | |
| "step": 421 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.62370300292969, | |
| "epoch": 8.958333333333334, | |
| "grad_norm": 2.993225929254523, | |
| "kl": 0.053955078125, | |
| "learning_rate": 1.2083333333333332e-07, | |
| "loss": 0.0023, | |
| "reward": 1.9129104614257812, | |
| "reward_std": 0.018522052094340324, | |
| "rewards/accuracy_reward": 0.9129105806350708, | |
| "rewards/format_reward": 1.0, | |
| "step": 422 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.63018035888672, | |
| "epoch": 8.979166666666666, | |
| "grad_norm": 1.868194038764531, | |
| "kl": 0.05908203125, | |
| "learning_rate": 1.1874999999999999e-07, | |
| "loss": 0.0024, | |
| "reward": 1.8931113481521606, | |
| "reward_std": 0.019662605598568916, | |
| "rewards/accuracy_reward": 0.8931112885475159, | |
| "rewards/format_reward": 1.0, | |
| "step": 423 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.67708587646484, | |
| "epoch": 9.020833333333334, | |
| "grad_norm": 4.908308927568047, | |
| "kl": 0.053955078125, | |
| "learning_rate": 1.1666666666666667e-07, | |
| "loss": 0.0022, | |
| "reward": 1.9184527397155762, | |
| "reward_std": 0.022821567952632904, | |
| "rewards/accuracy_reward": 0.9197548627853394, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 424 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.43229675292969, | |
| "epoch": 9.041666666666666, | |
| "grad_norm": 2.0304888444214884, | |
| "kl": 0.047119140625, | |
| "learning_rate": 1.1458333333333332e-07, | |
| "loss": 0.002, | |
| "reward": 1.9061381816864014, | |
| "reward_std": 0.017391815781593323, | |
| "rewards/accuracy_reward": 0.9061381220817566, | |
| "rewards/format_reward": 1.0, | |
| "step": 425 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.78385925292969, | |
| "epoch": 9.0625, | |
| "grad_norm": 4.941122982616481, | |
| "kl": 0.05029296875, | |
| "learning_rate": 1.125e-07, | |
| "loss": 0.0021, | |
| "reward": 1.9194796085357666, | |
| "reward_std": 0.01858523301780224, | |
| "rewards/accuracy_reward": 0.9194795489311218, | |
| "rewards/format_reward": 1.0, | |
| "step": 426 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.11198425292969, | |
| "epoch": 9.083333333333334, | |
| "grad_norm": 1.5041501011347416, | |
| "kl": 0.055419921875, | |
| "learning_rate": 1.1041666666666666e-07, | |
| "loss": 0.0023, | |
| "reward": 1.926844596862793, | |
| "reward_std": 0.017708610743284225, | |
| "rewards/accuracy_reward": 0.926844596862793, | |
| "rewards/format_reward": 1.0, | |
| "step": 427 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 104.85546875, | |
| "epoch": 9.104166666666666, | |
| "grad_norm": 5.314885280931172, | |
| "kl": 0.0498046875, | |
| "learning_rate": 1.0833333333333334e-07, | |
| "loss": 0.0021, | |
| "reward": 1.9273128509521484, | |
| "reward_std": 0.01629455015063286, | |
| "rewards/accuracy_reward": 0.927312970161438, | |
| "rewards/format_reward": 1.0, | |
| "step": 428 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.00651550292969, | |
| "epoch": 9.125, | |
| "grad_norm": 1.6763167113014938, | |
| "kl": 0.059326171875, | |
| "learning_rate": 1.0624999999999999e-07, | |
| "loss": 0.0024, | |
| "reward": 1.9027307033538818, | |
| "reward_std": 0.023923706263303757, | |
| "rewards/accuracy_reward": 0.9040327072143555, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 429 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.30729675292969, | |
| "epoch": 9.145833333333334, | |
| "grad_norm": 2.331935816678969, | |
| "kl": 0.046875, | |
| "learning_rate": 1.0416666666666667e-07, | |
| "loss": 0.002, | |
| "reward": 1.914282202720642, | |
| "reward_std": 0.01946648769080639, | |
| "rewards/accuracy_reward": 0.9142822027206421, | |
| "rewards/format_reward": 1.0, | |
| "step": 430 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.20833587646484, | |
| "epoch": 9.166666666666666, | |
| "grad_norm": 2.7235761953830515, | |
| "kl": 0.046875, | |
| "learning_rate": 1.0208333333333333e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9132049083709717, | |
| "reward_std": 0.018830081447958946, | |
| "rewards/accuracy_reward": 0.9132048487663269, | |
| "rewards/format_reward": 1.0, | |
| "step": 431 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.61328125, | |
| "epoch": 9.1875, | |
| "grad_norm": 5.239801858902832, | |
| "kl": 0.045166015625, | |
| "learning_rate": 1e-07, | |
| "loss": 0.0019, | |
| "reward": 1.9167590141296387, | |
| "reward_std": 0.01652991585433483, | |
| "rewards/accuracy_reward": 0.9167590141296387, | |
| "rewards/format_reward": 1.0, | |
| "step": 432 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.30989837646484, | |
| "epoch": 9.208333333333334, | |
| "grad_norm": 2.0958692476606453, | |
| "kl": 0.047607421875, | |
| "learning_rate": 9.791666666666666e-08, | |
| "loss": 0.002, | |
| "reward": 1.9000282287597656, | |
| "reward_std": 0.017917610704898834, | |
| "rewards/accuracy_reward": 0.9000282287597656, | |
| "rewards/format_reward": 1.0, | |
| "step": 433 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.13021087646484, | |
| "epoch": 9.229166666666666, | |
| "grad_norm": 3.688730357890573, | |
| "kl": 0.047119140625, | |
| "learning_rate": 9.583333333333334e-08, | |
| "loss": 0.002, | |
| "reward": 1.915367841720581, | |
| "reward_std": 0.01575218327343464, | |
| "rewards/accuracy_reward": 0.915367841720581, | |
| "rewards/format_reward": 1.0, | |
| "step": 434 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.29817962646484, | |
| "epoch": 9.25, | |
| "grad_norm": 4.777726769533225, | |
| "kl": 0.0498046875, | |
| "learning_rate": 9.375e-08, | |
| "loss": 0.002, | |
| "reward": 1.9215753078460693, | |
| "reward_std": 0.018969135358929634, | |
| "rewards/accuracy_reward": 0.9215752482414246, | |
| "rewards/format_reward": 1.0, | |
| "step": 435 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.67578125, | |
| "epoch": 9.270833333333334, | |
| "grad_norm": 3.166832895506651, | |
| "kl": 0.1455078125, | |
| "learning_rate": 9.166666666666665e-08, | |
| "loss": 0.0059, | |
| "reward": 1.8940961360931396, | |
| "reward_std": 0.022732451558113098, | |
| "rewards/accuracy_reward": 0.8940961360931396, | |
| "rewards/format_reward": 1.0, | |
| "step": 436 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 106.04427337646484, | |
| "epoch": 9.291666666666666, | |
| "grad_norm": 7.685144828516677, | |
| "kl": 0.053466796875, | |
| "learning_rate": 8.958333333333333e-08, | |
| "loss": 0.0022, | |
| "reward": 1.9240249395370483, | |
| "reward_std": 0.020782217383384705, | |
| "rewards/accuracy_reward": 0.9253270030021667, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 437 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.99609375, | |
| "epoch": 9.3125, | |
| "grad_norm": 2.378060228855226, | |
| "kl": 0.047119140625, | |
| "learning_rate": 8.75e-08, | |
| "loss": 0.0019, | |
| "reward": 1.9038584232330322, | |
| "reward_std": 0.018619615584611893, | |
| "rewards/accuracy_reward": 0.9038585424423218, | |
| "rewards/format_reward": 1.0, | |
| "step": 438 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.53515625, | |
| "epoch": 9.333333333333334, | |
| "grad_norm": 1.7692084618793151, | |
| "kl": 0.04150390625, | |
| "learning_rate": 8.541666666666666e-08, | |
| "loss": 0.0018, | |
| "reward": 1.9091638326644897, | |
| "reward_std": 0.022435273975133896, | |
| "rewards/accuracy_reward": 0.9104660749435425, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 439 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.88542175292969, | |
| "epoch": 9.354166666666666, | |
| "grad_norm": 1.9024254121008084, | |
| "kl": 0.05615234375, | |
| "learning_rate": 8.333333333333333e-08, | |
| "loss": 0.0023, | |
| "reward": 1.9238141775131226, | |
| "reward_std": 0.022633202373981476, | |
| "rewards/accuracy_reward": 0.9251161813735962, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 440 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.61589050292969, | |
| "epoch": 9.375, | |
| "grad_norm": 1.6956193954619285, | |
| "kl": 0.047119140625, | |
| "learning_rate": 8.125e-08, | |
| "loss": 0.002, | |
| "reward": 1.9281830787658691, | |
| "reward_std": 0.016372021287679672, | |
| "rewards/accuracy_reward": 0.9281830787658691, | |
| "rewards/format_reward": 1.0, | |
| "step": 441 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 105.97786712646484, | |
| "epoch": 9.395833333333334, | |
| "grad_norm": 1.7595623551426245, | |
| "kl": 0.046142578125, | |
| "learning_rate": 7.916666666666665e-08, | |
| "loss": 0.0019, | |
| "reward": 1.8898437023162842, | |
| "reward_std": 0.020021602511405945, | |
| "rewards/accuracy_reward": 0.8898436427116394, | |
| "rewards/format_reward": 1.0, | |
| "step": 442 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.08464050292969, | |
| "epoch": 9.416666666666666, | |
| "grad_norm": 2.5345231096237626, | |
| "kl": 0.045654296875, | |
| "learning_rate": 7.708333333333333e-08, | |
| "loss": 0.0019, | |
| "reward": 1.9146665334701538, | |
| "reward_std": 0.018744416534900665, | |
| "rewards/accuracy_reward": 0.9146665334701538, | |
| "rewards/format_reward": 1.0, | |
| "step": 443 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.6875, | |
| "epoch": 9.4375, | |
| "grad_norm": 3.953395223073277, | |
| "kl": 0.05224609375, | |
| "learning_rate": 7.5e-08, | |
| "loss": 0.0022, | |
| "reward": 1.901958703994751, | |
| "reward_std": 0.02426784299314022, | |
| "rewards/accuracy_reward": 0.9032607078552246, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 444 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.35026550292969, | |
| "epoch": 9.458333333333334, | |
| "grad_norm": 2.9548245945269294, | |
| "kl": 0.046875, | |
| "learning_rate": 7.291666666666667e-08, | |
| "loss": 0.0019, | |
| "reward": 1.9318149089813232, | |
| "reward_std": 0.01830216310918331, | |
| "rewards/accuracy_reward": 0.9318150281906128, | |
| "rewards/format_reward": 1.0, | |
| "step": 445 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.83203125, | |
| "epoch": 9.479166666666666, | |
| "grad_norm": 2.485294511659845, | |
| "kl": 0.050048828125, | |
| "learning_rate": 7.083333333333333e-08, | |
| "loss": 0.0021, | |
| "reward": 1.94561767578125, | |
| "reward_std": 0.014377261511981487, | |
| "rewards/accuracy_reward": 0.9456178545951843, | |
| "rewards/format_reward": 1.0, | |
| "step": 446 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.19921875, | |
| "epoch": 9.5, | |
| "grad_norm": 3.3111964897684425, | |
| "kl": 0.072265625, | |
| "learning_rate": 6.875e-08, | |
| "loss": 0.003, | |
| "reward": 1.932363510131836, | |
| "reward_std": 0.02322392538189888, | |
| "rewards/accuracy_reward": 0.9336656332015991, | |
| "rewards/format_reward": 0.9986979365348816, | |
| "step": 447 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 108.52214050292969, | |
| "epoch": 9.520833333333334, | |
| "grad_norm": 1.8991735989887744, | |
| "kl": 0.04541015625, | |
| "learning_rate": 6.666666666666667e-08, | |
| "loss": 0.002, | |
| "reward": 1.928739070892334, | |
| "reward_std": 0.017306815832853317, | |
| "rewards/accuracy_reward": 0.928739070892334, | |
| "rewards/format_reward": 1.0, | |
| "step": 448 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 107.90104675292969, | |
| "epoch": 9.541666666666666, | |
| "grad_norm": 2.0677038589369747, | |
| "kl": 0.0478515625, | |
| "learning_rate": 6.458333333333333e-08, | |
| "loss": 0.002, | |
| "reward": 1.9004323482513428, | |
| "reward_std": 0.018483035266399384, | |
| "rewards/accuracy_reward": 0.900432288646698, | |
| "rewards/format_reward": 1.0, | |
| "step": 449 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 109.74479675292969, | |
| "epoch": 9.5625, | |
| "grad_norm": 2.4393059130616748, | |
| "kl": 0.04736328125, | |
| "learning_rate": 6.25e-08, | |
| "loss": 0.002, | |
| "reward": 1.9109472036361694, | |
| "reward_std": 0.01986522786319256, | |
| "rewards/accuracy_reward": 0.9109472036361694, | |
| "rewards/format_reward": 1.0, | |
| "step": 450 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 480, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 25, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 48, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |