| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.5714285714285714, | |
| "eval_steps": 500, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1644.166748046875, | |
| "epoch": 0.001142857142857143, | |
| "grad_norm": 0.20607953518495117, | |
| "kl": 0.0, | |
| "learning_rate": 2e-08, | |
| "loss": 0.0022, | |
| "reward": -0.1127668060362339, | |
| "reward_std": 0.20213491283357143, | |
| "rewards/cosine_scaled_reward": -0.18138340720906854, | |
| "rewards/format_reward": 0.25, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1656.791748046875, | |
| "epoch": 0.002285714285714286, | |
| "grad_norm": 0.31679714617652144, | |
| "kl": 0.0, | |
| "learning_rate": 4e-08, | |
| "loss": 0.0623, | |
| "reward": -0.05582176148891449, | |
| "reward_std": 0.6275629922747612, | |
| "rewards/cosine_scaled_reward": -0.19457754865288734, | |
| "rewards/format_reward": 0.3333333432674408, | |
| "step": 2 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1606.7500610351562, | |
| "epoch": 0.0034285714285714284, | |
| "grad_norm": 0.2789602147805501, | |
| "kl": 3.388524055480957e-05, | |
| "learning_rate": 6e-08, | |
| "loss": 0.0376, | |
| "reward": -0.2583192214369774, | |
| "reward_std": 0.2636854462325573, | |
| "rewards/cosine_scaled_reward": -0.222909614443779, | |
| "rewards/format_reward": 0.1875000074505806, | |
| "step": 3 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1690.6250610351562, | |
| "epoch": 0.004571428571428572, | |
| "grad_norm": 0.27232938747073254, | |
| "kl": 4.017353057861328e-05, | |
| "learning_rate": 8e-08, | |
| "loss": 0.0159, | |
| "reward": -0.40017254278063774, | |
| "reward_std": 0.17111004143953323, | |
| "rewards/cosine_scaled_reward": -0.3146696165204048, | |
| "rewards/format_reward": 0.2291666716337204, | |
| "step": 4 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1618.3541870117188, | |
| "epoch": 0.005714285714285714, | |
| "grad_norm": 0.2939867481096334, | |
| "kl": 2.8431415557861328e-05, | |
| "learning_rate": 1e-07, | |
| "loss": 0.0576, | |
| "reward": 0.13743871822953224, | |
| "reward_std": 0.7271581590175629, | |
| "rewards/cosine_scaled_reward": -0.12919731251895428, | |
| "rewards/format_reward": 0.3958333395421505, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1629.4791870117188, | |
| "epoch": 0.006857142857142857, | |
| "grad_norm": 0.248871735331751, | |
| "kl": 3.477931022644043e-05, | |
| "learning_rate": 1.2e-07, | |
| "loss": -0.0029, | |
| "reward": -0.029103130102157593, | |
| "reward_std": 0.5708433166146278, | |
| "rewards/cosine_scaled_reward": -0.1708015874028206, | |
| "rewards/format_reward": 0.3125000037252903, | |
| "step": 6 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1490.6458740234375, | |
| "epoch": 0.008, | |
| "grad_norm": 0.22790937530079167, | |
| "kl": 3.007054328918457e-05, | |
| "learning_rate": 1.4e-07, | |
| "loss": 0.0903, | |
| "reward": 0.12145921215415001, | |
| "reward_std": 0.5416159555315971, | |
| "rewards/cosine_scaled_reward": -0.10593708232045174, | |
| "rewards/format_reward": 0.33333334140479565, | |
| "step": 7 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1683.5000305175781, | |
| "epoch": 0.009142857142857144, | |
| "grad_norm": 0.20752077742039396, | |
| "kl": 4.646182060241699e-05, | |
| "learning_rate": 1.6e-07, | |
| "loss": 0.0277, | |
| "reward": -0.23692437633872032, | |
| "reward_std": 0.4620281979441643, | |
| "rewards/cosine_scaled_reward": -0.2747122012078762, | |
| "rewards/format_reward": 0.31250000558793545, | |
| "step": 8 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1719.2292175292969, | |
| "epoch": 0.010285714285714285, | |
| "grad_norm": 0.2983323511333683, | |
| "kl": 4.1991472244262695e-05, | |
| "learning_rate": 1.8e-07, | |
| "loss": 0.0511, | |
| "reward": -0.31221747025847435, | |
| "reward_std": 0.21310735493898392, | |
| "rewards/cosine_scaled_reward": -0.24985874257981777, | |
| "rewards/format_reward": 0.1875000074505806, | |
| "step": 9 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1477.2083740234375, | |
| "epoch": 0.011428571428571429, | |
| "grad_norm": 0.23645082786220448, | |
| "kl": 3.116577863693237e-05, | |
| "learning_rate": 2e-07, | |
| "loss": 0.0495, | |
| "reward": 0.37697479128837585, | |
| "reward_std": 0.44906593672931194, | |
| "rewards/cosine_scaled_reward": -0.05109592713415623, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1508.8958587646484, | |
| "epoch": 0.012571428571428572, | |
| "grad_norm": 0.339825377520832, | |
| "kl": 2.8848648071289062e-05, | |
| "learning_rate": 2.1999999999999998e-07, | |
| "loss": 0.0535, | |
| "reward": -0.13005081936717033, | |
| "reward_std": 0.6173823103308678, | |
| "rewards/cosine_scaled_reward": -0.2525254301726818, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 11 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1631.1041870117188, | |
| "epoch": 0.013714285714285714, | |
| "grad_norm": 0.20658630326267732, | |
| "kl": 3.084540367126465e-05, | |
| "learning_rate": 2.4e-07, | |
| "loss": 0.0635, | |
| "reward": 0.03064786270260811, | |
| "reward_std": 0.4376446008682251, | |
| "rewards/cosine_scaled_reward": -0.1513427309691906, | |
| "rewards/format_reward": 0.33333334140479565, | |
| "step": 12 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1422.604232788086, | |
| "epoch": 0.014857142857142857, | |
| "grad_norm": 0.23614097630983502, | |
| "kl": 2.527981996536255e-05, | |
| "learning_rate": 2.6e-07, | |
| "loss": -0.0306, | |
| "reward": 0.4512472003698349, | |
| "reward_std": 0.40983884781599045, | |
| "rewards/cosine_scaled_reward": -0.02437640482094139, | |
| "rewards/format_reward": 0.5, | |
| "step": 13 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1652.3542175292969, | |
| "epoch": 0.016, | |
| "grad_norm": 0.2206408502680819, | |
| "kl": 3.93986701965332e-05, | |
| "learning_rate": 2.8e-07, | |
| "loss": 0.0059, | |
| "reward": -0.2542928569018841, | |
| "reward_std": 0.17246506363153458, | |
| "rewards/cosine_scaled_reward": -0.26256311126053333, | |
| "rewards/format_reward": 0.2708333395421505, | |
| "step": 14 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1679.229248046875, | |
| "epoch": 0.017142857142857144, | |
| "grad_norm": 0.2314183406404789, | |
| "kl": 4.3898820877075195e-05, | |
| "learning_rate": 3e-07, | |
| "loss": 0.0053, | |
| "reward": -0.258657343685627, | |
| "reward_std": 0.23606499657034874, | |
| "rewards/cosine_scaled_reward": -0.1918286692816764, | |
| "rewards/format_reward": 0.125, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1396.7917175292969, | |
| "epoch": 0.018285714285714287, | |
| "grad_norm": 0.25436941656143647, | |
| "kl": 2.3171305656433105e-05, | |
| "learning_rate": 3.2e-07, | |
| "loss": 0.1053, | |
| "reward": 0.20216324925422668, | |
| "reward_std": 0.4999893419444561, | |
| "rewards/cosine_scaled_reward": -0.13850171491503716, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 16 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1719.416748046875, | |
| "epoch": 0.019428571428571427, | |
| "grad_norm": 0.23312894299622924, | |
| "kl": 4.0084123611450195e-05, | |
| "learning_rate": 3.4000000000000003e-07, | |
| "loss": -0.0007, | |
| "reward": -0.41149570792913437, | |
| "reward_std": 0.13166083209216595, | |
| "rewards/cosine_scaled_reward": -0.26824783720076084, | |
| "rewards/format_reward": 0.125, | |
| "step": 17 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1686.0833740234375, | |
| "epoch": 0.02057142857142857, | |
| "grad_norm": 0.24676487462788851, | |
| "kl": 4.7713518142700195e-05, | |
| "learning_rate": 3.6e-07, | |
| "loss": 0.0814, | |
| "reward": -0.32610235549509525, | |
| "reward_std": 0.23402154073119164, | |
| "rewards/cosine_scaled_reward": -0.25680116564035416, | |
| "rewards/format_reward": 0.18750000186264515, | |
| "step": 18 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1773.6458740234375, | |
| "epoch": 0.021714285714285714, | |
| "grad_norm": 0.21561964662639843, | |
| "kl": 2.1457672119140625e-05, | |
| "learning_rate": 3.7999999999999996e-07, | |
| "loss": 0.0164, | |
| "reward": -0.5961569249629974, | |
| "reward_std": 0.1714775264263153, | |
| "rewards/cosine_scaled_reward": -0.3501618057489395, | |
| "rewards/format_reward": 0.10416666977107525, | |
| "step": 19 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1529.3125610351562, | |
| "epoch": 0.022857142857142857, | |
| "grad_norm": 0.251130340260543, | |
| "kl": 3.24249267578125e-05, | |
| "learning_rate": 4e-07, | |
| "loss": 0.0293, | |
| "reward": -0.048260755836963654, | |
| "reward_std": 0.34835576079785824, | |
| "rewards/cosine_scaled_reward": -0.20121371746063232, | |
| "rewards/format_reward": 0.35416667722165585, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1494.6250305175781, | |
| "epoch": 0.024, | |
| "grad_norm": 0.3018968569179871, | |
| "kl": 2.6673078536987305e-05, | |
| "learning_rate": 4.1999999999999995e-07, | |
| "loss": 0.0278, | |
| "reward": 0.021329142153263092, | |
| "reward_std": 0.45257429778575897, | |
| "rewards/cosine_scaled_reward": -0.15600210055708885, | |
| "rewards/format_reward": 0.3333333358168602, | |
| "step": 21 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1778.5625610351562, | |
| "epoch": 0.025142857142857144, | |
| "grad_norm": 0.29253387654098556, | |
| "kl": 3.1888484954833984e-05, | |
| "learning_rate": 4.3999999999999997e-07, | |
| "loss": 0.0494, | |
| "reward": -0.5034094974398613, | |
| "reward_std": 0.3080843798816204, | |
| "rewards/cosine_scaled_reward": -0.29337141662836075, | |
| "rewards/format_reward": 0.08333333395421505, | |
| "step": 22 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1762.8958740234375, | |
| "epoch": 0.026285714285714287, | |
| "grad_norm": 0.21053978305274443, | |
| "kl": 4.506111145019531e-05, | |
| "learning_rate": 4.6e-07, | |
| "loss": 0.0144, | |
| "reward": -0.028878159821033478, | |
| "reward_std": 0.5564102046191692, | |
| "rewards/cosine_scaled_reward": -0.10818908177316189, | |
| "rewards/format_reward": 0.1875000074505806, | |
| "step": 23 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1352.5625305175781, | |
| "epoch": 0.027428571428571427, | |
| "grad_norm": 0.20202450012624545, | |
| "kl": 1.6548670828342438e-05, | |
| "learning_rate": 4.8e-07, | |
| "loss": 0.0005, | |
| "reward": 0.6555859744548798, | |
| "reward_std": 0.47822858951985836, | |
| "rewards/cosine_scaled_reward": 0.06737629324197769, | |
| "rewards/format_reward": 0.520833333954215, | |
| "step": 24 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1597.1875610351562, | |
| "epoch": 0.02857142857142857, | |
| "grad_norm": 0.4327230812041704, | |
| "kl": 3.0606985092163086e-05, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0701, | |
| "reward": 0.05484675616025925, | |
| "reward_std": 0.6329891942441463, | |
| "rewards/cosine_scaled_reward": -0.11840994283556938, | |
| "rewards/format_reward": 0.29166667722165585, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1647.916748046875, | |
| "epoch": 0.029714285714285714, | |
| "grad_norm": 0.21123992049117873, | |
| "kl": 2.2917985916137695e-05, | |
| "learning_rate": 5.2e-07, | |
| "loss": 0.031, | |
| "reward": -0.24321994185447693, | |
| "reward_std": 0.12097731977701187, | |
| "rewards/cosine_scaled_reward": -0.18410997837781906, | |
| "rewards/format_reward": 0.125, | |
| "step": 26 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1638.8958740234375, | |
| "epoch": 0.030857142857142857, | |
| "grad_norm": 0.21745088219923464, | |
| "kl": 3.2067298889160156e-05, | |
| "learning_rate": 5.4e-07, | |
| "loss": -0.0097, | |
| "reward": -0.3657397888600826, | |
| "reward_std": 0.24539830163121223, | |
| "rewards/cosine_scaled_reward": -0.2974532376974821, | |
| "rewards/format_reward": 0.2291666716337204, | |
| "step": 27 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1711.2709045410156, | |
| "epoch": 0.032, | |
| "grad_norm": 0.2552233664551883, | |
| "kl": 2.8468668460845947e-05, | |
| "learning_rate": 5.6e-07, | |
| "loss": 0.0256, | |
| "reward": -0.38710537925362587, | |
| "reward_std": 0.2530311979353428, | |
| "rewards/cosine_scaled_reward": -0.2768860347568989, | |
| "rewards/format_reward": 0.1666666716337204, | |
| "step": 28 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1713.8125610351562, | |
| "epoch": 0.03314285714285714, | |
| "grad_norm": 0.202249350617508, | |
| "kl": 2.86102294921875e-05, | |
| "learning_rate": 5.8e-07, | |
| "loss": 0.0135, | |
| "reward": -0.1931730881333351, | |
| "reward_std": 0.5632064789533615, | |
| "rewards/cosine_scaled_reward": -0.20075321290642023, | |
| "rewards/format_reward": 0.2083333358168602, | |
| "step": 29 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1732.291748046875, | |
| "epoch": 0.03428571428571429, | |
| "grad_norm": 0.23328556356102392, | |
| "kl": 2.165883779525757e-05, | |
| "learning_rate": 6e-07, | |
| "loss": 0.0564, | |
| "reward": -0.3746844604611397, | |
| "reward_std": 0.34011659026145935, | |
| "rewards/cosine_scaled_reward": -0.24984224140644073, | |
| "rewards/format_reward": 0.12500000186264515, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1445.3125305175781, | |
| "epoch": 0.03542857142857143, | |
| "grad_norm": 0.30643607095324277, | |
| "kl": 3.966689109802246e-05, | |
| "learning_rate": 6.2e-07, | |
| "loss": 0.0923, | |
| "reward": -0.09436208941042423, | |
| "reward_std": 0.3265727870166302, | |
| "rewards/cosine_scaled_reward": -0.21384770551230758, | |
| "rewards/format_reward": 0.33333333395421505, | |
| "step": 31 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1810.7917175292969, | |
| "epoch": 0.036571428571428574, | |
| "grad_norm": 0.20484433233713875, | |
| "kl": 2.8021633625030518e-05, | |
| "learning_rate": 6.4e-07, | |
| "loss": 0.0202, | |
| "reward": -0.5034667998552322, | |
| "reward_std": 0.15860500000417233, | |
| "rewards/cosine_scaled_reward": -0.2621500678360462, | |
| "rewards/format_reward": 0.02083333395421505, | |
| "step": 32 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1750.9584045410156, | |
| "epoch": 0.037714285714285714, | |
| "grad_norm": 0.2027434434467969, | |
| "kl": 2.5600194931030273e-05, | |
| "learning_rate": 6.6e-07, | |
| "loss": -0.0171, | |
| "reward": -0.25296103954315186, | |
| "reward_std": 0.4817052260041237, | |
| "rewards/cosine_scaled_reward": -0.2514805067330599, | |
| "rewards/format_reward": 0.25000000558793545, | |
| "step": 33 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1634.8333740234375, | |
| "epoch": 0.038857142857142854, | |
| "grad_norm": 0.23764579059557195, | |
| "kl": 2.331659197807312e-05, | |
| "learning_rate": 6.800000000000001e-07, | |
| "loss": 0.0003, | |
| "reward": -0.3657361939549446, | |
| "reward_std": 0.2039697989821434, | |
| "rewards/cosine_scaled_reward": -0.25578476674854755, | |
| "rewards/format_reward": 0.14583333395421505, | |
| "step": 34 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1691.1875610351562, | |
| "epoch": 0.04, | |
| "grad_norm": 0.2390715088796384, | |
| "kl": 1.8522143363952637e-05, | |
| "learning_rate": 7e-07, | |
| "loss": 0.0579, | |
| "reward": -0.1916074175387621, | |
| "reward_std": 0.40257398039102554, | |
| "rewards/cosine_scaled_reward": -0.23122038505971432, | |
| "rewards/format_reward": 0.27083334885537624, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1526.2292175292969, | |
| "epoch": 0.04114285714285714, | |
| "grad_norm": 0.2361249356185026, | |
| "kl": 3.781914710998535e-05, | |
| "learning_rate": 7.2e-07, | |
| "loss": 0.0401, | |
| "reward": 0.35939645767211914, | |
| "reward_std": 0.39011720940470695, | |
| "rewards/cosine_scaled_reward": -0.01821846514940262, | |
| "rewards/format_reward": 0.3958333395421505, | |
| "step": 36 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1645.7708740234375, | |
| "epoch": 0.04228571428571429, | |
| "grad_norm": 0.26864783041008133, | |
| "kl": 3.820657730102539e-05, | |
| "learning_rate": 7.4e-07, | |
| "loss": 0.0746, | |
| "reward": -0.2870800420641899, | |
| "reward_std": 0.46812814101576805, | |
| "rewards/cosine_scaled_reward": -0.25812335684895515, | |
| "rewards/format_reward": 0.2291666679084301, | |
| "step": 37 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1722.5000610351562, | |
| "epoch": 0.04342857142857143, | |
| "grad_norm": 0.27664066975056834, | |
| "kl": 5.131959915161133e-05, | |
| "learning_rate": 7.599999999999999e-07, | |
| "loss": 0.0586, | |
| "reward": -0.15014038234949112, | |
| "reward_std": 0.4126087427139282, | |
| "rewards/cosine_scaled_reward": -0.2000702191144228, | |
| "rewards/format_reward": 0.2500000074505806, | |
| "step": 38 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1678.7083740234375, | |
| "epoch": 0.044571428571428574, | |
| "grad_norm": 0.3003829192682386, | |
| "kl": 4.968792200088501e-05, | |
| "learning_rate": 7.799999999999999e-07, | |
| "loss": 0.097, | |
| "reward": -0.21257384680211544, | |
| "reward_std": 0.48539142310619354, | |
| "rewards/cosine_scaled_reward": -0.2312869280576706, | |
| "rewards/format_reward": 0.2500000111758709, | |
| "step": 39 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1690.8958740234375, | |
| "epoch": 0.045714285714285714, | |
| "grad_norm": 0.20909108511646457, | |
| "kl": 5.0902366638183594e-05, | |
| "learning_rate": 8e-07, | |
| "loss": 0.0436, | |
| "reward": -0.5045258924365044, | |
| "reward_std": 0.2920587807893753, | |
| "rewards/cosine_scaled_reward": -0.3564296290278435, | |
| "rewards/format_reward": 0.2083333358168602, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1806.3334045410156, | |
| "epoch": 0.046857142857142854, | |
| "grad_norm": 0.2168555566166619, | |
| "kl": 3.137439489364624e-05, | |
| "learning_rate": 8.199999999999999e-07, | |
| "loss": -0.0012, | |
| "reward": 0.04771171510219574, | |
| "reward_std": 0.33250839821994305, | |
| "rewards/cosine_scaled_reward": -0.06989414617419243, | |
| "rewards/format_reward": 0.18750000186264515, | |
| "step": 41 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.6250457763672, | |
| "epoch": 0.048, | |
| "grad_norm": 0.40542845209419376, | |
| "kl": 0.000291675329208374, | |
| "learning_rate": 8.399999999999999e-07, | |
| "loss": 0.0768, | |
| "reward": 0.27488730661571026, | |
| "reward_std": 0.45710677094757557, | |
| "rewards/cosine_scaled_reward": -0.1646396858850494, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 42 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1705.8750610351562, | |
| "epoch": 0.04914285714285714, | |
| "grad_norm": 0.21842925663095267, | |
| "kl": 3.538280725479126e-05, | |
| "learning_rate": 8.599999999999999e-07, | |
| "loss": 0.0308, | |
| "reward": -0.2755163535475731, | |
| "reward_std": 0.3637393806129694, | |
| "rewards/cosine_scaled_reward": -0.2210915139876306, | |
| "rewards/format_reward": 0.1666666679084301, | |
| "step": 43 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1665.0625305175781, | |
| "epoch": 0.05028571428571429, | |
| "grad_norm": 0.26271417694787236, | |
| "kl": 0.00046503543853759766, | |
| "learning_rate": 8.799999999999999e-07, | |
| "loss": 0.073, | |
| "reward": -0.12092901021242142, | |
| "reward_std": 0.5556337833404541, | |
| "rewards/cosine_scaled_reward": -0.17504783952608705, | |
| "rewards/format_reward": 0.2291666679084301, | |
| "step": 44 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1733.2084045410156, | |
| "epoch": 0.05142857142857143, | |
| "grad_norm": 0.21285192669515357, | |
| "kl": 5.0537288188934326e-05, | |
| "learning_rate": 9e-07, | |
| "loss": 0.0423, | |
| "reward": -0.05799056589603424, | |
| "reward_std": 0.4342048391699791, | |
| "rewards/cosine_scaled_reward": -0.14357861876487732, | |
| "rewards/format_reward": 0.22916666977107525, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1640.0834045410156, | |
| "epoch": 0.052571428571428575, | |
| "grad_norm": 0.2622293688477209, | |
| "kl": 0.00013068318367004395, | |
| "learning_rate": 9.2e-07, | |
| "loss": 0.0317, | |
| "reward": -0.005384169518947601, | |
| "reward_std": 0.3068407401442528, | |
| "rewards/cosine_scaled_reward": -0.1068587563931942, | |
| "rewards/format_reward": 0.20833333395421505, | |
| "step": 46 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1498.8333892822266, | |
| "epoch": 0.053714285714285714, | |
| "grad_norm": 0.274608905827555, | |
| "kl": 0.0001885145902633667, | |
| "learning_rate": 9.399999999999999e-07, | |
| "loss": 0.049, | |
| "reward": -0.002073638141155243, | |
| "reward_std": 0.4514222964644432, | |
| "rewards/cosine_scaled_reward": -0.17812015302479267, | |
| "rewards/format_reward": 0.3541666753590107, | |
| "step": 47 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1610.4792175292969, | |
| "epoch": 0.054857142857142854, | |
| "grad_norm": 0.24771930467103717, | |
| "kl": 0.00015616416931152344, | |
| "learning_rate": 9.6e-07, | |
| "loss": 0.0334, | |
| "reward": -0.22091616783291101, | |
| "reward_std": 0.33334225323051214, | |
| "rewards/cosine_scaled_reward": -0.21462474018335342, | |
| "rewards/format_reward": 0.20833334140479565, | |
| "step": 48 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1341.1458740234375, | |
| "epoch": 0.056, | |
| "grad_norm": 0.3710205417665813, | |
| "kl": 0.00029793381690979004, | |
| "learning_rate": 9.8e-07, | |
| "loss": 0.0862, | |
| "reward": 0.40674951672554016, | |
| "reward_std": 0.5115297809243202, | |
| "rewards/cosine_scaled_reward": -0.025791920721530914, | |
| "rewards/format_reward": 0.45833333395421505, | |
| "step": 49 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1335.1667175292969, | |
| "epoch": 0.05714285714285714, | |
| "grad_norm": 0.3034272517231627, | |
| "kl": 0.0005925297737121582, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1036, | |
| "reward": 0.36978277564048767, | |
| "reward_std": 0.4990865057334304, | |
| "rewards/cosine_scaled_reward": -0.033858626149594784, | |
| "rewards/format_reward": 0.43750002048909664, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1686.8959045410156, | |
| "epoch": 0.05828571428571429, | |
| "grad_norm": 0.3009121706411098, | |
| "kl": 0.00032591819763183594, | |
| "learning_rate": 9.999890338174275e-07, | |
| "loss": 0.0864, | |
| "reward": -0.20582207757979631, | |
| "reward_std": 0.5198994930833578, | |
| "rewards/cosine_scaled_reward": -0.19666103832423687, | |
| "rewards/format_reward": 0.1875000111758709, | |
| "step": 51 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1718.2291870117188, | |
| "epoch": 0.05942857142857143, | |
| "grad_norm": 0.21311754620957382, | |
| "kl": 0.0005127787590026855, | |
| "learning_rate": 9.999561358041868e-07, | |
| "loss": 0.0262, | |
| "reward": -0.39756081253290176, | |
| "reward_std": 0.34694093093276024, | |
| "rewards/cosine_scaled_reward": -0.2716970667243004, | |
| "rewards/format_reward": 0.1458333358168602, | |
| "step": 52 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1611.8334045410156, | |
| "epoch": 0.060571428571428575, | |
| "grad_norm": 0.22683388578373892, | |
| "kl": 0.0005531832575798035, | |
| "learning_rate": 9.999013075636804e-07, | |
| "loss": 0.068, | |
| "reward": -0.13391486555337906, | |
| "reward_std": 0.27848392724990845, | |
| "rewards/cosine_scaled_reward": -0.22320742718875408, | |
| "rewards/format_reward": 0.31250000186264515, | |
| "step": 53 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1442.0834045410156, | |
| "epoch": 0.061714285714285715, | |
| "grad_norm": 0.24769106962876689, | |
| "kl": 0.0002713203430175781, | |
| "learning_rate": 9.998245517681593e-07, | |
| "loss": 0.0911, | |
| "reward": -0.11875106766819954, | |
| "reward_std": 0.1542784534394741, | |
| "rewards/cosine_scaled_reward": -0.2572922073304653, | |
| "rewards/format_reward": 0.3958333432674408, | |
| "step": 54 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1688.4167175292969, | |
| "epoch": 0.06285714285714286, | |
| "grad_norm": 0.22851815885942953, | |
| "kl": 0.0001881718635559082, | |
| "learning_rate": 9.997258721585931e-07, | |
| "loss": 0.0068, | |
| "reward": -0.3640219047665596, | |
| "reward_std": 0.2585913948714733, | |
| "rewards/cosine_scaled_reward": -0.2965943031013012, | |
| "rewards/format_reward": 0.2291666753590107, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1569.4166870117188, | |
| "epoch": 0.064, | |
| "grad_norm": 0.2466081306910316, | |
| "kl": 0.0021448135375976562, | |
| "learning_rate": 9.996052735444862e-07, | |
| "loss": 0.096, | |
| "reward": -0.4589140391908586, | |
| "reward_std": 0.4320836700499058, | |
| "rewards/cosine_scaled_reward": -0.3440403640270233, | |
| "rewards/format_reward": 0.2291666679084301, | |
| "step": 56 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1629.979248046875, | |
| "epoch": 0.06514285714285714, | |
| "grad_norm": 0.22573731739546327, | |
| "kl": 0.0010238885879516602, | |
| "learning_rate": 9.994627618036452e-07, | |
| "loss": 0.0592, | |
| "reward": -0.3061641752719879, | |
| "reward_std": 0.5002065226435661, | |
| "rewards/cosine_scaled_reward": -0.26766542345285416, | |
| "rewards/format_reward": 0.2291666716337204, | |
| "step": 57 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1660.4792175292969, | |
| "epoch": 0.06628571428571428, | |
| "grad_norm": 0.22190381637143303, | |
| "kl": 0.0011049509048461914, | |
| "learning_rate": 9.992983438818915e-07, | |
| "loss": 0.022, | |
| "reward": -0.32173825055360794, | |
| "reward_std": 0.27725364826619625, | |
| "rewards/cosine_scaled_reward": -0.2754524536430836, | |
| "rewards/format_reward": 0.2291666679084301, | |
| "step": 58 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1690.0417175292969, | |
| "epoch": 0.06742857142857143, | |
| "grad_norm": 0.21914617585966853, | |
| "kl": 0.0010164976119995117, | |
| "learning_rate": 9.991120277927223e-07, | |
| "loss": 0.0444, | |
| "reward": -0.021609768271446228, | |
| "reward_std": 0.3677750062197447, | |
| "rewards/cosine_scaled_reward": -0.135804895311594, | |
| "rewards/format_reward": 0.25000000558793545, | |
| "step": 59 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1581.6875305175781, | |
| "epoch": 0.06857142857142857, | |
| "grad_norm": 0.4016735260144472, | |
| "kl": 0.01423954963684082, | |
| "learning_rate": 9.989038226169207e-07, | |
| "loss": 0.0192, | |
| "reward": 0.11502109467983246, | |
| "reward_std": 0.29630398005247116, | |
| "rewards/cosine_scaled_reward": -0.057072801515460014, | |
| "rewards/format_reward": 0.2291666716337204, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1475.5833740234375, | |
| "epoch": 0.06971428571428571, | |
| "grad_norm": 0.24285848407581584, | |
| "kl": 0.0003628730773925781, | |
| "learning_rate": 9.98673738502114e-07, | |
| "loss": 0.0731, | |
| "reward": 0.5937481597065926, | |
| "reward_std": 0.6881431620568037, | |
| "rewards/cosine_scaled_reward": 0.046874068677425385, | |
| "rewards/format_reward": 0.5000000149011612, | |
| "step": 61 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1805.7500610351562, | |
| "epoch": 0.07085714285714285, | |
| "grad_norm": 0.19714468440546948, | |
| "kl": 0.0005519390106201172, | |
| "learning_rate": 9.98421786662277e-07, | |
| "loss": 0.0172, | |
| "reward": -0.4636555463075638, | |
| "reward_std": 0.3160466430708766, | |
| "rewards/cosine_scaled_reward": -0.2734944522380829, | |
| "rewards/format_reward": 0.08333333395421505, | |
| "step": 62 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1329.2917175292969, | |
| "epoch": 0.072, | |
| "grad_norm": 0.28510447078335305, | |
| "kl": 0.004929542541503906, | |
| "learning_rate": 9.981479793771866e-07, | |
| "loss": 0.1079, | |
| "reward": 0.30475724674761295, | |
| "reward_std": 0.4675188772380352, | |
| "rewards/cosine_scaled_reward": -0.0976213626563549, | |
| "rewards/format_reward": 0.5000000149011612, | |
| "step": 63 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1636.5208740234375, | |
| "epoch": 0.07314285714285715, | |
| "grad_norm": 0.20815660806735267, | |
| "kl": 0.0003604888916015625, | |
| "learning_rate": 9.97852329991824e-07, | |
| "loss": 0.0625, | |
| "reward": 0.29327625688165426, | |
| "reward_std": 0.5610844530165195, | |
| "rewards/cosine_scaled_reward": -0.03044520819094032, | |
| "rewards/format_reward": 0.354166679084301, | |
| "step": 64 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1559.5000305175781, | |
| "epoch": 0.07428571428571429, | |
| "grad_norm": 0.24172417943995111, | |
| "kl": 0.001363515853881836, | |
| "learning_rate": 9.975348529157229e-07, | |
| "loss": 0.0995, | |
| "reward": 0.1283707581460476, | |
| "reward_std": 0.7667413726449013, | |
| "rewards/cosine_scaled_reward": -0.13373128045350313, | |
| "rewards/format_reward": 0.3958333507180214, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1729.6667175292969, | |
| "epoch": 0.07542857142857143, | |
| "grad_norm": 0.20090852438136195, | |
| "kl": 0.00067138671875, | |
| "learning_rate": 9.971955636222684e-07, | |
| "loss": 0.0209, | |
| "reward": -0.39017004892230034, | |
| "reward_std": 0.32542612217366695, | |
| "rewards/cosine_scaled_reward": -0.3200850263237953, | |
| "rewards/format_reward": 0.2500000149011612, | |
| "step": 66 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1648.7292175292969, | |
| "epoch": 0.07657142857142857, | |
| "grad_norm": 0.18795555019652113, | |
| "kl": 0.0007681846618652344, | |
| "learning_rate": 9.968344786479415e-07, | |
| "loss": 0.0342, | |
| "reward": -0.1792638599872589, | |
| "reward_std": 0.3578680492937565, | |
| "rewards/cosine_scaled_reward": -0.20421527326107025, | |
| "rewards/format_reward": 0.2291666679084301, | |
| "step": 67 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1388.5625610351562, | |
| "epoch": 0.07771428571428571, | |
| "grad_norm": 0.3904259482407812, | |
| "kl": 0.00202178955078125, | |
| "learning_rate": 9.964516155915151e-07, | |
| "loss": 0.0637, | |
| "reward": 0.16577239707112312, | |
| "reward_std": 0.3421984985470772, | |
| "rewards/cosine_scaled_reward": -0.09419714100658894, | |
| "rewards/format_reward": 0.3541666716337204, | |
| "step": 68 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1507.8333740234375, | |
| "epoch": 0.07885714285714286, | |
| "grad_norm": 0.2361059164440503, | |
| "kl": 0.0008258819580078125, | |
| "learning_rate": 9.960469931131936e-07, | |
| "loss": 0.0613, | |
| "reward": 0.17160904966294765, | |
| "reward_std": 0.38275655917823315, | |
| "rewards/cosine_scaled_reward": -0.10169548355042934, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 69 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1690.3750305175781, | |
| "epoch": 0.08, | |
| "grad_norm": 0.19302606573391104, | |
| "kl": 0.002358675003051758, | |
| "learning_rate": 9.956206309337066e-07, | |
| "loss": 0.105, | |
| "reward": -0.1555338129401207, | |
| "reward_std": 0.37855083122849464, | |
| "rewards/cosine_scaled_reward": -0.20276692137122154, | |
| "rewards/format_reward": 0.25000000186264515, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1441.729232788086, | |
| "epoch": 0.08114285714285714, | |
| "grad_norm": 0.331702227116139, | |
| "kl": 0.0023870468139648438, | |
| "learning_rate": 9.951725498333448e-07, | |
| "loss": 0.1388, | |
| "reward": -0.2453744667582214, | |
| "reward_std": 0.15839526243507862, | |
| "rewards/cosine_scaled_reward": -0.3101872429251671, | |
| "rewards/format_reward": 0.3750000149011612, | |
| "step": 71 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1497.3959045410156, | |
| "epoch": 0.08228571428571428, | |
| "grad_norm": 0.33894190686830156, | |
| "kl": 0.0017808079719543457, | |
| "learning_rate": 9.947027716509488e-07, | |
| "loss": 0.0553, | |
| "reward": 0.09824148565530777, | |
| "reward_std": 0.1729265321046114, | |
| "rewards/cosine_scaled_reward": -0.08629592880606651, | |
| "rewards/format_reward": 0.2708333358168602, | |
| "step": 72 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1444.7708892822266, | |
| "epoch": 0.08342857142857144, | |
| "grad_norm": 0.9254159035231885, | |
| "kl": 0.039752960205078125, | |
| "learning_rate": 9.942113192828444e-07, | |
| "loss": 0.1025, | |
| "reward": 0.47389062121510506, | |
| "reward_std": 0.7162522077560425, | |
| "rewards/cosine_scaled_reward": -0.05472135776653886, | |
| "rewards/format_reward": 0.5833333507180214, | |
| "step": 73 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1484.7083740234375, | |
| "epoch": 0.08457142857142858, | |
| "grad_norm": 0.2164345231616129, | |
| "kl": 0.0021944046020507812, | |
| "learning_rate": 9.93698216681727e-07, | |
| "loss": 0.0129, | |
| "reward": -0.06718481332063675, | |
| "reward_std": 0.16878989525139332, | |
| "rewards/cosine_scaled_reward": -0.22109240666031837, | |
| "rewards/format_reward": 0.3750000037252903, | |
| "step": 74 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1526.0417175292969, | |
| "epoch": 0.08571428571428572, | |
| "grad_norm": 0.3075410122107456, | |
| "kl": 0.00359344482421875, | |
| "learning_rate": 9.931634888554935e-07, | |
| "loss": 0.0753, | |
| "reward": 0.17093585059046745, | |
| "reward_std": 0.4688509330153465, | |
| "rewards/cosine_scaled_reward": -0.08119874075055122, | |
| "rewards/format_reward": 0.33333334513008595, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1640.4583740234375, | |
| "epoch": 0.08685714285714285, | |
| "grad_norm": 0.20492660661291412, | |
| "kl": 0.00046312808990478516, | |
| "learning_rate": 9.926071618660237e-07, | |
| "loss": 0.0184, | |
| "reward": 0.029385031666606665, | |
| "reward_std": 0.6126945875585079, | |
| "rewards/cosine_scaled_reward": -0.151974156498909, | |
| "rewards/format_reward": 0.33333333395421505, | |
| "step": 76 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1674.5625610351562, | |
| "epoch": 0.088, | |
| "grad_norm": 0.21980728108796918, | |
| "kl": 0.0009822845458984375, | |
| "learning_rate": 9.9202926282791e-07, | |
| "loss": -0.0002, | |
| "reward": -0.18806731700897217, | |
| "reward_std": 0.12730432488024235, | |
| "rewards/cosine_scaled_reward": -0.15653366968035698, | |
| "rewards/format_reward": 0.125, | |
| "step": 77 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1518.0625610351562, | |
| "epoch": 0.08914285714285715, | |
| "grad_norm": 0.242785552217566, | |
| "kl": 0.0009822845458984375, | |
| "learning_rate": 9.91429819907136e-07, | |
| "loss": 0.0619, | |
| "reward": 0.13657424598932266, | |
| "reward_std": 0.4360465779900551, | |
| "rewards/cosine_scaled_reward": -0.10879619419574738, | |
| "rewards/format_reward": 0.35416666977107525, | |
| "step": 78 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1575.4792175292969, | |
| "epoch": 0.09028571428571429, | |
| "grad_norm": 0.24080955526978698, | |
| "kl": 0.0005426406860351562, | |
| "learning_rate": 9.908088623197048e-07, | |
| "loss": 0.0519, | |
| "reward": 0.016203314065933228, | |
| "reward_std": 0.6479124575853348, | |
| "rewards/cosine_scaled_reward": -0.1585650178603828, | |
| "rewards/format_reward": 0.3333333395421505, | |
| "step": 79 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1733.9167175292969, | |
| "epoch": 0.09142857142857143, | |
| "grad_norm": 0.2186002750502081, | |
| "kl": 0.0005044937133789062, | |
| "learning_rate": 9.901664203302124e-07, | |
| "loss": 0.031, | |
| "reward": -0.5251612327992916, | |
| "reward_std": 0.40141166001558304, | |
| "rewards/cosine_scaled_reward": -0.33549728989601135, | |
| "rewards/format_reward": 0.1458333358168602, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1728.479248046875, | |
| "epoch": 0.09257142857142857, | |
| "grad_norm": 0.21399417944679958, | |
| "kl": 0.0009112358093261719, | |
| "learning_rate": 9.895025252503755e-07, | |
| "loss": 0.0374, | |
| "reward": -0.19506264757364988, | |
| "reward_std": 0.48094464652240276, | |
| "rewards/cosine_scaled_reward": -0.1912813438102603, | |
| "rewards/format_reward": 0.18750000558793545, | |
| "step": 81 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1601.9792175292969, | |
| "epoch": 0.09371428571428571, | |
| "grad_norm": 0.2450961734236274, | |
| "kl": 0.0009531974792480469, | |
| "learning_rate": 9.888172094375033e-07, | |
| "loss": 0.077, | |
| "reward": -0.1917775571346283, | |
| "reward_std": 0.5255400985479355, | |
| "rewards/cosine_scaled_reward": -0.25213877484202385, | |
| "rewards/format_reward": 0.31250001303851604, | |
| "step": 82 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1748.1875610351562, | |
| "epoch": 0.09485714285714286, | |
| "grad_norm": 0.22448680749018862, | |
| "kl": 0.0004420280456542969, | |
| "learning_rate": 9.881105062929221e-07, | |
| "loss": 0.0159, | |
| "reward": -0.43924427404999733, | |
| "reward_std": 0.2609596960246563, | |
| "rewards/cosine_scaled_reward": -0.27170546911656857, | |
| "rewards/format_reward": 0.10416666977107525, | |
| "step": 83 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1515.7708435058594, | |
| "epoch": 0.096, | |
| "grad_norm": 0.2231038243696207, | |
| "kl": 0.0006551742553710938, | |
| "learning_rate": 9.873824502603459e-07, | |
| "loss": 0.0246, | |
| "reward": 0.36620646342635155, | |
| "reward_std": 0.884237602353096, | |
| "rewards/cosine_scaled_reward": -0.06689677853137255, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 84 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1701.2083740234375, | |
| "epoch": 0.09714285714285714, | |
| "grad_norm": 0.20906161676384463, | |
| "kl": 0.000804901123046875, | |
| "learning_rate": 9.866330768241983e-07, | |
| "loss": 0.0555, | |
| "reward": -0.39954638853669167, | |
| "reward_std": 0.31576116755604744, | |
| "rewards/cosine_scaled_reward": -0.2726898640394211, | |
| "rewards/format_reward": 0.14583333395421505, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1610.9792175292969, | |
| "epoch": 0.09828571428571428, | |
| "grad_norm": 0.22100681278056383, | |
| "kl": 0.0009822845458984375, | |
| "learning_rate": 9.85862422507884e-07, | |
| "loss": 0.0444, | |
| "reward": -0.24343110900372267, | |
| "reward_std": 0.2885846998542547, | |
| "rewards/cosine_scaled_reward": -0.30921556800603867, | |
| "rewards/format_reward": 0.375, | |
| "step": 86 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1695.354248046875, | |
| "epoch": 0.09942857142857142, | |
| "grad_norm": 0.24683069440334848, | |
| "kl": 0.0029506683349609375, | |
| "learning_rate": 9.850705248720068e-07, | |
| "loss": 0.0377, | |
| "reward": -0.09222975745797157, | |
| "reward_std": 0.24668438732624054, | |
| "rewards/cosine_scaled_reward": -0.1502815391868353, | |
| "rewards/format_reward": 0.2083333432674408, | |
| "step": 87 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1594.9167175292969, | |
| "epoch": 0.10057142857142858, | |
| "grad_norm": 0.27215086328931853, | |
| "kl": 0.0016989707946777344, | |
| "learning_rate": 9.8425742251254e-07, | |
| "loss": 0.1075, | |
| "reward": 0.18186672404408455, | |
| "reward_std": 0.9013341814279556, | |
| "rewards/cosine_scaled_reward": -0.07573332265019417, | |
| "rewards/format_reward": 0.3333333432674408, | |
| "step": 88 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1738.7292175292969, | |
| "epoch": 0.10171428571428572, | |
| "grad_norm": 0.1946900134085172, | |
| "kl": 0.000820159912109375, | |
| "learning_rate": 9.83423155058946e-07, | |
| "loss": 0.0331, | |
| "reward": -0.28752805292606354, | |
| "reward_std": 0.4243736080825329, | |
| "rewards/cosine_scaled_reward": -0.22709737345576286, | |
| "rewards/format_reward": 0.16666667722165585, | |
| "step": 89 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1572.2916870117188, | |
| "epoch": 0.10285714285714286, | |
| "grad_norm": 0.20694868118264276, | |
| "kl": 0.0007328987121582031, | |
| "learning_rate": 9.825677631722435e-07, | |
| "loss": 0.0753, | |
| "reward": -0.08595774043351412, | |
| "reward_std": 0.5348180644214153, | |
| "rewards/cosine_scaled_reward": -0.18881220323964953, | |
| "rewards/format_reward": 0.2916666679084301, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1601.5625610351562, | |
| "epoch": 0.104, | |
| "grad_norm": 0.20840771038907893, | |
| "kl": 0.0007948875427246094, | |
| "learning_rate": 9.816912885430258e-07, | |
| "loss": 0.0808, | |
| "reward": -0.015035435557365417, | |
| "reward_std": 0.14022575318813324, | |
| "rewards/cosine_scaled_reward": -0.1429343856871128, | |
| "rewards/format_reward": 0.2708333432674408, | |
| "step": 91 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1498.2292175292969, | |
| "epoch": 0.10514285714285715, | |
| "grad_norm": 0.20771988001872319, | |
| "kl": 0.0009174346923828125, | |
| "learning_rate": 9.807937738894303e-07, | |
| "loss": 0.0994, | |
| "reward": 0.07728531863540411, | |
| "reward_std": 0.508693166077137, | |
| "rewards/cosine_scaled_reward": -0.1384406816214323, | |
| "rewards/format_reward": 0.35416666977107525, | |
| "step": 92 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1347.8125305175781, | |
| "epoch": 0.10628571428571429, | |
| "grad_norm": 0.27527082284418775, | |
| "kl": 0.0021848678588867188, | |
| "learning_rate": 9.798752629550546e-07, | |
| "loss": 0.0296, | |
| "reward": 0.30088429898023605, | |
| "reward_std": 0.5643313899636269, | |
| "rewards/cosine_scaled_reward": -0.10997452400624752, | |
| "rewards/format_reward": 0.5208333432674408, | |
| "step": 93 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1733.2500610351562, | |
| "epoch": 0.10742857142857143, | |
| "grad_norm": 0.23935442867120157, | |
| "kl": 0.0012607574462890625, | |
| "learning_rate": 9.78935800506826e-07, | |
| "loss": 0.021, | |
| "reward": -0.34041892923414707, | |
| "reward_std": 0.2469240017235279, | |
| "rewards/cosine_scaled_reward": -0.26395946741104126, | |
| "rewards/format_reward": 0.18750000186264515, | |
| "step": 94 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1738.5625610351562, | |
| "epoch": 0.10857142857142857, | |
| "grad_norm": 0.21273217079983556, | |
| "kl": 0.0006814002990722656, | |
| "learning_rate": 9.779754323328192e-07, | |
| "loss": -0.0093, | |
| "reward": -0.5389137789607048, | |
| "reward_std": 0.17841140553355217, | |
| "rewards/cosine_scaled_reward": -0.3423735648393631, | |
| "rewards/format_reward": 0.14583333395421505, | |
| "step": 95 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1433.9375610351562, | |
| "epoch": 0.10971428571428571, | |
| "grad_norm": 0.30691056711732384, | |
| "kl": 0.002574920654296875, | |
| "learning_rate": 9.769942052400235e-07, | |
| "loss": 0.137, | |
| "reward": 0.296867486089468, | |
| "reward_std": 0.3943296894431114, | |
| "rewards/cosine_scaled_reward": -0.04948292672634125, | |
| "rewards/format_reward": 0.3958333432674408, | |
| "step": 96 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1567.2500610351562, | |
| "epoch": 0.11085714285714286, | |
| "grad_norm": 0.25051085956589897, | |
| "kl": 0.0013968944549560547, | |
| "learning_rate": 9.759921670520634e-07, | |
| "loss": 0.0267, | |
| "reward": -0.15386457741260529, | |
| "reward_std": 0.37108149379491806, | |
| "rewards/cosine_scaled_reward": -0.21234895661473274, | |
| "rewards/format_reward": 0.2708333432674408, | |
| "step": 97 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1406.0208740234375, | |
| "epoch": 0.112, | |
| "grad_norm": 0.366560785041491, | |
| "kl": 0.0012578964233398438, | |
| "learning_rate": 9.749693666068663e-07, | |
| "loss": 0.099, | |
| "reward": 0.3372333124279976, | |
| "reward_std": 0.3852754198014736, | |
| "rewards/cosine_scaled_reward": -0.12305000983178616, | |
| "rewards/format_reward": 0.5833333507180214, | |
| "step": 98 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1598.7917175292969, | |
| "epoch": 0.11314285714285714, | |
| "grad_norm": 0.2584279138871096, | |
| "kl": 0.0010881423950195312, | |
| "learning_rate": 9.739258537542835e-07, | |
| "loss": 0.0536, | |
| "reward": 0.1023973822593689, | |
| "reward_std": 0.4502338841557503, | |
| "rewards/cosine_scaled_reward": -0.1258846465498209, | |
| "rewards/format_reward": 0.3541666828095913, | |
| "step": 99 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1557.479248046875, | |
| "epoch": 0.11428571428571428, | |
| "grad_norm": 0.23713752727518134, | |
| "kl": 0.0009851455688476562, | |
| "learning_rate": 9.728616793536587e-07, | |
| "loss": 0.0694, | |
| "reward": -0.15063253417611122, | |
| "reward_std": 0.3854830376803875, | |
| "rewards/cosine_scaled_reward": -0.23156626150012016, | |
| "rewards/format_reward": 0.3125000111758709, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1387.4583435058594, | |
| "epoch": 0.11542857142857142, | |
| "grad_norm": 0.32157411791816565, | |
| "kl": 0.001094818115234375, | |
| "learning_rate": 9.717768952713511e-07, | |
| "loss": 0.1116, | |
| "reward": 0.07011325657367706, | |
| "reward_std": 0.3243808038532734, | |
| "rewards/cosine_scaled_reward": -0.19411004893481731, | |
| "rewards/format_reward": 0.4583333395421505, | |
| "step": 101 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1449.3750610351562, | |
| "epoch": 0.11657142857142858, | |
| "grad_norm": 0.2168599934302549, | |
| "kl": 0.0015411376953125, | |
| "learning_rate": 9.706715543782064e-07, | |
| "loss": 0.0577, | |
| "reward": -0.21096567437052727, | |
| "reward_std": 0.29599858447909355, | |
| "rewards/cosine_scaled_reward": -0.3138161562383175, | |
| "rewards/format_reward": 0.4166666865348816, | |
| "step": 102 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1715.166748046875, | |
| "epoch": 0.11771428571428572, | |
| "grad_norm": 0.21920178674297372, | |
| "kl": 0.0015869140625, | |
| "learning_rate": 9.695457105469804e-07, | |
| "loss": 0.0667, | |
| "reward": -0.18699942529201508, | |
| "reward_std": 0.5092732682824135, | |
| "rewards/cosine_scaled_reward": -0.22891639173030853, | |
| "rewards/format_reward": 0.2708333395421505, | |
| "step": 103 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1304.4583435058594, | |
| "epoch": 0.11885714285714286, | |
| "grad_norm": 0.22942484314958453, | |
| "kl": 0.0013804435729980469, | |
| "learning_rate": 9.683994186497132e-07, | |
| "loss": 0.0839, | |
| "reward": 0.5173723250627518, | |
| "reward_std": 0.5176322646439075, | |
| "rewards/cosine_scaled_reward": -0.001730518415570259, | |
| "rewards/format_reward": 0.5208333358168602, | |
| "step": 104 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1364.8333740234375, | |
| "epoch": 0.12, | |
| "grad_norm": 0.25403433256650454, | |
| "kl": 0.0016727447509765625, | |
| "learning_rate": 9.672327345550543e-07, | |
| "loss": 0.1156, | |
| "reward": 0.28816052433103323, | |
| "reward_std": 0.240465197712183, | |
| "rewards/cosine_scaled_reward": -0.1267530769109726, | |
| "rewards/format_reward": 0.541666679084301, | |
| "step": 105 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1570.6667175292969, | |
| "epoch": 0.12114285714285715, | |
| "grad_norm": 0.2462172191203138, | |
| "kl": 0.0020122528076171875, | |
| "learning_rate": 9.66045715125541e-07, | |
| "loss": 0.0866, | |
| "reward": 0.34020555624738336, | |
| "reward_std": 0.7328735627233982, | |
| "rewards/cosine_scaled_reward": -0.038230573292821646, | |
| "rewards/format_reward": 0.41666666977107525, | |
| "step": 106 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1243.4583740234375, | |
| "epoch": 0.12228571428571429, | |
| "grad_norm": 0.22392855280151888, | |
| "kl": 0.001399993896484375, | |
| "learning_rate": 9.648384182148252e-07, | |
| "loss": 0.0861, | |
| "reward": 0.19801579043269157, | |
| "reward_std": 0.4772573560476303, | |
| "rewards/cosine_scaled_reward": -0.18224211037158966, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 107 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1376.5625610351562, | |
| "epoch": 0.12342857142857143, | |
| "grad_norm": 0.2328882803373465, | |
| "kl": 0.0032482147216796875, | |
| "learning_rate": 9.636109026648554e-07, | |
| "loss": 0.0636, | |
| "reward": 0.6495321169495583, | |
| "reward_std": 0.5899618566036224, | |
| "rewards/cosine_scaled_reward": 0.06434935945435427, | |
| "rewards/format_reward": 0.5208333488553762, | |
| "step": 108 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1368.0625305175781, | |
| "epoch": 0.12457142857142857, | |
| "grad_norm": 0.3696050391986309, | |
| "kl": 0.0028667449951171875, | |
| "learning_rate": 9.623632283030077e-07, | |
| "loss": 0.1246, | |
| "reward": -0.031360091641545296, | |
| "reward_std": 0.4002140313386917, | |
| "rewards/cosine_scaled_reward": -0.2656800393015146, | |
| "rewards/format_reward": 0.5, | |
| "step": 109 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1444.6666870117188, | |
| "epoch": 0.12571428571428572, | |
| "grad_norm": 0.35213532577859125, | |
| "kl": 0.0029430389404296875, | |
| "learning_rate": 9.610954559391704e-07, | |
| "loss": 0.1339, | |
| "reward": 0.6942434869706631, | |
| "reward_std": 0.9198908805847168, | |
| "rewards/cosine_scaled_reward": 0.06587174534797668, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 110 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1072.7708587646484, | |
| "epoch": 0.12685714285714286, | |
| "grad_norm": 0.2985726423715741, | |
| "kl": 0.001979827880859375, | |
| "learning_rate": 9.598076473627796e-07, | |
| "loss": 0.0476, | |
| "reward": 0.7408694333862513, | |
| "reward_std": 0.7333548963069916, | |
| "rewards/cosine_scaled_reward": -0.004565277136862278, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 111 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1633.4167175292969, | |
| "epoch": 0.128, | |
| "grad_norm": 0.22471101395696397, | |
| "kl": 0.00258636474609375, | |
| "learning_rate": 9.58499865339809e-07, | |
| "loss": 0.0346, | |
| "reward": -0.05079384706914425, | |
| "reward_std": 0.4366183038800955, | |
| "rewards/cosine_scaled_reward": -0.2337302602827549, | |
| "rewards/format_reward": 0.4166666865348816, | |
| "step": 112 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1319.7291870117188, | |
| "epoch": 0.12914285714285714, | |
| "grad_norm": 0.27063696127291986, | |
| "kl": 0.0033721923828125, | |
| "learning_rate": 9.571721736097088e-07, | |
| "loss": 0.0833, | |
| "reward": 0.6321319434791803, | |
| "reward_std": 0.5336715504527092, | |
| "rewards/cosine_scaled_reward": -0.006850697100162506, | |
| "rewards/format_reward": 0.6458333507180214, | |
| "step": 113 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1052.3958892822266, | |
| "epoch": 0.13028571428571428, | |
| "grad_norm": 0.250125198289797, | |
| "kl": 0.0016460418701171875, | |
| "learning_rate": 9.55824636882301e-07, | |
| "loss": 0.0768, | |
| "reward": 0.653087726328522, | |
| "reward_std": 0.35864404030144215, | |
| "rewards/cosine_scaled_reward": -0.017206139862537384, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 114 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1440.2083740234375, | |
| "epoch": 0.13142857142857142, | |
| "grad_norm": 0.29266585256345196, | |
| "kl": 0.0030155181884765625, | |
| "learning_rate": 9.54457320834625e-07, | |
| "loss": 0.0755, | |
| "reward": 0.21958831325173378, | |
| "reward_std": 0.704796127974987, | |
| "rewards/cosine_scaled_reward": -0.16103917988948524, | |
| "rewards/format_reward": 0.5416666865348816, | |
| "step": 115 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1472.5416870117188, | |
| "epoch": 0.13257142857142856, | |
| "grad_norm": 0.26433038131357134, | |
| "kl": 0.003147125244140625, | |
| "learning_rate": 9.530702921077358e-07, | |
| "loss": 0.073, | |
| "reward": 0.018861573189496994, | |
| "reward_std": 0.3587416708469391, | |
| "rewards/cosine_scaled_reward": -0.18848587945103645, | |
| "rewards/format_reward": 0.39583334140479565, | |
| "step": 116 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1545.9375305175781, | |
| "epoch": 0.1337142857142857, | |
| "grad_norm": 0.21836493727001577, | |
| "kl": 0.002864837646484375, | |
| "learning_rate": 9.516636183034564e-07, | |
| "loss": 0.1366, | |
| "reward": -0.32600877061486244, | |
| "reward_std": 0.43822694569826126, | |
| "rewards/cosine_scaled_reward": -0.35050439089536667, | |
| "rewards/format_reward": 0.3750000074505806, | |
| "step": 117 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1334.1250305175781, | |
| "epoch": 0.13485714285714287, | |
| "grad_norm": 0.24394321780710398, | |
| "kl": 0.0026493072509765625, | |
| "learning_rate": 9.502373679810839e-07, | |
| "loss": 0.035, | |
| "reward": 0.457018606364727, | |
| "reward_std": 0.5285698734223843, | |
| "rewards/cosine_scaled_reward": -0.09440736100077629, | |
| "rewards/format_reward": 0.645833358168602, | |
| "step": 118 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1220.3333740234375, | |
| "epoch": 0.136, | |
| "grad_norm": 0.28272459137828676, | |
| "kl": 0.0042877197265625, | |
| "learning_rate": 9.487916106540465e-07, | |
| "loss": 0.0804, | |
| "reward": 0.3442453145980835, | |
| "reward_std": 0.564174473285675, | |
| "rewards/cosine_scaled_reward": -0.12996070086956024, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 119 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1528.6042175292969, | |
| "epoch": 0.13714285714285715, | |
| "grad_norm": 0.2668307726658885, | |
| "kl": 0.00232696533203125, | |
| "learning_rate": 9.473264167865171e-07, | |
| "loss": 0.1032, | |
| "reward": -0.03986197151243687, | |
| "reward_std": 0.37811761628836393, | |
| "rewards/cosine_scaled_reward": -0.2282643192447722, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 120 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1584.8125610351562, | |
| "epoch": 0.1382857142857143, | |
| "grad_norm": 0.22786468100552407, | |
| "kl": 0.002208709716796875, | |
| "learning_rate": 9.458418577899774e-07, | |
| "loss": 0.0046, | |
| "reward": 0.16309459879994392, | |
| "reward_std": 0.2453223168849945, | |
| "rewards/cosine_scaled_reward": -0.1372026912868023, | |
| "rewards/format_reward": 0.4375000074505806, | |
| "step": 121 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1462.6875610351562, | |
| "epoch": 0.13942857142857143, | |
| "grad_norm": 0.28816738889821486, | |
| "kl": 0.00514984130859375, | |
| "learning_rate": 9.443380060197385e-07, | |
| "loss": 0.0974, | |
| "reward": -0.12114270869642496, | |
| "reward_std": 0.2534109205007553, | |
| "rewards/cosine_scaled_reward": -0.2689046934247017, | |
| "rewards/format_reward": 0.41666666977107525, | |
| "step": 122 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1375.7708740234375, | |
| "epoch": 0.14057142857142857, | |
| "grad_norm": 0.32101258824146217, | |
| "kl": 0.0041351318359375, | |
| "learning_rate": 9.428149347714143e-07, | |
| "loss": 0.1284, | |
| "reward": 0.18988706171512604, | |
| "reward_std": 0.8535008877515793, | |
| "rewards/cosine_scaled_reward": -0.17588980495929718, | |
| "rewards/format_reward": 0.541666679084301, | |
| "step": 123 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1374.7292175292969, | |
| "epoch": 0.1417142857142857, | |
| "grad_norm": 0.2425349865258595, | |
| "kl": 0.00324249267578125, | |
| "learning_rate": 9.412727182773486e-07, | |
| "loss": 0.0382, | |
| "reward": 0.07038946449756622, | |
| "reward_std": 0.49846766516566277, | |
| "rewards/cosine_scaled_reward": -0.15230527985841036, | |
| "rewards/format_reward": 0.375, | |
| "step": 124 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1664.8541870117188, | |
| "epoch": 0.14285714285714285, | |
| "grad_norm": 0.2457250240943947, | |
| "kl": 0.0025730133056640625, | |
| "learning_rate": 9.397114317029974e-07, | |
| "loss": 0.0291, | |
| "reward": 0.004289238480851054, | |
| "reward_std": 0.32331261597573757, | |
| "rewards/cosine_scaled_reward": -0.15410537272691727, | |
| "rewards/format_reward": 0.31250000186264515, | |
| "step": 125 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1422.3750305175781, | |
| "epoch": 0.144, | |
| "grad_norm": 0.32285843347583837, | |
| "kl": 0.005126953125, | |
| "learning_rate": 9.381311511432658e-07, | |
| "loss": 0.0961, | |
| "reward": 0.19516459852457047, | |
| "reward_std": 0.6147220581769943, | |
| "rewards/cosine_scaled_reward": -0.162834367249161, | |
| "rewards/format_reward": 0.5208333507180214, | |
| "step": 126 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1370.7708435058594, | |
| "epoch": 0.14514285714285713, | |
| "grad_norm": 0.24341515642410516, | |
| "kl": 0.0030078887939453125, | |
| "learning_rate": 9.36531953618799e-07, | |
| "loss": 0.0726, | |
| "reward": -0.08839717879891396, | |
| "reward_std": 0.4017263073474169, | |
| "rewards/cosine_scaled_reward": -0.2941986061632633, | |
| "rewards/format_reward": 0.5000000149011612, | |
| "step": 127 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1219.9167175292969, | |
| "epoch": 0.1462857142857143, | |
| "grad_norm": 0.2623858416818109, | |
| "kl": 0.004070281982421875, | |
| "learning_rate": 9.34913917072228e-07, | |
| "loss": 0.0537, | |
| "reward": 0.43044765666127205, | |
| "reward_std": 0.49690980464220047, | |
| "rewards/cosine_scaled_reward": -0.15977618098258972, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 128 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1109.1875305175781, | |
| "epoch": 0.14742857142857144, | |
| "grad_norm": 0.2829401049059584, | |
| "kl": 0.00757598876953125, | |
| "learning_rate": 9.332771203643714e-07, | |
| "loss": 0.0692, | |
| "reward": 0.6423492059111595, | |
| "reward_std": 0.4438105970621109, | |
| "rewards/cosine_scaled_reward": -0.03299206681549549, | |
| "rewards/format_reward": 0.7083333358168602, | |
| "step": 129 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1495.8333740234375, | |
| "epoch": 0.14857142857142858, | |
| "grad_norm": 0.23104014201895975, | |
| "kl": 0.00299835205078125, | |
| "learning_rate": 9.316216432703916e-07, | |
| "loss": 0.0064, | |
| "reward": -0.09923176001757383, | |
| "reward_std": 0.43960002437233925, | |
| "rewards/cosine_scaled_reward": -0.29961589351296425, | |
| "rewards/format_reward": 0.5000000149011612, | |
| "step": 130 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1543.2500305175781, | |
| "epoch": 0.14971428571428572, | |
| "grad_norm": 0.22132261730116032, | |
| "kl": 0.0033931732177734375, | |
| "learning_rate": 9.299475664759068e-07, | |
| "loss": 0.1051, | |
| "reward": -0.012558471411466599, | |
| "reward_std": 0.5053001046180725, | |
| "rewards/cosine_scaled_reward": -0.24586258456110954, | |
| "rewards/format_reward": 0.47916667722165585, | |
| "step": 131 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1477.0625, | |
| "epoch": 0.15085714285714286, | |
| "grad_norm": 0.2442588816427236, | |
| "kl": 0.004528045654296875, | |
| "learning_rate": 9.282549715730579e-07, | |
| "loss": 0.0768, | |
| "reward": -0.11025669425725937, | |
| "reward_std": 0.18197684548795223, | |
| "rewards/cosine_scaled_reward": -0.284295029938221, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 132 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1563.3958740234375, | |
| "epoch": 0.152, | |
| "grad_norm": 0.21023108591248665, | |
| "kl": 0.00415802001953125, | |
| "learning_rate": 9.265439410565328e-07, | |
| "loss": 0.0672, | |
| "reward": 0.13176406361162663, | |
| "reward_std": 0.5022407323122025, | |
| "rewards/cosine_scaled_reward": -0.18411797285079956, | |
| "rewards/format_reward": 0.5000000149011612, | |
| "step": 133 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1049.1042022705078, | |
| "epoch": 0.15314285714285714, | |
| "grad_norm": 0.3838039161390532, | |
| "kl": 0.00562286376953125, | |
| "learning_rate": 9.248145583195447e-07, | |
| "loss": 0.1973, | |
| "reward": 0.4749515192816034, | |
| "reward_std": 0.3580738380551338, | |
| "rewards/cosine_scaled_reward": -0.15835759788751602, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 134 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1351.5416870117188, | |
| "epoch": 0.15428571428571428, | |
| "grad_norm": 0.34500799880157473, | |
| "kl": 0.00400543212890625, | |
| "learning_rate": 9.230669076497687e-07, | |
| "loss": 0.143, | |
| "reward": 0.2647483544424176, | |
| "reward_std": 0.5427017770707607, | |
| "rewards/cosine_scaled_reward": -0.11762583442032337, | |
| "rewards/format_reward": 0.5000000204890966, | |
| "step": 135 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1168.0625305175781, | |
| "epoch": 0.15542857142857142, | |
| "grad_norm": 0.31218899888892226, | |
| "kl": 0.004955291748046875, | |
| "learning_rate": 9.213010742252327e-07, | |
| "loss": 0.0562, | |
| "reward": 0.3584494572132826, | |
| "reward_std": 0.5529016815125942, | |
| "rewards/cosine_scaled_reward": -0.17494194395840168, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 136 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1282.0416870117188, | |
| "epoch": 0.15657142857142858, | |
| "grad_norm": 0.2721613126225875, | |
| "kl": 0.007869720458984375, | |
| "learning_rate": 9.195171441101668e-07, | |
| "loss": 0.1358, | |
| "reward": 0.2924184873700142, | |
| "reward_std": 0.5777250528335571, | |
| "rewards/cosine_scaled_reward": -0.16629073955118656, | |
| "rewards/format_reward": 0.6250000055879354, | |
| "step": 137 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1041.8958740234375, | |
| "epoch": 0.15771428571428572, | |
| "grad_norm": 0.30890354701331296, | |
| "kl": 0.00521087646484375, | |
| "learning_rate": 9.177152042508077e-07, | |
| "loss": 0.0338, | |
| "reward": 0.860385000705719, | |
| "reward_std": 0.8024220168590546, | |
| "rewards/cosine_scaled_reward": 0.023942476138472557, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 138 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1192.8125457763672, | |
| "epoch": 0.15885714285714286, | |
| "grad_norm": 0.2622844914783918, | |
| "kl": 0.00412750244140625, | |
| "learning_rate": 9.158953424711624e-07, | |
| "loss": 0.0825, | |
| "reward": 0.5425689108669758, | |
| "reward_std": 0.5253265127539635, | |
| "rewards/cosine_scaled_reward": -0.12454888969659805, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 139 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1283.4375305175781, | |
| "epoch": 0.16, | |
| "grad_norm": 0.24897560413424463, | |
| "kl": 0.004375457763671875, | |
| "learning_rate": 9.140576474687263e-07, | |
| "loss": 0.1075, | |
| "reward": 0.3927510902285576, | |
| "reward_std": 0.43108681961894035, | |
| "rewards/cosine_scaled_reward": -0.1577911265194416, | |
| "rewards/format_reward": 0.7083333507180214, | |
| "step": 140 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1262.7916717529297, | |
| "epoch": 0.16114285714285714, | |
| "grad_norm": 0.3772239136734691, | |
| "kl": 0.005344390869140625, | |
| "learning_rate": 9.122022088101613e-07, | |
| "loss": 0.1713, | |
| "reward": 0.37745123356580734, | |
| "reward_std": 0.5623941943049431, | |
| "rewards/cosine_scaled_reward": -0.13419108092784882, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 141 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1246.6042022705078, | |
| "epoch": 0.16228571428571428, | |
| "grad_norm": 0.28965855619789826, | |
| "kl": 0.0045166015625, | |
| "learning_rate": 9.103291169269299e-07, | |
| "loss": 0.0725, | |
| "reward": 0.5083264335989952, | |
| "reward_std": 0.5853047892451286, | |
| "rewards/cosine_scaled_reward": -0.047920111566782, | |
| "rewards/format_reward": 0.6041666828095913, | |
| "step": 142 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1448.9375610351562, | |
| "epoch": 0.16342857142857142, | |
| "grad_norm": 0.2549900151123108, | |
| "kl": 0.006290435791015625, | |
| "learning_rate": 9.084384631108882e-07, | |
| "loss": 0.1142, | |
| "reward": 0.13985165720805526, | |
| "reward_std": 0.2659877985715866, | |
| "rewards/cosine_scaled_reward": -0.20090750604867935, | |
| "rewards/format_reward": 0.5416666865348816, | |
| "step": 143 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1203.2083740234375, | |
| "epoch": 0.16457142857142856, | |
| "grad_norm": 0.2224971436424363, | |
| "kl": 0.005550384521484375, | |
| "learning_rate": 9.065303395098358e-07, | |
| "loss": 0.085, | |
| "reward": 0.5334329381585121, | |
| "reward_std": 0.5584629252552986, | |
| "rewards/cosine_scaled_reward": -0.10828354395925999, | |
| "rewards/format_reward": 0.75, | |
| "step": 144 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1086.9792175292969, | |
| "epoch": 0.1657142857142857, | |
| "grad_norm": 0.3813865927902241, | |
| "kl": 0.0063629150390625, | |
| "learning_rate": 9.046048391230247e-07, | |
| "loss": 0.1879, | |
| "reward": 0.2875216994434595, | |
| "reward_std": 0.5303685888648033, | |
| "rewards/cosine_scaled_reward": -0.23123916238546371, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 145 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1285.5625610351562, | |
| "epoch": 0.16685714285714287, | |
| "grad_norm": 0.2972743546494519, | |
| "kl": 0.0059814453125, | |
| "learning_rate": 9.026620557966279e-07, | |
| "loss": 0.0594, | |
| "reward": 0.2565866466611624, | |
| "reward_std": 0.46598899737000465, | |
| "rewards/cosine_scaled_reward": -0.25712333619594574, | |
| "rewards/format_reward": 0.770833358168602, | |
| "step": 146 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 896.6250305175781, | |
| "epoch": 0.168, | |
| "grad_norm": 0.32647401434979056, | |
| "kl": 0.006877899169921875, | |
| "learning_rate": 9.007020842191634e-07, | |
| "loss": -0.0011, | |
| "reward": 1.0985181145370007, | |
| "reward_std": 0.5338096916675568, | |
| "rewards/cosine_scaled_reward": 0.05967570189386606, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 147 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1160.9583435058594, | |
| "epoch": 0.16914285714285715, | |
| "grad_norm": 0.2816274273158885, | |
| "kl": 0.00585174560546875, | |
| "learning_rate": 8.987250199168808e-07, | |
| "loss": 0.0442, | |
| "reward": 0.18387611024081707, | |
| "reward_std": 0.2959946282207966, | |
| "rewards/cosine_scaled_reward": -0.3143119588494301, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 148 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1223.9791870117188, | |
| "epoch": 0.1702857142857143, | |
| "grad_norm": 0.2823488259457116, | |
| "kl": 0.00612640380859375, | |
| "learning_rate": 8.967309592491052e-07, | |
| "loss": 0.0654, | |
| "reward": 0.47756416723132133, | |
| "reward_std": 0.7413289695978165, | |
| "rewards/cosine_scaled_reward": -0.11538459919393063, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 149 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 986.6042022705078, | |
| "epoch": 0.17142857142857143, | |
| "grad_norm": 0.318064277562745, | |
| "kl": 0.0080413818359375, | |
| "learning_rate": 8.9471999940354e-07, | |
| "loss": 0.1332, | |
| "reward": 0.401881605386734, | |
| "reward_std": 0.6674076840281487, | |
| "rewards/cosine_scaled_reward": -0.20530920289456844, | |
| "rewards/format_reward": 0.8125000298023224, | |
| "step": 150 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1111.4375610351562, | |
| "epoch": 0.17257142857142857, | |
| "grad_norm": 0.23750964874516883, | |
| "kl": 0.005405426025390625, | |
| "learning_rate": 8.926922383915315e-07, | |
| "loss": 0.0547, | |
| "reward": 0.42157261446118355, | |
| "reward_std": 0.2637167125940323, | |
| "rewards/cosine_scaled_reward": -0.14338038116693497, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 151 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1390.2292175292969, | |
| "epoch": 0.1737142857142857, | |
| "grad_norm": 0.3108688575018839, | |
| "kl": 0.008697509765625, | |
| "learning_rate": 8.906477750432903e-07, | |
| "loss": 0.1077, | |
| "reward": 0.11867762915790081, | |
| "reward_std": 0.5801703371107578, | |
| "rewards/cosine_scaled_reward": -0.2739945203065872, | |
| "rewards/format_reward": 0.6666666716337204, | |
| "step": 152 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1189.4792022705078, | |
| "epoch": 0.17485714285714285, | |
| "grad_norm": 0.22859697466435477, | |
| "kl": 0.006011962890625, | |
| "learning_rate": 8.88586709003076e-07, | |
| "loss": 0.0402, | |
| "reward": 0.46854234486818314, | |
| "reward_std": 0.5257667489349842, | |
| "rewards/cosine_scaled_reward": -0.07822884852066636, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 153 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1227.7292175292969, | |
| "epoch": 0.176, | |
| "grad_norm": 0.23458511838935112, | |
| "kl": 0.0063323974609375, | |
| "learning_rate": 8.865091407243394e-07, | |
| "loss": 0.129, | |
| "reward": 0.7308447554241866, | |
| "reward_std": 0.4724605940282345, | |
| "rewards/cosine_scaled_reward": 0.011255700141191483, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 154 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1320.6666870117188, | |
| "epoch": 0.17714285714285713, | |
| "grad_norm": 0.29059316598505575, | |
| "kl": 0.007198333740234375, | |
| "learning_rate": 8.844151714648274e-07, | |
| "loss": 0.1327, | |
| "reward": -0.1417454145848751, | |
| "reward_std": 0.3702365458011627, | |
| "rewards/cosine_scaled_reward": -0.3521227166056633, | |
| "rewards/format_reward": 0.5625000055879354, | |
| "step": 155 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1116.000015258789, | |
| "epoch": 0.1782857142857143, | |
| "grad_norm": 0.37926874198201693, | |
| "kl": 0.00821685791015625, | |
| "learning_rate": 8.823049032816478e-07, | |
| "loss": 0.2189, | |
| "reward": 0.15536441165022552, | |
| "reward_std": 0.2769140414893627, | |
| "rewards/cosine_scaled_reward": -0.2869011387228966, | |
| "rewards/format_reward": 0.729166679084301, | |
| "step": 156 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1049.5000305175781, | |
| "epoch": 0.17942857142857144, | |
| "grad_norm": 0.3728612044799926, | |
| "kl": 0.02156829833984375, | |
| "learning_rate": 8.801784390262943e-07, | |
| "loss": 0.0389, | |
| "reward": 0.7612650550436229, | |
| "reward_std": 0.31401624344289303, | |
| "rewards/cosine_scaled_reward": 0.005632489919662476, | |
| "rewards/format_reward": 0.75, | |
| "step": 157 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1386.0833740234375, | |
| "epoch": 0.18057142857142858, | |
| "grad_norm": 0.3889317879381384, | |
| "kl": 0.00850677490234375, | |
| "learning_rate": 8.780358823396352e-07, | |
| "loss": 0.1308, | |
| "reward": 0.06261628679931164, | |
| "reward_std": 0.3530626520514488, | |
| "rewards/cosine_scaled_reward": -0.3124418593943119, | |
| "rewards/format_reward": 0.6875000298023224, | |
| "step": 158 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1129.7708740234375, | |
| "epoch": 0.18171428571428572, | |
| "grad_norm": 0.3220977926530576, | |
| "kl": 0.00748443603515625, | |
| "learning_rate": 8.758773376468604e-07, | |
| "loss": 0.1262, | |
| "reward": 0.6195714063942432, | |
| "reward_std": 0.6696993261575699, | |
| "rewards/cosine_scaled_reward": -0.08604763355106115, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 159 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1160.4167175292969, | |
| "epoch": 0.18285714285714286, | |
| "grad_norm": 0.2845576646765754, | |
| "kl": 0.0069122314453125, | |
| "learning_rate": 8.737029101523929e-07, | |
| "loss": 0.1064, | |
| "reward": 0.6454856535419822, | |
| "reward_std": 0.8377318382263184, | |
| "rewards/cosine_scaled_reward": -0.08350718393921852, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 160 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1429.4792175292969, | |
| "epoch": 0.184, | |
| "grad_norm": 0.25219633802451885, | |
| "kl": 0.00858306884765625, | |
| "learning_rate": 8.715127058347614e-07, | |
| "loss": 0.0965, | |
| "reward": 0.009432412683963776, | |
| "reward_std": 0.3042390923947096, | |
| "rewards/cosine_scaled_reward": -0.27653381787240505, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 161 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1226.7917175292969, | |
| "epoch": 0.18514285714285714, | |
| "grad_norm": 0.26347106975524837, | |
| "kl": 0.0080108642578125, | |
| "learning_rate": 8.693068314414344e-07, | |
| "loss": 0.077, | |
| "reward": 0.24512136541306973, | |
| "reward_std": 0.43705228716135025, | |
| "rewards/cosine_scaled_reward": -0.29410600662231445, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 162 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 926.1250305175781, | |
| "epoch": 0.18628571428571428, | |
| "grad_norm": 0.41739039022115654, | |
| "kl": 0.0112457275390625, | |
| "learning_rate": 8.670853944836176e-07, | |
| "loss": 0.1827, | |
| "reward": 0.7628292813897133, | |
| "reward_std": 0.8151352852582932, | |
| "rewards/cosine_scaled_reward": -0.04566871002316475, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 163 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 946.125, | |
| "epoch": 0.18742857142857142, | |
| "grad_norm": 0.36967841595429546, | |
| "kl": 0.01031494140625, | |
| "learning_rate": 8.648485032310144e-07, | |
| "loss": 0.1834, | |
| "reward": 0.6057916302233934, | |
| "reward_std": 0.48515384271740913, | |
| "rewards/cosine_scaled_reward": -0.10335419327020645, | |
| "rewards/format_reward": 0.8125000298023224, | |
| "step": 164 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1174.4792175292969, | |
| "epoch": 0.18857142857142858, | |
| "grad_norm": 0.2708332867444507, | |
| "kl": 0.00788116455078125, | |
| "learning_rate": 8.625962667065487e-07, | |
| "loss": 0.0394, | |
| "reward": 0.32264771312475204, | |
| "reward_std": 0.4833778813481331, | |
| "rewards/cosine_scaled_reward": -0.24492615275084972, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 165 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 907.0833740234375, | |
| "epoch": 0.18971428571428572, | |
| "grad_norm": 0.3261002575687217, | |
| "kl": 0.00717926025390625, | |
| "learning_rate": 8.603287946810513e-07, | |
| "loss": 0.1283, | |
| "reward": 0.6173169314861298, | |
| "reward_std": 0.2740478292107582, | |
| "rewards/cosine_scaled_reward": -0.1705082282423973, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 166 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1270.0417175292969, | |
| "epoch": 0.19085714285714286, | |
| "grad_norm": 0.3129560409827591, | |
| "kl": 0.00971221923828125, | |
| "learning_rate": 8.580461976679099e-07, | |
| "loss": 0.1093, | |
| "reward": 0.3518100567162037, | |
| "reward_std": 0.5595069229602814, | |
| "rewards/cosine_scaled_reward": -0.1470116525888443, | |
| "rewards/format_reward": 0.645833358168602, | |
| "step": 167 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1077.1041870117188, | |
| "epoch": 0.192, | |
| "grad_norm": 0.26915582005747507, | |
| "kl": 0.0078125, | |
| "learning_rate": 8.557485869176825e-07, | |
| "loss": 0.1617, | |
| "reward": 0.2642595246434212, | |
| "reward_std": 0.46994560211896896, | |
| "rewards/cosine_scaled_reward": -0.26370356790721416, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 168 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1156.2500305175781, | |
| "epoch": 0.19314285714285714, | |
| "grad_norm": 0.35785552736378773, | |
| "kl": 0.0098724365234375, | |
| "learning_rate": 8.534360744126753e-07, | |
| "loss": 0.0922, | |
| "reward": 0.77548947930336, | |
| "reward_std": 0.7726699560880661, | |
| "rewards/cosine_scaled_reward": 0.0023280568420886993, | |
| "rewards/format_reward": 0.770833358168602, | |
| "step": 169 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1073.7292175292969, | |
| "epoch": 0.19428571428571428, | |
| "grad_norm": 0.32755955253118335, | |
| "kl": 0.0117034912109375, | |
| "learning_rate": 8.511087728614862e-07, | |
| "loss": 0.0752, | |
| "reward": 0.19202834740281105, | |
| "reward_std": 0.3850276917219162, | |
| "rewards/cosine_scaled_reward": -0.3206525072455406, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 170 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 918.8333511352539, | |
| "epoch": 0.19542857142857142, | |
| "grad_norm": 0.3616914993674, | |
| "kl": 0.00833892822265625, | |
| "learning_rate": 8.487667956935087e-07, | |
| "loss": 0.0904, | |
| "reward": 0.5478162653744221, | |
| "reward_std": 0.6629246398806572, | |
| "rewards/cosine_scaled_reward": -0.1948418878018856, | |
| "rewards/format_reward": 0.9375, | |
| "step": 171 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1036.1458740234375, | |
| "epoch": 0.19657142857142856, | |
| "grad_norm": 0.3354400822116869, | |
| "kl": 0.0130157470703125, | |
| "learning_rate": 8.464102570534061e-07, | |
| "loss": 0.0669, | |
| "reward": 0.7608658275566995, | |
| "reward_std": 0.6014236621558666, | |
| "rewards/cosine_scaled_reward": -0.04665040969848633, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 172 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1106.4583740234375, | |
| "epoch": 0.1977142857142857, | |
| "grad_norm": 0.3236947350770136, | |
| "kl": 0.0121307373046875, | |
| "learning_rate": 8.440392717955475e-07, | |
| "loss": 0.093, | |
| "reward": 0.7088564559817314, | |
| "reward_std": 0.4235651511698961, | |
| "rewards/cosine_scaled_reward": -0.010155089199543, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 173 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1109.5000305175781, | |
| "epoch": 0.19885714285714284, | |
| "grad_norm": 0.37244639543702895, | |
| "kl": 0.015838623046875, | |
| "learning_rate": 8.416539554784089e-07, | |
| "loss": 0.1098, | |
| "reward": 0.17886048182845116, | |
| "reward_std": 0.35543810576200485, | |
| "rewards/cosine_scaled_reward": -0.30640310421586037, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 174 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 972.9167022705078, | |
| "epoch": 0.2, | |
| "grad_norm": 0.6554774460546362, | |
| "kl": 0.0153045654296875, | |
| "learning_rate": 8.392544243589427e-07, | |
| "loss": 0.2068, | |
| "reward": 0.607050247490406, | |
| "reward_std": 0.4396999180316925, | |
| "rewards/cosine_scaled_reward": -0.14439154416322708, | |
| "rewards/format_reward": 0.895833358168602, | |
| "step": 175 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 998.2291870117188, | |
| "epoch": 0.20114285714285715, | |
| "grad_norm": 0.28748166515655293, | |
| "kl": 0.0133819580078125, | |
| "learning_rate": 8.368407953869103e-07, | |
| "loss": 0.0371, | |
| "reward": 0.486224377527833, | |
| "reward_std": 0.6124172061681747, | |
| "rewards/cosine_scaled_reward": -0.17355448007583618, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 176 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 916.2291870117188, | |
| "epoch": 0.2022857142857143, | |
| "grad_norm": 0.4438177799902679, | |
| "kl": 0.0131378173828125, | |
| "learning_rate": 8.344131861991828e-07, | |
| "loss": 0.1487, | |
| "reward": 0.8074519336223602, | |
| "reward_std": 0.4988584369421005, | |
| "rewards/cosine_scaled_reward": -0.05460738018155098, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 177 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 822.8333740234375, | |
| "epoch": 0.20342857142857143, | |
| "grad_norm": 0.5173286289503403, | |
| "kl": 0.0179443359375, | |
| "learning_rate": 8.319717151140072e-07, | |
| "loss": 0.1961, | |
| "reward": 1.0362385213375092, | |
| "reward_std": 0.5397170335054398, | |
| "rewards/cosine_scaled_reward": 0.13270257785916328, | |
| "rewards/format_reward": 0.770833358168602, | |
| "step": 178 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 959.9792022705078, | |
| "epoch": 0.20457142857142857, | |
| "grad_norm": 0.369107073779179, | |
| "kl": 0.016815185546875, | |
| "learning_rate": 8.295165011252396e-07, | |
| "loss": 0.1417, | |
| "reward": 0.6556574255228043, | |
| "reward_std": 0.4815560430288315, | |
| "rewards/cosine_scaled_reward": -0.10967130470089614, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 179 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1162.7292022705078, | |
| "epoch": 0.2057142857142857, | |
| "grad_norm": 0.5036563993456736, | |
| "kl": 0.01904296875, | |
| "learning_rate": 8.270476638965461e-07, | |
| "loss": 0.0949, | |
| "reward": 0.2779462654143572, | |
| "reward_std": 0.4615231901407242, | |
| "rewards/cosine_scaled_reward": -0.24644354078918695, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 180 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1078.8333435058594, | |
| "epoch": 0.20685714285714285, | |
| "grad_norm": 0.4317948665990577, | |
| "kl": 0.0135955810546875, | |
| "learning_rate": 8.245653237555705e-07, | |
| "loss": 0.1473, | |
| "reward": 0.6264736168086529, | |
| "reward_std": 0.5298948585987091, | |
| "rewards/cosine_scaled_reward": -0.10342983156442642, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 181 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1065.8125305175781, | |
| "epoch": 0.208, | |
| "grad_norm": 0.5168299485262725, | |
| "kl": 0.02105712890625, | |
| "learning_rate": 8.220696016880687e-07, | |
| "loss": 0.1884, | |
| "reward": 0.3882112614810467, | |
| "reward_std": 0.5859006345272064, | |
| "rewards/cosine_scaled_reward": -0.2017277143895626, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 182 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1069.4583740234375, | |
| "epoch": 0.20914285714285713, | |
| "grad_norm": 0.5024855038579699, | |
| "kl": 0.0205078125, | |
| "learning_rate": 8.195606193320136e-07, | |
| "loss": 0.1323, | |
| "reward": 0.24412129819393158, | |
| "reward_std": 0.47408775985240936, | |
| "rewards/cosine_scaled_reward": -0.2529393620789051, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 183 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 936.4792022705078, | |
| "epoch": 0.2102857142857143, | |
| "grad_norm": 0.4981488833418968, | |
| "kl": 0.017730712890625, | |
| "learning_rate": 8.170384989716657e-07, | |
| "loss": 0.137, | |
| "reward": 0.6930912919342518, | |
| "reward_std": 0.5617035925388336, | |
| "rewards/cosine_scaled_reward": -0.03887102263979614, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 184 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1030.4166717529297, | |
| "epoch": 0.21142857142857144, | |
| "grad_norm": 0.4904295101939947, | |
| "kl": 0.0301513671875, | |
| "learning_rate": 8.145033635316128e-07, | |
| "loss": 0.1667, | |
| "reward": 0.07037857547402382, | |
| "reward_std": 0.27715054154396057, | |
| "rewards/cosine_scaled_reward": -0.33981072157621384, | |
| "rewards/format_reward": 0.7500000298023224, | |
| "step": 185 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1297.3750457763672, | |
| "epoch": 0.21257142857142858, | |
| "grad_norm": 0.359329704495533, | |
| "kl": 0.02447509765625, | |
| "learning_rate": 8.119553365707802e-07, | |
| "loss": 0.0722, | |
| "reward": 0.27740756422281265, | |
| "reward_std": 0.35020239651203156, | |
| "rewards/cosine_scaled_reward": -0.20504622161388397, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 186 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1278.2291870117188, | |
| "epoch": 0.21371428571428572, | |
| "grad_norm": 0.6229091446373484, | |
| "kl": 0.037841796875, | |
| "learning_rate": 8.093945422764069e-07, | |
| "loss": 0.159, | |
| "reward": 0.6862413678318262, | |
| "reward_std": 0.806188240647316, | |
| "rewards/cosine_scaled_reward": -0.011045984923839569, | |
| "rewards/format_reward": 0.7083333507180214, | |
| "step": 187 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 828.7083587646484, | |
| "epoch": 0.21485714285714286, | |
| "grad_norm": 0.8396211982213951, | |
| "kl": 0.029296875, | |
| "learning_rate": 8.068211054579943e-07, | |
| "loss": 0.1705, | |
| "reward": 0.5941705331206322, | |
| "reward_std": 0.6708386167883873, | |
| "rewards/cosine_scaled_reward": -0.12999806739389896, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 188 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1126.9583740234375, | |
| "epoch": 0.216, | |
| "grad_norm": 1.0692586435721545, | |
| "kl": 0.05059814453125, | |
| "learning_rate": 8.04235151541222e-07, | |
| "loss": 0.2306, | |
| "reward": 0.3716874085366726, | |
| "reward_std": 0.6852569133043289, | |
| "rewards/cosine_scaled_reward": -0.17873962549492717, | |
| "rewards/format_reward": 0.7291666939854622, | |
| "step": 189 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1441.7292175292969, | |
| "epoch": 0.21714285714285714, | |
| "grad_norm": 0.4556901372243305, | |
| "kl": 0.0775146484375, | |
| "learning_rate": 8.01636806561836e-07, | |
| "loss": 0.0641, | |
| "reward": -0.02832420915365219, | |
| "reward_std": 0.41898399591445923, | |
| "rewards/cosine_scaled_reward": -0.21207877062261105, | |
| "rewards/format_reward": 0.3958333432674408, | |
| "step": 190 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1079.3541870117188, | |
| "epoch": 0.21828571428571428, | |
| "grad_norm": 0.7752155732582218, | |
| "kl": 0.05340576171875, | |
| "learning_rate": 7.990261971595048e-07, | |
| "loss": 0.1862, | |
| "reward": 0.4970630258321762, | |
| "reward_std": 0.6355597376823425, | |
| "rewards/cosine_scaled_reward": -0.1264684833586216, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 191 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1067.5417175292969, | |
| "epoch": 0.21942857142857142, | |
| "grad_norm": 0.9433921479755671, | |
| "kl": 0.0628662109375, | |
| "learning_rate": 7.964034505716476e-07, | |
| "loss": 0.1345, | |
| "reward": 0.34896004013717175, | |
| "reward_std": 0.44530968368053436, | |
| "rewards/cosine_scaled_reward": -0.19010332133620977, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 192 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1153.1458892822266, | |
| "epoch": 0.22057142857142858, | |
| "grad_norm": 0.557299045473737, | |
| "kl": 0.07440185546875, | |
| "learning_rate": 7.93768694627233e-07, | |
| "loss": 0.0623, | |
| "reward": 0.3937496952712536, | |
| "reward_std": 0.4528709352016449, | |
| "rewards/cosine_scaled_reward": -0.14687515422701836, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 193 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 772.7708435058594, | |
| "epoch": 0.22171428571428572, | |
| "grad_norm": 1.0195572615380695, | |
| "kl": 0.03753662109375, | |
| "learning_rate": 7.911220577405484e-07, | |
| "loss": 0.1994, | |
| "reward": 1.379511073231697, | |
| "reward_std": 0.604660227894783, | |
| "rewards/cosine_scaled_reward": 0.23142218962311745, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 194 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1043.2708740234375, | |
| "epoch": 0.22285714285714286, | |
| "grad_norm": 0.9603645520119819, | |
| "kl": 0.057830810546875, | |
| "learning_rate": 7.884636689049422e-07, | |
| "loss": 0.101, | |
| "reward": 0.9527463093400002, | |
| "reward_std": 0.651703879237175, | |
| "rewards/cosine_scaled_reward": 0.12220647558569908, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 195 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1171.8125305175781, | |
| "epoch": 0.224, | |
| "grad_norm": 1.0759043540199384, | |
| "kl": 0.094970703125, | |
| "learning_rate": 7.857936576865356e-07, | |
| "loss": 0.0986, | |
| "reward": 0.22757766395807266, | |
| "reward_std": 0.5421559736132622, | |
| "rewards/cosine_scaled_reward": -0.14662783965468407, | |
| "rewards/format_reward": 0.5208333488553762, | |
| "step": 196 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1254.6458892822266, | |
| "epoch": 0.22514285714285714, | |
| "grad_norm": 1.2281398548522355, | |
| "kl": 0.1163330078125, | |
| "learning_rate": 7.831121542179086e-07, | |
| "loss": 0.2334, | |
| "reward": 0.1120694987475872, | |
| "reward_std": 0.406834427267313, | |
| "rewards/cosine_scaled_reward": -0.21479860320687294, | |
| "rewards/format_reward": 0.541666679084301, | |
| "step": 197 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1551.9792175292969, | |
| "epoch": 0.22628571428571428, | |
| "grad_norm": 1.2807709220712407, | |
| "kl": 0.1573486328125, | |
| "learning_rate": 7.804192891917571e-07, | |
| "loss": 0.1642, | |
| "reward": 0.1520095318555832, | |
| "reward_std": 0.5469059012830257, | |
| "rewards/cosine_scaled_reward": -0.16357857827097178, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 198 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1243.7500457763672, | |
| "epoch": 0.22742857142857142, | |
| "grad_norm": 1.2387930807523095, | |
| "kl": 0.1546630859375, | |
| "learning_rate": 7.777151938545235e-07, | |
| "loss": 0.0664, | |
| "reward": 0.5908387266099453, | |
| "reward_std": 0.44286736100912094, | |
| "rewards/cosine_scaled_reward": 0.014169345609843731, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 199 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 998.3542022705078, | |
| "epoch": 0.22857142857142856, | |
| "grad_norm": 1.6258231243608119, | |
| "kl": 0.146240234375, | |
| "learning_rate": 7.75e-07, | |
| "loss": 0.223, | |
| "reward": 0.9689896870404482, | |
| "reward_std": 0.6490836925804615, | |
| "rewards/cosine_scaled_reward": 0.10949480719864368, | |
| "rewards/format_reward": 0.7500000298023224, | |
| "step": 200 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1197.5208587646484, | |
| "epoch": 0.2297142857142857, | |
| "grad_norm": 1.2117522808382983, | |
| "kl": 0.15203857421875, | |
| "learning_rate": 7.72273839962904e-07, | |
| "loss": 0.1108, | |
| "reward": 0.29535099118947983, | |
| "reward_std": 0.6659888252615929, | |
| "rewards/cosine_scaled_reward": -0.18565785279497504, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 201 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1035.2292022705078, | |
| "epoch": 0.23085714285714284, | |
| "grad_norm": 2.430024645446878, | |
| "kl": 0.1729736328125, | |
| "learning_rate": 7.695368466124296e-07, | |
| "loss": 0.1341, | |
| "reward": 0.4362456016242504, | |
| "reward_std": 0.665816992521286, | |
| "rewards/cosine_scaled_reward": -0.13604386523365974, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 202 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1285.2083587646484, | |
| "epoch": 0.232, | |
| "grad_norm": 3.5252314114631926, | |
| "kl": 0.2603759765625, | |
| "learning_rate": 7.667891533457718e-07, | |
| "loss": 0.2005, | |
| "reward": 0.48519248701632023, | |
| "reward_std": 0.612464651465416, | |
| "rewards/cosine_scaled_reward": -0.08032042533159256, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 203 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 835.0833587646484, | |
| "epoch": 0.23314285714285715, | |
| "grad_norm": 1.8381824332135883, | |
| "kl": 0.1859130859375, | |
| "learning_rate": 7.640308940816239e-07, | |
| "loss": 0.053, | |
| "reward": 1.2399137616157532, | |
| "reward_std": 0.6745168194174767, | |
| "rewards/cosine_scaled_reward": 0.2241235449910164, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 204 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1421.9583435058594, | |
| "epoch": 0.2342857142857143, | |
| "grad_norm": 1.7567396533005133, | |
| "kl": 0.362548828125, | |
| "learning_rate": 7.612622032536507e-07, | |
| "loss": 0.1051, | |
| "reward": 0.3085259608924389, | |
| "reward_std": 0.6349210105836391, | |
| "rewards/cosine_scaled_reward": -0.08532036282122135, | |
| "rewards/format_reward": 0.4791666865348816, | |
| "step": 205 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1228.7083892822266, | |
| "epoch": 0.23542857142857143, | |
| "grad_norm": 2.3389392066981562, | |
| "kl": 0.30615234375, | |
| "learning_rate": 7.584832158039378e-07, | |
| "loss": 0.0693, | |
| "reward": 0.18148453161120415, | |
| "reward_std": 0.5284193530678749, | |
| "rewards/cosine_scaled_reward": -0.24259107932448387, | |
| "rewards/format_reward": 0.6666666716337204, | |
| "step": 206 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1070.4792022705078, | |
| "epoch": 0.23657142857142857, | |
| "grad_norm": 3.543320557594463, | |
| "kl": 0.26416015625, | |
| "learning_rate": 7.556940671764124e-07, | |
| "loss": 0.1883, | |
| "reward": 0.542645301669836, | |
| "reward_std": 0.5379708558320999, | |
| "rewards/cosine_scaled_reward": -0.12451068125665188, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 207 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1317.2708740234375, | |
| "epoch": 0.2377142857142857, | |
| "grad_norm": 1.9754558032385148, | |
| "kl": 0.6748046875, | |
| "learning_rate": 7.528948933102438e-07, | |
| "loss": 0.1365, | |
| "reward": 0.09549727046396583, | |
| "reward_std": 0.3623932749032974, | |
| "rewards/cosine_scaled_reward": -0.2126680426299572, | |
| "rewards/format_reward": 0.5208333395421505, | |
| "step": 208 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 919.9375457763672, | |
| "epoch": 0.23885714285714285, | |
| "grad_norm": 3.305425458945869, | |
| "kl": 0.474609375, | |
| "learning_rate": 7.500858306332172e-07, | |
| "loss": 0.0593, | |
| "reward": 0.7489641904830933, | |
| "reward_std": 0.4507276937365532, | |
| "rewards/cosine_scaled_reward": 0.030732073821127415, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 209 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 908.8958435058594, | |
| "epoch": 0.24, | |
| "grad_norm": 3.7173678494051496, | |
| "kl": 0.403564453125, | |
| "learning_rate": 7.472670160550848e-07, | |
| "loss": 0.1606, | |
| "reward": 0.7559212893247604, | |
| "reward_std": 0.5382421165704727, | |
| "rewards/cosine_scaled_reward": -0.007456040009856224, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 210 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1479.0416870117188, | |
| "epoch": 0.24114285714285713, | |
| "grad_norm": 39.96198653082631, | |
| "kl": 2.5693359375, | |
| "learning_rate": 7.444385869608921e-07, | |
| "loss": 0.2707, | |
| "reward": 0.03475058265030384, | |
| "reward_std": 0.3246455695480108, | |
| "rewards/cosine_scaled_reward": -0.18054138123989105, | |
| "rewards/format_reward": 0.39583333395421505, | |
| "step": 211 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1051.0208587646484, | |
| "epoch": 0.2422857142857143, | |
| "grad_norm": 3.2393755765757777, | |
| "kl": 0.53466796875, | |
| "learning_rate": 7.416006812042827e-07, | |
| "loss": 0.0958, | |
| "reward": 0.6123923324048519, | |
| "reward_std": 0.5387515500187874, | |
| "rewards/cosine_scaled_reward": -0.04797050543129444, | |
| "rewards/format_reward": 0.708333358168602, | |
| "step": 212 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1301.3750610351562, | |
| "epoch": 0.24342857142857144, | |
| "grad_norm": 2.65733014184082, | |
| "kl": 0.755859375, | |
| "learning_rate": 7.387534371007797e-07, | |
| "loss": 0.1374, | |
| "reward": 0.1711240354925394, | |
| "reward_std": 0.42111407220363617, | |
| "rewards/cosine_scaled_reward": -0.16443797945976257, | |
| "rewards/format_reward": 0.5000000149011612, | |
| "step": 213 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1203.6875610351562, | |
| "epoch": 0.24457142857142858, | |
| "grad_norm": 2.501952170306742, | |
| "kl": 0.50732421875, | |
| "learning_rate": 7.358969934210438e-07, | |
| "loss": 0.1105, | |
| "reward": 0.22278533224016428, | |
| "reward_std": 0.434869222342968, | |
| "rewards/cosine_scaled_reward": -0.22194067016243935, | |
| "rewards/format_reward": 0.6666667014360428, | |
| "step": 214 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1249.6667022705078, | |
| "epoch": 0.24571428571428572, | |
| "grad_norm": 4.086485386572322, | |
| "kl": 0.880859375, | |
| "learning_rate": 7.330314893841101e-07, | |
| "loss": 0.0173, | |
| "reward": 0.3316160347312689, | |
| "reward_std": 0.5279753059148788, | |
| "rewards/cosine_scaled_reward": -0.14669198356568813, | |
| "rewards/format_reward": 0.6250000223517418, | |
| "step": 215 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1369.1666870117188, | |
| "epoch": 0.24685714285714286, | |
| "grad_norm": 3.328918162087878, | |
| "kl": 0.773193359375, | |
| "learning_rate": 7.301570646506027e-07, | |
| "loss": 0.1402, | |
| "reward": 0.2145287273451686, | |
| "reward_std": 0.5796768814325333, | |
| "rewards/cosine_scaled_reward": -0.16356897167861462, | |
| "rewards/format_reward": 0.5416666865348816, | |
| "step": 216 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1269.8542175292969, | |
| "epoch": 0.248, | |
| "grad_norm": 2.8333189883709515, | |
| "kl": 0.75927734375, | |
| "learning_rate": 7.27273859315928e-07, | |
| "loss": -0.0115, | |
| "reward": 0.5310591869056225, | |
| "reward_std": 0.4825605973601341, | |
| "rewards/cosine_scaled_reward": -0.057387083768844604, | |
| "rewards/format_reward": 0.645833358168602, | |
| "step": 217 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1252.2708740234375, | |
| "epoch": 0.24914285714285714, | |
| "grad_norm": 4.762778702423241, | |
| "kl": 0.74072265625, | |
| "learning_rate": 7.243820139034464e-07, | |
| "loss": 0.1477, | |
| "reward": 0.5015929639339447, | |
| "reward_std": 0.3994259871542454, | |
| "rewards/cosine_scaled_reward": -0.07212021434679627, | |
| "rewards/format_reward": 0.6458333507180214, | |
| "step": 218 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1017.5416870117188, | |
| "epoch": 0.2502857142857143, | |
| "grad_norm": 4.164501369060878, | |
| "kl": 1.07958984375, | |
| "learning_rate": 7.214816693576234e-07, | |
| "loss": 0.1337, | |
| "reward": 0.767455330118537, | |
| "reward_std": 0.5030167028307915, | |
| "rewards/cosine_scaled_reward": 0.07122766599059105, | |
| "rewards/format_reward": 0.6250000223517418, | |
| "step": 219 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1053.1875457763672, | |
| "epoch": 0.25142857142857145, | |
| "grad_norm": 3.588996799420188, | |
| "kl": 0.71142578125, | |
| "learning_rate": 7.185729670371604e-07, | |
| "loss": 0.1866, | |
| "reward": 0.48609594255685806, | |
| "reward_std": 0.617650680243969, | |
| "rewards/cosine_scaled_reward": -0.11111870361492038, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 220 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1244.687515258789, | |
| "epoch": 0.25257142857142856, | |
| "grad_norm": 2.946733537468475, | |
| "kl": 1.330078125, | |
| "learning_rate": 7.156560487081051e-07, | |
| "loss": 0.1268, | |
| "reward": 0.4570632018148899, | |
| "reward_std": 0.36856189370155334, | |
| "rewards/cosine_scaled_reward": -0.0006350576877593994, | |
| "rewards/format_reward": 0.4583333395421505, | |
| "step": 221 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1376.9167175292969, | |
| "epoch": 0.2537142857142857, | |
| "grad_norm": 3.53418042013775, | |
| "kl": 1.1337890625, | |
| "learning_rate": 7.127310565369415e-07, | |
| "loss": 0.2362, | |
| "reward": 0.1362705221399665, | |
| "reward_std": 0.3934030085802078, | |
| "rewards/cosine_scaled_reward": -0.19228141009807587, | |
| "rewards/format_reward": 0.5208333358168602, | |
| "step": 222 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1156.750015258789, | |
| "epoch": 0.25485714285714284, | |
| "grad_norm": 35.23833796360462, | |
| "kl": 2.369140625, | |
| "learning_rate": 7.097981330836616e-07, | |
| "loss": 0.1765, | |
| "reward": 0.6305188983678818, | |
| "reward_std": 0.5979669764637947, | |
| "rewards/cosine_scaled_reward": 0.023592765908688307, | |
| "rewards/format_reward": 0.5833333507180214, | |
| "step": 223 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1251.3125305175781, | |
| "epoch": 0.256, | |
| "grad_norm": 3.4418620220945138, | |
| "kl": 1.376953125, | |
| "learning_rate": 7.068574212948169e-07, | |
| "loss": 0.1723, | |
| "reward": 0.5104624545201659, | |
| "reward_std": 0.25178899243474007, | |
| "rewards/cosine_scaled_reward": -0.06768545880913734, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 224 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 924.3541870117188, | |
| "epoch": 0.2571428571428571, | |
| "grad_norm": 6.348797231777103, | |
| "kl": 0.9375, | |
| "learning_rate": 7.039090644965509e-07, | |
| "loss": 0.1337, | |
| "reward": 0.7791457176208496, | |
| "reward_std": 0.7603946030139923, | |
| "rewards/cosine_scaled_reward": 0.07707285927608609, | |
| "rewards/format_reward": 0.6250000223517418, | |
| "step": 225 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1052.5000305175781, | |
| "epoch": 0.2582857142857143, | |
| "grad_norm": 3.9386080018485288, | |
| "kl": 1.52734375, | |
| "learning_rate": 7.009532063876148e-07, | |
| "loss": 0.2459, | |
| "reward": 0.46499455720186234, | |
| "reward_std": 0.6090477257966995, | |
| "rewards/cosine_scaled_reward": -0.09041939489543438, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 226 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 892.1875152587891, | |
| "epoch": 0.25942857142857145, | |
| "grad_norm": 3.4313724086317445, | |
| "kl": 1.125, | |
| "learning_rate": 6.979899910323624e-07, | |
| "loss": 0.1959, | |
| "reward": 0.5925753712654114, | |
| "reward_std": 0.8098603934049606, | |
| "rewards/cosine_scaled_reward": -0.04746231180615723, | |
| "rewards/format_reward": 0.6875000298023224, | |
| "step": 227 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1091.6458740234375, | |
| "epoch": 0.26057142857142856, | |
| "grad_norm": 4.447647427267497, | |
| "kl": 1.66015625, | |
| "learning_rate": 6.950195628537299e-07, | |
| "loss": 0.1179, | |
| "reward": 0.24639339372515678, | |
| "reward_std": 0.48318010196089745, | |
| "rewards/cosine_scaled_reward": -0.17888664733618498, | |
| "rewards/format_reward": 0.6041666865348816, | |
| "step": 228 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 809.2916870117188, | |
| "epoch": 0.26171428571428573, | |
| "grad_norm": 8.169532609480521, | |
| "kl": 2.046875, | |
| "learning_rate": 6.920420666261961e-07, | |
| "loss": 0.3082, | |
| "reward": 0.5617873594164848, | |
| "reward_std": 0.7489510700106621, | |
| "rewards/cosine_scaled_reward": -0.07327299565076828, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 229 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1110.3542175292969, | |
| "epoch": 0.26285714285714284, | |
| "grad_norm": 2.921180843223507, | |
| "kl": 2.2265625, | |
| "learning_rate": 6.890576474687263e-07, | |
| "loss": 0.1487, | |
| "reward": 0.4394577872008085, | |
| "reward_std": 0.4748491495847702, | |
| "rewards/cosine_scaled_reward": -0.05110444873571396, | |
| "rewards/format_reward": 0.5416666977107525, | |
| "step": 230 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1026.3542175292969, | |
| "epoch": 0.264, | |
| "grad_norm": 2.544177744090501, | |
| "kl": 1.572265625, | |
| "learning_rate": 6.860664508377001e-07, | |
| "loss": 0.1564, | |
| "reward": 0.2407762985676527, | |
| "reward_std": 0.5902754589915276, | |
| "rewards/cosine_scaled_reward": -0.20252852141857147, | |
| "rewards/format_reward": 0.645833358168602, | |
| "step": 231 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1030.8958587646484, | |
| "epoch": 0.2651428571428571, | |
| "grad_norm": 3.5304119337525526, | |
| "kl": 1.529296875, | |
| "learning_rate": 6.83068622519821e-07, | |
| "loss": 0.1109, | |
| "reward": 0.42541009094566107, | |
| "reward_std": 0.6807678937911987, | |
| "rewards/cosine_scaled_reward": -0.11021162755787373, | |
| "rewards/format_reward": 0.6458333507180214, | |
| "step": 232 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1073.3333435058594, | |
| "epoch": 0.2662857142857143, | |
| "grad_norm": 3.0267711493511382, | |
| "kl": 1.1796875, | |
| "learning_rate": 6.800643086250121e-07, | |
| "loss": 0.2702, | |
| "reward": 0.42545080557465553, | |
| "reward_std": 0.48426005244255066, | |
| "rewards/cosine_scaled_reward": -0.15185793861746788, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 233 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1166.9791870117188, | |
| "epoch": 0.2674285714285714, | |
| "grad_norm": 2.956369605796136, | |
| "kl": 1.1279296875, | |
| "learning_rate": 6.770536555792944e-07, | |
| "loss": 0.1076, | |
| "reward": 0.3714570254087448, | |
| "reward_std": 0.650765061378479, | |
| "rewards/cosine_scaled_reward": -0.13718816195614636, | |
| "rewards/format_reward": 0.645833358168602, | |
| "step": 234 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1054.6667175292969, | |
| "epoch": 0.26857142857142857, | |
| "grad_norm": 4.47554265499188, | |
| "kl": 1.21484375, | |
| "learning_rate": 6.740368101176495e-07, | |
| "loss": 0.2849, | |
| "reward": 0.6623743935488164, | |
| "reward_std": 0.7155829221010208, | |
| "rewards/cosine_scaled_reward": -0.012562822550535202, | |
| "rewards/format_reward": 0.6875000223517418, | |
| "step": 235 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1096.1875457763672, | |
| "epoch": 0.26971428571428574, | |
| "grad_norm": 4.925975683565178, | |
| "kl": 1.3408203125, | |
| "learning_rate": 6.710139192768694e-07, | |
| "loss": 0.2351, | |
| "reward": 0.26786297000944614, | |
| "reward_std": 0.5117842257022858, | |
| "rewards/cosine_scaled_reward": -0.2202351950109005, | |
| "rewards/format_reward": 0.708333358168602, | |
| "step": 236 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 983.2291870117188, | |
| "epoch": 0.27085714285714285, | |
| "grad_norm": 2.226077510557553, | |
| "kl": 0.77294921875, | |
| "learning_rate": 6.679851303883891e-07, | |
| "loss": 0.1527, | |
| "reward": 0.5171467587351799, | |
| "reward_std": 0.5790724456310272, | |
| "rewards/cosine_scaled_reward": -0.10600997135043144, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 237 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1015.9167175292969, | |
| "epoch": 0.272, | |
| "grad_norm": 2.746018994596942, | |
| "kl": 1.0703125, | |
| "learning_rate": 6.649505910711058e-07, | |
| "loss": 0.1685, | |
| "reward": 0.4093864783644676, | |
| "reward_std": 0.5853541940450668, | |
| "rewards/cosine_scaled_reward": -0.1911400929093361, | |
| "rewards/format_reward": 0.7916667014360428, | |
| "step": 238 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1138.8542022705078, | |
| "epoch": 0.27314285714285713, | |
| "grad_norm": 2.366422791383297, | |
| "kl": 1.3916015625, | |
| "learning_rate": 6.619104492241847e-07, | |
| "loss": 0.1319, | |
| "reward": 0.03224743437021971, | |
| "reward_std": 0.40017952769994736, | |
| "rewards/cosine_scaled_reward": -0.2963762879371643, | |
| "rewards/format_reward": 0.6250000298023224, | |
| "step": 239 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 916.8125305175781, | |
| "epoch": 0.2742857142857143, | |
| "grad_norm": 1.7577643969871468, | |
| "kl": 1.291015625, | |
| "learning_rate": 6.588648530198504e-07, | |
| "loss": 0.13, | |
| "reward": 0.8863477371633053, | |
| "reward_std": 0.6274040639400482, | |
| "rewards/cosine_scaled_reward": 0.10984052997082472, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 240 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 891.0417022705078, | |
| "epoch": 0.2754285714285714, | |
| "grad_norm": 2.841473966918375, | |
| "kl": 1.0361328125, | |
| "learning_rate": 6.558139508961654e-07, | |
| "loss": 0.1554, | |
| "reward": 0.48904264718294144, | |
| "reward_std": 0.669127531349659, | |
| "rewards/cosine_scaled_reward": -0.16172868385910988, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 241 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 900.8541870117188, | |
| "epoch": 0.2765714285714286, | |
| "grad_norm": 4.202915193648642, | |
| "kl": 0.96337890625, | |
| "learning_rate": 6.527578915497951e-07, | |
| "loss": 0.1132, | |
| "reward": 0.6491687893867493, | |
| "reward_std": 0.6397206410765648, | |
| "rewards/cosine_scaled_reward": -0.08166561461985111, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 242 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 871.3542022705078, | |
| "epoch": 0.2777142857142857, | |
| "grad_norm": 4.013401867872089, | |
| "kl": 1.2275390625, | |
| "learning_rate": 6.496968239287603e-07, | |
| "loss": 0.0343, | |
| "reward": 0.6437305957078934, | |
| "reward_std": 0.566775843501091, | |
| "rewards/cosine_scaled_reward": -0.06355137238278985, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 243 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1051.8541870117188, | |
| "epoch": 0.27885714285714286, | |
| "grad_norm": 2.0640323982742346, | |
| "kl": 1.2119140625, | |
| "learning_rate": 6.466308972251785e-07, | |
| "loss": 0.1283, | |
| "reward": 0.6993502229452133, | |
| "reward_std": 0.8381707072257996, | |
| "rewards/cosine_scaled_reward": -0.04615823458880186, | |
| "rewards/format_reward": 0.7916667014360428, | |
| "step": 244 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 926.8958587646484, | |
| "epoch": 0.28, | |
| "grad_norm": 2.3095581027269456, | |
| "kl": 1.2373046875, | |
| "learning_rate": 6.435602608679916e-07, | |
| "loss": 0.1728, | |
| "reward": 0.5032865107059479, | |
| "reward_std": 0.4741464629769325, | |
| "rewards/cosine_scaled_reward": -0.15460674837231636, | |
| "rewards/format_reward": 0.8125000298023224, | |
| "step": 245 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 948.7292022705078, | |
| "epoch": 0.28114285714285714, | |
| "grad_norm": 2.2705966167509697, | |
| "kl": 1.0166015625, | |
| "learning_rate": 6.404850645156841e-07, | |
| "loss": 0.0879, | |
| "reward": 0.5439350083470345, | |
| "reward_std": 0.6458217911422253, | |
| "rewards/cosine_scaled_reward": -0.11344920098781586, | |
| "rewards/format_reward": 0.770833358168602, | |
| "step": 246 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 766.6250152587891, | |
| "epoch": 0.2822857142857143, | |
| "grad_norm": 4.218176679768865, | |
| "kl": 1.375, | |
| "learning_rate": 6.374054580489873e-07, | |
| "loss": 0.1529, | |
| "reward": 0.7583817802369595, | |
| "reward_std": 0.9407426938414574, | |
| "rewards/cosine_scaled_reward": 0.02502422034740448, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 247 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1149.2708435058594, | |
| "epoch": 0.2834285714285714, | |
| "grad_norm": 2.966316254338991, | |
| "kl": 1.69921875, | |
| "learning_rate": 6.343215915635761e-07, | |
| "loss": 0.1307, | |
| "reward": 0.37028552405536175, | |
| "reward_std": 0.35450038872659206, | |
| "rewards/cosine_scaled_reward": -0.15860724076628685, | |
| "rewards/format_reward": 0.6875000298023224, | |
| "step": 248 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1236.3750305175781, | |
| "epoch": 0.2845714285714286, | |
| "grad_norm": 2.8644099570080126, | |
| "kl": 1.646484375, | |
| "learning_rate": 6.31233615362752e-07, | |
| "loss": 0.142, | |
| "reward": 0.3449726775288582, | |
| "reward_std": 0.7856429815292358, | |
| "rewards/cosine_scaled_reward": -0.09834698960185051, | |
| "rewards/format_reward": 0.5416666865348816, | |
| "step": 249 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 977.7500305175781, | |
| "epoch": 0.2857142857142857, | |
| "grad_norm": 1.9099821609277308, | |
| "kl": 0.921875, | |
| "learning_rate": 6.281416799501187e-07, | |
| "loss": 0.0404, | |
| "reward": 0.6945669716224074, | |
| "reward_std": 0.822948083281517, | |
| "rewards/cosine_scaled_reward": -0.048549871891736984, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 250 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1265.9583740234375, | |
| "epoch": 0.28685714285714287, | |
| "grad_norm": 2.751476452748249, | |
| "kl": 1.216796875, | |
| "learning_rate": 1.000438641958131e-07, | |
| "loss": 0.1111, | |
| "reward": 0.12667130306363106, | |
| "reward_std": 0.7467320710420609, | |
| "rewards/cosine_scaled_reward": -0.17624769732356071, | |
| "rewards/format_reward": 0.4791666865348816, | |
| "step": 251 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1031.0833740234375, | |
| "epoch": 0.288, | |
| "grad_norm": 3.701835452468544, | |
| "kl": 1.033203125, | |
| "learning_rate": 6.219465344613258e-07, | |
| "loss": 0.2332, | |
| "reward": 0.3126375643769279, | |
| "reward_std": 0.748970627784729, | |
| "rewards/cosine_scaled_reward": -0.09368122089654207, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 252 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 889.5833587646484, | |
| "epoch": 0.28914285714285715, | |
| "grad_norm": 5.141640270028422, | |
| "kl": 1.69921875, | |
| "learning_rate": 6.188436263278172e-07, | |
| "loss": -0.1188, | |
| "reward": 0.23392239259555936, | |
| "reward_std": 0.8090809062123299, | |
| "rewards/cosine_scaled_reward": -0.11220548488199711, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 253 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 912.5208587646484, | |
| "epoch": 0.29028571428571426, | |
| "grad_norm": 3.5136083178201183, | |
| "kl": 1.1953125, | |
| "learning_rate": 6.157373628530852e-07, | |
| "loss": 0.1793, | |
| "reward": 0.7197382766753435, | |
| "reward_std": 0.9268201515078545, | |
| "rewards/cosine_scaled_reward": 0.057785794138908386, | |
| "rewards/format_reward": 0.6041666865348816, | |
| "step": 254 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1043.3333740234375, | |
| "epoch": 0.2914285714285714, | |
| "grad_norm": 2.8576463073310023, | |
| "kl": 1.361328125, | |
| "learning_rate": 6.126278954320294e-07, | |
| "loss": 0.1618, | |
| "reward": 0.21097473427653313, | |
| "reward_std": 0.8950171619653702, | |
| "rewards/cosine_scaled_reward": -0.08201263658702374, | |
| "rewards/format_reward": 0.3750000074505806, | |
| "step": 255 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1132.9166870117188, | |
| "epoch": 0.2925714285714286, | |
| "grad_norm": 2.6390372016890877, | |
| "kl": 0.9296875, | |
| "learning_rate": 6.095153756157051e-07, | |
| "loss": 0.1517, | |
| "reward": 0.3409617803990841, | |
| "reward_std": 0.7687749713659286, | |
| "rewards/cosine_scaled_reward": -0.142019122838974, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 256 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1172.1458740234375, | |
| "epoch": 0.2937142857142857, | |
| "grad_norm": 1.7999790033387904, | |
| "kl": 0.8994140625, | |
| "learning_rate": 6.06399955103937e-07, | |
| "loss": 0.0345, | |
| "reward": 0.24714069813489914, | |
| "reward_std": 0.526521310210228, | |
| "rewards/cosine_scaled_reward": -0.20976299978792667, | |
| "rewards/format_reward": 0.6666667014360428, | |
| "step": 257 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1025.2292022705078, | |
| "epoch": 0.2948571428571429, | |
| "grad_norm": 3.7817000702854284, | |
| "kl": 0.9970703125, | |
| "learning_rate": 6.032817857379256e-07, | |
| "loss": 0.0254, | |
| "reward": 0.371606208384037, | |
| "reward_std": 0.8782027065753937, | |
| "rewards/cosine_scaled_reward": -0.10586357489228249, | |
| "rewards/format_reward": 0.583333358168602, | |
| "step": 258 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1027.1041870117188, | |
| "epoch": 0.296, | |
| "grad_norm": 2.2007546083055627, | |
| "kl": 1.23828125, | |
| "learning_rate": 6.001610194928464e-07, | |
| "loss": 0.1329, | |
| "reward": 0.2863161154091358, | |
| "reward_std": 0.6974881812930107, | |
| "rewards/cosine_scaled_reward": -0.16934195160865784, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 259 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1060.1666870117188, | |
| "epoch": 0.29714285714285715, | |
| "grad_norm": 2.0712856185453226, | |
| "kl": 1.314453125, | |
| "learning_rate": 5.97037808470444e-07, | |
| "loss": -0.0031, | |
| "reward": 0.05191618762910366, | |
| "reward_std": 0.5254812240600586, | |
| "rewards/cosine_scaled_reward": -0.1927919089794159, | |
| "rewards/format_reward": 0.4375000074505806, | |
| "step": 260 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 800.5625152587891, | |
| "epoch": 0.29828571428571427, | |
| "grad_norm": 3.953323642394609, | |
| "kl": 1.18359375, | |
| "learning_rate": 5.939123048916173e-07, | |
| "loss": 0.1926, | |
| "reward": 0.16135332686826587, | |
| "reward_std": 0.6497361660003662, | |
| "rewards/cosine_scaled_reward": -0.21099001914262772, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 261 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 906.3542022705078, | |
| "epoch": 0.29942857142857143, | |
| "grad_norm": 6.975231366994329, | |
| "kl": 1.1025390625, | |
| "learning_rate": 5.907846610890011e-07, | |
| "loss": 0.2163, | |
| "reward": 0.13131073210388422, | |
| "reward_std": 0.5159479975700378, | |
| "rewards/cosine_scaled_reward": -0.1739279804751277, | |
| "rewards/format_reward": 0.47916667722165585, | |
| "step": 262 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 981.3958587646484, | |
| "epoch": 0.30057142857142854, | |
| "grad_norm": 3.6462739135853304, | |
| "kl": 0.93359375, | |
| "learning_rate": 5.87655029499542e-07, | |
| "loss": 0.2144, | |
| "reward": 0.2528093755245209, | |
| "reward_std": 0.6878427565097809, | |
| "rewards/cosine_scaled_reward": -0.19651199039071798, | |
| "rewards/format_reward": 0.645833358168602, | |
| "step": 263 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1036.4792022705078, | |
| "epoch": 0.3017142857142857, | |
| "grad_norm": 2.4186369761638797, | |
| "kl": 1.11328125, | |
| "learning_rate": 5.845235626570683e-07, | |
| "loss": 0.0094, | |
| "reward": 0.34765794809209183, | |
| "reward_std": 0.7917995601892471, | |
| "rewards/cosine_scaled_reward": -0.10742103308439255, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 264 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 988.1458587646484, | |
| "epoch": 0.3028571428571429, | |
| "grad_norm": 3.8358402184782845, | |
| "kl": 1.125, | |
| "learning_rate": 5.813904131848564e-07, | |
| "loss": 0.1412, | |
| "reward": 0.22985844686627388, | |
| "reward_std": 0.4855259954929352, | |
| "rewards/cosine_scaled_reward": -0.17673744820058346, | |
| "rewards/format_reward": 0.5833333358168602, | |
| "step": 265 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 853.1042022705078, | |
| "epoch": 0.304, | |
| "grad_norm": 3.155418565951925, | |
| "kl": 1.138671875, | |
| "learning_rate": 5.78255733788191e-07, | |
| "loss": -0.0981, | |
| "reward": 0.23544084653258324, | |
| "reward_std": 0.5617225617170334, | |
| "rewards/cosine_scaled_reward": -0.18436292186379433, | |
| "rewards/format_reward": 0.6041666865348816, | |
| "step": 266 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1040.7291870117188, | |
| "epoch": 0.30514285714285716, | |
| "grad_norm": 4.49377424287265, | |
| "kl": 1.8671875, | |
| "learning_rate": 5.751196772469237e-07, | |
| "loss": 0.3133, | |
| "reward": 0.019660448655486107, | |
| "reward_std": 0.5969599932432175, | |
| "rewards/cosine_scaled_reward": -0.14641978219151497, | |
| "rewards/format_reward": 0.3125000149011612, | |
| "step": 267 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1120.3334045410156, | |
| "epoch": 0.3062857142857143, | |
| "grad_norm": 2.9296163486934588, | |
| "kl": 1.455078125, | |
| "learning_rate": 5.71982396408026e-07, | |
| "loss": 0.0891, | |
| "reward": 0.019381534308195114, | |
| "reward_std": 0.6385679095983505, | |
| "rewards/cosine_scaled_reward": -0.188225906342268, | |
| "rewards/format_reward": 0.3958333432674408, | |
| "step": 268 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 940.0833587646484, | |
| "epoch": 0.30742857142857144, | |
| "grad_norm": 3.99474649335861, | |
| "kl": 1.58203125, | |
| "learning_rate": 5.688440441781398e-07, | |
| "loss": 0.2037, | |
| "reward": 0.21233398653566837, | |
| "reward_std": 0.5940781682729721, | |
| "rewards/cosine_scaled_reward": -0.17508301883935928, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 269 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 820.8541870117188, | |
| "epoch": 0.30857142857142855, | |
| "grad_norm": 3.64920081986899, | |
| "kl": 1.548828125, | |
| "learning_rate": 5.657047735161255e-07, | |
| "loss": 0.187, | |
| "reward": 0.287849310785532, | |
| "reward_std": 0.7942548245191574, | |
| "rewards/cosine_scaled_reward": -0.16857536626048386, | |
| "rewards/format_reward": 0.6250000298023224, | |
| "step": 270 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 918.2708435058594, | |
| "epoch": 0.3097142857142857, | |
| "grad_norm": 4.142397150940974, | |
| "kl": 1.3642578125, | |
| "learning_rate": 5.625647374256061e-07, | |
| "loss": -0.0034, | |
| "reward": 0.21712711825966835, | |
| "reward_std": 0.7582554370164871, | |
| "rewards/cosine_scaled_reward": -0.1726864455267787, | |
| "rewards/format_reward": 0.5625000223517418, | |
| "step": 271 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1067.3125305175781, | |
| "epoch": 0.31085714285714283, | |
| "grad_norm": 5.568481701496752, | |
| "kl": 1.576171875, | |
| "learning_rate": 5.594240889475106e-07, | |
| "loss": 0.2629, | |
| "reward": 0.07018839695956558, | |
| "reward_std": 0.6307368651032448, | |
| "rewards/cosine_scaled_reward": -0.17323914170265198, | |
| "rewards/format_reward": 0.4166666753590107, | |
| "step": 272 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1032.7916870117188, | |
| "epoch": 0.312, | |
| "grad_norm": 2.7380334201594207, | |
| "kl": 1.763671875, | |
| "learning_rate": 5.562829811526154e-07, | |
| "loss": 0.1532, | |
| "reward": 0.1198783004656434, | |
| "reward_std": 0.5959479659795761, | |
| "rewards/cosine_scaled_reward": -0.15881085954606533, | |
| "rewards/format_reward": 0.4375000149011612, | |
| "step": 273 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1005.0000305175781, | |
| "epoch": 0.31314285714285717, | |
| "grad_norm": 3.288058849096818, | |
| "kl": 1.3232421875, | |
| "learning_rate": 5.531415671340826e-07, | |
| "loss": 0.0679, | |
| "reward": 0.33828355744481087, | |
| "reward_std": 0.7625949904322624, | |
| "rewards/cosine_scaled_reward": -0.1329415813088417, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 274 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1209.2083892822266, | |
| "epoch": 0.3142857142857143, | |
| "grad_norm": 3.384369498507843, | |
| "kl": 1.3759765625, | |
| "learning_rate": 5.5e-07, | |
| "loss": 0.1487, | |
| "reward": 0.2773652821779251, | |
| "reward_std": 0.7781829237937927, | |
| "rewards/cosine_scaled_reward": -0.09048402030020952, | |
| "rewards/format_reward": 0.45833334885537624, | |
| "step": 275 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 871.2500152587891, | |
| "epoch": 0.31542857142857145, | |
| "grad_norm": 3.6001944034052666, | |
| "kl": 1.2470703125, | |
| "learning_rate": 5.468584328659172e-07, | |
| "loss": 0.2545, | |
| "reward": 0.4259207919239998, | |
| "reward_std": 0.7986200153827667, | |
| "rewards/cosine_scaled_reward": -0.1099562719464302, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 276 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 970.1458740234375, | |
| "epoch": 0.31657142857142856, | |
| "grad_norm": 5.098242367200561, | |
| "kl": 1.9375, | |
| "learning_rate": 5.437170188473847e-07, | |
| "loss": 0.0347, | |
| "reward": 0.1577397957444191, | |
| "reward_std": 0.8665766268968582, | |
| "rewards/cosine_scaled_reward": -0.16071344492956996, | |
| "rewards/format_reward": 0.479166679084301, | |
| "step": 277 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1138.0417175292969, | |
| "epoch": 0.3177142857142857, | |
| "grad_norm": 4.893358334263393, | |
| "kl": 1.51953125, | |
| "learning_rate": 5.405759110524894e-07, | |
| "loss": 0.2335, | |
| "reward": 0.2129652127623558, | |
| "reward_std": 0.8123987764120102, | |
| "rewards/cosine_scaled_reward": -0.1122674010694027, | |
| "rewards/format_reward": 0.4375000149011612, | |
| "step": 278 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1055.5416870117188, | |
| "epoch": 0.31885714285714284, | |
| "grad_norm": 11.325087114885777, | |
| "kl": 1.70703125, | |
| "learning_rate": 5.37435262574394e-07, | |
| "loss": 0.1758, | |
| "reward": 0.2276703668758273, | |
| "reward_std": 0.7087787315249443, | |
| "rewards/cosine_scaled_reward": -0.14658149890601635, | |
| "rewards/format_reward": 0.520833358168602, | |
| "step": 279 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1201.5625305175781, | |
| "epoch": 0.32, | |
| "grad_norm": 4.499791162135755, | |
| "kl": 1.3359375, | |
| "learning_rate": 5.342952264838747e-07, | |
| "loss": 0.199, | |
| "reward": 0.4334499780088663, | |
| "reward_std": 0.8222155347466469, | |
| "rewards/cosine_scaled_reward": -0.0853583601419814, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 280 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1183.5833740234375, | |
| "epoch": 0.3211428571428571, | |
| "grad_norm": 3.6400895329844336, | |
| "kl": 1.931640625, | |
| "learning_rate": 5.311559558218603e-07, | |
| "loss": 0.0286, | |
| "reward": -0.14555206894874573, | |
| "reward_std": 0.4930955022573471, | |
| "rewards/cosine_scaled_reward": -0.2081927042454481, | |
| "rewards/format_reward": 0.2708333395421505, | |
| "step": 281 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1227.6875305175781, | |
| "epoch": 0.3222857142857143, | |
| "grad_norm": 3.351330372342759, | |
| "kl": 1.51953125, | |
| "learning_rate": 5.28017603591974e-07, | |
| "loss": 0.1735, | |
| "reward": 0.08991836942732334, | |
| "reward_std": 0.7664570957422256, | |
| "rewards/cosine_scaled_reward": -0.1946241520345211, | |
| "rewards/format_reward": 0.479166679084301, | |
| "step": 282 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1011.8958435058594, | |
| "epoch": 0.32342857142857145, | |
| "grad_norm": 3.607306150140324, | |
| "kl": 1.52734375, | |
| "learning_rate": 5.248803227530763e-07, | |
| "loss": 0.1756, | |
| "reward": -0.16347728297114372, | |
| "reward_std": 0.6131603866815567, | |
| "rewards/cosine_scaled_reward": -0.269238643348217, | |
| "rewards/format_reward": 0.3750000074505806, | |
| "step": 283 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1238.1875305175781, | |
| "epoch": 0.32457142857142857, | |
| "grad_norm": 3.700854838943554, | |
| "kl": 1.3291015625, | |
| "learning_rate": 5.21744266211809e-07, | |
| "loss": 0.0644, | |
| "reward": 0.19410160928964615, | |
| "reward_std": 0.6351519152522087, | |
| "rewards/cosine_scaled_reward": -0.16336587071418762, | |
| "rewards/format_reward": 0.520833358168602, | |
| "step": 284 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1302.5833435058594, | |
| "epoch": 0.32571428571428573, | |
| "grad_norm": 5.590443825333452, | |
| "kl": 1.396484375, | |
| "learning_rate": 5.186095868151436e-07, | |
| "loss": 0.1172, | |
| "reward": 0.0053066437467350625, | |
| "reward_std": 0.6190855652093887, | |
| "rewards/cosine_scaled_reward": -0.1952633447945118, | |
| "rewards/format_reward": 0.3958333507180214, | |
| "step": 285 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1408.7708587646484, | |
| "epoch": 0.32685714285714285, | |
| "grad_norm": 5820.413747461295, | |
| "kl": 44.6220703125, | |
| "learning_rate": 5.154764373429315e-07, | |
| "loss": 2.1366, | |
| "reward": 0.321873364970088, | |
| "reward_std": 0.7274122461676598, | |
| "rewards/cosine_scaled_reward": -0.06822998262941837, | |
| "rewards/format_reward": 0.45833334885537624, | |
| "step": 286 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1293.6875305175781, | |
| "epoch": 0.328, | |
| "grad_norm": 10688.293773017389, | |
| "kl": 90.048828125, | |
| "learning_rate": 5.123449705004581e-07, | |
| "loss": 3.6012, | |
| "reward": 0.22728685289621353, | |
| "reward_std": 0.6926668882369995, | |
| "rewards/cosine_scaled_reward": -0.10510657541453838, | |
| "rewards/format_reward": 0.4375000074505806, | |
| "step": 287 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1143.1042175292969, | |
| "epoch": 0.3291428571428571, | |
| "grad_norm": 69995.08344409091, | |
| "kl": 821.830078125, | |
| "learning_rate": 5.09215338910999e-07, | |
| "loss": 50.9221, | |
| "reward": 0.3029659762978554, | |
| "reward_std": 0.8068300932645798, | |
| "rewards/cosine_scaled_reward": -0.04643368790857494, | |
| "rewards/format_reward": 0.3958333432674408, | |
| "step": 288 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1325.7084045410156, | |
| "epoch": 0.3302857142857143, | |
| "grad_norm": 62.300695111663714, | |
| "kl": 1.5146484375, | |
| "learning_rate": 5.060876951083828e-07, | |
| "loss": 0.1171, | |
| "reward": 0.10640177875757217, | |
| "reward_std": 0.6392035633325577, | |
| "rewards/cosine_scaled_reward": -0.08221577852964401, | |
| "rewards/format_reward": 0.2708333358168602, | |
| "step": 289 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1066.375015258789, | |
| "epoch": 0.3314285714285714, | |
| "grad_norm": 3.0451709688438138, | |
| "kl": 0.85791015625, | |
| "learning_rate": 5.02962191529556e-07, | |
| "loss": 0.0875, | |
| "reward": 0.4837397076189518, | |
| "reward_std": 0.6303973346948624, | |
| "rewards/cosine_scaled_reward": -0.008130142465233803, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 290 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1176.2292175292969, | |
| "epoch": 0.3325714285714286, | |
| "grad_norm": 6.431194933370891, | |
| "kl": 1.169921875, | |
| "learning_rate": 4.998389805071536e-07, | |
| "loss": 0.0944, | |
| "reward": 0.004224353935569525, | |
| "reward_std": 0.7458223477005959, | |
| "rewards/cosine_scaled_reward": -0.17497116327285767, | |
| "rewards/format_reward": 0.354166679084301, | |
| "step": 291 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1279.1875610351562, | |
| "epoch": 0.33371428571428574, | |
| "grad_norm": 11.784461019524304, | |
| "kl": 1.0419921875, | |
| "learning_rate": 4.967182142620745e-07, | |
| "loss": 0.0752, | |
| "reward": -0.019843921065330505, | |
| "reward_std": 0.5733096897602081, | |
| "rewards/cosine_scaled_reward": -0.21825530380010605, | |
| "rewards/format_reward": 0.4166666753590107, | |
| "step": 292 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1270.3750305175781, | |
| "epoch": 0.33485714285714285, | |
| "grad_norm": 12451.222306718704, | |
| "kl": 56.82421875, | |
| "learning_rate": 4.93600044896063e-07, | |
| "loss": 2.6089, | |
| "reward": -0.0518635269254446, | |
| "reward_std": 0.4941852539777756, | |
| "rewards/cosine_scaled_reward": -0.22384843230247498, | |
| "rewards/format_reward": 0.3958333507180214, | |
| "step": 293 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1304.8750457763672, | |
| "epoch": 0.336, | |
| "grad_norm": 354145.9079404987, | |
| "kl": 3584.8046875, | |
| "learning_rate": 4.904846243842949e-07, | |
| "loss": 283.5748, | |
| "reward": 0.06046904996037483, | |
| "reward_std": 0.7505204379558563, | |
| "rewards/cosine_scaled_reward": -0.13643214339390397, | |
| "rewards/format_reward": 0.3333333358168602, | |
| "step": 294 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1317.5625610351562, | |
| "epoch": 0.33714285714285713, | |
| "grad_norm": 5.242464203702877, | |
| "kl": 1.0029296875, | |
| "learning_rate": 4.873721045679706e-07, | |
| "loss": 0.1195, | |
| "reward": 0.005757967010140419, | |
| "reward_std": 0.6009484976530075, | |
| "rewards/cosine_scaled_reward": -0.12212102208286524, | |
| "rewards/format_reward": 0.2500000111758709, | |
| "step": 295 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1103.0625610351562, | |
| "epoch": 0.3382857142857143, | |
| "grad_norm": 4.2430557491796055, | |
| "kl": 0.8115234375, | |
| "learning_rate": 4.842626371469149e-07, | |
| "loss": 0.0632, | |
| "reward": 0.0580328986980021, | |
| "reward_std": 0.6936925277113914, | |
| "rewards/cosine_scaled_reward": -0.15848355647176504, | |
| "rewards/format_reward": 0.3750000149011612, | |
| "step": 296 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1432.3333435058594, | |
| "epoch": 0.3394285714285714, | |
| "grad_norm": 2.9908283966206457, | |
| "kl": 0.7646484375, | |
| "learning_rate": 4.811563736721829e-07, | |
| "loss": 0.1022, | |
| "reward": -0.011708778678439558, | |
| "reward_std": 0.5683621913194656, | |
| "rewards/cosine_scaled_reward": -0.12043773010373116, | |
| "rewards/format_reward": 0.2291666716337204, | |
| "step": 297 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1451.8750305175781, | |
| "epoch": 0.3405714285714286, | |
| "grad_norm": 4.214445887739457, | |
| "kl": 0.673828125, | |
| "learning_rate": 4.780534655386743e-07, | |
| "loss": -0.0113, | |
| "reward": -0.12220606487244368, | |
| "reward_std": 0.5942584052681923, | |
| "rewards/cosine_scaled_reward": -0.18610304035246372, | |
| "rewards/format_reward": 0.2500000074505806, | |
| "step": 298 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1379.4791870117188, | |
| "epoch": 0.3417142857142857, | |
| "grad_norm": 4.524572878515851, | |
| "kl": 0.5302734375, | |
| "learning_rate": 4.749540639777539e-07, | |
| "loss": -0.0319, | |
| "reward": -0.08997016213834286, | |
| "reward_std": 0.6837709844112396, | |
| "rewards/cosine_scaled_reward": -0.1804017536342144, | |
| "rewards/format_reward": 0.27083334140479565, | |
| "step": 299 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1242.2083740234375, | |
| "epoch": 0.34285714285714286, | |
| "grad_norm": 22.44129435449986, | |
| "kl": 0.6015625, | |
| "learning_rate": 4.7185832004988133e-07, | |
| "loss": 0.047, | |
| "reward": 0.4733648784458637, | |
| "reward_std": 0.6498839557170868, | |
| "rewards/cosine_scaled_reward": -0.013317572651430964, | |
| "rewards/format_reward": 0.5000000111758709, | |
| "step": 300 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1358.5000610351562, | |
| "epoch": 0.344, | |
| "grad_norm": 5.451894779313779, | |
| "kl": 0.55419921875, | |
| "learning_rate": 4.68766384637248e-07, | |
| "loss": 0.0201, | |
| "reward": 0.012628388591110706, | |
| "reward_std": 0.6598528623580933, | |
| "rewards/cosine_scaled_reward": -0.11868580989539623, | |
| "rewards/format_reward": 0.2500000037252903, | |
| "step": 301 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1208.0000610351562, | |
| "epoch": 0.34514285714285714, | |
| "grad_norm": 2.502133066720727, | |
| "kl": 0.5078125, | |
| "learning_rate": 4.656784084364238e-07, | |
| "loss": 0.0976, | |
| "reward": 0.01287244912236929, | |
| "reward_std": 0.6720428466796875, | |
| "rewards/cosine_scaled_reward": -0.14981378242373466, | |
| "rewards/format_reward": 0.3125000111758709, | |
| "step": 302 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1372.2917175292969, | |
| "epoch": 0.3462857142857143, | |
| "grad_norm": 9.527527408809727, | |
| "kl": 0.591796875, | |
| "learning_rate": 4.6259454195101267e-07, | |
| "loss": -0.0351, | |
| "reward": -0.0026968184392899275, | |
| "reward_std": 0.7502148300409317, | |
| "rewards/cosine_scaled_reward": -0.1784317558631301, | |
| "rewards/format_reward": 0.3541666716337204, | |
| "step": 303 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1228.1458435058594, | |
| "epoch": 0.3474285714285714, | |
| "grad_norm": 5.5176774561091655, | |
| "kl": 0.4345703125, | |
| "learning_rate": 4.59514935484316e-07, | |
| "loss": 0.1598, | |
| "reward": 0.39222877379506826, | |
| "reward_std": 0.840458020567894, | |
| "rewards/cosine_scaled_reward": -0.03305228240787983, | |
| "rewards/format_reward": 0.4583333358168602, | |
| "step": 304 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1448.8125305175781, | |
| "epoch": 0.3485714285714286, | |
| "grad_norm": 7.801875434214254, | |
| "kl": 0.3525390625, | |
| "learning_rate": 4.5643973913200837e-07, | |
| "loss": 0.0808, | |
| "reward": 0.005279352888464928, | |
| "reward_std": 0.6858643740415573, | |
| "rewards/cosine_scaled_reward": -0.1536103216931224, | |
| "rewards/format_reward": 0.3125000074505806, | |
| "step": 305 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1402.6666870117188, | |
| "epoch": 0.3497142857142857, | |
| "grad_norm": 3.566822202421308, | |
| "kl": 0.29638671875, | |
| "learning_rate": 4.5336910277482155e-07, | |
| "loss": 0.0791, | |
| "reward": 0.18335522711277008, | |
| "reward_std": 0.6350644528865814, | |
| "rewards/cosine_scaled_reward": -0.13748905574902892, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 306 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1421.8541870117188, | |
| "epoch": 0.35085714285714287, | |
| "grad_norm": 1.9532542741070622, | |
| "kl": 0.289794921875, | |
| "learning_rate": 4.503031760712397e-07, | |
| "loss": 0.0514, | |
| "reward": 0.2609965428709984, | |
| "reward_std": 0.7012953609228134, | |
| "rewards/cosine_scaled_reward": -0.06741839554160833, | |
| "rewards/format_reward": 0.39583334885537624, | |
| "step": 307 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1331.2500305175781, | |
| "epoch": 0.352, | |
| "grad_norm": 2.135773174322825, | |
| "kl": 0.26416015625, | |
| "learning_rate": 4.4724210845020494e-07, | |
| "loss": 0.1508, | |
| "reward": 0.21997906267642975, | |
| "reward_std": 0.6842755973339081, | |
| "rewards/cosine_scaled_reward": -0.1191771375015378, | |
| "rewards/format_reward": 0.4583333358168602, | |
| "step": 308 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1291.1666870117188, | |
| "epoch": 0.35314285714285715, | |
| "grad_norm": 3.030174625800062, | |
| "kl": 0.323486328125, | |
| "learning_rate": 4.441860491038345e-07, | |
| "loss": 0.1012, | |
| "reward": -0.060309079475700855, | |
| "reward_std": 0.48270438611507416, | |
| "rewards/cosine_scaled_reward": -0.16557121649384499, | |
| "rewards/format_reward": 0.2708333469927311, | |
| "step": 309 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1296.8125457763672, | |
| "epoch": 0.35428571428571426, | |
| "grad_norm": 3.288974321286699, | |
| "kl": 0.30712890625, | |
| "learning_rate": 4.4113514698014953e-07, | |
| "loss": 0.1053, | |
| "reward": 0.3812438789755106, | |
| "reward_std": 0.6454566046595573, | |
| "rewards/cosine_scaled_reward": 0.0031219255179166794, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 310 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1582.4167175292969, | |
| "epoch": 0.3554285714285714, | |
| "grad_norm": 11.037201589242047, | |
| "kl": 0.3916015625, | |
| "learning_rate": 4.3808955077581546e-07, | |
| "loss": 0.0554, | |
| "reward": 0.011564895510673523, | |
| "reward_std": 0.5866778641939163, | |
| "rewards/cosine_scaled_reward": -0.12963422574102879, | |
| "rewards/format_reward": 0.27083333767950535, | |
| "step": 311 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1511.1458740234375, | |
| "epoch": 0.3565714285714286, | |
| "grad_norm": 541.360267852673, | |
| "kl": 2.48046875, | |
| "learning_rate": 4.350494089288943e-07, | |
| "loss": 0.1743, | |
| "reward": 0.09507806971669197, | |
| "reward_std": 0.7126565277576447, | |
| "rewards/cosine_scaled_reward": -0.12954430282115936, | |
| "rewards/format_reward": 0.3541666716337204, | |
| "step": 312 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1310.4792022705078, | |
| "epoch": 0.3577142857142857, | |
| "grad_norm": 1.6060292822301743, | |
| "kl": 0.2235107421875, | |
| "learning_rate": 4.3201486961161093e-07, | |
| "loss": 0.0119, | |
| "reward": 0.19681214727461338, | |
| "reward_std": 0.5347588732838631, | |
| "rewards/cosine_scaled_reward": -0.14117726124823093, | |
| "rewards/format_reward": 0.4791666902601719, | |
| "step": 313 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1412.7708740234375, | |
| "epoch": 0.3588571428571429, | |
| "grad_norm": 0.9234012789427545, | |
| "kl": 0.2705078125, | |
| "learning_rate": 4.2898608072313045e-07, | |
| "loss": 0.0522, | |
| "reward": 0.1253851738292724, | |
| "reward_std": 0.5503663271665573, | |
| "rewards/cosine_scaled_reward": -0.1352240853011608, | |
| "rewards/format_reward": 0.3958333469927311, | |
| "step": 314 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1183.7916870117188, | |
| "epoch": 0.36, | |
| "grad_norm": 1.7837131712349448, | |
| "kl": 0.248779296875, | |
| "learning_rate": 4.2596318988235037e-07, | |
| "loss": 0.1102, | |
| "reward": 0.06632774323225021, | |
| "reward_std": 0.8003478944301605, | |
| "rewards/cosine_scaled_reward": -0.14391947723925114, | |
| "rewards/format_reward": 0.354166679084301, | |
| "step": 315 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1321.1667022705078, | |
| "epoch": 0.36114285714285715, | |
| "grad_norm": 3.8904561936208473, | |
| "kl": 0.311279296875, | |
| "learning_rate": 4.2294634442070553e-07, | |
| "loss": 0.0684, | |
| "reward": -0.12211128510534763, | |
| "reward_std": 0.3644377589225769, | |
| "rewards/cosine_scaled_reward": -0.19647231698036194, | |
| "rewards/format_reward": 0.27083333767950535, | |
| "step": 316 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1472.5208435058594, | |
| "epoch": 0.36228571428571427, | |
| "grad_norm": 0.6761305622628668, | |
| "kl": 0.2392578125, | |
| "learning_rate": 4.1993569137498776e-07, | |
| "loss": 0.0051, | |
| "reward": 0.07694595551583916, | |
| "reward_std": 0.698570191860199, | |
| "rewards/cosine_scaled_reward": -0.08652702532708645, | |
| "rewards/format_reward": 0.25000000186264515, | |
| "step": 317 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1446.3959045410156, | |
| "epoch": 0.36342857142857143, | |
| "grad_norm": 1.610083766620256, | |
| "kl": 0.23046875, | |
| "learning_rate": 4.1693137748017915e-07, | |
| "loss": 0.1272, | |
| "reward": 0.22593690548092127, | |
| "reward_std": 0.7007799595594406, | |
| "rewards/cosine_scaled_reward": -0.11619820445775986, | |
| "rewards/format_reward": 0.45833336375653744, | |
| "step": 318 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1419.7500610351562, | |
| "epoch": 0.36457142857142855, | |
| "grad_norm": 1.3177147357732026, | |
| "kl": 0.3505859375, | |
| "learning_rate": 4.1393354916230005e-07, | |
| "loss": 0.0908, | |
| "reward": 0.05421498417854309, | |
| "reward_std": 0.6087209582328796, | |
| "rewards/cosine_scaled_reward": -0.10830917488783598, | |
| "rewards/format_reward": 0.27083334140479565, | |
| "step": 319 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1459.2917175292969, | |
| "epoch": 0.3657142857142857, | |
| "grad_norm": 2.383045046585821, | |
| "kl": 0.177001953125, | |
| "learning_rate": 4.1094235253127374e-07, | |
| "loss": 0.143, | |
| "reward": 0.23994141444563866, | |
| "reward_std": 0.7169264256954193, | |
| "rewards/cosine_scaled_reward": -0.08836262859404087, | |
| "rewards/format_reward": 0.4166666753590107, | |
| "step": 320 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1525.8542175292969, | |
| "epoch": 0.3668571428571429, | |
| "grad_norm": 1.4014039132566267, | |
| "kl": 0.327880859375, | |
| "learning_rate": 4.079579333738039e-07, | |
| "loss": 0.0636, | |
| "reward": 0.07618786534294486, | |
| "reward_std": 0.6110149621963501, | |
| "rewards/cosine_scaled_reward": -0.17023939825594425, | |
| "rewards/format_reward": 0.416666679084301, | |
| "step": 321 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1505.1042022705078, | |
| "epoch": 0.368, | |
| "grad_norm": 0.9016635108285753, | |
| "kl": 0.18017578125, | |
| "learning_rate": 4.0498043714627006e-07, | |
| "loss": 0.0766, | |
| "reward": 0.1637781597673893, | |
| "reward_std": 0.6868859454989433, | |
| "rewards/cosine_scaled_reward": -0.13686091732233763, | |
| "rewards/format_reward": 0.4375000186264515, | |
| "step": 322 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1378.7500305175781, | |
| "epoch": 0.36914285714285716, | |
| "grad_norm": 1.1982814454055981, | |
| "kl": 0.39306640625, | |
| "learning_rate": 4.020100089676376e-07, | |
| "loss": 0.0913, | |
| "reward": 0.17529202857986093, | |
| "reward_std": 0.6956184059381485, | |
| "rewards/cosine_scaled_reward": -0.14152065757662058, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 323 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1414.5208740234375, | |
| "epoch": 0.3702857142857143, | |
| "grad_norm": 5.168812943695912, | |
| "kl": 0.2706298828125, | |
| "learning_rate": 3.9904679361238526e-07, | |
| "loss": 0.0758, | |
| "reward": -0.05163134215399623, | |
| "reward_std": 0.573038712143898, | |
| "rewards/cosine_scaled_reward": -0.2133156731724739, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 324 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1026.2292175292969, | |
| "epoch": 0.37142857142857144, | |
| "grad_norm": 2.717389747197644, | |
| "kl": 0.2042236328125, | |
| "learning_rate": 3.9609093550344907e-07, | |
| "loss": 0.0446, | |
| "reward": 0.35916636511683464, | |
| "reward_std": 0.7165441811084747, | |
| "rewards/cosine_scaled_reward": -0.11208349000662565, | |
| "rewards/format_reward": 0.583333358168602, | |
| "step": 325 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1296.2083435058594, | |
| "epoch": 0.37257142857142855, | |
| "grad_norm": 0.9706132798560072, | |
| "kl": 0.1436767578125, | |
| "learning_rate": 3.931425787051832e-07, | |
| "loss": 0.0264, | |
| "reward": 0.03931037150323391, | |
| "reward_std": 0.5944674462080002, | |
| "rewards/cosine_scaled_reward": -0.24076148495078087, | |
| "rewards/format_reward": 0.5208333395421505, | |
| "step": 326 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1119.3958740234375, | |
| "epoch": 0.3737142857142857, | |
| "grad_norm": 7.20904098295775, | |
| "kl": 0.34619140625, | |
| "learning_rate": 3.902018669163384e-07, | |
| "loss": 0.0023, | |
| "reward": 0.5026027010753751, | |
| "reward_std": 0.4505321756005287, | |
| "rewards/cosine_scaled_reward": 0.011718038469552994, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 327 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1340.9791717529297, | |
| "epoch": 0.37485714285714283, | |
| "grad_norm": 1.2860908020915138, | |
| "kl": 0.416259765625, | |
| "learning_rate": 3.872689434630585e-07, | |
| "loss": 0.1449, | |
| "reward": 0.15127216652035713, | |
| "reward_std": 0.6304197087883949, | |
| "rewards/cosine_scaled_reward": -0.15353058651089668, | |
| "rewards/format_reward": 0.4583333507180214, | |
| "step": 328 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1571.3750610351562, | |
| "epoch": 0.376, | |
| "grad_norm": 0.8293118478307562, | |
| "kl": 0.242431640625, | |
| "learning_rate": 3.843439512918949e-07, | |
| "loss": 0.0229, | |
| "reward": 0.09288652800023556, | |
| "reward_std": 0.5842361897230148, | |
| "rewards/cosine_scaled_reward": -0.15147340297698975, | |
| "rewards/format_reward": 0.3958333432674408, | |
| "step": 329 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1407.9583740234375, | |
| "epoch": 0.37714285714285717, | |
| "grad_norm": 1.189781094856149, | |
| "kl": 0.1077880859375, | |
| "learning_rate": 3.8142703296283953e-07, | |
| "loss": 0.0681, | |
| "reward": -0.09090141206979752, | |
| "reward_std": 0.5390855148434639, | |
| "rewards/cosine_scaled_reward": -0.21211737021803856, | |
| "rewards/format_reward": 0.3333333395421505, | |
| "step": 330 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1255.6875457763672, | |
| "epoch": 0.3782857142857143, | |
| "grad_norm": 1.046472107288498, | |
| "kl": 0.10308837890625, | |
| "learning_rate": 3.785183306423767e-07, | |
| "loss": 0.0811, | |
| "reward": -0.12841611605836079, | |
| "reward_std": 0.39798876643180847, | |
| "rewards/cosine_scaled_reward": -0.3350413963198662, | |
| "rewards/format_reward": 0.5416666865348816, | |
| "step": 331 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1201.8958740234375, | |
| "epoch": 0.37942857142857145, | |
| "grad_norm": 1.123018980255247, | |
| "kl": 0.117584228515625, | |
| "learning_rate": 3.7561798609655373e-07, | |
| "loss": 0.072, | |
| "reward": 0.499036006629467, | |
| "reward_std": 0.6711834743618965, | |
| "rewards/cosine_scaled_reward": -0.03173201950266957, | |
| "rewards/format_reward": 0.5625000074505806, | |
| "step": 332 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1133.3333587646484, | |
| "epoch": 0.38057142857142856, | |
| "grad_norm": 2.177638459571002, | |
| "kl": 0.14453125, | |
| "learning_rate": 3.72726140684072e-07, | |
| "loss": 0.1488, | |
| "reward": 0.03351620538160205, | |
| "reward_std": 0.4431127682328224, | |
| "rewards/cosine_scaled_reward": -0.27490856871008873, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 333 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1252.5833587646484, | |
| "epoch": 0.38171428571428573, | |
| "grad_norm": 1.6680786188797292, | |
| "kl": 2.4984130859375, | |
| "learning_rate": 3.6984293534939737e-07, | |
| "loss": 0.1246, | |
| "reward": -0.1514057070016861, | |
| "reward_std": 0.5695896856486797, | |
| "rewards/cosine_scaled_reward": -0.26320285350084305, | |
| "rewards/format_reward": 0.3750000074505806, | |
| "step": 334 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1232.0000610351562, | |
| "epoch": 0.38285714285714284, | |
| "grad_norm": 1.828125714274309, | |
| "kl": 0.07843017578125, | |
| "learning_rate": 3.6696851061588994e-07, | |
| "loss": 0.1105, | |
| "reward": 0.07522661844268441, | |
| "reward_std": 0.5525132827460766, | |
| "rewards/cosine_scaled_reward": -0.21238669380545616, | |
| "rewards/format_reward": 0.5000000149011612, | |
| "step": 335 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1240.4375305175781, | |
| "epoch": 0.384, | |
| "grad_norm": 3.2255921262965432, | |
| "kl": 0.19232177734375, | |
| "learning_rate": 3.641030065789562e-07, | |
| "loss": 0.2104, | |
| "reward": -0.07903135940432549, | |
| "reward_std": 0.4235813617706299, | |
| "rewards/cosine_scaled_reward": -0.3103490248322487, | |
| "rewards/format_reward": 0.5416666865348816, | |
| "step": 336 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1136.2500457763672, | |
| "epoch": 0.3851428571428571, | |
| "grad_norm": 2.1359050155328076, | |
| "kl": 0.298095703125, | |
| "learning_rate": 3.612465628992203e-07, | |
| "loss": 0.1271, | |
| "reward": 0.29203586652874947, | |
| "reward_std": 0.6221929639577866, | |
| "rewards/cosine_scaled_reward": -0.14564874302595854, | |
| "rewards/format_reward": 0.583333358168602, | |
| "step": 337 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1304.4792175292969, | |
| "epoch": 0.3862857142857143, | |
| "grad_norm": 1.42801024449987, | |
| "kl": 0.2041015625, | |
| "learning_rate": 3.5839931879571725e-07, | |
| "loss": 0.0306, | |
| "reward": -0.07640792615711689, | |
| "reward_std": 0.29374565184116364, | |
| "rewards/cosine_scaled_reward": -0.30903729796409607, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 338 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1483.5625305175781, | |
| "epoch": 0.38742857142857146, | |
| "grad_norm": 4.530770296915891, | |
| "kl": 0.2216796875, | |
| "learning_rate": 3.555614130391079e-07, | |
| "loss": 0.0756, | |
| "reward": -0.22593690641224384, | |
| "reward_std": 0.42642898857593536, | |
| "rewards/cosine_scaled_reward": -0.31088512018322945, | |
| "rewards/format_reward": 0.39583334513008595, | |
| "step": 339 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1301.6875305175781, | |
| "epoch": 0.38857142857142857, | |
| "grad_norm": 32.229056752997074, | |
| "kl": 0.72998046875, | |
| "learning_rate": 3.5273298394491515e-07, | |
| "loss": 0.0451, | |
| "reward": 0.1187155619263649, | |
| "reward_std": 0.6100866496562958, | |
| "rewards/cosine_scaled_reward": -0.16980887576937675, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 340 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1222.5000610351562, | |
| "epoch": 0.38971428571428574, | |
| "grad_norm": 31.15024931066955, | |
| "kl": 1.814453125, | |
| "learning_rate": 3.4991416936678276e-07, | |
| "loss": 0.0053, | |
| "reward": 0.4647822715342045, | |
| "reward_std": 0.8535723686218262, | |
| "rewards/cosine_scaled_reward": 0.013641122728586197, | |
| "rewards/format_reward": 0.4375000149011612, | |
| "step": 341 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1550.9583740234375, | |
| "epoch": 0.39085714285714285, | |
| "grad_norm": 5.073035047796139, | |
| "kl": 0.40185546875, | |
| "learning_rate": 3.471051066897562e-07, | |
| "loss": 0.1274, | |
| "reward": -0.049222253262996674, | |
| "reward_std": 0.6296448782086372, | |
| "rewards/cosine_scaled_reward": -0.1704444605857134, | |
| "rewards/format_reward": 0.29166667349636555, | |
| "step": 342 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1254.1458740234375, | |
| "epoch": 0.392, | |
| "grad_norm": 2.9987047793682247, | |
| "kl": 0.191650390625, | |
| "learning_rate": 3.4430593282358777e-07, | |
| "loss": 0.132, | |
| "reward": 0.4507103096693754, | |
| "reward_std": 0.46682045608758926, | |
| "rewards/cosine_scaled_reward": -0.11839485540986061, | |
| "rewards/format_reward": 0.6875000298023224, | |
| "step": 343 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1259.0833740234375, | |
| "epoch": 0.3931428571428571, | |
| "grad_norm": 11.834773130920754, | |
| "kl": 0.765625, | |
| "learning_rate": 3.4151678419606233e-07, | |
| "loss": 0.1692, | |
| "reward": 0.04102582670748234, | |
| "reward_std": 0.6375212371349335, | |
| "rewards/cosine_scaled_reward": -0.16698708944022655, | |
| "rewards/format_reward": 0.3750000111758709, | |
| "step": 344 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 948.8333587646484, | |
| "epoch": 0.3942857142857143, | |
| "grad_norm": 4.082579051373274, | |
| "kl": 0.11126708984375, | |
| "learning_rate": 3.387377967463493e-07, | |
| "loss": 0.1531, | |
| "reward": 0.32552773877978325, | |
| "reward_std": 0.5937002822756767, | |
| "rewards/cosine_scaled_reward": -0.18098615854978561, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 345 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1348.5208740234375, | |
| "epoch": 0.3954285714285714, | |
| "grad_norm": 4.16581520032074, | |
| "kl": 0.233154296875, | |
| "learning_rate": 3.359691059183761e-07, | |
| "loss": 0.0891, | |
| "reward": -0.024696938693523407, | |
| "reward_std": 0.6840994879603386, | |
| "rewards/cosine_scaled_reward": -0.2310984805226326, | |
| "rewards/format_reward": 0.4375000074505806, | |
| "step": 346 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1152.9583435058594, | |
| "epoch": 0.3965714285714286, | |
| "grad_norm": 6.491892036842968, | |
| "kl": 0.688232421875, | |
| "learning_rate": 3.3321084665422803e-07, | |
| "loss": 0.1813, | |
| "reward": 0.7761995047330856, | |
| "reward_std": 0.9014021009206772, | |
| "rewards/cosine_scaled_reward": 0.08601640490815043, | |
| "rewards/format_reward": 0.6041666865348816, | |
| "step": 347 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1119.6875305175781, | |
| "epoch": 0.3977142857142857, | |
| "grad_norm": 6.465035426418669, | |
| "kl": 0.2274169921875, | |
| "learning_rate": 3.3046315338757026e-07, | |
| "loss": 0.3084, | |
| "reward": 0.1041297996416688, | |
| "reward_std": 0.5661944150924683, | |
| "rewards/cosine_scaled_reward": -0.2187684327363968, | |
| "rewards/format_reward": 0.541666679084301, | |
| "step": 348 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1280.5000305175781, | |
| "epoch": 0.39885714285714285, | |
| "grad_norm": 5.965340713566614, | |
| "kl": 0.0919189453125, | |
| "learning_rate": 3.2772616003709616e-07, | |
| "loss": 0.264, | |
| "reward": 0.5343287643045187, | |
| "reward_std": 1.0619665831327438, | |
| "rewards/cosine_scaled_reward": -0.024502300075255334, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 349 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1420.2291870117188, | |
| "epoch": 0.4, | |
| "grad_norm": 2.925238124886515, | |
| "kl": 0.185791015625, | |
| "learning_rate": 3.250000000000001e-07, | |
| "loss": 0.1961, | |
| "reward": 0.12700789980590343, | |
| "reward_std": 0.8331074118614197, | |
| "rewards/cosine_scaled_reward": -0.1656627282500267, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 350 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1083.6042175292969, | |
| "epoch": 0.40114285714285713, | |
| "grad_norm": 3.606767246674259, | |
| "kl": 0.18115234375, | |
| "learning_rate": 3.222848061454764e-07, | |
| "loss": -0.0154, | |
| "reward": 0.25727599672973156, | |
| "reward_std": 0.6387183666229248, | |
| "rewards/cosine_scaled_reward": -0.18386201839894056, | |
| "rewards/format_reward": 0.6250000074505806, | |
| "step": 351 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1439.1250610351562, | |
| "epoch": 0.4022857142857143, | |
| "grad_norm": 1.929818758425276, | |
| "kl": 0.1397705078125, | |
| "learning_rate": 3.195807108082429e-07, | |
| "loss": 0.1728, | |
| "reward": -0.14825151395052671, | |
| "reward_std": 0.5558790042996407, | |
| "rewards/cosine_scaled_reward": -0.2824591100215912, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 352 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1523.3542175292969, | |
| "epoch": 0.4034285714285714, | |
| "grad_norm": 1.501137402879622, | |
| "kl": 0.15606689453125, | |
| "learning_rate": 3.168878457820915e-07, | |
| "loss": 0.1054, | |
| "reward": -0.2005203291773796, | |
| "reward_std": 0.5384240373969078, | |
| "rewards/cosine_scaled_reward": -0.2565101645886898, | |
| "rewards/format_reward": 0.31250000558793545, | |
| "step": 353 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1457.8125305175781, | |
| "epoch": 0.4045714285714286, | |
| "grad_norm": 1.7447813143906967, | |
| "kl": 0.19287109375, | |
| "learning_rate": 3.142063423134644e-07, | |
| "loss": 0.0946, | |
| "reward": -0.07205517496913671, | |
| "reward_std": 0.5912996232509613, | |
| "rewards/cosine_scaled_reward": -0.27561092376708984, | |
| "rewards/format_reward": 0.4791666865348816, | |
| "step": 354 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 935.1041793823242, | |
| "epoch": 0.4057142857142857, | |
| "grad_norm": 5.735907828017728, | |
| "kl": 0.416259765625, | |
| "learning_rate": 3.115363310950578e-07, | |
| "loss": 0.2126, | |
| "reward": 0.6018264503218234, | |
| "reward_std": 0.43670547753572464, | |
| "rewards/cosine_scaled_reward": -0.04283679276704788, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 355 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1400.4375610351562, | |
| "epoch": 0.40685714285714286, | |
| "grad_norm": 4.2513620245343855, | |
| "kl": 0.2110595703125, | |
| "learning_rate": 3.0887794225945143e-07, | |
| "loss": 0.0986, | |
| "reward": 0.07107849605381489, | |
| "reward_std": 0.6532387360930443, | |
| "rewards/cosine_scaled_reward": -0.22487742826342583, | |
| "rewards/format_reward": 0.5208333432674408, | |
| "step": 356 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1214.1250305175781, | |
| "epoch": 0.408, | |
| "grad_norm": 1.8135177203210504, | |
| "kl": 0.1314697265625, | |
| "learning_rate": 3.062313053727671e-07, | |
| "loss": 0.1426, | |
| "reward": 0.03724817745387554, | |
| "reward_std": 0.5181447230279446, | |
| "rewards/cosine_scaled_reward": -0.2730425810441375, | |
| "rewards/format_reward": 0.5833333507180214, | |
| "step": 357 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1284.7500305175781, | |
| "epoch": 0.40914285714285714, | |
| "grad_norm": 3.565695417018542, | |
| "kl": 0.18597412109375, | |
| "learning_rate": 3.0359654942835247e-07, | |
| "loss": 0.1245, | |
| "reward": 0.04130622744560242, | |
| "reward_std": 0.7205251231789589, | |
| "rewards/cosine_scaled_reward": -0.19809689931571484, | |
| "rewards/format_reward": 0.4375000111758709, | |
| "step": 358 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1343.2083740234375, | |
| "epoch": 0.4102857142857143, | |
| "grad_norm": 3.2057830256260917, | |
| "kl": 0.14324951171875, | |
| "learning_rate": 3.0097380284049523e-07, | |
| "loss": -0.0078, | |
| "reward": 0.1697351299226284, | |
| "reward_std": 0.3564612567424774, | |
| "rewards/cosine_scaled_reward": -0.13388244062662125, | |
| "rewards/format_reward": 0.4375000111758709, | |
| "step": 359 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1433.2291870117188, | |
| "epoch": 0.4114285714285714, | |
| "grad_norm": 1.6762255456245136, | |
| "kl": 0.1739501953125, | |
| "learning_rate": 2.9836319343816397e-07, | |
| "loss": 0.1781, | |
| "reward": 0.21988008171319962, | |
| "reward_std": 0.7903619408607483, | |
| "rewards/cosine_scaled_reward": -0.10880996193736792, | |
| "rewards/format_reward": 0.4375000223517418, | |
| "step": 360 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1309.6250457763672, | |
| "epoch": 0.4125714285714286, | |
| "grad_norm": 0.9826821036841882, | |
| "kl": 0.135498046875, | |
| "learning_rate": 2.9576484845877793e-07, | |
| "loss": 0.0186, | |
| "reward": 0.33486853912472725, | |
| "reward_std": 0.500580433756113, | |
| "rewards/cosine_scaled_reward": -0.11381572997197509, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 361 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1173.5625305175781, | |
| "epoch": 0.4137142857142857, | |
| "grad_norm": 4.226570835684713, | |
| "kl": 0.0926513671875, | |
| "learning_rate": 2.931788945420058e-07, | |
| "loss": 0.18, | |
| "reward": 0.15393588319420815, | |
| "reward_std": 0.5774414390325546, | |
| "rewards/cosine_scaled_reward": -0.20428206771612167, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 362 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1250.0416870117188, | |
| "epoch": 0.41485714285714287, | |
| "grad_norm": 1.978862188088671, | |
| "kl": 0.09033203125, | |
| "learning_rate": 2.9060545772359305e-07, | |
| "loss": 0.1327, | |
| "reward": 0.2741839215159416, | |
| "reward_std": 0.6551093906164169, | |
| "rewards/cosine_scaled_reward": -0.17540805786848068, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 363 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1354.9583892822266, | |
| "epoch": 0.416, | |
| "grad_norm": 1.500971160749094, | |
| "kl": 0.1431884765625, | |
| "learning_rate": 2.8804466342921987e-07, | |
| "loss": 0.1556, | |
| "reward": 0.09914333745837212, | |
| "reward_std": 0.5969183072447777, | |
| "rewards/cosine_scaled_reward": -0.17959501221776009, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 364 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1443.8958435058594, | |
| "epoch": 0.41714285714285715, | |
| "grad_norm": 1.5336203716893533, | |
| "kl": 0.14166259765625, | |
| "learning_rate": 2.854966364683872e-07, | |
| "loss": 0.0794, | |
| "reward": 0.08230920624919236, | |
| "reward_std": 0.7491874545812607, | |
| "rewards/cosine_scaled_reward": -0.18801206350326538, | |
| "rewards/format_reward": 0.4583333358168602, | |
| "step": 365 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1280.0833740234375, | |
| "epoch": 0.41828571428571426, | |
| "grad_norm": 5.917103922817008, | |
| "kl": 0.1395263671875, | |
| "learning_rate": 2.829615010283344e-07, | |
| "loss": 0.2201, | |
| "reward": 0.30844624526798725, | |
| "reward_std": 0.6032929718494415, | |
| "rewards/cosine_scaled_reward": -0.11661022901535034, | |
| "rewards/format_reward": 0.5416666865348816, | |
| "step": 366 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1071.7083587646484, | |
| "epoch": 0.41942857142857143, | |
| "grad_norm": 6.764653159306351, | |
| "kl": 1.21533203125, | |
| "learning_rate": 2.8043938066798645e-07, | |
| "loss": 0.2217, | |
| "reward": 0.5629880558699369, | |
| "reward_std": 0.7271402254700661, | |
| "rewards/cosine_scaled_reward": -0.02058931067585945, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 367 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1272.2708892822266, | |
| "epoch": 0.4205714285714286, | |
| "grad_norm": 3.868751461553908, | |
| "kl": 0.376953125, | |
| "learning_rate": 2.7793039831193133e-07, | |
| "loss": 0.0282, | |
| "reward": 0.2414314430207014, | |
| "reward_std": 0.783539354801178, | |
| "rewards/cosine_scaled_reward": -0.13970092684030533, | |
| "rewards/format_reward": 0.5208333544433117, | |
| "step": 368 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1453.8333740234375, | |
| "epoch": 0.4217142857142857, | |
| "grad_norm": 1.6191974125060598, | |
| "kl": 0.29150390625, | |
| "learning_rate": 2.7543467624442956e-07, | |
| "loss": 0.172, | |
| "reward": 0.11266430467367172, | |
| "reward_std": 0.7149153798818588, | |
| "rewards/cosine_scaled_reward": -0.1415845244191587, | |
| "rewards/format_reward": 0.39583334140479565, | |
| "step": 369 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1463.3959045410156, | |
| "epoch": 0.4228571428571429, | |
| "grad_norm": 4.101308083609096, | |
| "kl": 0.56884765625, | |
| "learning_rate": 2.729523361034538e-07, | |
| "loss": 0.2149, | |
| "reward": -0.2552230432629585, | |
| "reward_std": 0.5415500551462173, | |
| "rewards/cosine_scaled_reward": -0.26302820444107056, | |
| "rewards/format_reward": 0.27083334140479565, | |
| "step": 370 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1240.8542175292969, | |
| "epoch": 0.424, | |
| "grad_norm": 3.8927886605185447, | |
| "kl": 0.30340576171875, | |
| "learning_rate": 2.7048349887476037e-07, | |
| "loss": 0.1602, | |
| "reward": 0.1614240426570177, | |
| "reward_std": 0.5875495374202728, | |
| "rewards/cosine_scaled_reward": -0.15887131541967392, | |
| "rewards/format_reward": 0.479166679084301, | |
| "step": 371 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1177.3958740234375, | |
| "epoch": 0.42514285714285716, | |
| "grad_norm": 3.066569475752354, | |
| "kl": 0.1824951171875, | |
| "learning_rate": 2.6802828488599294e-07, | |
| "loss": 0.1059, | |
| "reward": 0.2956250160932541, | |
| "reward_std": 0.6594211757183075, | |
| "rewards/cosine_scaled_reward": -0.12302083522081375, | |
| "rewards/format_reward": 0.5416666939854622, | |
| "step": 372 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1163.4375610351562, | |
| "epoch": 0.42628571428571427, | |
| "grad_norm": 5.09566585578463, | |
| "kl": 0.2724609375, | |
| "learning_rate": 2.655868138008171e-07, | |
| "loss": 0.1544, | |
| "reward": 0.07318597589619458, | |
| "reward_std": 0.7096846550703049, | |
| "rewards/cosine_scaled_reward": -0.2759070098400116, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 373 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1246.7500457763672, | |
| "epoch": 0.42742857142857144, | |
| "grad_norm": 32.203352857308325, | |
| "kl": 0.839111328125, | |
| "learning_rate": 2.631592046130896e-07, | |
| "loss": 0.1927, | |
| "reward": 0.08969515189528465, | |
| "reward_std": 0.6610818058252335, | |
| "rewards/cosine_scaled_reward": -0.22598576080054045, | |
| "rewards/format_reward": 0.5416666865348816, | |
| "step": 374 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1280.7708740234375, | |
| "epoch": 0.42857142857142855, | |
| "grad_norm": 63.335567619096544, | |
| "kl": 0.94873046875, | |
| "learning_rate": 2.6074557564105724e-07, | |
| "loss": 0.2184, | |
| "reward": 0.18546735402196646, | |
| "reward_std": 0.9102050960063934, | |
| "rewards/cosine_scaled_reward": -0.17809965554624796, | |
| "rewards/format_reward": 0.5416667014360428, | |
| "step": 375 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1256.6250305175781, | |
| "epoch": 0.4297142857142857, | |
| "grad_norm": 3.6519960558396716, | |
| "kl": 0.310302734375, | |
| "learning_rate": 2.583460445215911e-07, | |
| "loss": 0.1114, | |
| "reward": 0.1940733604133129, | |
| "reward_std": 0.5819907337427139, | |
| "rewards/cosine_scaled_reward": -0.1946299858391285, | |
| "rewards/format_reward": 0.583333358168602, | |
| "step": 376 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1285.6458740234375, | |
| "epoch": 0.4308571428571429, | |
| "grad_norm": 5.319529708040252, | |
| "kl": 0.3394775390625, | |
| "learning_rate": 2.5596072820445254e-07, | |
| "loss": 0.0359, | |
| "reward": 0.25018906872719526, | |
| "reward_std": 0.8042758777737617, | |
| "rewards/cosine_scaled_reward": -0.13532213680446148, | |
| "rewards/format_reward": 0.5208333432674408, | |
| "step": 377 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1374.5416870117188, | |
| "epoch": 0.432, | |
| "grad_norm": 20.461016176615068, | |
| "kl": 0.70166015625, | |
| "learning_rate": 2.5358974294659373e-07, | |
| "loss": 0.2346, | |
| "reward": -0.005498896003700793, | |
| "reward_std": 0.5357099026441574, | |
| "rewards/cosine_scaled_reward": -0.22149945423007011, | |
| "rewards/format_reward": 0.4375000074505806, | |
| "step": 378 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1435.1875, | |
| "epoch": 0.43314285714285716, | |
| "grad_norm": 2.391831824846237, | |
| "kl": 0.292236328125, | |
| "learning_rate": 2.512332043064913e-07, | |
| "loss": 0.1982, | |
| "reward": 0.012932289391756058, | |
| "reward_std": 0.799980454146862, | |
| "rewards/cosine_scaled_reward": -0.20186719112098217, | |
| "rewards/format_reward": 0.4166666865348816, | |
| "step": 379 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1369.5833740234375, | |
| "epoch": 0.4342857142857143, | |
| "grad_norm": 2.2747857355280208, | |
| "kl": 0.1715087890625, | |
| "learning_rate": 2.488912271385139e-07, | |
| "loss": 0.1725, | |
| "reward": -0.22791396314278245, | |
| "reward_std": 0.4170580878853798, | |
| "rewards/cosine_scaled_reward": -0.3431236445903778, | |
| "rewards/format_reward": 0.4583333544433117, | |
| "step": 380 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1187.6458587646484, | |
| "epoch": 0.43542857142857144, | |
| "grad_norm": 2.7553958817382593, | |
| "kl": 0.16162109375, | |
| "learning_rate": 2.465639255873246e-07, | |
| "loss": 0.1247, | |
| "reward": 0.19117721682414412, | |
| "reward_std": 0.46048377081751823, | |
| "rewards/cosine_scaled_reward": -0.23774472624063492, | |
| "rewards/format_reward": 0.6666667014360428, | |
| "step": 381 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1300.3125305175781, | |
| "epoch": 0.43657142857142855, | |
| "grad_norm": 2.0362039263750082, | |
| "kl": 0.1822509765625, | |
| "learning_rate": 2.4425141308231765e-07, | |
| "loss": 0.1158, | |
| "reward": 0.2739548869431019, | |
| "reward_std": 0.603746622800827, | |
| "rewards/cosine_scaled_reward": -0.09218922536820173, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 382 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1263.7917175292969, | |
| "epoch": 0.4377142857142857, | |
| "grad_norm": 7.617696331239462, | |
| "kl": 0.2333984375, | |
| "learning_rate": 2.4195380233209006e-07, | |
| "loss": 0.1076, | |
| "reward": 0.12070683389902115, | |
| "reward_std": 0.38592402543872595, | |
| "rewards/cosine_scaled_reward": -0.18964658118784428, | |
| "rewards/format_reward": 0.5, | |
| "step": 383 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1170.3750457763672, | |
| "epoch": 0.43885714285714283, | |
| "grad_norm": 3.2601623912372233, | |
| "kl": 0.2103271484375, | |
| "learning_rate": 2.3967120531894857e-07, | |
| "loss": 0.1471, | |
| "reward": -0.021999074146151543, | |
| "reward_std": 0.34355130419135094, | |
| "rewards/cosine_scaled_reward": -0.31308288127183914, | |
| "rewards/format_reward": 0.6041666865348816, | |
| "step": 384 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1233.7083740234375, | |
| "epoch": 0.44, | |
| "grad_norm": 2.1916650637468025, | |
| "kl": 0.16259765625, | |
| "learning_rate": 2.374037332934512e-07, | |
| "loss": 0.0922, | |
| "reward": 0.054161038249731064, | |
| "reward_std": 0.7442760765552521, | |
| "rewards/cosine_scaled_reward": -0.2541694864630699, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 385 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 870.3333740234375, | |
| "epoch": 0.44114285714285717, | |
| "grad_norm": 1.4860604247340325, | |
| "kl": 0.0894775390625, | |
| "learning_rate": 2.3515149676898552e-07, | |
| "loss": 0.1158, | |
| "reward": 0.28954136464744806, | |
| "reward_std": 0.5479708462953568, | |
| "rewards/cosine_scaled_reward": -0.240646006539464, | |
| "rewards/format_reward": 0.770833358168602, | |
| "step": 386 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1227.2291870117188, | |
| "epoch": 0.4422857142857143, | |
| "grad_norm": 1.687755076974517, | |
| "kl": 0.2470703125, | |
| "learning_rate": 2.3291460551638237e-07, | |
| "loss": 0.151, | |
| "reward": -0.0012904666364192963, | |
| "reward_std": 0.4440325200557709, | |
| "rewards/cosine_scaled_reward": -0.2714785784482956, | |
| "rewards/format_reward": 0.5416666865348816, | |
| "step": 387 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1033.6042022705078, | |
| "epoch": 0.44342857142857145, | |
| "grad_norm": 2.5341444420596884, | |
| "kl": 0.164947509765625, | |
| "learning_rate": 2.306931685585657e-07, | |
| "loss": 0.0897, | |
| "reward": 0.4180222749710083, | |
| "reward_std": 0.754804901778698, | |
| "rewards/cosine_scaled_reward": -0.14515553694218397, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 388 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1356.8333740234375, | |
| "epoch": 0.44457142857142856, | |
| "grad_norm": 3.704344948231344, | |
| "kl": 0.372314453125, | |
| "learning_rate": 2.2848729416523859e-07, | |
| "loss": 0.102, | |
| "reward": 0.2806839719414711, | |
| "reward_std": 0.6125510483980179, | |
| "rewards/cosine_scaled_reward": -0.07840801030397415, | |
| "rewards/format_reward": 0.4375000074505806, | |
| "step": 389 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1090.2708740234375, | |
| "epoch": 0.44571428571428573, | |
| "grad_norm": 14.470921296685399, | |
| "kl": 0.47216796875, | |
| "learning_rate": 2.2629708984760706e-07, | |
| "loss": 0.2654, | |
| "reward": 0.07703178748488426, | |
| "reward_std": 0.5665107443928719, | |
| "rewards/cosine_scaled_reward": -0.26356743834912777, | |
| "rewards/format_reward": 0.6041666939854622, | |
| "step": 390 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1098.7500305175781, | |
| "epoch": 0.44685714285714284, | |
| "grad_norm": 2.4001916122615157, | |
| "kl": 0.26904296875, | |
| "learning_rate": 2.2412266235313973e-07, | |
| "loss": 0.1304, | |
| "reward": 0.2017030455172062, | |
| "reward_std": 0.5325312875211239, | |
| "rewards/cosine_scaled_reward": -0.20123182306997478, | |
| "rewards/format_reward": 0.6041666865348816, | |
| "step": 391 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1404.0625610351562, | |
| "epoch": 0.448, | |
| "grad_norm": 12.93850484473414, | |
| "kl": 0.662109375, | |
| "learning_rate": 2.2196411766036487e-07, | |
| "loss": 0.0663, | |
| "reward": 0.39279897045344114, | |
| "reward_std": 0.9181084930896759, | |
| "rewards/cosine_scaled_reward": -0.04318385384976864, | |
| "rewards/format_reward": 0.4791666939854622, | |
| "step": 392 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1046.0417175292969, | |
| "epoch": 0.4491428571428571, | |
| "grad_norm": 3.1910943036863695, | |
| "kl": 0.2236328125, | |
| "learning_rate": 2.1982156097370557e-07, | |
| "loss": 0.094, | |
| "reward": 0.1259294361807406, | |
| "reward_std": 0.620373547077179, | |
| "rewards/cosine_scaled_reward": -0.23911861330270767, | |
| "rewards/format_reward": 0.6041666865348816, | |
| "step": 393 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 898.3333511352539, | |
| "epoch": 0.4502857142857143, | |
| "grad_norm": 4.93057169428389, | |
| "kl": 0.25714111328125, | |
| "learning_rate": 2.1769509671835223e-07, | |
| "loss": 0.2665, | |
| "reward": 0.2223543766885996, | |
| "reward_std": 0.4368506968021393, | |
| "rewards/cosine_scaled_reward": -0.23257281631231308, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 394 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1140.0000457763672, | |
| "epoch": 0.4514285714285714, | |
| "grad_norm": 2.3738396662205945, | |
| "kl": 0.35986328125, | |
| "learning_rate": 2.1558482853517253e-07, | |
| "loss": 0.3105, | |
| "reward": 0.10918148793280125, | |
| "reward_std": 0.5202281884849072, | |
| "rewards/cosine_scaled_reward": -0.21624258160591125, | |
| "rewards/format_reward": 0.5416666865348816, | |
| "step": 395 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1151.3958740234375, | |
| "epoch": 0.45257142857142857, | |
| "grad_norm": 2.5367764499763257, | |
| "kl": 0.3154296875, | |
| "learning_rate": 2.134908592756607e-07, | |
| "loss": 0.1917, | |
| "reward": 0.17909681051969528, | |
| "reward_std": 0.7349686250090599, | |
| "rewards/cosine_scaled_reward": -0.2021182719618082, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 396 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1223.8958740234375, | |
| "epoch": 0.45371428571428574, | |
| "grad_norm": 3.0861426217577645, | |
| "kl": 0.38720703125, | |
| "learning_rate": 2.1141329099692406e-07, | |
| "loss": 0.2308, | |
| "reward": 0.6319128852337599, | |
| "reward_std": 0.8242618143558502, | |
| "rewards/cosine_scaled_reward": 0.04512310400605202, | |
| "rewards/format_reward": 0.541666679084301, | |
| "step": 397 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1218.8541870117188, | |
| "epoch": 0.45485714285714285, | |
| "grad_norm": 18.365837770437405, | |
| "kl": 0.6829833984375, | |
| "learning_rate": 2.0935222495670968e-07, | |
| "loss": 0.189, | |
| "reward": 0.27588833356276155, | |
| "reward_std": 0.8127910792827606, | |
| "rewards/cosine_scaled_reward": -0.19538918882608414, | |
| "rewards/format_reward": 0.6666666939854622, | |
| "step": 398 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1264.1666870117188, | |
| "epoch": 0.456, | |
| "grad_norm": 3.8049582826738373, | |
| "kl": 0.47314453125, | |
| "learning_rate": 2.0730776160846853e-07, | |
| "loss": 0.1823, | |
| "reward": 0.055698491632938385, | |
| "reward_std": 0.49411067366600037, | |
| "rewards/cosine_scaled_reward": -0.21173409838229418, | |
| "rewards/format_reward": 0.4791666828095913, | |
| "step": 399 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1013.3750457763672, | |
| "epoch": 0.45714285714285713, | |
| "grad_norm": 7.251771375036044, | |
| "kl": 0.4078369140625, | |
| "learning_rate": 2.0528000059645995e-07, | |
| "loss": 0.175, | |
| "reward": 0.2562308683991432, | |
| "reward_std": 0.2563706263899803, | |
| "rewards/cosine_scaled_reward": -0.2260512337088585, | |
| "rewards/format_reward": 0.7083333488553762, | |
| "step": 400 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 952.7291870117188, | |
| "epoch": 0.4582857142857143, | |
| "grad_norm": 8.82258461767532, | |
| "kl": 0.45355224609375, | |
| "learning_rate": 2.032690407508949e-07, | |
| "loss": 0.1529, | |
| "reward": 0.4902263447875157, | |
| "reward_std": 0.5446355119347572, | |
| "rewards/cosine_scaled_reward": -0.11947017908096313, | |
| "rewards/format_reward": 0.729166679084301, | |
| "step": 401 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1302.8542175292969, | |
| "epoch": 0.4594285714285714, | |
| "grad_norm": 9.144934630730456, | |
| "kl": 0.51953125, | |
| "learning_rate": 2.0127498008311922e-07, | |
| "loss": 0.1489, | |
| "reward": 0.0001004636287689209, | |
| "reward_std": 0.5631029531359673, | |
| "rewards/cosine_scaled_reward": -0.28119976818561554, | |
| "rewards/format_reward": 0.5625000223517418, | |
| "step": 402 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1430.5833740234375, | |
| "epoch": 0.4605714285714286, | |
| "grad_norm": 1.9477748820622875, | |
| "kl": 0.3515625, | |
| "learning_rate": 1.9929791578083655e-07, | |
| "loss": 0.2408, | |
| "reward": -0.06413780152797699, | |
| "reward_std": 0.7934899777173996, | |
| "rewards/cosine_scaled_reward": -0.2195689007639885, | |
| "rewards/format_reward": 0.3750000037252903, | |
| "step": 403 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1262.8333587646484, | |
| "epoch": 0.4617142857142857, | |
| "grad_norm": 3.330875199108497, | |
| "kl": 0.19140625, | |
| "learning_rate": 1.9733794420337213e-07, | |
| "loss": 0.1344, | |
| "reward": 0.1329102972522378, | |
| "reward_std": 0.5511343032121658, | |
| "rewards/cosine_scaled_reward": -0.25646152906119823, | |
| "rewards/format_reward": 0.645833358168602, | |
| "step": 404 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1030.2916870117188, | |
| "epoch": 0.46285714285714286, | |
| "grad_norm": 7.2006501333137996, | |
| "kl": 0.147216796875, | |
| "learning_rate": 1.9539516087697517e-07, | |
| "loss": 0.1614, | |
| "reward": 0.41257511638104916, | |
| "reward_std": 0.4603617787361145, | |
| "rewards/cosine_scaled_reward": -0.14787913113832474, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 405 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1085.1666870117188, | |
| "epoch": 0.464, | |
| "grad_norm": 1.7990135612572722, | |
| "kl": 0.25146484375, | |
| "learning_rate": 1.934696604901642e-07, | |
| "loss": 0.1199, | |
| "reward": -0.0262349434196949, | |
| "reward_std": 0.4924147129058838, | |
| "rewards/cosine_scaled_reward": -0.2839508093893528, | |
| "rewards/format_reward": 0.5416666828095913, | |
| "step": 406 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 953.4375, | |
| "epoch": 0.46514285714285714, | |
| "grad_norm": 2.205384781012483, | |
| "kl": 0.16357421875, | |
| "learning_rate": 1.915615368891117e-07, | |
| "loss": 0.0901, | |
| "reward": 0.5169772207736969, | |
| "reward_std": 0.28926569409668446, | |
| "rewards/cosine_scaled_reward": -0.0748447310179472, | |
| "rewards/format_reward": 0.6666666828095913, | |
| "step": 407 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 914.9583587646484, | |
| "epoch": 0.4662857142857143, | |
| "grad_norm": 2.497956913876472, | |
| "kl": 0.27496337890625, | |
| "learning_rate": 1.8967088307307e-07, | |
| "loss": 0.1155, | |
| "reward": 0.3262156348209828, | |
| "reward_std": 0.6255160942673683, | |
| "rewards/cosine_scaled_reward": -0.13897553086280823, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 408 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1040.8333435058594, | |
| "epoch": 0.4674285714285714, | |
| "grad_norm": 8.69885706641019, | |
| "kl": 0.2950439453125, | |
| "learning_rate": 1.8779779118983867e-07, | |
| "loss": 0.1446, | |
| "reward": 0.45548180863261223, | |
| "reward_std": 0.683892697095871, | |
| "rewards/cosine_scaled_reward": -0.1472591133788228, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 409 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1113.8750305175781, | |
| "epoch": 0.4685714285714286, | |
| "grad_norm": 2.827095364325863, | |
| "kl": 0.17364501953125, | |
| "learning_rate": 1.8594235253127372e-07, | |
| "loss": 0.1365, | |
| "reward": -0.055647075176239014, | |
| "reward_std": 0.5701718181371689, | |
| "rewards/cosine_scaled_reward": -0.3299068883061409, | |
| "rewards/format_reward": 0.6041666939854622, | |
| "step": 410 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1180.7500305175781, | |
| "epoch": 0.4697142857142857, | |
| "grad_norm": 6.312691045251246, | |
| "kl": 0.2171630859375, | |
| "learning_rate": 1.8410465752883758e-07, | |
| "loss": 0.26, | |
| "reward": -0.027378916274756193, | |
| "reward_std": 0.5135050415992737, | |
| "rewards/cosine_scaled_reward": -0.33660613000392914, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 411 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 936.0625152587891, | |
| "epoch": 0.47085714285714286, | |
| "grad_norm": 8.466457070247934, | |
| "kl": 0.207763671875, | |
| "learning_rate": 1.822847957491922e-07, | |
| "loss": 0.2152, | |
| "reward": 0.2903781367931515, | |
| "reward_std": 0.6151079386472702, | |
| "rewards/cosine_scaled_reward": -0.24022759683430195, | |
| "rewards/format_reward": 0.770833358168602, | |
| "step": 412 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1127.9375610351562, | |
| "epoch": 0.472, | |
| "grad_norm": 7.005816452720984, | |
| "kl": 0.23388671875, | |
| "learning_rate": 1.804828558898332e-07, | |
| "loss": 0.2359, | |
| "reward": -0.05256163072772324, | |
| "reward_std": 0.5086416229605675, | |
| "rewards/cosine_scaled_reward": -0.30753082782030106, | |
| "rewards/format_reward": 0.5625000223517418, | |
| "step": 413 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1254.2292175292969, | |
| "epoch": 0.47314285714285714, | |
| "grad_norm": 3.1930529627345146, | |
| "kl": 0.30908203125, | |
| "learning_rate": 1.7869892577476722e-07, | |
| "loss": 0.091, | |
| "reward": 0.27630291134119034, | |
| "reward_std": 0.601336345076561, | |
| "rewards/cosine_scaled_reward": -0.12226520664989948, | |
| "rewards/format_reward": 0.520833358168602, | |
| "step": 414 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1198.7083740234375, | |
| "epoch": 0.4742857142857143, | |
| "grad_norm": 1.9203274121236615, | |
| "kl": 0.27783203125, | |
| "learning_rate": 1.7693309235023127e-07, | |
| "loss": 0.1839, | |
| "reward": 0.15045135095715523, | |
| "reward_std": 0.8359555453062057, | |
| "rewards/cosine_scaled_reward": -0.21644099615514278, | |
| "rewards/format_reward": 0.5833333507180214, | |
| "step": 415 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1303.2708435058594, | |
| "epoch": 0.4754285714285714, | |
| "grad_norm": 5.219130595783076, | |
| "kl": 0.288330078125, | |
| "learning_rate": 1.7518544168045524e-07, | |
| "loss": 0.2384, | |
| "reward": 0.06198018416762352, | |
| "reward_std": 0.7209452688694, | |
| "rewards/cosine_scaled_reward": -0.2502599246799946, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 416 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1104.3958740234375, | |
| "epoch": 0.4765714285714286, | |
| "grad_norm": 343.4311543801194, | |
| "kl": 3.455078125, | |
| "learning_rate": 1.7345605894346726e-07, | |
| "loss": 0.3667, | |
| "reward": 0.25671100057661533, | |
| "reward_std": 0.5841851308941841, | |
| "rewards/cosine_scaled_reward": -0.19456118065863848, | |
| "rewards/format_reward": 0.645833358168602, | |
| "step": 417 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1071.9375305175781, | |
| "epoch": 0.4777142857142857, | |
| "grad_norm": 3.5739561302927703, | |
| "kl": 0.18438720703125, | |
| "learning_rate": 1.7174502842694212e-07, | |
| "loss": 0.0318, | |
| "reward": 0.18263494968414307, | |
| "reward_std": 0.688008576631546, | |
| "rewards/cosine_scaled_reward": -0.25243253633379936, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 418 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1059.2500457763672, | |
| "epoch": 0.47885714285714287, | |
| "grad_norm": 42.82614000306872, | |
| "kl": 14.88720703125, | |
| "learning_rate": 1.7005243352409333e-07, | |
| "loss": 0.182, | |
| "reward": 0.10820261249318719, | |
| "reward_std": 0.658612459897995, | |
| "rewards/cosine_scaled_reward": -0.24798204004764557, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 419 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1066.7917022705078, | |
| "epoch": 0.48, | |
| "grad_norm": 7.563689623131912, | |
| "kl": 0.54296875, | |
| "learning_rate": 1.6837835672960831e-07, | |
| "loss": 0.1366, | |
| "reward": 0.24830662203021348, | |
| "reward_std": 0.6641267538070679, | |
| "rewards/cosine_scaled_reward": -0.19876337423920631, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 420 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1140.8958740234375, | |
| "epoch": 0.48114285714285715, | |
| "grad_norm": 5.102712434876203, | |
| "kl": 0.455322265625, | |
| "learning_rate": 1.6672287963562852e-07, | |
| "loss": 0.238, | |
| "reward": 0.22175164567306638, | |
| "reward_std": 0.48806294053792953, | |
| "rewards/cosine_scaled_reward": -0.19120752811431885, | |
| "rewards/format_reward": 0.6041666865348816, | |
| "step": 421 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1181.3333587646484, | |
| "epoch": 0.48228571428571426, | |
| "grad_norm": 11.187728016893017, | |
| "kl": 0.7470703125, | |
| "learning_rate": 1.6508608292777203e-07, | |
| "loss": 0.2428, | |
| "reward": 0.016264647245407104, | |
| "reward_std": 0.7520715892314911, | |
| "rewards/cosine_scaled_reward": -0.27311767637729645, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 422 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1151.5625305175781, | |
| "epoch": 0.48342857142857143, | |
| "grad_norm": 36.484656907353894, | |
| "kl": 1.12109375, | |
| "learning_rate": 1.6346804638120098e-07, | |
| "loss": 0.225, | |
| "reward": 0.166658578440547, | |
| "reward_std": 0.5137820392847061, | |
| "rewards/cosine_scaled_reward": -0.20833738893270493, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 423 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1222.4167022705078, | |
| "epoch": 0.4845714285714286, | |
| "grad_norm": 5.314021913144739, | |
| "kl": 0.468994140625, | |
| "learning_rate": 1.6186884885673413e-07, | |
| "loss": 0.0791, | |
| "reward": -0.053052062867209315, | |
| "reward_std": 0.5032695159316063, | |
| "rewards/cosine_scaled_reward": -0.349442720413208, | |
| "rewards/format_reward": 0.6458333507180214, | |
| "step": 424 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1303.0417022705078, | |
| "epoch": 0.4857142857142857, | |
| "grad_norm": 15.439357915372184, | |
| "kl": 0.76171875, | |
| "learning_rate": 1.6028856829700258e-07, | |
| "loss": 0.1567, | |
| "reward": 0.06288054899778217, | |
| "reward_std": 0.8221424967050552, | |
| "rewards/cosine_scaled_reward": -0.24980972707271576, | |
| "rewards/format_reward": 0.5625000111758709, | |
| "step": 425 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1363.2292175292969, | |
| "epoch": 0.4868571428571429, | |
| "grad_norm": 16.190560753791, | |
| "kl": 0.64306640625, | |
| "learning_rate": 1.5872728172265146e-07, | |
| "loss": 0.2057, | |
| "reward": 0.0070614293217659, | |
| "reward_std": 0.8801029026508331, | |
| "rewards/cosine_scaled_reward": -0.18396929651498795, | |
| "rewards/format_reward": 0.3750000111758709, | |
| "step": 426 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1062.6250457763672, | |
| "epoch": 0.488, | |
| "grad_norm": 5.208018104302035, | |
| "kl": 0.289306640625, | |
| "learning_rate": 1.5718506522858572e-07, | |
| "loss": 0.2392, | |
| "reward": 0.1040960568934679, | |
| "reward_std": 0.7021225243806839, | |
| "rewards/cosine_scaled_reward": -0.21878531202673912, | |
| "rewards/format_reward": 0.5416666865348816, | |
| "step": 427 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1168.6250457763672, | |
| "epoch": 0.48914285714285716, | |
| "grad_norm": 1.7936384513629215, | |
| "kl": 0.194366455078125, | |
| "learning_rate": 1.5566199398026147e-07, | |
| "loss": 0.1231, | |
| "reward": 0.1094297245144844, | |
| "reward_std": 0.5426923930644989, | |
| "rewards/cosine_scaled_reward": -0.247368473559618, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 428 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1089.6875305175781, | |
| "epoch": 0.49028571428571427, | |
| "grad_norm": 3.242866515089598, | |
| "kl": 0.18408203125, | |
| "learning_rate": 1.5415814221002265e-07, | |
| "loss": 0.1081, | |
| "reward": 0.4839252680540085, | |
| "reward_std": 0.5947171896696091, | |
| "rewards/cosine_scaled_reward": -0.03928736597299576, | |
| "rewards/format_reward": 0.5625000260770321, | |
| "step": 429 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1313.5833587646484, | |
| "epoch": 0.49142857142857144, | |
| "grad_norm": 1.478054069262014, | |
| "kl": 0.21612548828125, | |
| "learning_rate": 1.5267358321348285e-07, | |
| "loss": 0.1273, | |
| "reward": 0.15572084113955498, | |
| "reward_std": 0.5618212074041367, | |
| "rewards/cosine_scaled_reward": -0.18255625164601952, | |
| "rewards/format_reward": 0.5208333358168602, | |
| "step": 430 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 975.4791870117188, | |
| "epoch": 0.49257142857142855, | |
| "grad_norm": 3.541465585724065, | |
| "kl": 0.1605224609375, | |
| "learning_rate": 1.5120838934595337e-07, | |
| "loss": 0.1148, | |
| "reward": 0.420807933434844, | |
| "reward_std": 0.890654593706131, | |
| "rewards/cosine_scaled_reward": -0.11251270584762096, | |
| "rewards/format_reward": 0.6458333507180214, | |
| "step": 431 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1181.062515258789, | |
| "epoch": 0.4937142857142857, | |
| "grad_norm": 3.350973639300781, | |
| "kl": 0.155517578125, | |
| "learning_rate": 1.4976263201891613e-07, | |
| "loss": 0.1027, | |
| "reward": 0.032605723943561316, | |
| "reward_std": 0.5731803774833679, | |
| "rewards/cosine_scaled_reward": -0.2753637991845608, | |
| "rewards/format_reward": 0.5833333507180214, | |
| "step": 432 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1207.0417175292969, | |
| "epoch": 0.4948571428571429, | |
| "grad_norm": 4.990349151202906, | |
| "kl": 0.185546875, | |
| "learning_rate": 1.483363816965435e-07, | |
| "loss": 0.1393, | |
| "reward": 0.08886189805343747, | |
| "reward_std": 0.4594448246061802, | |
| "rewards/cosine_scaled_reward": -0.23681906727142632, | |
| "rewards/format_reward": 0.5625000298023224, | |
| "step": 433 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 859.1250457763672, | |
| "epoch": 0.496, | |
| "grad_norm": 1.9877951359345267, | |
| "kl": 0.17950439453125, | |
| "learning_rate": 1.469297078922642e-07, | |
| "loss": 0.0512, | |
| "reward": 1.2721150815486908, | |
| "reward_std": 0.6770742386579514, | |
| "rewards/cosine_scaled_reward": 0.20897419564425945, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 434 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1266.375015258789, | |
| "epoch": 0.49714285714285716, | |
| "grad_norm": 1.8972601097369153, | |
| "kl": 0.2255859375, | |
| "learning_rate": 1.4554267916537495e-07, | |
| "loss": 0.1234, | |
| "reward": 0.10697830189019442, | |
| "reward_std": 0.531020175665617, | |
| "rewards/cosine_scaled_reward": -0.22776086255908012, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 435 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1315.5625305175781, | |
| "epoch": 0.4982857142857143, | |
| "grad_norm": 2.490164316553904, | |
| "kl": 0.2635498046875, | |
| "learning_rate": 1.4417536311769885e-07, | |
| "loss": 0.1196, | |
| "reward": -0.10972822457551956, | |
| "reward_std": 0.5596715956926346, | |
| "rewards/cosine_scaled_reward": -0.2840307876467705, | |
| "rewards/format_reward": 0.4583333395421505, | |
| "step": 436 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1060.5208587646484, | |
| "epoch": 0.49942857142857144, | |
| "grad_norm": 1.9387158266765225, | |
| "kl": 0.294189453125, | |
| "learning_rate": 1.4282782639029128e-07, | |
| "loss": 0.0174, | |
| "reward": 0.52107123285532, | |
| "reward_std": 0.5726887807250023, | |
| "rewards/cosine_scaled_reward": -0.05196441989392042, | |
| "rewards/format_reward": 0.6250000298023224, | |
| "step": 437 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 921.3125457763672, | |
| "epoch": 0.5005714285714286, | |
| "grad_norm": 8.654811309244227, | |
| "kl": 0.2535400390625, | |
| "learning_rate": 1.4150013466019114e-07, | |
| "loss": 0.1354, | |
| "reward": 0.20009983237832785, | |
| "reward_std": 0.6868909299373627, | |
| "rewards/cosine_scaled_reward": -0.20203341665910557, | |
| "rewards/format_reward": 0.6041666865348816, | |
| "step": 438 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1083.0833740234375, | |
| "epoch": 0.5017142857142857, | |
| "grad_norm": 5.3889889905872375, | |
| "kl": 0.40216064453125, | |
| "learning_rate": 1.4019235263722034e-07, | |
| "loss": 0.2461, | |
| "reward": 0.11843711510300636, | |
| "reward_std": 0.5985070914030075, | |
| "rewards/cosine_scaled_reward": -0.2636981066316366, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 439 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1088.8750305175781, | |
| "epoch": 0.5028571428571429, | |
| "grad_norm": 4.149099334589977, | |
| "kl": 0.30615234375, | |
| "learning_rate": 1.3890454406082956e-07, | |
| "loss": 0.033, | |
| "reward": 0.11394692957401276, | |
| "reward_std": 0.6579174622893333, | |
| "rewards/cosine_scaled_reward": -0.24510987009853125, | |
| "rewards/format_reward": 0.6041666865348816, | |
| "step": 440 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 984.3125610351562, | |
| "epoch": 0.504, | |
| "grad_norm": 16.782102815445953, | |
| "kl": 0.367919921875, | |
| "learning_rate": 1.3763677169699217e-07, | |
| "loss": 0.0977, | |
| "reward": 0.3985663428902626, | |
| "reward_std": 0.42315196245908737, | |
| "rewards/cosine_scaled_reward": -0.134050190448761, | |
| "rewards/format_reward": 0.6666666828095913, | |
| "step": 441 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 988.4583587646484, | |
| "epoch": 0.5051428571428571, | |
| "grad_norm": 4.106390753028162, | |
| "kl": 0.31378173828125, | |
| "learning_rate": 1.3638909733514452e-07, | |
| "loss": 0.0056, | |
| "reward": 0.13055693171918392, | |
| "reward_std": 0.48535653203725815, | |
| "rewards/cosine_scaled_reward": -0.27847154438495636, | |
| "rewards/format_reward": 0.6875000223517418, | |
| "step": 442 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1004.3750152587891, | |
| "epoch": 0.5062857142857143, | |
| "grad_norm": 8.08171445493757, | |
| "kl": 0.16259765625, | |
| "learning_rate": 1.351615817851748e-07, | |
| "loss": 0.2301, | |
| "reward": 0.30977149307727814, | |
| "reward_std": 0.6895428746938705, | |
| "rewards/cosine_scaled_reward": -0.18886426091194153, | |
| "rewards/format_reward": 0.6875000298023224, | |
| "step": 443 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1087.9375305175781, | |
| "epoch": 0.5074285714285715, | |
| "grad_norm": 3.849891000062917, | |
| "kl": 0.1669921875, | |
| "learning_rate": 1.3395428487445914e-07, | |
| "loss": 0.0975, | |
| "reward": 0.4580417312681675, | |
| "reward_std": 0.640699241310358, | |
| "rewards/cosine_scaled_reward": -0.14597914181649685, | |
| "rewards/format_reward": 0.7500000298023224, | |
| "step": 444 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1224.2500457763672, | |
| "epoch": 0.5085714285714286, | |
| "grad_norm": 23.59411548899569, | |
| "kl": 0.82958984375, | |
| "learning_rate": 1.3276726544494571e-07, | |
| "loss": 0.2131, | |
| "reward": 0.051861570216715336, | |
| "reward_std": 0.6278680041432381, | |
| "rewards/cosine_scaled_reward": -0.2553192190825939, | |
| "rewards/format_reward": 0.5625, | |
| "step": 445 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1067.9791870117188, | |
| "epoch": 0.5097142857142857, | |
| "grad_norm": 3.656936199785778, | |
| "kl": 0.24365234375, | |
| "learning_rate": 1.316005813502869e-07, | |
| "loss": 0.0234, | |
| "reward": 0.36385649256408215, | |
| "reward_std": 0.7834623008966446, | |
| "rewards/cosine_scaled_reward": -0.1722384188324213, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 446 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1174.7083740234375, | |
| "epoch": 0.5108571428571429, | |
| "grad_norm": 2.1759216078948036, | |
| "kl": 0.3828125, | |
| "learning_rate": 1.3045428945301953e-07, | |
| "loss": 0.2194, | |
| "reward": 0.23982627410441637, | |
| "reward_std": 0.5332969650626183, | |
| "rewards/cosine_scaled_reward": -0.21342020854353905, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 447 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1180.2083740234375, | |
| "epoch": 0.512, | |
| "grad_norm": 4.2366039265569135, | |
| "kl": 0.31494140625, | |
| "learning_rate": 1.2932844562179352e-07, | |
| "loss": 0.1963, | |
| "reward": 0.3762773834168911, | |
| "reward_std": 0.6801744475960732, | |
| "rewards/cosine_scaled_reward": -0.14519466273486614, | |
| "rewards/format_reward": 0.666666679084301, | |
| "step": 448 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 994.3958740234375, | |
| "epoch": 0.5131428571428571, | |
| "grad_norm": 4.146583173336839, | |
| "kl": 0.148681640625, | |
| "learning_rate": 1.2822310472864885e-07, | |
| "loss": 0.1922, | |
| "reward": 0.36078188568353653, | |
| "reward_std": 0.737194113433361, | |
| "rewards/cosine_scaled_reward": -0.15294241392984986, | |
| "rewards/format_reward": 0.6666667014360428, | |
| "step": 449 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 851.8958587646484, | |
| "epoch": 0.5142857142857142, | |
| "grad_norm": 51.8228987068238, | |
| "kl": 0.534942626953125, | |
| "learning_rate": 1.2713832064634125e-07, | |
| "loss": 0.1269, | |
| "reward": 0.5865043960511684, | |
| "reward_std": 0.4706997238099575, | |
| "rewards/cosine_scaled_reward": -0.10258114710450172, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 450 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1079.7916870117188, | |
| "epoch": 0.5154285714285715, | |
| "grad_norm": 6.392599999015184, | |
| "kl": 0.2735595703125, | |
| "learning_rate": 1.260741462457165e-07, | |
| "loss": 0.2626, | |
| "reward": 0.22280075028538704, | |
| "reward_std": 0.6088056340813637, | |
| "rewards/cosine_scaled_reward": -0.18026629835367203, | |
| "rewards/format_reward": 0.5833333395421505, | |
| "step": 451 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1046.8542022705078, | |
| "epoch": 0.5165714285714286, | |
| "grad_norm": 8.715599338320725, | |
| "kl": 0.152099609375, | |
| "learning_rate": 1.2503063339313356e-07, | |
| "loss": 0.2189, | |
| "reward": -0.0023173224180936813, | |
| "reward_std": 0.5100973732769489, | |
| "rewards/cosine_scaled_reward": -0.3136586770415306, | |
| "rewards/format_reward": 0.6250000298023224, | |
| "step": 452 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1271.9375305175781, | |
| "epoch": 0.5177142857142857, | |
| "grad_norm": 2.4908553038859917, | |
| "kl": 0.40869140625, | |
| "learning_rate": 1.2400783294793668e-07, | |
| "loss": 0.1805, | |
| "reward": 0.027155719697475433, | |
| "reward_std": 0.5863115191459656, | |
| "rewards/cosine_scaled_reward": -0.23642215505242348, | |
| "rewards/format_reward": 0.5000000149011612, | |
| "step": 453 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1104.6667175292969, | |
| "epoch": 0.5188571428571429, | |
| "grad_norm": 42.815024473876115, | |
| "kl": 2.04296875, | |
| "learning_rate": 1.2300579475997657e-07, | |
| "loss": 0.039, | |
| "reward": 0.2878073714673519, | |
| "reward_std": 0.6589629650115967, | |
| "rewards/cosine_scaled_reward": -0.13734631799161434, | |
| "rewards/format_reward": 0.5625000223517418, | |
| "step": 454 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1183.250015258789, | |
| "epoch": 0.52, | |
| "grad_norm": 2.6108705721314824, | |
| "kl": 0.306640625, | |
| "learning_rate": 1.220245676671809e-07, | |
| "loss": 0.1199, | |
| "reward": 0.3790533752180636, | |
| "reward_std": 0.4862861856818199, | |
| "rewards/cosine_scaled_reward": -0.13338997215032578, | |
| "rewards/format_reward": 0.645833358168602, | |
| "step": 455 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1168.4375305175781, | |
| "epoch": 0.5211428571428571, | |
| "grad_norm": 25.701940158931713, | |
| "kl": 0.477294921875, | |
| "learning_rate": 1.2106419949317388e-07, | |
| "loss": 0.1681, | |
| "reward": 0.2994745699688792, | |
| "reward_std": 0.7066301554441452, | |
| "rewards/cosine_scaled_reward": -0.15234605269506574, | |
| "rewards/format_reward": 0.6041666865348816, | |
| "step": 456 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1007.4167022705078, | |
| "epoch": 0.5222857142857142, | |
| "grad_norm": 8.570796851314613, | |
| "kl": 0.186279296875, | |
| "learning_rate": 1.2012473704494537e-07, | |
| "loss": 0.3132, | |
| "reward": 0.400404367595911, | |
| "reward_std": 0.5747000873088837, | |
| "rewards/cosine_scaled_reward": -0.15396450087428093, | |
| "rewards/format_reward": 0.708333358168602, | |
| "step": 457 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1272.8958435058594, | |
| "epoch": 0.5234285714285715, | |
| "grad_norm": 4.345232068890798, | |
| "kl": 0.48974609375, | |
| "learning_rate": 1.1920622611056974e-07, | |
| "loss": 0.2715, | |
| "reward": 0.2525772713124752, | |
| "reward_std": 0.8047986179590225, | |
| "rewards/cosine_scaled_reward": -0.12371136248111725, | |
| "rewards/format_reward": 0.5000000298023224, | |
| "step": 458 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1063.625015258789, | |
| "epoch": 0.5245714285714286, | |
| "grad_norm": 5.1821320020363615, | |
| "kl": 0.15771484375, | |
| "learning_rate": 1.1830871145697412e-07, | |
| "loss": 0.1275, | |
| "reward": 0.02093285135924816, | |
| "reward_std": 0.42146630585193634, | |
| "rewards/cosine_scaled_reward": -0.3124502506107092, | |
| "rewards/format_reward": 0.645833358168602, | |
| "step": 459 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1044.2708435058594, | |
| "epoch": 0.5257142857142857, | |
| "grad_norm": 3.3010395921201514, | |
| "kl": 0.267333984375, | |
| "learning_rate": 1.1743223682775649e-07, | |
| "loss": 0.1046, | |
| "reward": 0.2678923445455439, | |
| "reward_std": 0.896328404545784, | |
| "rewards/cosine_scaled_reward": -0.12647049874067307, | |
| "rewards/format_reward": 0.5208333656191826, | |
| "step": 460 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1053.4583587646484, | |
| "epoch": 0.5268571428571428, | |
| "grad_norm": 1.478075619341315, | |
| "kl": 0.21600341796875, | |
| "learning_rate": 1.1657684494105386e-07, | |
| "loss": 0.0555, | |
| "reward": 0.3391416594386101, | |
| "reward_std": 0.9088789522647858, | |
| "rewards/cosine_scaled_reward": -0.20542917400598526, | |
| "rewards/format_reward": 0.7500000223517418, | |
| "step": 461 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1103.2083740234375, | |
| "epoch": 0.528, | |
| "grad_norm": 9530.342121672113, | |
| "kl": 28.46978759765625, | |
| "learning_rate": 1.1574257748745986e-07, | |
| "loss": 1.3293, | |
| "reward": 0.14297988126054406, | |
| "reward_std": 0.5064843520522118, | |
| "rewards/cosine_scaled_reward": -0.25142673472873867, | |
| "rewards/format_reward": 0.645833358168602, | |
| "step": 462 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1043.3333587646484, | |
| "epoch": 0.5291428571428571, | |
| "grad_norm": 4.739243744092973, | |
| "kl": 0.39892578125, | |
| "learning_rate": 1.1492947512799328e-07, | |
| "loss": 0.2493, | |
| "reward": 0.6755956448614597, | |
| "reward_std": 0.4871959462761879, | |
| "rewards/cosine_scaled_reward": 0.025297801941633224, | |
| "rewards/format_reward": 0.6250000298023224, | |
| "step": 463 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 992.2916717529297, | |
| "epoch": 0.5302857142857142, | |
| "grad_norm": 122.21833055898026, | |
| "kl": 1.33843994140625, | |
| "learning_rate": 1.1413757749211602e-07, | |
| "loss": 0.2572, | |
| "reward": 0.29958341596648097, | |
| "reward_std": 0.8296171501278877, | |
| "rewards/cosine_scaled_reward": -0.20437496528029442, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 464 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1240.5417175292969, | |
| "epoch": 0.5314285714285715, | |
| "grad_norm": 6.845067293294205, | |
| "kl": 0.55908203125, | |
| "learning_rate": 1.1336692317580158e-07, | |
| "loss": 0.1986, | |
| "reward": 0.072305912617594, | |
| "reward_std": 0.4831778481602669, | |
| "rewards/cosine_scaled_reward": -0.2138470560312271, | |
| "rewards/format_reward": 0.5000000186264515, | |
| "step": 465 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1192.9167175292969, | |
| "epoch": 0.5325714285714286, | |
| "grad_norm": 9.93163371597492, | |
| "kl": 0.49163818359375, | |
| "learning_rate": 1.1261754973965422e-07, | |
| "loss": 0.0928, | |
| "reward": 0.04001780319958925, | |
| "reward_std": 0.44342009350657463, | |
| "rewards/cosine_scaled_reward": -0.28207441698759794, | |
| "rewards/format_reward": 0.6041666865348816, | |
| "step": 466 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1341.1667175292969, | |
| "epoch": 0.5337142857142857, | |
| "grad_norm": 19.835786495839272, | |
| "kl": 0.8251953125, | |
| "learning_rate": 1.1188949370707787e-07, | |
| "loss": 0.2635, | |
| "reward": 0.1306269969791174, | |
| "reward_std": 0.6591696962714195, | |
| "rewards/cosine_scaled_reward": -0.21593650616705418, | |
| "rewards/format_reward": 0.5625000298023224, | |
| "step": 467 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1138.7708740234375, | |
| "epoch": 0.5348571428571428, | |
| "grad_norm": 13.935934940776233, | |
| "kl": 0.576904296875, | |
| "learning_rate": 1.1118279056249653e-07, | |
| "loss": 0.0976, | |
| "reward": 0.37931894324719906, | |
| "reward_std": 0.5462356135249138, | |
| "rewards/cosine_scaled_reward": -0.10200719349086285, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 468 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1028.6041870117188, | |
| "epoch": 0.536, | |
| "grad_norm": 2.6967193006473216, | |
| "kl": 0.26171875, | |
| "learning_rate": 1.1049747474962444e-07, | |
| "loss": 0.0529, | |
| "reward": 0.40807172656059265, | |
| "reward_std": 0.6494475156068802, | |
| "rewards/cosine_scaled_reward": -0.13971414044499397, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 469 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 848.5833587646484, | |
| "epoch": 0.5371428571428571, | |
| "grad_norm": 1.7855628531087904, | |
| "kl": 0.1328125, | |
| "learning_rate": 1.0983357966978745e-07, | |
| "loss": 0.0217, | |
| "reward": 0.6918718162924051, | |
| "reward_std": 0.5211210399866104, | |
| "rewards/cosine_scaled_reward": -0.0811474658548832, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 470 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1070.7500457763672, | |
| "epoch": 0.5382857142857143, | |
| "grad_norm": 6.038242137859596, | |
| "kl": 0.169921875, | |
| "learning_rate": 1.0919113768029517e-07, | |
| "loss": 0.2149, | |
| "reward": 0.06172482669353485, | |
| "reward_std": 0.5211478099226952, | |
| "rewards/cosine_scaled_reward": -0.29205426201224327, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 471 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 842.7083435058594, | |
| "epoch": 0.5394285714285715, | |
| "grad_norm": 12.109988243714355, | |
| "kl": 0.2894287109375, | |
| "learning_rate": 1.0857018009286381e-07, | |
| "loss": -0.0496, | |
| "reward": 0.46792223304510117, | |
| "reward_std": 0.54752978682518, | |
| "rewards/cosine_scaled_reward": -0.17228887975215912, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 472 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1181.1875305175781, | |
| "epoch": 0.5405714285714286, | |
| "grad_norm": 6.409193674543087, | |
| "kl": 0.3604736328125, | |
| "learning_rate": 1.0797073717209013e-07, | |
| "loss": 0.0613, | |
| "reward": -0.049041745252907276, | |
| "reward_std": 0.5112807080149651, | |
| "rewards/cosine_scaled_reward": -0.2641042061150074, | |
| "rewards/format_reward": 0.4791666865348816, | |
| "step": 473 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 990.3750305175781, | |
| "epoch": 0.5417142857142857, | |
| "grad_norm": 9.264374683069418, | |
| "kl": 0.11328125, | |
| "learning_rate": 1.0739283813397639e-07, | |
| "loss": 0.1628, | |
| "reward": 0.32282854616642, | |
| "reward_std": 0.7814144194126129, | |
| "rewards/cosine_scaled_reward": -0.20316907577216625, | |
| "rewards/format_reward": 0.7291667014360428, | |
| "step": 474 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 962.4792022705078, | |
| "epoch": 0.5428571428571428, | |
| "grad_norm": 4.242696196218054, | |
| "kl": 0.12603759765625, | |
| "learning_rate": 1.068365111445064e-07, | |
| "loss": 0.1483, | |
| "reward": 0.08424473810009658, | |
| "reward_std": 0.48827143758535385, | |
| "rewards/cosine_scaled_reward": -0.28079431876540184, | |
| "rewards/format_reward": 0.6458333544433117, | |
| "step": 475 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1048.8958587646484, | |
| "epoch": 0.544, | |
| "grad_norm": 18.108937894089827, | |
| "kl": 0.4520263671875, | |
| "learning_rate": 1.063017833182728e-07, | |
| "loss": 0.1426, | |
| "reward": 0.3813807927072048, | |
| "reward_std": 0.6394810080528259, | |
| "rewards/cosine_scaled_reward": -0.05930961295962334, | |
| "rewards/format_reward": 0.5000000223517418, | |
| "step": 476 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 723.4791870117188, | |
| "epoch": 0.5451428571428572, | |
| "grad_norm": 5.68071346914076, | |
| "kl": 0.156005859375, | |
| "learning_rate": 1.0578868071715544e-07, | |
| "loss": 0.0836, | |
| "reward": 0.7284884303808212, | |
| "reward_std": 0.6032212525606155, | |
| "rewards/cosine_scaled_reward": -0.08367248624563217, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 477 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1013.3750305175781, | |
| "epoch": 0.5462857142857143, | |
| "grad_norm": 4.283725447191352, | |
| "kl": 0.0955657958984375, | |
| "learning_rate": 1.0529722834905125e-07, | |
| "loss": 0.1397, | |
| "reward": 0.49316432885825634, | |
| "reward_std": 0.45135799795389175, | |
| "rewards/cosine_scaled_reward": -0.12841782718896866, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 478 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 998.3125, | |
| "epoch": 0.5474285714285714, | |
| "grad_norm": 1.2914537281090146, | |
| "kl": 0.15277099609375, | |
| "learning_rate": 1.0482745016665526e-07, | |
| "loss": 0.0674, | |
| "reward": 0.31127920374274254, | |
| "reward_std": 0.6323697119951248, | |
| "rewards/cosine_scaled_reward": -0.16727706603705883, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 479 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 984.8750152587891, | |
| "epoch": 0.5485714285714286, | |
| "grad_norm": 1.5465556570908303, | |
| "kl": 0.077606201171875, | |
| "learning_rate": 1.0437936906629334e-07, | |
| "loss": 0.0463, | |
| "reward": 0.5994082670658827, | |
| "reward_std": 0.37920307368040085, | |
| "rewards/cosine_scaled_reward": -0.1169625474140048, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 480 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1145.3541870117188, | |
| "epoch": 0.5497142857142857, | |
| "grad_norm": 2.3612933288431535, | |
| "kl": 0.150390625, | |
| "learning_rate": 1.0395300688680625e-07, | |
| "loss": 0.0852, | |
| "reward": 0.18339010886847973, | |
| "reward_std": 0.6312093585729599, | |
| "rewards/cosine_scaled_reward": -0.1999716181308031, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 481 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1099.2291870117188, | |
| "epoch": 0.5508571428571428, | |
| "grad_norm": 3.3478046762143303, | |
| "kl": 0.1209716796875, | |
| "learning_rate": 1.0354838440848501e-07, | |
| "loss": 0.1351, | |
| "reward": 0.4303822033107281, | |
| "reward_std": 0.5441673323512077, | |
| "rewards/cosine_scaled_reward": -0.10772557370364666, | |
| "rewards/format_reward": 0.6458333488553762, | |
| "step": 482 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 972.8958587646484, | |
| "epoch": 0.552, | |
| "grad_norm": 2.070698520552659, | |
| "kl": 0.1766357421875, | |
| "learning_rate": 1.0316552135205837e-07, | |
| "loss": 0.1898, | |
| "reward": 0.3021550700068474, | |
| "reward_std": 0.6595650911331177, | |
| "rewards/cosine_scaled_reward": -0.244755819439888, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 483 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 966.4375, | |
| "epoch": 0.5531428571428572, | |
| "grad_norm": 7.4718244138049785, | |
| "kl": 0.11669921875, | |
| "learning_rate": 1.0280443637773163e-07, | |
| "loss": 0.1535, | |
| "reward": 0.5982861579395831, | |
| "reward_std": 0.72054024040699, | |
| "rewards/cosine_scaled_reward": -0.034190285950899124, | |
| "rewards/format_reward": 0.6666666716337204, | |
| "step": 484 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 850.2292022705078, | |
| "epoch": 0.5542857142857143, | |
| "grad_norm": 2.6747580946696172, | |
| "kl": 0.171142578125, | |
| "learning_rate": 1.0246514708427701e-07, | |
| "loss": 0.0845, | |
| "reward": 0.440962532768026, | |
| "reward_std": 0.4621984176337719, | |
| "rewards/cosine_scaled_reward": -0.1545187532901764, | |
| "rewards/format_reward": 0.7500000298023224, | |
| "step": 485 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1098.4791870117188, | |
| "epoch": 0.5554285714285714, | |
| "grad_norm": 5.009044167350138, | |
| "kl": 0.1492919921875, | |
| "learning_rate": 1.0214767000817596e-07, | |
| "loss": 0.1657, | |
| "reward": 0.31744778295978904, | |
| "reward_std": 0.8680954575538635, | |
| "rewards/cosine_scaled_reward": -0.15377611527219415, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 486 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1250.9167022705078, | |
| "epoch": 0.5565714285714286, | |
| "grad_norm": 2.168704280565103, | |
| "kl": 0.3330078125, | |
| "learning_rate": 1.0185202062281336e-07, | |
| "loss": 0.1075, | |
| "reward": 0.09373046457767487, | |
| "reward_std": 0.7844668254256248, | |
| "rewards/cosine_scaled_reward": -0.18230143561959267, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 487 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1182.0417175292969, | |
| "epoch": 0.5577142857142857, | |
| "grad_norm": 68.49302633471272, | |
| "kl": 1.04931640625, | |
| "learning_rate": 1.0157821333772304e-07, | |
| "loss": 0.2607, | |
| "reward": 0.0013678865507245064, | |
| "reward_std": 0.5483251512050629, | |
| "rewards/cosine_scaled_reward": -0.2805660478770733, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 488 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 898.8958740234375, | |
| "epoch": 0.5588571428571428, | |
| "grad_norm": 3.3267908174810983, | |
| "kl": 0.22412109375, | |
| "learning_rate": 1.013262614978859e-07, | |
| "loss": 0.1155, | |
| "reward": 0.9380166502669454, | |
| "reward_std": 0.38279012218117714, | |
| "rewards/cosine_scaled_reward": 0.10442498326301575, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 489 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1022.5417175292969, | |
| "epoch": 0.56, | |
| "grad_norm": 1.2964210055945644, | |
| "kl": 0.142425537109375, | |
| "learning_rate": 1.0109617738307911e-07, | |
| "loss": 0.1375, | |
| "reward": 0.1352614858187735, | |
| "reward_std": 0.5779989808797836, | |
| "rewards/cosine_scaled_reward": -0.29695259779691696, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 490 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1071.2708740234375, | |
| "epoch": 0.5611428571428572, | |
| "grad_norm": 5.198263303721915, | |
| "kl": 0.270751953125, | |
| "learning_rate": 1.0088797220727779e-07, | |
| "loss": 0.1398, | |
| "reward": 0.360213914886117, | |
| "reward_std": 0.5864584296941757, | |
| "rewards/cosine_scaled_reward": -0.12197639048099518, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 491 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1146.0208740234375, | |
| "epoch": 0.5622857142857143, | |
| "grad_norm": 25.32884427185481, | |
| "kl": 0.860107421875, | |
| "learning_rate": 1.0070165611810855e-07, | |
| "loss": 0.279, | |
| "reward": 0.3603329248726368, | |
| "reward_std": 0.4203804060816765, | |
| "rewards/cosine_scaled_reward": -0.11150021478533745, | |
| "rewards/format_reward": 0.583333358168602, | |
| "step": 492 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1079.6250457763672, | |
| "epoch": 0.5634285714285714, | |
| "grad_norm": 5.39013483275012, | |
| "kl": 0.4027099609375, | |
| "learning_rate": 1.005372381963547e-07, | |
| "loss": 0.2018, | |
| "reward": 0.24866360798478127, | |
| "reward_std": 0.6557547599077225, | |
| "rewards/cosine_scaled_reward": -0.21941821463406086, | |
| "rewards/format_reward": 0.6875000298023224, | |
| "step": 493 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1020.0417022705078, | |
| "epoch": 0.5645714285714286, | |
| "grad_norm": 39.118419014119006, | |
| "kl": 1.0677490234375, | |
| "learning_rate": 1.0039472645551372e-07, | |
| "loss": 0.2737, | |
| "reward": 0.027191074565052986, | |
| "reward_std": 0.4351058676838875, | |
| "rewards/cosine_scaled_reward": -0.3301544785499573, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 494 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1194.8958740234375, | |
| "epoch": 0.5657142857142857, | |
| "grad_norm": 5.938670898030579, | |
| "kl": 0.630859375, | |
| "learning_rate": 1.002741278414069e-07, | |
| "loss": 0.2055, | |
| "reward": 0.374758190009743, | |
| "reward_std": 0.6815578863024712, | |
| "rewards/cosine_scaled_reward": -0.09387091733515263, | |
| "rewards/format_reward": 0.5625000298023224, | |
| "step": 495 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1120.3542022705078, | |
| "epoch": 0.5668571428571428, | |
| "grad_norm": 16.235625518016562, | |
| "kl": 0.50927734375, | |
| "learning_rate": 1.0017544823184055e-07, | |
| "loss": 0.297, | |
| "reward": 0.40772235160693526, | |
| "reward_std": 0.8966069668531418, | |
| "rewards/cosine_scaled_reward": -0.09822217002511024, | |
| "rewards/format_reward": 0.6041666865348816, | |
| "step": 496 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 827.4791870117188, | |
| "epoch": 0.568, | |
| "grad_norm": 6.892460021170429, | |
| "kl": 6.8536376953125, | |
| "learning_rate": 1.0009869243631952e-07, | |
| "loss": 0.2026, | |
| "reward": 0.8302161321043968, | |
| "reward_std": 0.560060553252697, | |
| "rewards/cosine_scaled_reward": 0.06094140186905861, | |
| "rewards/format_reward": 0.708333358168602, | |
| "step": 497 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1077.1250305175781, | |
| "epoch": 0.5691428571428572, | |
| "grad_norm": 10.255398655040155, | |
| "kl": 0.54931640625, | |
| "learning_rate": 1.000438641958131e-07, | |
| "loss": 0.2299, | |
| "reward": 0.033572545275092125, | |
| "reward_std": 0.4632219597697258, | |
| "rewards/cosine_scaled_reward": -0.30613040924072266, | |
| "rewards/format_reward": 0.645833358168602, | |
| "step": 498 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1336.7083435058594, | |
| "epoch": 0.5702857142857143, | |
| "grad_norm": 27.33692044026225, | |
| "kl": 0.934326171875, | |
| "learning_rate": 1.0001096618257236e-07, | |
| "loss": 0.1642, | |
| "reward": -0.13300850987434387, | |
| "reward_std": 0.6832303777337074, | |
| "rewards/cosine_scaled_reward": -0.28525424748659134, | |
| "rewards/format_reward": 0.4375000149011612, | |
| "step": 499 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1017.9375305175781, | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 2.4162040111238334, | |
| "kl": 0.2301025390625, | |
| "learning_rate": 1e-07, | |
| "loss": 0.1131, | |
| "reward": 0.13043908029794693, | |
| "reward_std": 0.5788910314440727, | |
| "rewards/cosine_scaled_reward": -0.29936380684375763, | |
| "rewards/format_reward": 0.729166679084301, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "step": 500, | |
| "total_flos": 0.0, | |
| "train_loss": 0.7532739232839085, | |
| "train_runtime": 13678.504, | |
| "train_samples_per_second": 1.755, | |
| "train_steps_per_second": 0.037 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |