| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.8571428571428571, | |
| "eval_steps": 500, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2848.666707356771, | |
| "epoch": 0.0017142857142857142, | |
| "grad_norm": 0.14995548129081726, | |
| "kl": 0.0, | |
| "learning_rate": 2e-08, | |
| "loss": -0.0083, | |
| "reward": 0.2294620672861735, | |
| "reward_std": 0.5262463341156641, | |
| "rewards/cosine_scaled_reward": -0.07971340417861938, | |
| "rewards/format_reward": 0.38888888930281, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2825.7084147135415, | |
| "epoch": 0.0034285714285714284, | |
| "grad_norm": 0.20299114286899567, | |
| "kl": 0.0, | |
| "learning_rate": 4e-08, | |
| "loss": -0.0275, | |
| "reward": -0.0334977979461352, | |
| "reward_std": 0.5172864546378454, | |
| "rewards/cosine_scaled_reward": -0.2250822459657987, | |
| "rewards/format_reward": 0.4166666741172473, | |
| "step": 2 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2863.8472900390625, | |
| "epoch": 0.005142857142857143, | |
| "grad_norm": 0.16031597554683685, | |
| "kl": 3.977616628011068e-05, | |
| "learning_rate": 6e-08, | |
| "loss": 0.0267, | |
| "reward": 0.12272219297786553, | |
| "reward_std": 0.6255507320165634, | |
| "rewards/cosine_scaled_reward": -0.11919447189817826, | |
| "rewards/format_reward": 0.3611111218730609, | |
| "step": 3 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2967.625040690104, | |
| "epoch": 0.006857142857142857, | |
| "grad_norm": 0.19282503426074982, | |
| "kl": 3.894170125325521e-05, | |
| "learning_rate": 8e-08, | |
| "loss": 0.1045, | |
| "reward": 0.22163815796375275, | |
| "reward_std": 0.8553579648335775, | |
| "rewards/cosine_scaled_reward": -0.06279203792413075, | |
| "rewards/format_reward": 0.3472222338120143, | |
| "step": 4 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2510.0139973958335, | |
| "epoch": 0.008571428571428572, | |
| "grad_norm": 0.1911565065383911, | |
| "kl": 2.485513687133789e-05, | |
| "learning_rate": 1e-07, | |
| "loss": 0.0869, | |
| "reward": 0.4895862266421318, | |
| "reward_std": 0.6777948240439097, | |
| "rewards/cosine_scaled_reward": -0.012151338160037994, | |
| "rewards/format_reward": 0.513888900478681, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3151.277872721354, | |
| "epoch": 0.010285714285714285, | |
| "grad_norm": 0.16237948834896088, | |
| "kl": 3.830591837565104e-05, | |
| "learning_rate": 1.2e-07, | |
| "loss": 0.0522, | |
| "reward": 0.1946993718544642, | |
| "reward_std": 0.7613486846288046, | |
| "rewards/cosine_scaled_reward": -0.06237253795067469, | |
| "rewards/format_reward": 0.3194444527228673, | |
| "step": 6 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2076.7639770507812, | |
| "epoch": 0.012, | |
| "grad_norm": 0.226565882563591, | |
| "kl": 3.1779209772745766e-05, | |
| "learning_rate": 1.4e-07, | |
| "loss": 0.0759, | |
| "reward": 0.4943330654253562, | |
| "reward_std": 0.7168246308962504, | |
| "rewards/cosine_scaled_reward": -0.0722779215623935, | |
| "rewards/format_reward": 0.6388889054457346, | |
| "step": 7 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2729.4444986979165, | |
| "epoch": 0.013714285714285714, | |
| "grad_norm": 0.1481897234916687, | |
| "kl": 2.658367156982422e-05, | |
| "learning_rate": 1.6e-07, | |
| "loss": 0.0123, | |
| "reward": 0.7092214872439703, | |
| "reward_std": 0.9769582649072012, | |
| "rewards/cosine_scaled_reward": 0.07683294266462326, | |
| "rewards/format_reward": 0.5555555621782938, | |
| "step": 8 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2510.972239176432, | |
| "epoch": 0.015428571428571429, | |
| "grad_norm": 0.2302147001028061, | |
| "kl": 2.4378299713134766e-05, | |
| "learning_rate": 1.8e-07, | |
| "loss": 0.0496, | |
| "reward": 0.5456009954214096, | |
| "reward_std": 0.727370043595632, | |
| "rewards/cosine_scaled_reward": 0.04363382316660136, | |
| "rewards/format_reward": 0.4583333482344945, | |
| "step": 9 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2919.4305623372397, | |
| "epoch": 0.017142857142857144, | |
| "grad_norm": 0.16766680777072906, | |
| "kl": 3.277262051900228e-05, | |
| "learning_rate": 2e-07, | |
| "loss": 0.0029, | |
| "reward": 0.16271250943342844, | |
| "reward_std": 0.8108218063910803, | |
| "rewards/cosine_scaled_reward": -0.09225484977165858, | |
| "rewards/format_reward": 0.3472222362955411, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2648.4445190429688, | |
| "epoch": 0.018857142857142857, | |
| "grad_norm": 0.21706043183803558, | |
| "kl": 3.404418627421061e-05, | |
| "learning_rate": 2.1999999999999998e-07, | |
| "loss": 0.0896, | |
| "reward": 0.2999572505553563, | |
| "reward_std": 0.7597030699253082, | |
| "rewards/cosine_scaled_reward": -0.04446581875284513, | |
| "rewards/format_reward": 0.3888889054457347, | |
| "step": 11 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2892.250040690104, | |
| "epoch": 0.02057142857142857, | |
| "grad_norm": 0.27907419204711914, | |
| "kl": 3.864367802937826e-05, | |
| "learning_rate": 2.4e-07, | |
| "loss": 0.1194, | |
| "reward": 0.05846052865187327, | |
| "reward_std": 0.7291679481665293, | |
| "rewards/cosine_scaled_reward": -0.17215862792606154, | |
| "rewards/format_reward": 0.4027777910232544, | |
| "step": 12 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2794.0278523763022, | |
| "epoch": 0.022285714285714287, | |
| "grad_norm": 0.20479245483875275, | |
| "kl": 3.020962079366048e-05, | |
| "learning_rate": 2.6e-07, | |
| "loss": 0.0366, | |
| "reward": 0.49337278803189594, | |
| "reward_std": 0.9299193223317465, | |
| "rewards/cosine_scaled_reward": -0.003313623368740082, | |
| "rewards/format_reward": 0.5000000111758709, | |
| "step": 13 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2823.3611857096353, | |
| "epoch": 0.024, | |
| "grad_norm": 0.15838919579982758, | |
| "kl": 2.7060508728027344e-05, | |
| "learning_rate": 2.8e-07, | |
| "loss": 0.0325, | |
| "reward": 0.2336851549334824, | |
| "reward_std": 0.48479830970366794, | |
| "rewards/cosine_scaled_reward": -0.0706574262852276, | |
| "rewards/format_reward": 0.3750000037252903, | |
| "step": 14 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3431.013956705729, | |
| "epoch": 0.025714285714285714, | |
| "grad_norm": 0.16635432839393616, | |
| "kl": 3.9696693420410156e-05, | |
| "learning_rate": 3e-07, | |
| "loss": 0.0345, | |
| "reward": -0.038486323008934654, | |
| "reward_std": 0.6927771319945654, | |
| "rewards/cosine_scaled_reward": -0.14424316041792432, | |
| "rewards/format_reward": 0.2500000024835269, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2358.152872721354, | |
| "epoch": 0.027428571428571427, | |
| "grad_norm": 0.19597449898719788, | |
| "kl": 2.9385089874267578e-05, | |
| "learning_rate": 3.2e-07, | |
| "loss": 0.0524, | |
| "reward": 0.8643488318969806, | |
| "reward_std": 0.770653153459231, | |
| "rewards/cosine_scaled_reward": 0.11967439825336139, | |
| "rewards/format_reward": 0.6250000099341074, | |
| "step": 16 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2722.4167277018228, | |
| "epoch": 0.029142857142857144, | |
| "grad_norm": 0.17968367040157318, | |
| "kl": 3.1620264053344727e-05, | |
| "learning_rate": 3.4000000000000003e-07, | |
| "loss": 0.0742, | |
| "reward": 0.43655472497145337, | |
| "reward_std": 0.4768330107132594, | |
| "rewards/cosine_scaled_reward": 0.03077736000219981, | |
| "rewards/format_reward": 0.3750000049670537, | |
| "step": 17 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3218.138956705729, | |
| "epoch": 0.030857142857142857, | |
| "grad_norm": 0.18847692012786865, | |
| "kl": 3.443161646525065e-05, | |
| "learning_rate": 3.6e-07, | |
| "loss": 0.0345, | |
| "reward": 0.0637030154466629, | |
| "reward_std": 0.6453258246183395, | |
| "rewards/cosine_scaled_reward": -0.07925960669914882, | |
| "rewards/format_reward": 0.22222222636143366, | |
| "step": 18 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3169.2083740234375, | |
| "epoch": 0.03257142857142857, | |
| "grad_norm": 0.12977641820907593, | |
| "kl": 2.824266751607259e-05, | |
| "learning_rate": 3.7999999999999996e-07, | |
| "loss": 0.0032, | |
| "reward": 0.20541929205258688, | |
| "reward_std": 0.8046327730019888, | |
| "rewards/cosine_scaled_reward": -0.070901474605004, | |
| "rewards/format_reward": 0.347222230086724, | |
| "step": 19 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3202.3334147135415, | |
| "epoch": 0.03428571428571429, | |
| "grad_norm": 0.18071624636650085, | |
| "kl": 3.085533777872721e-05, | |
| "learning_rate": 4e-07, | |
| "loss": 0.0764, | |
| "reward": 0.13824394841988882, | |
| "reward_std": 0.9271457294623057, | |
| "rewards/cosine_scaled_reward": -0.09060026394824187, | |
| "rewards/format_reward": 0.3194444539646308, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2846.4861653645835, | |
| "epoch": 0.036, | |
| "grad_norm": 0.1691419929265976, | |
| "kl": 3.962218761444092e-05, | |
| "learning_rate": 4.1999999999999995e-07, | |
| "loss": 0.0463, | |
| "reward": -0.10299787142624457, | |
| "reward_std": 0.5158225695292155, | |
| "rewards/cosine_scaled_reward": -0.20427671323219934, | |
| "rewards/format_reward": 0.3055555559694767, | |
| "step": 21 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3078.3472900390625, | |
| "epoch": 0.037714285714285714, | |
| "grad_norm": 0.14622145891189575, | |
| "kl": 3.381570180257162e-05, | |
| "learning_rate": 4.3999999999999997e-07, | |
| "loss": 0.0345, | |
| "reward": 0.04741051917274793, | |
| "reward_std": 0.6438981095949808, | |
| "rewards/cosine_scaled_reward": -0.1360169698794683, | |
| "rewards/format_reward": 0.3194444576899211, | |
| "step": 22 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3112.3750813802085, | |
| "epoch": 0.03942857142857143, | |
| "grad_norm": 0.15722624957561493, | |
| "kl": 3.0120213826497395e-05, | |
| "learning_rate": 4.6e-07, | |
| "loss": 0.029, | |
| "reward": -0.05071499291807413, | |
| "reward_std": 0.6119177291790644, | |
| "rewards/cosine_scaled_reward": -0.18507971552511057, | |
| "rewards/format_reward": 0.3194444477558136, | |
| "step": 23 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2608.4722900390625, | |
| "epoch": 0.04114285714285714, | |
| "grad_norm": 0.33484968543052673, | |
| "kl": 2.8118491172790527e-05, | |
| "learning_rate": 4.8e-07, | |
| "loss": 0.0786, | |
| "reward": 0.5405420685807864, | |
| "reward_std": 0.7321184525887171, | |
| "rewards/cosine_scaled_reward": 0.03415991769482692, | |
| "rewards/format_reward": 0.4722222288449605, | |
| "step": 24 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2733.4861653645835, | |
| "epoch": 0.04285714285714286, | |
| "grad_norm": 0.15931585431098938, | |
| "kl": 2.71722674369812e-05, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0171, | |
| "reward": 0.2985700157781442, | |
| "reward_std": 0.6910065859556198, | |
| "rewards/cosine_scaled_reward": -0.07988164760172367, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3068.4862060546875, | |
| "epoch": 0.044571428571428574, | |
| "grad_norm": 0.15254491567611694, | |
| "kl": 2.6553869247436523e-05, | |
| "learning_rate": 5.2e-07, | |
| "loss": 0.095, | |
| "reward": 0.41129567722479504, | |
| "reward_std": 0.9707497109969457, | |
| "rewards/cosine_scaled_reward": -0.03046328170845906, | |
| "rewards/format_reward": 0.4722222338120143, | |
| "step": 26 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3151.7362060546875, | |
| "epoch": 0.046285714285714284, | |
| "grad_norm": 0.1549319624900818, | |
| "kl": 2.218782901763916e-05, | |
| "learning_rate": 5.4e-07, | |
| "loss": 0.0699, | |
| "reward": 0.223313440879186, | |
| "reward_std": 0.8699296017487844, | |
| "rewards/cosine_scaled_reward": -0.06889883669403692, | |
| "rewards/format_reward": 0.3611111293236415, | |
| "step": 27 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2370.8333943684897, | |
| "epoch": 0.048, | |
| "grad_norm": 0.21545660495758057, | |
| "kl": 1.852835218111674e-05, | |
| "learning_rate": 5.6e-07, | |
| "loss": 0.0335, | |
| "reward": 0.6544605791568756, | |
| "reward_std": 0.7478305300076803, | |
| "rewards/cosine_scaled_reward": 0.028619173914194107, | |
| "rewards/format_reward": 0.5972222288449606, | |
| "step": 28 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2897.666707356771, | |
| "epoch": 0.04971428571428571, | |
| "grad_norm": 0.16816435754299164, | |
| "kl": 2.7875105539957683e-05, | |
| "learning_rate": 5.8e-07, | |
| "loss": 0.035, | |
| "reward": 0.3773072225352128, | |
| "reward_std": 0.7288130819797516, | |
| "rewards/cosine_scaled_reward": -0.026624162991841633, | |
| "rewards/format_reward": 0.4305555621782939, | |
| "step": 29 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2675.2362060546875, | |
| "epoch": 0.05142857142857143, | |
| "grad_norm": 0.2294975072145462, | |
| "kl": 1.7409523328145344e-05, | |
| "learning_rate": 6e-07, | |
| "loss": 0.114, | |
| "reward": 0.6032393351197243, | |
| "reward_std": 0.8560094932715098, | |
| "rewards/cosine_scaled_reward": 0.023841881503661472, | |
| "rewards/format_reward": 0.5555555721124014, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2669.5555826822915, | |
| "epoch": 0.053142857142857144, | |
| "grad_norm": 0.17238615453243256, | |
| "kl": 7.640570402145386e-06, | |
| "learning_rate": 6.2e-07, | |
| "loss": 0.0224, | |
| "reward": 0.3138917237520218, | |
| "reward_std": 0.5515048305193583, | |
| "rewards/cosine_scaled_reward": -0.05138745748748382, | |
| "rewards/format_reward": 0.4166666766007741, | |
| "step": 31 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2914.0972696940103, | |
| "epoch": 0.054857142857142854, | |
| "grad_norm": 0.19301587343215942, | |
| "kl": 1.8848106265068054e-05, | |
| "learning_rate": 6.4e-07, | |
| "loss": 0.1332, | |
| "reward": 0.24654228488604227, | |
| "reward_std": 0.8124425162871679, | |
| "rewards/cosine_scaled_reward": -0.10589553664127986, | |
| "rewards/format_reward": 0.4583333507180214, | |
| "step": 32 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2381.402821858724, | |
| "epoch": 0.05657142857142857, | |
| "grad_norm": 0.18890738487243652, | |
| "kl": 2.9340386390686035e-05, | |
| "learning_rate": 6.6e-07, | |
| "loss": 0.0528, | |
| "reward": 0.5025079051653544, | |
| "reward_std": 0.7784387270609537, | |
| "rewards/cosine_scaled_reward": -0.019579386338591576, | |
| "rewards/format_reward": 0.5416666741172472, | |
| "step": 33 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2819.805623372396, | |
| "epoch": 0.05828571428571429, | |
| "grad_norm": 0.19334346055984497, | |
| "kl": 2.561012903849284e-05, | |
| "learning_rate": 6.800000000000001e-07, | |
| "loss": 0.0753, | |
| "reward": 0.0026500746607780457, | |
| "reward_std": 0.6724469214677811, | |
| "rewards/cosine_scaled_reward": -0.15839717785517374, | |
| "rewards/format_reward": 0.319444448997577, | |
| "step": 34 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3071.52783203125, | |
| "epoch": 0.06, | |
| "grad_norm": 0.15700003504753113, | |
| "kl": 3.983577092488607e-05, | |
| "learning_rate": 7e-07, | |
| "loss": 0.0924, | |
| "reward": -0.01628999039530754, | |
| "reward_std": 0.6857109169165293, | |
| "rewards/cosine_scaled_reward": -0.15397833163539568, | |
| "rewards/format_reward": 0.2916666753590107, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2596.888916015625, | |
| "epoch": 0.061714285714285715, | |
| "grad_norm": 0.1650371253490448, | |
| "kl": 3.803769747416178e-05, | |
| "learning_rate": 7.2e-07, | |
| "loss": -0.0405, | |
| "reward": 0.0971544881661733, | |
| "reward_std": 0.5383333116769791, | |
| "rewards/cosine_scaled_reward": -0.13197831716388464, | |
| "rewards/format_reward": 0.3611111119389534, | |
| "step": 36 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3136.0000813802085, | |
| "epoch": 0.06342857142857143, | |
| "grad_norm": 0.14498025178909302, | |
| "kl": 2.9464562733968098e-05, | |
| "learning_rate": 7.4e-07, | |
| "loss": 0.0149, | |
| "reward": -0.22002460869650045, | |
| "reward_std": 0.48985352615515393, | |
| "rewards/cosine_scaled_reward": -0.21417897442976633, | |
| "rewards/format_reward": 0.20833333457509676, | |
| "step": 37 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3310.000040690104, | |
| "epoch": 0.06514285714285714, | |
| "grad_norm": 0.1291505992412567, | |
| "kl": 8.823474248250325e-05, | |
| "learning_rate": 7.599999999999999e-07, | |
| "loss": -0.0128, | |
| "reward": 0.04104452828566233, | |
| "reward_std": 0.45023731887340546, | |
| "rewards/cosine_scaled_reward": -0.11836662143468857, | |
| "rewards/format_reward": 0.2777777810891469, | |
| "step": 38 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3070.4861653645835, | |
| "epoch": 0.06685714285714285, | |
| "grad_norm": 0.2239781618118286, | |
| "kl": 0.00010270873705546062, | |
| "learning_rate": 7.799999999999999e-07, | |
| "loss": 0.0439, | |
| "reward": 0.07889220615228017, | |
| "reward_std": 0.6337036391099294, | |
| "rewards/cosine_scaled_reward": -0.1272205668889607, | |
| "rewards/format_reward": 0.3333333432674408, | |
| "step": 39 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2646.916707356771, | |
| "epoch": 0.06857142857142857, | |
| "grad_norm": 0.24816741049289703, | |
| "kl": 0.0002489471808075905, | |
| "learning_rate": 8e-07, | |
| "loss": 0.101, | |
| "reward": 0.4287034186224143, | |
| "reward_std": 0.6285349975029627, | |
| "rewards/cosine_scaled_reward": 0.026851719866196316, | |
| "rewards/format_reward": 0.3750000099341075, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2787.0972900390625, | |
| "epoch": 0.07028571428571428, | |
| "grad_norm": 0.14899501204490662, | |
| "kl": 5.305806795756022e-05, | |
| "learning_rate": 8.199999999999999e-07, | |
| "loss": 0.0971, | |
| "reward": 0.39761913754045963, | |
| "reward_std": 0.6327133079369863, | |
| "rewards/cosine_scaled_reward": 0.004365116357803345, | |
| "rewards/format_reward": 0.3888889004786809, | |
| "step": 41 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2566.0694986979165, | |
| "epoch": 0.072, | |
| "grad_norm": 0.1521102786064148, | |
| "kl": 0.0004043777783711751, | |
| "learning_rate": 8.399999999999999e-07, | |
| "loss": 0.012, | |
| "reward": 0.3376003090913097, | |
| "reward_std": 0.7216030806303024, | |
| "rewards/cosine_scaled_reward": -0.08814430236816406, | |
| "rewards/format_reward": 0.5138888955116272, | |
| "step": 42 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2799.4861653645835, | |
| "epoch": 0.07371428571428572, | |
| "grad_norm": 0.22864435613155365, | |
| "kl": 8.469820022583008e-05, | |
| "learning_rate": 8.599999999999999e-07, | |
| "loss": 0.0877, | |
| "reward": 0.5849024479587873, | |
| "reward_std": 0.7425422618786494, | |
| "rewards/cosine_scaled_reward": 0.070229001964132, | |
| "rewards/format_reward": 0.4444444527228673, | |
| "step": 43 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2659.8334147135415, | |
| "epoch": 0.07542857142857143, | |
| "grad_norm": 0.13977661728858948, | |
| "kl": 0.00017054875691731772, | |
| "learning_rate": 8.799999999999999e-07, | |
| "loss": -0.0104, | |
| "reward": 0.3781442828476429, | |
| "reward_std": 0.7416819036006927, | |
| "rewards/cosine_scaled_reward": -0.07481675532956918, | |
| "rewards/format_reward": 0.5277777947485447, | |
| "step": 44 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2976.6944783528647, | |
| "epoch": 0.07714285714285714, | |
| "grad_norm": 0.29603779315948486, | |
| "kl": 0.00020544727643330893, | |
| "learning_rate": 9e-07, | |
| "loss": 0.0778, | |
| "reward": 0.1988565002878507, | |
| "reward_std": 0.7289342085520426, | |
| "rewards/cosine_scaled_reward": -0.06723843080302079, | |
| "rewards/format_reward": 0.3333333407839139, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2397.416748046875, | |
| "epoch": 0.07885714285714286, | |
| "grad_norm": 0.2634449303150177, | |
| "kl": 0.0006041030089060465, | |
| "learning_rate": 9.2e-07, | |
| "loss": 0.0768, | |
| "reward": 0.3765933312824927, | |
| "reward_std": 0.7712161093950272, | |
| "rewards/cosine_scaled_reward": -0.06170332680145899, | |
| "rewards/format_reward": 0.5000000099341074, | |
| "step": 46 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2784.3472900390625, | |
| "epoch": 0.08057142857142857, | |
| "grad_norm": 0.19174522161483765, | |
| "kl": 0.0004113192359606425, | |
| "learning_rate": 9.399999999999999e-07, | |
| "loss": 0.0964, | |
| "reward": 0.4658193091551463, | |
| "reward_std": 0.8367001414299011, | |
| "rewards/cosine_scaled_reward": -0.010145904496312141, | |
| "rewards/format_reward": 0.486111128081878, | |
| "step": 47 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2657.3334147135415, | |
| "epoch": 0.08228571428571428, | |
| "grad_norm": 0.163053959608078, | |
| "kl": 0.00047886309524377185, | |
| "learning_rate": 9.6e-07, | |
| "loss": 0.0614, | |
| "reward": 0.7998232170939445, | |
| "reward_std": 0.8264668385187784, | |
| "rewards/cosine_scaled_reward": 0.12213384561861555, | |
| "rewards/format_reward": 0.5555555621782938, | |
| "step": 48 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2264.6112060546875, | |
| "epoch": 0.084, | |
| "grad_norm": 0.22662024199962616, | |
| "kl": 0.0013178189595540364, | |
| "learning_rate": 9.8e-07, | |
| "loss": 0.0663, | |
| "reward": 0.7101199378569921, | |
| "reward_std": 0.6965262393156687, | |
| "rewards/cosine_scaled_reward": 0.056448845813671746, | |
| "rewards/format_reward": 0.5972222338120142, | |
| "step": 49 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2706.263956705729, | |
| "epoch": 0.08571428571428572, | |
| "grad_norm": 0.20602725446224213, | |
| "kl": 0.0012766520182291667, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0675, | |
| "reward": 0.11585338972508907, | |
| "reward_std": 0.728886475165685, | |
| "rewards/cosine_scaled_reward": -0.1642955287049214, | |
| "rewards/format_reward": 0.4444444514811039, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2933.7084147135415, | |
| "epoch": 0.08742857142857142, | |
| "grad_norm": 0.14264345169067383, | |
| "kl": 0.00035913785298665363, | |
| "learning_rate": 9.999890338174275e-07, | |
| "loss": -0.0258, | |
| "reward": 0.46854039778312045, | |
| "reward_std": 0.7167676687240601, | |
| "rewards/cosine_scaled_reward": 0.018992409110069275, | |
| "rewards/format_reward": 0.4305555696288745, | |
| "step": 51 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3102.1944783528647, | |
| "epoch": 0.08914285714285715, | |
| "grad_norm": 0.15506410598754883, | |
| "kl": 0.0006569623947143555, | |
| "learning_rate": 9.999561358041868e-07, | |
| "loss": 0.017, | |
| "reward": 0.20700607324639955, | |
| "reward_std": 0.7135306199391683, | |
| "rewards/cosine_scaled_reward": -0.0631636418402195, | |
| "rewards/format_reward": 0.3333333383003871, | |
| "step": 52 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2958.680623372396, | |
| "epoch": 0.09085714285714286, | |
| "grad_norm": 0.13531740009784698, | |
| "kl": 0.00019252300262451172, | |
| "learning_rate": 9.999013075636804e-07, | |
| "loss": 0.0824, | |
| "reward": 0.18692485615611076, | |
| "reward_std": 0.7432453433672587, | |
| "rewards/cosine_scaled_reward": -0.11487091658636928, | |
| "rewards/format_reward": 0.4166666803260644, | |
| "step": 53 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3312.3333740234375, | |
| "epoch": 0.09257142857142857, | |
| "grad_norm": 0.14047981798648834, | |
| "kl": 0.0033926963806152344, | |
| "learning_rate": 9.998245517681593e-07, | |
| "loss": 0.0183, | |
| "reward": -0.08021007105708122, | |
| "reward_std": 0.6435250441233317, | |
| "rewards/cosine_scaled_reward": -0.13038281351327896, | |
| "rewards/format_reward": 0.18055555721124014, | |
| "step": 54 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3318.9583740234375, | |
| "epoch": 0.09428571428571429, | |
| "grad_norm": 0.1470106989145279, | |
| "kl": 0.0006798108418782552, | |
| "learning_rate": 9.997258721585931e-07, | |
| "loss": 0.0479, | |
| "reward": -0.0656120063116153, | |
| "reward_std": 0.797911008199056, | |
| "rewards/cosine_scaled_reward": -0.16475044501324496, | |
| "rewards/format_reward": 0.2638888930281003, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3246.250040690104, | |
| "epoch": 0.096, | |
| "grad_norm": 0.1471526026725769, | |
| "kl": 0.00042287508646647137, | |
| "learning_rate": 9.996052735444862e-07, | |
| "loss": 0.057, | |
| "reward": 0.19003195067246756, | |
| "reward_std": 0.7704959412415823, | |
| "rewards/cosine_scaled_reward": -0.050817357997099556, | |
| "rewards/format_reward": 0.2916666753590107, | |
| "step": 56 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3109.2361653645835, | |
| "epoch": 0.09771428571428571, | |
| "grad_norm": 0.13638579845428467, | |
| "kl": 0.0006163914998372396, | |
| "learning_rate": 9.994627618036452e-07, | |
| "loss": 0.0088, | |
| "reward": 0.18183419605096182, | |
| "reward_std": 0.7120010356108347, | |
| "rewards/cosine_scaled_reward": -0.0618606669207414, | |
| "rewards/format_reward": 0.3055555609365304, | |
| "step": 57 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3067.5416666666665, | |
| "epoch": 0.09942857142857142, | |
| "grad_norm": 0.1721331924200058, | |
| "kl": 0.002284367879231771, | |
| "learning_rate": 9.992983438818915e-07, | |
| "loss": 0.0784, | |
| "reward": -0.04034102149307728, | |
| "reward_std": 0.6754575098554293, | |
| "rewards/cosine_scaled_reward": -0.14517051726579666, | |
| "rewards/format_reward": 0.2500000099341075, | |
| "step": 58 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3024.3472493489585, | |
| "epoch": 0.10114285714285715, | |
| "grad_norm": 0.15059597790241241, | |
| "kl": 0.002747615178426107, | |
| "learning_rate": 9.991120277927223e-07, | |
| "loss": 0.04, | |
| "reward": 0.6628293991088867, | |
| "reward_std": 0.7772391984860102, | |
| "rewards/cosine_scaled_reward": 0.10919245580832164, | |
| "rewards/format_reward": 0.4444444527228673, | |
| "step": 59 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3326.736124674479, | |
| "epoch": 0.10285714285714286, | |
| "grad_norm": 0.11866168677806854, | |
| "kl": 0.00102996826171875, | |
| "learning_rate": 9.989038226169207e-07, | |
| "loss": 0.0322, | |
| "reward": 0.03830151570339998, | |
| "reward_std": 0.6654073546330134, | |
| "rewards/cosine_scaled_reward": -0.06418257827560107, | |
| "rewards/format_reward": 0.16666666915019354, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3216.1111653645835, | |
| "epoch": 0.10457142857142857, | |
| "grad_norm": 0.21836848556995392, | |
| "kl": 0.0018286705017089844, | |
| "learning_rate": 9.98673738502114e-07, | |
| "loss": 0.0537, | |
| "reward": 0.09106251100699107, | |
| "reward_std": 0.5555507987737656, | |
| "rewards/cosine_scaled_reward": -0.1419687569141388, | |
| "rewards/format_reward": 0.3750000124176343, | |
| "step": 61 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2644.4583740234375, | |
| "epoch": 0.10628571428571429, | |
| "grad_norm": 0.1769046038389206, | |
| "kl": 0.0012696584065755208, | |
| "learning_rate": 9.98421786662277e-07, | |
| "loss": 0.058, | |
| "reward": 0.46535767273356515, | |
| "reward_std": 0.6260566016038259, | |
| "rewards/cosine_scaled_reward": 0.017401046430071194, | |
| "rewards/format_reward": 0.4305555621782939, | |
| "step": 62 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3411.888916015625, | |
| "epoch": 0.108, | |
| "grad_norm": 0.13592268526554108, | |
| "kl": 0.0009024937947591146, | |
| "learning_rate": 9.981479793771866e-07, | |
| "loss": 0.0295, | |
| "reward": -0.11998325337966283, | |
| "reward_std": 0.7244327788551649, | |
| "rewards/cosine_scaled_reward": -0.15026939660310745, | |
| "rewards/format_reward": 0.180555559694767, | |
| "step": 63 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2811.0972900390625, | |
| "epoch": 0.10971428571428571, | |
| "grad_norm": 0.16869740188121796, | |
| "kl": 0.001640637715657552, | |
| "learning_rate": 9.97852329991824e-07, | |
| "loss": -0.0433, | |
| "reward": 0.07164862006902695, | |
| "reward_std": 0.50055563946565, | |
| "rewards/cosine_scaled_reward": -0.14473124345143637, | |
| "rewards/format_reward": 0.3611111144224803, | |
| "step": 64 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3108.4305826822915, | |
| "epoch": 0.11142857142857143, | |
| "grad_norm": 0.1962134838104248, | |
| "kl": 0.012165705362955729, | |
| "learning_rate": 9.975348529157229e-07, | |
| "loss": 0.0612, | |
| "reward": 0.13354974488417307, | |
| "reward_std": 0.6879242360591888, | |
| "rewards/cosine_scaled_reward": -0.07905847206711769, | |
| "rewards/format_reward": 0.2916666716337204, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2837.6528523763022, | |
| "epoch": 0.11314285714285714, | |
| "grad_norm": 0.14684489369392395, | |
| "kl": 0.0006155967712402344, | |
| "learning_rate": 9.971955636222684e-07, | |
| "loss": 0.0465, | |
| "reward": 0.12391687432924907, | |
| "reward_std": 0.49408531685670215, | |
| "rewards/cosine_scaled_reward": -0.09776378174622853, | |
| "rewards/format_reward": 0.3194444452722867, | |
| "step": 66 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3272.3612060546875, | |
| "epoch": 0.11485714285714285, | |
| "grad_norm": 0.17381706833839417, | |
| "kl": 0.0022366841634114585, | |
| "learning_rate": 9.968344786479415e-07, | |
| "loss": 0.067, | |
| "reward": 0.1833379832096398, | |
| "reward_std": 0.7986834744612376, | |
| "rewards/cosine_scaled_reward": -0.08888657142718633, | |
| "rewards/format_reward": 0.3611111268401146, | |
| "step": 67 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2779.638956705729, | |
| "epoch": 0.11657142857142858, | |
| "grad_norm": 0.14668098092079163, | |
| "kl": 0.0016632080078125, | |
| "learning_rate": 9.964516155915151e-07, | |
| "loss": -0.0097, | |
| "reward": 0.1990161488453547, | |
| "reward_std": 0.6795898899435997, | |
| "rewards/cosine_scaled_reward": -0.07410304351651575, | |
| "rewards/format_reward": 0.3472222288449605, | |
| "step": 68 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3062.0833740234375, | |
| "epoch": 0.11828571428571429, | |
| "grad_norm": 0.14154009521007538, | |
| "kl": 0.0015592575073242188, | |
| "learning_rate": 9.960469931131936e-07, | |
| "loss": -0.0087, | |
| "reward": 0.22120910634597143, | |
| "reward_std": 0.707958256204923, | |
| "rewards/cosine_scaled_reward": -0.056062132120132446, | |
| "rewards/format_reward": 0.3333333345750968, | |
| "step": 69 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2811.6389363606772, | |
| "epoch": 0.12, | |
| "grad_norm": 0.15003657341003418, | |
| "kl": 0.002394994099934896, | |
| "learning_rate": 9.956206309337066e-07, | |
| "loss": 0.0697, | |
| "reward": 0.47579720616340637, | |
| "reward_std": 0.6386595567067465, | |
| "rewards/cosine_scaled_reward": 0.050398593147595726, | |
| "rewards/format_reward": 0.3750000074505806, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3040.8472900390625, | |
| "epoch": 0.12171428571428572, | |
| "grad_norm": 0.16743303835391998, | |
| "kl": 0.00225830078125, | |
| "learning_rate": 9.951725498333448e-07, | |
| "loss": 0.0694, | |
| "reward": 0.5515925685564677, | |
| "reward_std": 0.9878981113433838, | |
| "rewards/cosine_scaled_reward": 0.08829631159702937, | |
| "rewards/format_reward": 0.3750000173846881, | |
| "step": 71 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2586.9861857096353, | |
| "epoch": 0.12342857142857143, | |
| "grad_norm": 0.16907824575901031, | |
| "kl": 0.003780364990234375, | |
| "learning_rate": 9.947027716509488e-07, | |
| "loss": 0.0584, | |
| "reward": 0.6482277313868204, | |
| "reward_std": 0.7509043018023173, | |
| "rewards/cosine_scaled_reward": 0.07411385203401248, | |
| "rewards/format_reward": 0.5000000111758709, | |
| "step": 72 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3030.4861653645835, | |
| "epoch": 0.12514285714285714, | |
| "grad_norm": 0.17010487616062164, | |
| "kl": 0.0023921330769856772, | |
| "learning_rate": 9.942113192828444e-07, | |
| "loss": 0.0591, | |
| "reward": 0.3889308621486028, | |
| "reward_std": 0.6073996548851331, | |
| "rewards/cosine_scaled_reward": -0.013867907226085663, | |
| "rewards/format_reward": 0.4166666741172473, | |
| "step": 73 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2406.9722900390625, | |
| "epoch": 0.12685714285714286, | |
| "grad_norm": 0.15091054141521454, | |
| "kl": 0.0030508041381835938, | |
| "learning_rate": 9.93698216681727e-07, | |
| "loss": -0.01, | |
| "reward": 0.9654075627525648, | |
| "reward_std": 0.9749771058559418, | |
| "rewards/cosine_scaled_reward": 0.13548154414941868, | |
| "rewards/format_reward": 0.6944444552063942, | |
| "step": 74 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3084.5694580078125, | |
| "epoch": 0.12857142857142856, | |
| "grad_norm": 0.16688691079616547, | |
| "kl": 0.0031131108601888022, | |
| "learning_rate": 9.931634888554935e-07, | |
| "loss": 0.0246, | |
| "reward": 0.20254600048065186, | |
| "reward_std": 0.6444505527615547, | |
| "rewards/cosine_scaled_reward": -0.05150478333234787, | |
| "rewards/format_reward": 0.3055555659035842, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2140.4583943684897, | |
| "epoch": 0.13028571428571428, | |
| "grad_norm": 0.44517478346824646, | |
| "kl": 0.042032877604166664, | |
| "learning_rate": 9.926071618660237e-07, | |
| "loss": -0.01, | |
| "reward": 0.6283551938831806, | |
| "reward_std": 0.6151509483655294, | |
| "rewards/cosine_scaled_reward": 0.022510942692557972, | |
| "rewards/format_reward": 0.5833333420256773, | |
| "step": 76 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3191.5694986979165, | |
| "epoch": 0.132, | |
| "grad_norm": 0.1384154111146927, | |
| "kl": 0.004534403483072917, | |
| "learning_rate": 9.9202926282791e-07, | |
| "loss": 0.035, | |
| "reward": 0.38374640171726543, | |
| "reward_std": 0.800934687256813, | |
| "rewards/cosine_scaled_reward": 0.03215096270044645, | |
| "rewards/format_reward": 0.3194444514811039, | |
| "step": 77 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3311.2083740234375, | |
| "epoch": 0.1337142857142857, | |
| "grad_norm": 0.12451548129320145, | |
| "kl": 0.004205067952473958, | |
| "learning_rate": 9.91429819907136e-07, | |
| "loss": -0.001, | |
| "reward": -0.22674076755841574, | |
| "reward_std": 0.5278006841739019, | |
| "rewards/cosine_scaled_reward": -0.18975927556554475, | |
| "rewards/format_reward": 0.1527777798473835, | |
| "step": 78 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2407.5139770507812, | |
| "epoch": 0.13542857142857143, | |
| "grad_norm": 0.4050913453102112, | |
| "kl": 0.045001983642578125, | |
| "learning_rate": 9.908088623197048e-07, | |
| "loss": 0.0696, | |
| "reward": 0.4897211113323768, | |
| "reward_std": 0.701280802488327, | |
| "rewards/cosine_scaled_reward": -0.04680613180001577, | |
| "rewards/format_reward": 0.583333340783914, | |
| "step": 79 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2920.0834147135415, | |
| "epoch": 0.13714285714285715, | |
| "grad_norm": 0.16643448173999786, | |
| "kl": 0.0032285054524739585, | |
| "learning_rate": 9.901664203302124e-07, | |
| "loss": 0.0713, | |
| "reward": 0.20773327688220888, | |
| "reward_std": 0.6839462419350942, | |
| "rewards/cosine_scaled_reward": -0.09752224804833531, | |
| "rewards/format_reward": 0.4027777885397275, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3149.3611653645835, | |
| "epoch": 0.13885714285714285, | |
| "grad_norm": 0.1878816783428192, | |
| "kl": 0.011072794596354166, | |
| "learning_rate": 9.895025252503755e-07, | |
| "loss": -0.0117, | |
| "reward": 0.03017812470595042, | |
| "reward_std": 0.495699738462766, | |
| "rewards/cosine_scaled_reward": -0.1237998412301143, | |
| "rewards/format_reward": 0.2777777810891469, | |
| "step": 81 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2979.638916015625, | |
| "epoch": 0.14057142857142857, | |
| "grad_norm": 0.16028527915477753, | |
| "kl": 0.006017049153645833, | |
| "learning_rate": 9.888172094375033e-07, | |
| "loss": 0.0175, | |
| "reward": 0.1827130333210031, | |
| "reward_std": 0.800931582848231, | |
| "rewards/cosine_scaled_reward": -0.061421267688274384, | |
| "rewards/format_reward": 0.3055555659035842, | |
| "step": 82 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2838.5694986979165, | |
| "epoch": 0.1422857142857143, | |
| "grad_norm": 0.14776544272899628, | |
| "kl": 0.0016415913899739583, | |
| "learning_rate": 9.881105062929221e-07, | |
| "loss": 0.038, | |
| "reward": 0.25952816009521484, | |
| "reward_std": 0.7201411376396815, | |
| "rewards/cosine_scaled_reward": -0.09940259127567212, | |
| "rewards/format_reward": 0.4583333420256774, | |
| "step": 83 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2666.180623372396, | |
| "epoch": 0.144, | |
| "grad_norm": 0.17353218793869019, | |
| "kl": 0.006327311197916667, | |
| "learning_rate": 9.873824502603459e-07, | |
| "loss": 0.0022, | |
| "reward": 0.22785979136824608, | |
| "reward_std": 0.6654940495888392, | |
| "rewards/cosine_scaled_reward": -0.0944034568965435, | |
| "rewards/format_reward": 0.4166666766007741, | |
| "step": 84 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2908.277872721354, | |
| "epoch": 0.1457142857142857, | |
| "grad_norm": 0.20922456681728363, | |
| "kl": 0.0032831827799479165, | |
| "learning_rate": 9.866330768241983e-07, | |
| "loss": 0.0915, | |
| "reward": 0.2944327586640914, | |
| "reward_std": 1.014359136422475, | |
| "rewards/cosine_scaled_reward": -0.06111695369084676, | |
| "rewards/format_reward": 0.4166666741172473, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2642.40283203125, | |
| "epoch": 0.14742857142857144, | |
| "grad_norm": 0.20185264945030212, | |
| "kl": 0.004110972086588542, | |
| "learning_rate": 9.85862422507884e-07, | |
| "loss": 0.0269, | |
| "reward": 0.2140984901537498, | |
| "reward_std": 0.7380417038997015, | |
| "rewards/cosine_scaled_reward": -0.11517298097411792, | |
| "rewards/format_reward": 0.4444444477558136, | |
| "step": 86 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3455.1944580078125, | |
| "epoch": 0.14914285714285713, | |
| "grad_norm": 0.23156170547008514, | |
| "kl": 0.00185394287109375, | |
| "learning_rate": 9.850705248720068e-07, | |
| "loss": 0.0346, | |
| "reward": -0.10635035609205563, | |
| "reward_std": 0.5582728683948517, | |
| "rewards/cosine_scaled_reward": -0.12261960903803508, | |
| "rewards/format_reward": 0.13888889675339064, | |
| "step": 87 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3373.1111653645835, | |
| "epoch": 0.15085714285714286, | |
| "grad_norm": 0.29082295298576355, | |
| "kl": 0.003131866455078125, | |
| "learning_rate": 9.8425742251254e-07, | |
| "loss": 0.0533, | |
| "reward": -0.16532108187675476, | |
| "reward_std": 0.5516959031422933, | |
| "rewards/cosine_scaled_reward": -0.18682721008857092, | |
| "rewards/format_reward": 0.2083333358168602, | |
| "step": 88 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2791.7222900390625, | |
| "epoch": 0.15257142857142858, | |
| "grad_norm": 0.30182477831840515, | |
| "kl": 0.00485992431640625, | |
| "learning_rate": 9.83423155058946e-07, | |
| "loss": -0.0047, | |
| "reward": 0.10053239266077678, | |
| "reward_std": 0.6104811653494835, | |
| "rewards/cosine_scaled_reward": -0.1372338104993105, | |
| "rewards/format_reward": 0.3750000099341075, | |
| "step": 89 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2683.7222900390625, | |
| "epoch": 0.15428571428571428, | |
| "grad_norm": 0.2090618759393692, | |
| "kl": 0.00568389892578125, | |
| "learning_rate": 9.825677631722435e-07, | |
| "loss": 0.0661, | |
| "reward": 0.3266903484861056, | |
| "reward_std": 0.5737487251559893, | |
| "rewards/cosine_scaled_reward": -0.010265930245320002, | |
| "rewards/format_reward": 0.3472222263614337, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2978.055623372396, | |
| "epoch": 0.156, | |
| "grad_norm": 0.25054702162742615, | |
| "kl": 0.008074442545572916, | |
| "learning_rate": 9.816912885430258e-07, | |
| "loss": 0.0675, | |
| "reward": 0.2967253675063451, | |
| "reward_std": 0.9393502573172251, | |
| "rewards/cosine_scaled_reward": -0.04608177145322164, | |
| "rewards/format_reward": 0.3888889004786809, | |
| "step": 91 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2914.3334452311196, | |
| "epoch": 0.15771428571428572, | |
| "grad_norm": 0.20104250311851501, | |
| "kl": 0.006212870279947917, | |
| "learning_rate": 9.807937738894303e-07, | |
| "loss": 0.0658, | |
| "reward": 0.3902638703584671, | |
| "reward_std": 0.8173000464836756, | |
| "rewards/cosine_scaled_reward": 0.014576359341541925, | |
| "rewards/format_reward": 0.3611111181477706, | |
| "step": 92 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2758.2500813802085, | |
| "epoch": 0.15942857142857142, | |
| "grad_norm": 0.16985346376895905, | |
| "kl": 0.003955841064453125, | |
| "learning_rate": 9.798752629550546e-07, | |
| "loss": 0.0753, | |
| "reward": 0.5438450860480467, | |
| "reward_std": 0.6189677069584528, | |
| "rewards/cosine_scaled_reward": 0.0010892003774642944, | |
| "rewards/format_reward": 0.5416666840513548, | |
| "step": 93 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2820.7639770507812, | |
| "epoch": 0.16114285714285714, | |
| "grad_norm": 0.16832537949085236, | |
| "kl": 0.00426483154296875, | |
| "learning_rate": 9.78935800506826e-07, | |
| "loss": 0.0077, | |
| "reward": 0.40633398356537026, | |
| "reward_std": 0.5728251735369364, | |
| "rewards/cosine_scaled_reward": -0.005166356762250264, | |
| "rewards/format_reward": 0.4166666741172473, | |
| "step": 94 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2733.2362060546875, | |
| "epoch": 0.16285714285714287, | |
| "grad_norm": 0.46326446533203125, | |
| "kl": 0.0055675506591796875, | |
| "learning_rate": 9.779754323328192e-07, | |
| "loss": 0.1245, | |
| "reward": 0.12223885705073674, | |
| "reward_std": 0.5204089830319086, | |
| "rewards/cosine_scaled_reward": -0.11943613241116206, | |
| "rewards/format_reward": 0.361111119389534, | |
| "step": 95 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3072.388956705729, | |
| "epoch": 0.16457142857142856, | |
| "grad_norm": 0.13426201045513153, | |
| "kl": 0.0037829081217447915, | |
| "learning_rate": 9.769942052400235e-07, | |
| "loss": 0.0406, | |
| "reward": 0.7161272789041201, | |
| "reward_std": 0.9450909892717997, | |
| "rewards/cosine_scaled_reward": 0.09417474642395973, | |
| "rewards/format_reward": 0.5277777910232544, | |
| "step": 96 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3446.9722493489585, | |
| "epoch": 0.1662857142857143, | |
| "grad_norm": 0.14770051836967468, | |
| "kl": 0.008074442545572916, | |
| "learning_rate": 9.759921670520634e-07, | |
| "loss": 0.0324, | |
| "reward": -0.1645828572412332, | |
| "reward_std": 0.7982521951198578, | |
| "rewards/cosine_scaled_reward": -0.17256921033064523, | |
| "rewards/format_reward": 0.1805555559694767, | |
| "step": 97 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2893.041788736979, | |
| "epoch": 0.168, | |
| "grad_norm": 0.23282977938652039, | |
| "kl": 0.0102996826171875, | |
| "learning_rate": 9.749693666068663e-07, | |
| "loss": 0.0827, | |
| "reward": 0.5822938953836759, | |
| "reward_std": 0.7445585628350576, | |
| "rewards/cosine_scaled_reward": 0.06198027543723583, | |
| "rewards/format_reward": 0.4583333407839139, | |
| "step": 98 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3236.8611653645835, | |
| "epoch": 0.1697142857142857, | |
| "grad_norm": 0.1794876754283905, | |
| "kl": 0.00789642333984375, | |
| "learning_rate": 9.739258537542835e-07, | |
| "loss": 0.0777, | |
| "reward": -0.0883933554093043, | |
| "reward_std": 0.5415167262156805, | |
| "rewards/cosine_scaled_reward": -0.15530777722597122, | |
| "rewards/format_reward": 0.22222222263614336, | |
| "step": 99 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3089.916707356771, | |
| "epoch": 0.17142857142857143, | |
| "grad_norm": 0.13794437050819397, | |
| "kl": 0.01137542724609375, | |
| "learning_rate": 9.728616793536587e-07, | |
| "loss": 0.0248, | |
| "reward": 0.10524778068065643, | |
| "reward_std": 0.5597566316525141, | |
| "rewards/cosine_scaled_reward": -0.09320944671829541, | |
| "rewards/format_reward": 0.2916666803260644, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3082.736124674479, | |
| "epoch": 0.17314285714285715, | |
| "grad_norm": 0.15086770057678223, | |
| "kl": 0.004786173502604167, | |
| "learning_rate": 9.717768952713511e-07, | |
| "loss": 0.0271, | |
| "reward": -0.002999328076839447, | |
| "reward_std": 0.4365875447789828, | |
| "rewards/cosine_scaled_reward": -0.11955521752436955, | |
| "rewards/format_reward": 0.2361111119389534, | |
| "step": 101 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2992.8750813802085, | |
| "epoch": 0.17485714285714285, | |
| "grad_norm": 0.14004293084144592, | |
| "kl": 0.0070718129475911455, | |
| "learning_rate": 9.706715543782064e-07, | |
| "loss": 0.0215, | |
| "reward": 0.29652568077047664, | |
| "reward_std": 0.4537389675776164, | |
| "rewards/cosine_scaled_reward": -0.05312604829668999, | |
| "rewards/format_reward": 0.4027777910232544, | |
| "step": 102 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3076.138916015625, | |
| "epoch": 0.17657142857142857, | |
| "grad_norm": 0.1663040965795517, | |
| "kl": 0.007525126139322917, | |
| "learning_rate": 9.695457105469804e-07, | |
| "loss": 0.0118, | |
| "reward": 0.10118945688009262, | |
| "reward_std": 0.5565604468186697, | |
| "rewards/cosine_scaled_reward": -0.11607193946838379, | |
| "rewards/format_reward": 0.3333333457509677, | |
| "step": 103 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3474.0972900390625, | |
| "epoch": 0.1782857142857143, | |
| "grad_norm": 0.13592784106731415, | |
| "kl": 0.004453023274739583, | |
| "learning_rate": 9.683994186497132e-07, | |
| "loss": 0.0245, | |
| "reward": 0.010009939471880594, | |
| "reward_std": 0.8361826241016388, | |
| "rewards/cosine_scaled_reward": -0.09916169879337151, | |
| "rewards/format_reward": 0.2083333395421505, | |
| "step": 104 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2807.3612263997397, | |
| "epoch": 0.18, | |
| "grad_norm": 0.18675260245800018, | |
| "kl": 0.007502237955729167, | |
| "learning_rate": 9.672327345550543e-07, | |
| "loss": 0.0441, | |
| "reward": 0.42158892129858333, | |
| "reward_std": 0.9289968361457189, | |
| "rewards/cosine_scaled_reward": -0.03226109594106674, | |
| "rewards/format_reward": 0.486111119389534, | |
| "step": 105 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3148.9861653645835, | |
| "epoch": 0.18171428571428572, | |
| "grad_norm": 0.18907137215137482, | |
| "kl": 0.007731119791666667, | |
| "learning_rate": 9.66045715125541e-07, | |
| "loss": 0.0794, | |
| "reward": 0.17682927350203195, | |
| "reward_std": 0.7613694767157236, | |
| "rewards/cosine_scaled_reward": -0.08519648428773507, | |
| "rewards/format_reward": 0.3472222276031971, | |
| "step": 106 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3267.388916015625, | |
| "epoch": 0.18342857142857144, | |
| "grad_norm": 0.1298302710056305, | |
| "kl": 0.006474812825520833, | |
| "learning_rate": 9.648384182148252e-07, | |
| "loss": 0.0314, | |
| "reward": 0.16061918313304582, | |
| "reward_std": 0.7836594184239706, | |
| "rewards/cosine_scaled_reward": -0.04469040408730507, | |
| "rewards/format_reward": 0.2500000012417634, | |
| "step": 107 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3351.388956705729, | |
| "epoch": 0.18514285714285714, | |
| "grad_norm": 0.1430933177471161, | |
| "kl": 0.004694620768229167, | |
| "learning_rate": 9.636109026648554e-07, | |
| "loss": 0.0442, | |
| "reward": 0.014846639707684517, | |
| "reward_std": 0.8161966055631638, | |
| "rewards/cosine_scaled_reward": -0.12452112386624019, | |
| "rewards/format_reward": 0.2638889004786809, | |
| "step": 108 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2849.3194580078125, | |
| "epoch": 0.18685714285714286, | |
| "grad_norm": 0.28510013222694397, | |
| "kl": 0.013326009114583334, | |
| "learning_rate": 9.623632283030077e-07, | |
| "loss": 0.0208, | |
| "reward": 0.0852958969771862, | |
| "reward_std": 0.5950245261192322, | |
| "rewards/cosine_scaled_reward": -0.12401872221380472, | |
| "rewards/format_reward": 0.3333333420256774, | |
| "step": 109 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2990.3194986979165, | |
| "epoch": 0.18857142857142858, | |
| "grad_norm": 0.21149034798145294, | |
| "kl": 0.006968180338541667, | |
| "learning_rate": 9.610954559391704e-07, | |
| "loss": 0.0462, | |
| "reward": 0.09307336946949363, | |
| "reward_std": 0.8012422521909078, | |
| "rewards/cosine_scaled_reward": -0.13401888330311826, | |
| "rewards/format_reward": 0.3611111255983512, | |
| "step": 110 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2828.15283203125, | |
| "epoch": 0.19028571428571428, | |
| "grad_norm": 0.1380799263715744, | |
| "kl": 0.007802327473958333, | |
| "learning_rate": 9.598076473627796e-07, | |
| "loss": 0.0626, | |
| "reward": -0.047976731012264885, | |
| "reward_std": 0.47912681102752686, | |
| "rewards/cosine_scaled_reward": -0.1906550352772077, | |
| "rewards/format_reward": 0.3333333457509677, | |
| "step": 111 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2788.055623372396, | |
| "epoch": 0.192, | |
| "grad_norm": 0.15422765910625458, | |
| "kl": 0.0055084228515625, | |
| "learning_rate": 9.58499865339809e-07, | |
| "loss": 0.0467, | |
| "reward": 0.08222175016999245, | |
| "reward_std": 0.5915350417296091, | |
| "rewards/cosine_scaled_reward": -0.16027801856398582, | |
| "rewards/format_reward": 0.4027777947485447, | |
| "step": 112 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3038.902872721354, | |
| "epoch": 0.19371428571428573, | |
| "grad_norm": 0.1365227848291397, | |
| "kl": 0.007494608561197917, | |
| "learning_rate": 9.571721736097088e-07, | |
| "loss": 0.0243, | |
| "reward": 0.4698562081903219, | |
| "reward_std": 0.9386945068836212, | |
| "rewards/cosine_scaled_reward": 0.026594760517279308, | |
| "rewards/format_reward": 0.4166666778425376, | |
| "step": 113 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2433.791748046875, | |
| "epoch": 0.19542857142857142, | |
| "grad_norm": 0.20159192383289337, | |
| "kl": 0.00775909423828125, | |
| "learning_rate": 9.55824636882301e-07, | |
| "loss": 0.0794, | |
| "reward": 0.35647524148225784, | |
| "reward_std": 0.7067185292641321, | |
| "rewards/cosine_scaled_reward": -0.06481793895363808, | |
| "rewards/format_reward": 0.486111128081878, | |
| "step": 114 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2614.6944783528647, | |
| "epoch": 0.19714285714285715, | |
| "grad_norm": 0.18976598978042603, | |
| "kl": 0.009274800618489584, | |
| "learning_rate": 9.54457320834625e-07, | |
| "loss": -0.0495, | |
| "reward": 0.47524294008811313, | |
| "reward_std": 0.6709072093168894, | |
| "rewards/cosine_scaled_reward": -0.012378551997244358, | |
| "rewards/format_reward": 0.5, | |
| "step": 115 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3302.541748046875, | |
| "epoch": 0.19885714285714284, | |
| "grad_norm": 0.14912541210651398, | |
| "kl": 0.008229573567708334, | |
| "learning_rate": 9.530702921077358e-07, | |
| "loss": 0.0373, | |
| "reward": -0.07236603554338217, | |
| "reward_std": 0.6672362685203552, | |
| "rewards/cosine_scaled_reward": -0.16812747033933798, | |
| "rewards/format_reward": 0.2638888942698638, | |
| "step": 116 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3138.5694783528647, | |
| "epoch": 0.20057142857142857, | |
| "grad_norm": 0.14411915838718414, | |
| "kl": 0.011599222819010416, | |
| "learning_rate": 9.516636183034564e-07, | |
| "loss": 0.0413, | |
| "reward": 0.4193639711787303, | |
| "reward_std": 0.8430522158741951, | |
| "rewards/cosine_scaled_reward": 0.029126417512694996, | |
| "rewards/format_reward": 0.3611111231148243, | |
| "step": 117 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2664.000040690104, | |
| "epoch": 0.2022857142857143, | |
| "grad_norm": 0.17431053519248962, | |
| "kl": 0.009043375651041666, | |
| "learning_rate": 9.502373679810839e-07, | |
| "loss": 0.0438, | |
| "reward": 0.28883447746435803, | |
| "reward_std": 0.904882033665975, | |
| "rewards/cosine_scaled_reward": -0.08474943165977795, | |
| "rewards/format_reward": 0.4583333407839139, | |
| "step": 118 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2811.9722900390625, | |
| "epoch": 0.204, | |
| "grad_norm": 0.17246517539024353, | |
| "kl": 0.00846099853515625, | |
| "learning_rate": 9.487916106540465e-07, | |
| "loss": 0.0039, | |
| "reward": 0.4127761671940486, | |
| "reward_std": 0.37164223690827686, | |
| "rewards/cosine_scaled_reward": -0.0019452547033627827, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 119 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3117.9861653645835, | |
| "epoch": 0.2057142857142857, | |
| "grad_norm": 0.22049903869628906, | |
| "kl": 0.012972513834635416, | |
| "learning_rate": 9.473264167865171e-07, | |
| "loss": 0.0913, | |
| "reward": 0.3390945568680763, | |
| "reward_std": 0.8262646396954855, | |
| "rewards/cosine_scaled_reward": -0.00406383474667867, | |
| "rewards/format_reward": 0.3472222338120143, | |
| "step": 120 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3134.166748046875, | |
| "epoch": 0.20742857142857143, | |
| "grad_norm": 0.16842107474803925, | |
| "kl": 0.00911712646484375, | |
| "learning_rate": 9.458418577899774e-07, | |
| "loss": 0.0387, | |
| "reward": 0.29650769879420596, | |
| "reward_std": 0.753247082233429, | |
| "rewards/cosine_scaled_reward": -0.06007948727346957, | |
| "rewards/format_reward": 0.4166666803260644, | |
| "step": 121 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3098.4862060546875, | |
| "epoch": 0.20914285714285713, | |
| "grad_norm": 0.20871736109256744, | |
| "kl": 0.0095977783203125, | |
| "learning_rate": 9.443380060197385e-07, | |
| "loss": 0.0339, | |
| "reward": -0.06545824371278286, | |
| "reward_std": 0.45910689731438953, | |
| "rewards/cosine_scaled_reward": -0.2271735742688179, | |
| "rewards/format_reward": 0.3888889004786809, | |
| "step": 122 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2762.541748046875, | |
| "epoch": 0.21085714285714285, | |
| "grad_norm": 0.18799707293510437, | |
| "kl": 0.008534749348958334, | |
| "learning_rate": 9.428149347714143e-07, | |
| "loss": 0.0795, | |
| "reward": 0.5211281642938653, | |
| "reward_std": 1.0881054202715557, | |
| "rewards/cosine_scaled_reward": 0.045286305248737335, | |
| "rewards/format_reward": 0.4305555721124013, | |
| "step": 123 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3144.0556030273438, | |
| "epoch": 0.21257142857142858, | |
| "grad_norm": 0.16049958765506744, | |
| "kl": 0.00748443603515625, | |
| "learning_rate": 9.412727182773486e-07, | |
| "loss": 0.0349, | |
| "reward": 0.3905097395181656, | |
| "reward_std": 0.6326450407505035, | |
| "rewards/cosine_scaled_reward": -0.006134033824006717, | |
| "rewards/format_reward": 0.402777789781491, | |
| "step": 124 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2921.777791341146, | |
| "epoch": 0.21428571428571427, | |
| "grad_norm": 0.14256709814071655, | |
| "kl": 0.009646097819010416, | |
| "learning_rate": 9.397114317029974e-07, | |
| "loss": 0.0295, | |
| "reward": 0.38034086177746457, | |
| "reward_std": 0.7871350646018982, | |
| "rewards/cosine_scaled_reward": 0.009614857534567514, | |
| "rewards/format_reward": 0.3611111156642437, | |
| "step": 125 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2928.680623372396, | |
| "epoch": 0.216, | |
| "grad_norm": 0.19500091671943665, | |
| "kl": 0.01242828369140625, | |
| "learning_rate": 9.381311511432658e-07, | |
| "loss": 0.0775, | |
| "reward": 0.2960887650648753, | |
| "reward_std": 0.8312297910451889, | |
| "rewards/cosine_scaled_reward": -0.07417784631252289, | |
| "rewards/format_reward": 0.4444444527228673, | |
| "step": 126 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3226.6944580078125, | |
| "epoch": 0.21771428571428572, | |
| "grad_norm": 0.16747912764549255, | |
| "kl": 0.011484781901041666, | |
| "learning_rate": 9.36531953618799e-07, | |
| "loss": 0.0423, | |
| "reward": 0.04955474380403757, | |
| "reward_std": 0.7419270873069763, | |
| "rewards/cosine_scaled_reward": -0.11411152531703313, | |
| "rewards/format_reward": 0.277777789781491, | |
| "step": 127 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3292.8194986979165, | |
| "epoch": 0.21942857142857142, | |
| "grad_norm": 0.1409364640712738, | |
| "kl": 0.013376871744791666, | |
| "learning_rate": 9.34913917072228e-07, | |
| "loss": 0.0218, | |
| "reward": 0.10593928893407185, | |
| "reward_std": 0.6205802957216898, | |
| "rewards/cosine_scaled_reward": -0.06508591026067734, | |
| "rewards/format_reward": 0.23611111318071684, | |
| "step": 128 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2576.8472900390625, | |
| "epoch": 0.22114285714285714, | |
| "grad_norm": 0.20601420104503632, | |
| "kl": 0.015233357747395834, | |
| "learning_rate": 9.332771203643714e-07, | |
| "loss": 0.0052, | |
| "reward": 0.4525395209590594, | |
| "reward_std": 0.710087443391482, | |
| "rewards/cosine_scaled_reward": -0.0445635716120402, | |
| "rewards/format_reward": 0.5416666865348816, | |
| "step": 129 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2218.2917277018228, | |
| "epoch": 0.22285714285714286, | |
| "grad_norm": 0.2574611008167267, | |
| "kl": 0.012542724609375, | |
| "learning_rate": 9.316216432703916e-07, | |
| "loss": 0.0364, | |
| "reward": 1.2637093514204025, | |
| "reward_std": 0.6893531282742819, | |
| "rewards/cosine_scaled_reward": 0.3054657746106386, | |
| "rewards/format_reward": 0.6527777798473835, | |
| "step": 130 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3194.805623372396, | |
| "epoch": 0.22457142857142856, | |
| "grad_norm": 0.15491630136966705, | |
| "kl": 0.008402506510416666, | |
| "learning_rate": 9.299475664759068e-07, | |
| "loss": 0.0533, | |
| "reward": -0.07167169451713562, | |
| "reward_std": 0.59788678586483, | |
| "rewards/cosine_scaled_reward": -0.1747247353196144, | |
| "rewards/format_reward": 0.2777777835726738, | |
| "step": 131 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3019.819539388021, | |
| "epoch": 0.22628571428571428, | |
| "grad_norm": 0.15278573334217072, | |
| "kl": 0.00980377197265625, | |
| "learning_rate": 9.282549715730579e-07, | |
| "loss": 0.0144, | |
| "reward": 0.17270600143820047, | |
| "reward_std": 0.7830294072628021, | |
| "rewards/cosine_scaled_reward": -0.10809145557383697, | |
| "rewards/format_reward": 0.3888889017204444, | |
| "step": 132 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3154.000040690104, | |
| "epoch": 0.228, | |
| "grad_norm": 0.14009016752243042, | |
| "kl": 0.012176513671875, | |
| "learning_rate": 9.265439410565328e-07, | |
| "loss": 0.0021, | |
| "reward": 0.3888522535562515, | |
| "reward_std": 0.6623830497264862, | |
| "rewards/cosine_scaled_reward": 0.013870567083358765, | |
| "rewards/format_reward": 0.3611111218730609, | |
| "step": 133 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2308.930623372396, | |
| "epoch": 0.2297142857142857, | |
| "grad_norm": 0.2797318398952484, | |
| "kl": 0.011810302734375, | |
| "learning_rate": 9.248145583195447e-07, | |
| "loss": 0.0402, | |
| "reward": 0.7703872546553612, | |
| "reward_std": 0.919698029756546, | |
| "rewards/cosine_scaled_reward": 0.05186028157671293, | |
| "rewards/format_reward": 0.6666666741172472, | |
| "step": 134 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2652.9305826822915, | |
| "epoch": 0.23142857142857143, | |
| "grad_norm": 0.17482583224773407, | |
| "kl": 0.015187581380208334, | |
| "learning_rate": 9.230669076497687e-07, | |
| "loss": 0.03, | |
| "reward": 0.5785276778042316, | |
| "reward_std": 0.8663486738999685, | |
| "rewards/cosine_scaled_reward": 0.03231940092518926, | |
| "rewards/format_reward": 0.5138889029622078, | |
| "step": 135 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2879.9444986979165, | |
| "epoch": 0.23314285714285715, | |
| "grad_norm": 0.20896418392658234, | |
| "kl": 0.013559977213541666, | |
| "learning_rate": 9.213010742252327e-07, | |
| "loss": 0.0811, | |
| "reward": 0.5022369648019472, | |
| "reward_std": 0.9430117209752401, | |
| "rewards/cosine_scaled_reward": 0.056674020985762276, | |
| "rewards/format_reward": 0.3888889066874981, | |
| "step": 136 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3232.4306640625, | |
| "epoch": 0.23485714285714285, | |
| "grad_norm": 0.16263550519943237, | |
| "kl": 0.011606852213541666, | |
| "learning_rate": 9.195171441101668e-07, | |
| "loss": 0.0777, | |
| "reward": 0.44614940229803324, | |
| "reward_std": 0.7385490934054056, | |
| "rewards/cosine_scaled_reward": 0.03557470068335533, | |
| "rewards/format_reward": 0.3750000149011612, | |
| "step": 137 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3029.138956705729, | |
| "epoch": 0.23657142857142857, | |
| "grad_norm": 0.1600210815668106, | |
| "kl": 0.009490966796875, | |
| "learning_rate": 9.177152042508077e-07, | |
| "loss": -0.0593, | |
| "reward": 0.19021011392275491, | |
| "reward_std": 0.694852868715922, | |
| "rewards/cosine_scaled_reward": -0.1062838261326154, | |
| "rewards/format_reward": 0.4027777947485447, | |
| "step": 138 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3129.916707356771, | |
| "epoch": 0.2382857142857143, | |
| "grad_norm": 0.1589079201221466, | |
| "kl": 0.014801025390625, | |
| "learning_rate": 9.158953424711624e-07, | |
| "loss": 0.0315, | |
| "reward": 0.05092944453159968, | |
| "reward_std": 0.5542028794685999, | |
| "rewards/cosine_scaled_reward": -0.09953528021772702, | |
| "rewards/format_reward": 0.2500000062088172, | |
| "step": 139 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2289.8611857096353, | |
| "epoch": 0.24, | |
| "grad_norm": 0.36694806814193726, | |
| "kl": 0.019073486328125, | |
| "learning_rate": 9.140576474687263e-07, | |
| "loss": 0.1083, | |
| "reward": 0.5017321242485195, | |
| "reward_std": 0.7119812965393066, | |
| "rewards/cosine_scaled_reward": -0.03385616342226664, | |
| "rewards/format_reward": 0.5694444502393404, | |
| "step": 140 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3049.6111450195312, | |
| "epoch": 0.24171428571428571, | |
| "grad_norm": 0.27735084295272827, | |
| "kl": 0.016042073567708332, | |
| "learning_rate": 9.122022088101613e-07, | |
| "loss": 0.1162, | |
| "reward": 0.05020663142204285, | |
| "reward_std": 0.6165368407964706, | |
| "rewards/cosine_scaled_reward": -0.12073002755641937, | |
| "rewards/format_reward": 0.2916666815678279, | |
| "step": 141 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3191.055623372396, | |
| "epoch": 0.24342857142857144, | |
| "grad_norm": 0.17002621293067932, | |
| "kl": 0.013763427734375, | |
| "learning_rate": 9.103291169269299e-07, | |
| "loss": 0.0722, | |
| "reward": 0.05829016864299774, | |
| "reward_std": 0.5495459834734598, | |
| "rewards/cosine_scaled_reward": -0.09585491034279887, | |
| "rewards/format_reward": 0.250000008692344, | |
| "step": 142 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3142.055623372396, | |
| "epoch": 0.24514285714285713, | |
| "grad_norm": 0.17562943696975708, | |
| "kl": 0.012054443359375, | |
| "learning_rate": 9.084384631108882e-07, | |
| "loss": 0.0285, | |
| "reward": 0.23804536399741968, | |
| "reward_std": 0.7772295872370402, | |
| "rewards/cosine_scaled_reward": -0.0754217728972435, | |
| "rewards/format_reward": 0.3888888992369175, | |
| "step": 143 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3351.9722493489585, | |
| "epoch": 0.24685714285714286, | |
| "grad_norm": 0.19126754999160767, | |
| "kl": 0.013427734375, | |
| "learning_rate": 9.065303395098358e-07, | |
| "loss": 0.0578, | |
| "reward": 0.0238612350076437, | |
| "reward_std": 0.7186805009841919, | |
| "rewards/cosine_scaled_reward": -0.10612493753433228, | |
| "rewards/format_reward": 0.23611111690600714, | |
| "step": 144 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2916.9583740234375, | |
| "epoch": 0.24857142857142858, | |
| "grad_norm": 0.14967964589595795, | |
| "kl": 0.012797037760416666, | |
| "learning_rate": 9.046048391230247e-07, | |
| "loss": -0.0411, | |
| "reward": 0.3306312958399455, | |
| "reward_std": 0.54432645936807, | |
| "rewards/cosine_scaled_reward": -0.015239919225374857, | |
| "rewards/format_reward": 0.3611111218730609, | |
| "step": 145 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2608.0000813802085, | |
| "epoch": 0.2502857142857143, | |
| "grad_norm": 0.19246625900268555, | |
| "kl": 0.014272054036458334, | |
| "learning_rate": 9.026620557966279e-07, | |
| "loss": -0.0199, | |
| "reward": 0.39618437240521115, | |
| "reward_std": 0.7984492381413778, | |
| "rewards/cosine_scaled_reward": -0.03801893023774028, | |
| "rewards/format_reward": 0.4722222325702508, | |
| "step": 146 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3047.5000813802085, | |
| "epoch": 0.252, | |
| "grad_norm": 0.2678869664669037, | |
| "kl": 0.013824462890625, | |
| "learning_rate": 9.007020842191634e-07, | |
| "loss": 0.1068, | |
| "reward": 0.06674646337827046, | |
| "reward_std": 0.6082490384578705, | |
| "rewards/cosine_scaled_reward": -0.10551566754778226, | |
| "rewards/format_reward": 0.2777777810891469, | |
| "step": 147 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3211.8472493489585, | |
| "epoch": 0.2537142857142857, | |
| "grad_norm": 0.1258208155632019, | |
| "kl": 0.009765625, | |
| "learning_rate": 8.987250199168808e-07, | |
| "loss": 0.0396, | |
| "reward": 0.03818178673585256, | |
| "reward_std": 0.6221184581518173, | |
| "rewards/cosine_scaled_reward": -0.12674243996540704, | |
| "rewards/format_reward": 0.291666670391957, | |
| "step": 148 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2670.4584147135415, | |
| "epoch": 0.25542857142857145, | |
| "grad_norm": 0.18250980973243713, | |
| "kl": 0.012552897135416666, | |
| "learning_rate": 8.967309592491052e-07, | |
| "loss": 0.0273, | |
| "reward": 0.35591835528612137, | |
| "reward_std": 0.8182683984438578, | |
| "rewards/cosine_scaled_reward": -0.07204083229104678, | |
| "rewards/format_reward": 0.5000000099341074, | |
| "step": 149 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2699.680623372396, | |
| "epoch": 0.2571428571428571, | |
| "grad_norm": 0.2456682324409485, | |
| "kl": 0.012308756510416666, | |
| "learning_rate": 8.9471999940354e-07, | |
| "loss": 0.0987, | |
| "reward": 0.7872514377037684, | |
| "reward_std": 0.7793236275513967, | |
| "rewards/cosine_scaled_reward": 0.09501460194587708, | |
| "rewards/format_reward": 0.5972222362955412, | |
| "step": 150 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2685.1112060546875, | |
| "epoch": 0.25885714285714284, | |
| "grad_norm": 0.2533908784389496, | |
| "kl": 0.018534342447916668, | |
| "learning_rate": 8.926922383915315e-07, | |
| "loss": 0.0437, | |
| "reward": 0.30423689012726146, | |
| "reward_std": 0.6282614668210348, | |
| "rewards/cosine_scaled_reward": -0.09788154562314351, | |
| "rewards/format_reward": 0.5000000099341074, | |
| "step": 151 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3282.0972900390625, | |
| "epoch": 0.26057142857142856, | |
| "grad_norm": 0.16320674121379852, | |
| "kl": 0.01776123046875, | |
| "learning_rate": 8.906477750432903e-07, | |
| "loss": 0.0328, | |
| "reward": 0.10830122729142506, | |
| "reward_std": 0.6112055083115896, | |
| "rewards/cosine_scaled_reward": -0.07779383783539136, | |
| "rewards/format_reward": 0.2638888930281003, | |
| "step": 152 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2974.9444986979165, | |
| "epoch": 0.2622857142857143, | |
| "grad_norm": 0.1725301891565323, | |
| "kl": 0.013763427734375, | |
| "learning_rate": 8.88586709003076e-07, | |
| "loss": 0.0329, | |
| "reward": 0.40149328112602234, | |
| "reward_std": 0.7482608755429586, | |
| "rewards/cosine_scaled_reward": 0.006302192031095426, | |
| "rewards/format_reward": 0.3888888992369175, | |
| "step": 153 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2940.4583536783853, | |
| "epoch": 0.264, | |
| "grad_norm": 0.1826322078704834, | |
| "kl": 0.014404296875, | |
| "learning_rate": 8.865091407243394e-07, | |
| "loss": 0.0443, | |
| "reward": 0.17415902204811573, | |
| "reward_std": 0.7082438717285792, | |
| "rewards/cosine_scaled_reward": -0.07958716154098511, | |
| "rewards/format_reward": 0.3333333432674408, | |
| "step": 154 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3110.9861653645835, | |
| "epoch": 0.26571428571428574, | |
| "grad_norm": 0.1413702666759491, | |
| "kl": 0.016520182291666668, | |
| "learning_rate": 8.844151714648274e-07, | |
| "loss": 0.001, | |
| "reward": 0.42592448244492215, | |
| "reward_std": 0.9181863069534302, | |
| "rewards/cosine_scaled_reward": 0.01157334508995215, | |
| "rewards/format_reward": 0.4027777848144372, | |
| "step": 155 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2997.3750813802085, | |
| "epoch": 0.2674285714285714, | |
| "grad_norm": 0.1732780486345291, | |
| "kl": 0.013163248697916666, | |
| "learning_rate": 8.823049032816478e-07, | |
| "loss": 0.0075, | |
| "reward": 0.43662730790674686, | |
| "reward_std": 0.7612739453713099, | |
| "rewards/cosine_scaled_reward": 0.0030358731746673584, | |
| "rewards/format_reward": 0.4305555572112401, | |
| "step": 156 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2910.263956705729, | |
| "epoch": 0.26914285714285713, | |
| "grad_norm": 0.16774919629096985, | |
| "kl": 0.019602457682291668, | |
| "learning_rate": 8.801784390262943e-07, | |
| "loss": 0.0488, | |
| "reward": 0.41491053874293965, | |
| "reward_std": 0.8844227890173594, | |
| "rewards/cosine_scaled_reward": -0.00782250085224708, | |
| "rewards/format_reward": 0.4305555621782939, | |
| "step": 157 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3001.0139363606772, | |
| "epoch": 0.27085714285714285, | |
| "grad_norm": 0.14972104132175446, | |
| "kl": 0.018452962239583332, | |
| "learning_rate": 8.780358823396352e-07, | |
| "loss": -0.0037, | |
| "reward": 0.14678718646367392, | |
| "reward_std": 0.6579625209172567, | |
| "rewards/cosine_scaled_reward": -0.10716194752603769, | |
| "rewards/format_reward": 0.3611111156642437, | |
| "step": 158 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3254.15283203125, | |
| "epoch": 0.2725714285714286, | |
| "grad_norm": 0.2082168012857437, | |
| "kl": 0.021748860677083332, | |
| "learning_rate": 8.758773376468604e-07, | |
| "loss": 0.0576, | |
| "reward": 0.15007218966881433, | |
| "reward_std": 0.7303880155086517, | |
| "rewards/cosine_scaled_reward": -0.07079725050910686, | |
| "rewards/format_reward": 0.2916666803260644, | |
| "step": 159 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2968.6944986979165, | |
| "epoch": 0.2742857142857143, | |
| "grad_norm": 0.17828086018562317, | |
| "kl": 0.01422119140625, | |
| "learning_rate": 8.737029101523929e-07, | |
| "loss": 0.0301, | |
| "reward": 0.6818346430857977, | |
| "reward_std": 0.7203890879948934, | |
| "rewards/cosine_scaled_reward": 0.1256395454208056, | |
| "rewards/format_reward": 0.4305555621782939, | |
| "step": 160 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2831.6389973958335, | |
| "epoch": 0.276, | |
| "grad_norm": 0.22226989269256592, | |
| "kl": 0.024037679036458332, | |
| "learning_rate": 8.715127058347614e-07, | |
| "loss": 0.0694, | |
| "reward": 0.2572607894738515, | |
| "reward_std": 0.8273610572020212, | |
| "rewards/cosine_scaled_reward": -0.07970294418434302, | |
| "rewards/format_reward": 0.416666679084301, | |
| "step": 161 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2563.6389770507812, | |
| "epoch": 0.2777142857142857, | |
| "grad_norm": 0.2706269323825836, | |
| "kl": 0.017873128255208332, | |
| "learning_rate": 8.693068314414344e-07, | |
| "loss": 0.0858, | |
| "reward": 0.25896017338770133, | |
| "reward_std": 0.5756277690331141, | |
| "rewards/cosine_scaled_reward": -0.16913103560606638, | |
| "rewards/format_reward": 0.5972222437461218, | |
| "step": 162 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3107.013916015625, | |
| "epoch": 0.2794285714285714, | |
| "grad_norm": 0.18369509279727936, | |
| "kl": 0.020548502604166668, | |
| "learning_rate": 8.670853944836176e-07, | |
| "loss": 0.0756, | |
| "reward": 0.12944546590248743, | |
| "reward_std": 0.759774794181188, | |
| "rewards/cosine_scaled_reward": -0.12277728024249275, | |
| "rewards/format_reward": 0.3750000223517418, | |
| "step": 163 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2707.791707356771, | |
| "epoch": 0.28114285714285714, | |
| "grad_norm": 0.32584112882614136, | |
| "kl": 0.022633870442708332, | |
| "learning_rate": 8.648485032310144e-07, | |
| "loss": 0.0821, | |
| "reward": 0.5376808034876982, | |
| "reward_std": 0.8403552174568176, | |
| "rewards/cosine_scaled_reward": 0.032729278629024826, | |
| "rewards/format_reward": 0.4722222313284874, | |
| "step": 164 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2882.2222900390625, | |
| "epoch": 0.28285714285714286, | |
| "grad_norm": 0.24923206865787506, | |
| "kl": 0.021067301432291668, | |
| "learning_rate": 8.625962667065487e-07, | |
| "loss": 0.0886, | |
| "reward": 0.3673405672113101, | |
| "reward_std": 0.941634883483251, | |
| "rewards/cosine_scaled_reward": -0.0316074937582016, | |
| "rewards/format_reward": 0.4305555609365304, | |
| "step": 165 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2887.2778523763022, | |
| "epoch": 0.2845714285714286, | |
| "grad_norm": 0.18646156787872314, | |
| "kl": 0.020243326822916668, | |
| "learning_rate": 8.603287946810513e-07, | |
| "loss": 0.0607, | |
| "reward": 0.2368552734454473, | |
| "reward_std": 0.8638633986314138, | |
| "rewards/cosine_scaled_reward": -0.06212792297204336, | |
| "rewards/format_reward": 0.3611111293236415, | |
| "step": 166 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3185.77783203125, | |
| "epoch": 0.2862857142857143, | |
| "grad_norm": 0.19375282526016235, | |
| "kl": 0.022806803385416668, | |
| "learning_rate": 8.580461976679099e-07, | |
| "loss": 0.0738, | |
| "reward": 0.588482570524017, | |
| "reward_std": 1.0810319185256958, | |
| "rewards/cosine_scaled_reward": 0.0859079472720623, | |
| "rewards/format_reward": 0.4166666865348816, | |
| "step": 167 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2992.90283203125, | |
| "epoch": 0.288, | |
| "grad_norm": 0.17105716466903687, | |
| "kl": 0.026295979817708332, | |
| "learning_rate": 8.557485869176825e-07, | |
| "loss": 0.0348, | |
| "reward": 0.5344396332899729, | |
| "reward_std": 0.8491599460442861, | |
| "rewards/cosine_scaled_reward": 0.03805313538759947, | |
| "rewards/format_reward": 0.4583333532015483, | |
| "step": 168 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2991.138916015625, | |
| "epoch": 0.2897142857142857, | |
| "grad_norm": 0.18935738503932953, | |
| "kl": 0.02996826171875, | |
| "learning_rate": 8.534360744126753e-07, | |
| "loss": 0.0164, | |
| "reward": 0.37488481728360057, | |
| "reward_std": 0.7196001211802164, | |
| "rewards/cosine_scaled_reward": -0.02089092880487442, | |
| "rewards/format_reward": 0.4166666753590107, | |
| "step": 169 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2747.666788736979, | |
| "epoch": 0.2914285714285714, | |
| "grad_norm": 0.17408499121665955, | |
| "kl": 0.024434407552083332, | |
| "learning_rate": 8.511087728614862e-07, | |
| "loss": 0.0, | |
| "reward": 0.5880009370545546, | |
| "reward_std": 0.6625326325496038, | |
| "rewards/cosine_scaled_reward": 0.07177824371804793, | |
| "rewards/format_reward": 0.4444444552063942, | |
| "step": 170 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2548.9167277018228, | |
| "epoch": 0.29314285714285715, | |
| "grad_norm": 2.4388060569763184, | |
| "kl": 0.2674967447916667, | |
| "learning_rate": 8.487667956935087e-07, | |
| "loss": 0.1006, | |
| "reward": 0.6461973956320435, | |
| "reward_std": 0.6500315368175507, | |
| "rewards/cosine_scaled_reward": 0.0036542738477389016, | |
| "rewards/format_reward": 0.638888900478681, | |
| "step": 171 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2787.805623372396, | |
| "epoch": 0.2948571428571429, | |
| "grad_norm": 0.1982651799917221, | |
| "kl": 0.02239990234375, | |
| "learning_rate": 8.464102570534061e-07, | |
| "loss": 0.0153, | |
| "reward": 0.5344789425532023, | |
| "reward_std": 0.9072269002596537, | |
| "rewards/cosine_scaled_reward": 0.017239479968945186, | |
| "rewards/format_reward": 0.5000000124176344, | |
| "step": 172 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2557.7223103841147, | |
| "epoch": 0.2965714285714286, | |
| "grad_norm": 0.18745917081832886, | |
| "kl": 0.030181884765625, | |
| "learning_rate": 8.440392717955475e-07, | |
| "loss": 0.0142, | |
| "reward": 0.18471611042817435, | |
| "reward_std": 0.7463686764240265, | |
| "rewards/cosine_scaled_reward": -0.14375305672486624, | |
| "rewards/format_reward": 0.4722222238779068, | |
| "step": 173 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2586.0417277018228, | |
| "epoch": 0.29828571428571427, | |
| "grad_norm": 0.2376364767551422, | |
| "kl": 0.030192057291666668, | |
| "learning_rate": 8.416539554784089e-07, | |
| "loss": 0.1002, | |
| "reward": 0.11324162781238556, | |
| "reward_std": 0.6646289924780527, | |
| "rewards/cosine_scaled_reward": -0.16560141742229462, | |
| "rewards/format_reward": 0.4444444527228673, | |
| "step": 174 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2886.7500813802085, | |
| "epoch": 0.3, | |
| "grad_norm": 0.1901567131280899, | |
| "kl": 0.032979329427083336, | |
| "learning_rate": 8.392544243589427e-07, | |
| "loss": 0.0057, | |
| "reward": 0.17423932005961737, | |
| "reward_std": 0.6424900939067205, | |
| "rewards/cosine_scaled_reward": -0.10732479145129521, | |
| "rewards/format_reward": 0.3888889042039712, | |
| "step": 175 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2308.541707356771, | |
| "epoch": 0.3017142857142857, | |
| "grad_norm": 0.2572733461856842, | |
| "kl": 0.02880859375, | |
| "learning_rate": 8.368407953869103e-07, | |
| "loss": 0.0885, | |
| "reward": 0.5797970102479061, | |
| "reward_std": 0.8780222237110138, | |
| "rewards/cosine_scaled_reward": -0.01565706233183543, | |
| "rewards/format_reward": 0.6111111293236414, | |
| "step": 176 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3252.8194580078125, | |
| "epoch": 0.30342857142857144, | |
| "grad_norm": 0.15189853310585022, | |
| "kl": 0.027384440104166668, | |
| "learning_rate": 8.344131861991828e-07, | |
| "loss": 0.0396, | |
| "reward": 0.027006535480419796, | |
| "reward_std": 0.6573801139990488, | |
| "rewards/cosine_scaled_reward": -0.10455229009191196, | |
| "rewards/format_reward": 0.2361111156642437, | |
| "step": 177 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2764.2083740234375, | |
| "epoch": 0.30514285714285716, | |
| "grad_norm": 0.2380024492740631, | |
| "kl": 0.024790445963541668, | |
| "learning_rate": 8.319717151140072e-07, | |
| "loss": 0.0174, | |
| "reward": 0.27436770498752594, | |
| "reward_std": 0.6253433674573898, | |
| "rewards/cosine_scaled_reward": -0.06420503463596106, | |
| "rewards/format_reward": 0.4027777910232544, | |
| "step": 178 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2766.9722900390625, | |
| "epoch": 0.3068571428571429, | |
| "grad_norm": 0.3094896972179413, | |
| "kl": 0.0311279296875, | |
| "learning_rate": 8.295165011252396e-07, | |
| "loss": 0.1226, | |
| "reward": 0.4755251506964366, | |
| "reward_std": 0.6630441000064214, | |
| "rewards/cosine_scaled_reward": -0.019181872407595318, | |
| "rewards/format_reward": 0.5138889029622078, | |
| "step": 179 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1620.138916015625, | |
| "epoch": 0.30857142857142855, | |
| "grad_norm": 0.31113868951797485, | |
| "kl": 0.0299072265625, | |
| "learning_rate": 8.270476638965461e-07, | |
| "loss": -0.0101, | |
| "reward": 0.7099573736389478, | |
| "reward_std": 0.7148088390628496, | |
| "rewards/cosine_scaled_reward": -0.06168798813208317, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 180 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3063.8055826822915, | |
| "epoch": 0.3102857142857143, | |
| "grad_norm": 0.20861417055130005, | |
| "kl": 0.03369140625, | |
| "learning_rate": 8.245653237555705e-07, | |
| "loss": 0.024, | |
| "reward": 0.4864314068108797, | |
| "reward_std": 0.7121660659710566, | |
| "rewards/cosine_scaled_reward": 0.041826844215393066, | |
| "rewards/format_reward": 0.4027777885397275, | |
| "step": 181 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3182.5972900390625, | |
| "epoch": 0.312, | |
| "grad_norm": 0.2217608392238617, | |
| "kl": 0.034993489583333336, | |
| "learning_rate": 8.220696016880687e-07, | |
| "loss": 0.0615, | |
| "reward": -0.1495605477442344, | |
| "reward_std": 0.6062896152337393, | |
| "rewards/cosine_scaled_reward": -0.2067247157295545, | |
| "rewards/format_reward": 0.2638888955116272, | |
| "step": 182 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2671.1528930664062, | |
| "epoch": 0.3137142857142857, | |
| "grad_norm": 0.3118150234222412, | |
| "kl": 0.032002766927083336, | |
| "learning_rate": 8.195606193320136e-07, | |
| "loss": 0.0933, | |
| "reward": 0.44948608179887134, | |
| "reward_std": 0.9109388391176859, | |
| "rewards/cosine_scaled_reward": -0.004423625146349271, | |
| "rewards/format_reward": 0.4583333532015483, | |
| "step": 183 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2496.1250610351562, | |
| "epoch": 0.31542857142857145, | |
| "grad_norm": 0.5605117678642273, | |
| "kl": 0.03387451171875, | |
| "learning_rate": 8.170384989716657e-07, | |
| "loss": 0.1509, | |
| "reward": 0.6977702975273132, | |
| "reward_std": 0.9448588987191519, | |
| "rewards/cosine_scaled_reward": 0.043329599779099226, | |
| "rewards/format_reward": 0.6111111243565878, | |
| "step": 184 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2554.7222493489585, | |
| "epoch": 0.3171428571428571, | |
| "grad_norm": 0.3769248127937317, | |
| "kl": 0.038187662760416664, | |
| "learning_rate": 8.145033635316128e-07, | |
| "loss": 0.0559, | |
| "reward": 0.4873435174425443, | |
| "reward_std": 0.8436052600542704, | |
| "rewards/cosine_scaled_reward": -0.027161574612061184, | |
| "rewards/format_reward": 0.541666679084301, | |
| "step": 185 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2530.680623372396, | |
| "epoch": 0.31885714285714284, | |
| "grad_norm": 0.2839488387107849, | |
| "kl": 0.036824544270833336, | |
| "learning_rate": 8.119553365707802e-07, | |
| "loss": 0.0468, | |
| "reward": 0.1027833657960097, | |
| "reward_std": 0.6216810842355093, | |
| "rewards/cosine_scaled_reward": -0.1708305476543804, | |
| "rewards/format_reward": 0.4444444589316845, | |
| "step": 186 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2109.6666870117188, | |
| "epoch": 0.32057142857142856, | |
| "grad_norm": 0.5897616147994995, | |
| "kl": 0.040598551432291664, | |
| "learning_rate": 8.093945422764069e-07, | |
| "loss": 0.0862, | |
| "reward": 1.1625737498203914, | |
| "reward_std": 0.8115918238957723, | |
| "rewards/cosine_scaled_reward": 0.19934244205554327, | |
| "rewards/format_reward": 0.7638889054457346, | |
| "step": 187 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2784.8472900390625, | |
| "epoch": 0.3222857142857143, | |
| "grad_norm": 0.3380698263645172, | |
| "kl": 0.048095703125, | |
| "learning_rate": 8.068211054579943e-07, | |
| "loss": -0.022, | |
| "reward": 0.11683306377381086, | |
| "reward_std": 0.6127427419026693, | |
| "rewards/cosine_scaled_reward": -0.15686125556627908, | |
| "rewards/format_reward": 0.4305555621782939, | |
| "step": 188 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2143.2778523763022, | |
| "epoch": 0.324, | |
| "grad_norm": 0.5302934050559998, | |
| "kl": 0.047220865885416664, | |
| "learning_rate": 8.04235151541222e-07, | |
| "loss": 0.0614, | |
| "reward": 0.3594171529014905, | |
| "reward_std": 0.696213573217392, | |
| "rewards/cosine_scaled_reward": -0.11890254272536065, | |
| "rewards/format_reward": 0.5972222437461218, | |
| "step": 189 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2630.0694986979165, | |
| "epoch": 0.32571428571428573, | |
| "grad_norm": 0.22001327574253082, | |
| "kl": 0.045145670572916664, | |
| "learning_rate": 8.01636806561836e-07, | |
| "loss": 0.0881, | |
| "reward": 0.38299726011852425, | |
| "reward_std": 0.6745277245839437, | |
| "rewards/cosine_scaled_reward": -0.07933470886200666, | |
| "rewards/format_reward": 0.5416666803260645, | |
| "step": 190 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2416.819539388021, | |
| "epoch": 0.3274285714285714, | |
| "grad_norm": 0.4197448790073395, | |
| "kl": 0.037638346354166664, | |
| "learning_rate": 7.990261971595048e-07, | |
| "loss": 0.0727, | |
| "reward": 0.7062414238850275, | |
| "reward_std": 0.9238099257151285, | |
| "rewards/cosine_scaled_reward": 0.03367625301082929, | |
| "rewards/format_reward": 0.6388889029622078, | |
| "step": 191 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2464.416748046875, | |
| "epoch": 0.3291428571428571, | |
| "grad_norm": 0.37473064661026, | |
| "kl": 0.07063802083333333, | |
| "learning_rate": 7.964034505716476e-07, | |
| "loss": 0.0332, | |
| "reward": 0.3312869320313136, | |
| "reward_std": 0.6453676869471868, | |
| "rewards/cosine_scaled_reward": -0.09130099043250084, | |
| "rewards/format_reward": 0.513888897995154, | |
| "step": 192 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2048.0694986979165, | |
| "epoch": 0.33085714285714285, | |
| "grad_norm": 0.4343617856502533, | |
| "kl": 0.056254069010416664, | |
| "learning_rate": 7.93768694627233e-07, | |
| "loss": -0.0097, | |
| "reward": 0.7283353358507156, | |
| "reward_std": 0.7651784718036652, | |
| "rewards/cosine_scaled_reward": 0.016945424179236095, | |
| "rewards/format_reward": 0.6944444552063942, | |
| "step": 193 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2032.5694885253906, | |
| "epoch": 0.3325714285714286, | |
| "grad_norm": 0.4706946909427643, | |
| "kl": 0.06571451822916667, | |
| "learning_rate": 7.911220577405484e-07, | |
| "loss": -0.0261, | |
| "reward": 0.8320082649588585, | |
| "reward_std": 0.6376081357399622, | |
| "rewards/cosine_scaled_reward": 0.06878188418340869, | |
| "rewards/format_reward": 0.6944444477558136, | |
| "step": 194 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2661.2361653645835, | |
| "epoch": 0.3342857142857143, | |
| "grad_norm": 0.5797499418258667, | |
| "kl": 0.07354736328125, | |
| "learning_rate": 7.884636689049422e-07, | |
| "loss": 0.0982, | |
| "reward": 0.033702409942634404, | |
| "reward_std": 0.5151704748471578, | |
| "rewards/cosine_scaled_reward": -0.1706487958629926, | |
| "rewards/format_reward": 0.3750000111758709, | |
| "step": 195 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2322.6250813802085, | |
| "epoch": 0.336, | |
| "grad_norm": 0.3754180371761322, | |
| "kl": 0.07417805989583333, | |
| "learning_rate": 7.857936576865356e-07, | |
| "loss": 0.0346, | |
| "reward": 0.9288424551486969, | |
| "reward_std": 0.7033603191375732, | |
| "rewards/cosine_scaled_reward": 0.1102545199294885, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 196 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2660.2361450195312, | |
| "epoch": 0.3377142857142857, | |
| "grad_norm": 0.50594562292099, | |
| "kl": 0.07568359375, | |
| "learning_rate": 7.831121542179086e-07, | |
| "loss": 0.0758, | |
| "reward": 0.2973423044507702, | |
| "reward_std": 0.9695365031560262, | |
| "rewards/cosine_scaled_reward": -0.0735510762509269, | |
| "rewards/format_reward": 0.4444444477558136, | |
| "step": 197 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2640.708455403646, | |
| "epoch": 0.3394285714285714, | |
| "grad_norm": 0.6515359282493591, | |
| "kl": 0.094482421875, | |
| "learning_rate": 7.804192891917571e-07, | |
| "loss": 0.1125, | |
| "reward": 0.2253766544163227, | |
| "reward_std": 0.7898083130518595, | |
| "rewards/cosine_scaled_reward": -0.1303672380745411, | |
| "rewards/format_reward": 0.4861111268401146, | |
| "step": 198 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2910.3055826822915, | |
| "epoch": 0.34114285714285714, | |
| "grad_norm": 0.599124014377594, | |
| "kl": 0.10546875, | |
| "learning_rate": 7.777151938545235e-07, | |
| "loss": -0.0532, | |
| "reward": 0.2923546185096105, | |
| "reward_std": 0.7325415760278702, | |
| "rewards/cosine_scaled_reward": -0.05521159991621971, | |
| "rewards/format_reward": 0.4027777848144372, | |
| "step": 199 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2131.2222696940103, | |
| "epoch": 0.34285714285714286, | |
| "grad_norm": 0.3559664189815521, | |
| "kl": 0.076904296875, | |
| "learning_rate": 7.75e-07, | |
| "loss": -0.0047, | |
| "reward": 0.8175519158442816, | |
| "reward_std": 0.6491561482350031, | |
| "rewards/cosine_scaled_reward": 0.1032204049018522, | |
| "rewards/format_reward": 0.6111111231148243, | |
| "step": 200 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2403.5000203450522, | |
| "epoch": 0.3445714285714286, | |
| "grad_norm": 0.4335411787033081, | |
| "kl": 0.094970703125, | |
| "learning_rate": 7.72273839962904e-07, | |
| "loss": 0.0362, | |
| "reward": 0.3435460871551186, | |
| "reward_std": 0.5741855899492899, | |
| "rewards/cosine_scaled_reward": -0.11294919004042943, | |
| "rewards/format_reward": 0.5694444626569748, | |
| "step": 201 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2334.0973103841147, | |
| "epoch": 0.3462857142857143, | |
| "grad_norm": 0.6908948421478271, | |
| "kl": 0.12158203125, | |
| "learning_rate": 7.695368466124296e-07, | |
| "loss": 0.1256, | |
| "reward": 0.48156655083100003, | |
| "reward_std": 0.6996385753154755, | |
| "rewards/cosine_scaled_reward": -0.07171675112719338, | |
| "rewards/format_reward": 0.625000019868215, | |
| "step": 202 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1966.2222696940105, | |
| "epoch": 0.348, | |
| "grad_norm": 0.67049241065979, | |
| "kl": 0.08585611979166667, | |
| "learning_rate": 7.667891533457718e-07, | |
| "loss": 0.0909, | |
| "reward": 1.0487131079037983, | |
| "reward_std": 0.9580865850051244, | |
| "rewards/cosine_scaled_reward": 0.13546766511475047, | |
| "rewards/format_reward": 0.7777777959903082, | |
| "step": 203 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2628.305623372396, | |
| "epoch": 0.3497142857142857, | |
| "grad_norm": 0.40789034962654114, | |
| "kl": 0.14168294270833334, | |
| "learning_rate": 7.640308940816239e-07, | |
| "loss": 0.0302, | |
| "reward": -0.012188049654165903, | |
| "reward_std": 0.6520945082108179, | |
| "rewards/cosine_scaled_reward": -0.17970513738691807, | |
| "rewards/format_reward": 0.3472222338120143, | |
| "step": 204 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2274.763956705729, | |
| "epoch": 0.3514285714285714, | |
| "grad_norm": 1.2571016550064087, | |
| "kl": 0.138916015625, | |
| "learning_rate": 7.612622032536507e-07, | |
| "loss": 0.1178, | |
| "reward": 0.7175190349419912, | |
| "reward_std": 1.1127092838287354, | |
| "rewards/cosine_scaled_reward": 0.05320395218829314, | |
| "rewards/format_reward": 0.6111111342906952, | |
| "step": 205 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2163.166717529297, | |
| "epoch": 0.35314285714285715, | |
| "grad_norm": 0.8860762715339661, | |
| "kl": 0.13492838541666666, | |
| "learning_rate": 7.584832158039378e-07, | |
| "loss": 0.0813, | |
| "reward": 0.441390501956145, | |
| "reward_std": 0.8139428297678629, | |
| "rewards/cosine_scaled_reward": -0.07097143121063709, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 206 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2281.513956705729, | |
| "epoch": 0.35485714285714287, | |
| "grad_norm": 1.0573821067810059, | |
| "kl": 0.17203776041666666, | |
| "learning_rate": 7.556940671764124e-07, | |
| "loss": 0.1539, | |
| "reward": 0.5699481119712194, | |
| "reward_std": 0.7771651248137156, | |
| "rewards/cosine_scaled_reward": 0.007196256270011266, | |
| "rewards/format_reward": 0.5555555659035841, | |
| "step": 207 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2529.2222493489585, | |
| "epoch": 0.3565714285714286, | |
| "grad_norm": 0.869641900062561, | |
| "kl": 0.19580078125, | |
| "learning_rate": 7.528948933102438e-07, | |
| "loss": 0.1115, | |
| "reward": 0.402865959952275, | |
| "reward_std": 0.9499445905288061, | |
| "rewards/cosine_scaled_reward": -0.055511463433504105, | |
| "rewards/format_reward": 0.513888897995154, | |
| "step": 208 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2344.9444986979165, | |
| "epoch": 0.35828571428571426, | |
| "grad_norm": 0.8390803337097168, | |
| "kl": 0.20068359375, | |
| "learning_rate": 7.500858306332172e-07, | |
| "loss": 0.1081, | |
| "reward": 0.514536718527476, | |
| "reward_std": 0.7412205884853998, | |
| "rewards/cosine_scaled_reward": -0.04134275003646811, | |
| "rewards/format_reward": 0.5972222338120142, | |
| "step": 209 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2167.27783203125, | |
| "epoch": 0.36, | |
| "grad_norm": 0.8296210765838623, | |
| "kl": 0.23583984375, | |
| "learning_rate": 7.472670160550848e-07, | |
| "loss": 0.0887, | |
| "reward": 0.4385714679956436, | |
| "reward_std": 0.578294982512792, | |
| "rewards/cosine_scaled_reward": -0.0862698298393904, | |
| "rewards/format_reward": 0.611111119389534, | |
| "step": 210 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3031.750040690104, | |
| "epoch": 0.3617142857142857, | |
| "grad_norm": 0.4978528320789337, | |
| "kl": 0.32958984375, | |
| "learning_rate": 7.444385869608921e-07, | |
| "loss": 0.0405, | |
| "reward": -0.35640790810187656, | |
| "reward_std": 0.48471235235532123, | |
| "rewards/cosine_scaled_reward": -0.2615372935930888, | |
| "rewards/format_reward": 0.1666666679084301, | |
| "step": 211 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2329.7500203450522, | |
| "epoch": 0.36342857142857143, | |
| "grad_norm": 1.4704159498214722, | |
| "kl": 0.29052734375, | |
| "learning_rate": 7.416006812042827e-07, | |
| "loss": 0.142, | |
| "reward": 0.38594985256592435, | |
| "reward_std": 0.9150921801726023, | |
| "rewards/cosine_scaled_reward": -0.05008065110693375, | |
| "rewards/format_reward": 0.4861111342906952, | |
| "step": 212 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2657.5972696940103, | |
| "epoch": 0.36514285714285716, | |
| "grad_norm": 0.8537811040878296, | |
| "kl": 0.33935546875, | |
| "learning_rate": 7.387534371007797e-07, | |
| "loss": 0.0707, | |
| "reward": 0.3708499073982239, | |
| "reward_std": 0.8413863132397333, | |
| "rewards/cosine_scaled_reward": -0.08540838918027778, | |
| "rewards/format_reward": 0.5416666766007742, | |
| "step": 213 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2893.513956705729, | |
| "epoch": 0.3668571428571429, | |
| "grad_norm": 0.7264823913574219, | |
| "kl": 0.4700520833333333, | |
| "learning_rate": 7.358969934210438e-07, | |
| "loss": 0.0776, | |
| "reward": 0.2533894454439481, | |
| "reward_std": 0.8303240140279134, | |
| "rewards/cosine_scaled_reward": -0.07469417713582516, | |
| "rewards/format_reward": 0.4027777959903081, | |
| "step": 214 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2492.402872721354, | |
| "epoch": 0.36857142857142855, | |
| "grad_norm": 0.5665526390075684, | |
| "kl": 0.3577473958333333, | |
| "learning_rate": 7.330314893841101e-07, | |
| "loss": 0.0456, | |
| "reward": 0.6144696623086929, | |
| "reward_std": 0.6529396076997122, | |
| "rewards/cosine_scaled_reward": 0.015568156183386842, | |
| "rewards/format_reward": 0.5833333482344946, | |
| "step": 215 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2781.777872721354, | |
| "epoch": 0.3702857142857143, | |
| "grad_norm": 0.8222731351852417, | |
| "kl": 0.478515625, | |
| "learning_rate": 7.301570646506027e-07, | |
| "loss": 0.1093, | |
| "reward": 0.09490112960338593, | |
| "reward_std": 0.6362172613541285, | |
| "rewards/cosine_scaled_reward": -0.1469938817123572, | |
| "rewards/format_reward": 0.3888889004786809, | |
| "step": 216 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2547.2222900390625, | |
| "epoch": 0.372, | |
| "grad_norm": 1.1918792724609375, | |
| "kl": 0.4853515625, | |
| "learning_rate": 7.27273859315928e-07, | |
| "loss": 0.1031, | |
| "reward": 0.33123820275068283, | |
| "reward_std": 0.5770809849103292, | |
| "rewards/cosine_scaled_reward": -0.08438090638568004, | |
| "rewards/format_reward": 0.5000000012417635, | |
| "step": 217 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2428.7223205566406, | |
| "epoch": 0.3737142857142857, | |
| "grad_norm": 0.742900013923645, | |
| "kl": 0.4697265625, | |
| "learning_rate": 7.243820139034464e-07, | |
| "loss": 0.0601, | |
| "reward": 0.4406207002078493, | |
| "reward_std": 0.7823553284009298, | |
| "rewards/cosine_scaled_reward": -0.0644118661681811, | |
| "rewards/format_reward": 0.569444457689921, | |
| "step": 218 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2893.3194986979165, | |
| "epoch": 0.37542857142857144, | |
| "grad_norm": 1.2224547863006592, | |
| "kl": 0.5126953125, | |
| "learning_rate": 7.214816693576234e-07, | |
| "loss": 0.101, | |
| "reward": 0.097161748756965, | |
| "reward_std": 0.8393781532843908, | |
| "rewards/cosine_scaled_reward": -0.15280801647653183, | |
| "rewards/format_reward": 0.402777789781491, | |
| "step": 219 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2655.819539388021, | |
| "epoch": 0.37714285714285717, | |
| "grad_norm": 0.6563175916671753, | |
| "kl": 0.537109375, | |
| "learning_rate": 7.185729670371604e-07, | |
| "loss": 0.0872, | |
| "reward": 0.21256010606884956, | |
| "reward_std": 0.8274597724278768, | |
| "rewards/cosine_scaled_reward": -0.15066439720491567, | |
| "rewards/format_reward": 0.5138888992369175, | |
| "step": 220 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2961.6250813802085, | |
| "epoch": 0.37885714285714284, | |
| "grad_norm": 0.9792293310165405, | |
| "kl": 0.7268880208333334, | |
| "learning_rate": 7.156560487081051e-07, | |
| "loss": 0.0844, | |
| "reward": 0.1597462377200524, | |
| "reward_std": 0.7385697315136591, | |
| "rewards/cosine_scaled_reward": -0.12846021602551141, | |
| "rewards/format_reward": 0.4166666741172473, | |
| "step": 221 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2461.9583943684897, | |
| "epoch": 0.38057142857142856, | |
| "grad_norm": 2.1349949836730957, | |
| "kl": 0.5712890625, | |
| "learning_rate": 7.127310565369415e-07, | |
| "loss": 0.0147, | |
| "reward": 0.517729893947641, | |
| "reward_std": 0.7363678812980652, | |
| "rewards/cosine_scaled_reward": -0.005023935188849767, | |
| "rewards/format_reward": 0.527777798473835, | |
| "step": 222 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3140.8472493489585, | |
| "epoch": 0.3822857142857143, | |
| "grad_norm": 1.151196002960205, | |
| "kl": 0.75390625, | |
| "learning_rate": 7.097981330836616e-07, | |
| "loss": 0.134, | |
| "reward": 0.03437145877008637, | |
| "reward_std": 0.6441821306943893, | |
| "rewards/cosine_scaled_reward": -0.15642539039254189, | |
| "rewards/format_reward": 0.3472222350537777, | |
| "step": 223 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3064.0972696940103, | |
| "epoch": 0.384, | |
| "grad_norm": 1.0468528270721436, | |
| "kl": 0.7708333333333334, | |
| "learning_rate": 7.068574212948169e-07, | |
| "loss": 0.0504, | |
| "reward": -0.25790150215228397, | |
| "reward_std": 0.482844481865565, | |
| "rewards/cosine_scaled_reward": -0.3025618642568588, | |
| "rewards/format_reward": 0.3472222276031971, | |
| "step": 224 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2669.9444986979165, | |
| "epoch": 0.38571428571428573, | |
| "grad_norm": 0.921763002872467, | |
| "kl": 0.6188151041666666, | |
| "learning_rate": 7.039090644965509e-07, | |
| "loss": 0.0955, | |
| "reward": 0.3052919792632262, | |
| "reward_std": 0.8954313198725382, | |
| "rewards/cosine_scaled_reward": -0.10429845812420051, | |
| "rewards/format_reward": 0.5138889079292616, | |
| "step": 225 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2575.0973103841147, | |
| "epoch": 0.38742857142857146, | |
| "grad_norm": 1.6890478134155273, | |
| "kl": 0.4981282552083333, | |
| "learning_rate": 7.009532063876148e-07, | |
| "loss": -0.0498, | |
| "reward": 0.06124853684256474, | |
| "reward_std": 0.597150057554245, | |
| "rewards/cosine_scaled_reward": -0.1777090662314246, | |
| "rewards/format_reward": 0.4166666741172473, | |
| "step": 226 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2854.527872721354, | |
| "epoch": 0.3891428571428571, | |
| "grad_norm": 1.6057039499282837, | |
| "kl": 0.5764973958333334, | |
| "learning_rate": 6.979899910323624e-07, | |
| "loss": 0.0224, | |
| "reward": 0.31274983535210293, | |
| "reward_std": 0.8754556278387705, | |
| "rewards/cosine_scaled_reward": -0.06584731210023165, | |
| "rewards/format_reward": 0.4444444564481576, | |
| "step": 227 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2501.1667277018228, | |
| "epoch": 0.39085714285714285, | |
| "grad_norm": 1.1685543060302734, | |
| "kl": 0.5205078125, | |
| "learning_rate": 6.950195628537299e-07, | |
| "loss": 0.1068, | |
| "reward": 0.6254536683360735, | |
| "reward_std": 0.8047119130690893, | |
| "rewards/cosine_scaled_reward": 0.007171270747979482, | |
| "rewards/format_reward": 0.6111111243565878, | |
| "step": 228 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2691.9861653645835, | |
| "epoch": 0.39257142857142857, | |
| "grad_norm": 1.0908252000808716, | |
| "kl": 0.5074869791666666, | |
| "learning_rate": 6.920420666261961e-07, | |
| "loss": 0.0452, | |
| "reward": 0.4251248224948843, | |
| "reward_std": 0.8874272306760153, | |
| "rewards/cosine_scaled_reward": -0.09299315760533015, | |
| "rewards/format_reward": 0.6111111243565878, | |
| "step": 229 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2879.3194986979165, | |
| "epoch": 0.3942857142857143, | |
| "grad_norm": 1.021728754043579, | |
| "kl": 0.6097005208333334, | |
| "learning_rate": 6.890576474687263e-07, | |
| "loss": 0.1111, | |
| "reward": 0.26515428101023036, | |
| "reward_std": 0.6410991052786509, | |
| "rewards/cosine_scaled_reward": -0.05492285639047623, | |
| "rewards/format_reward": 0.3750000099341075, | |
| "step": 230 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2719.8473307291665, | |
| "epoch": 0.396, | |
| "grad_norm": 1.2554185390472412, | |
| "kl": 0.4524739583333333, | |
| "learning_rate": 6.860664508377001e-07, | |
| "loss": 0.0972, | |
| "reward": 0.32737448314825696, | |
| "reward_std": 0.6432693004608154, | |
| "rewards/cosine_scaled_reward": -0.09325722636034091, | |
| "rewards/format_reward": 0.5138888992369175, | |
| "step": 231 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2556.3473307291665, | |
| "epoch": 0.3977142857142857, | |
| "grad_norm": 0.8875249624252319, | |
| "kl": 0.4931640625, | |
| "learning_rate": 6.83068622519821e-07, | |
| "loss": 0.082, | |
| "reward": 0.6148818656802177, | |
| "reward_std": 0.8026633958021799, | |
| "rewards/cosine_scaled_reward": 0.036607603232065834, | |
| "rewards/format_reward": 0.5416666803260645, | |
| "step": 232 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1979.5278727213542, | |
| "epoch": 0.3994285714285714, | |
| "grad_norm": 1.9907816648483276, | |
| "kl": 0.4049479166666667, | |
| "learning_rate": 6.800643086250121e-07, | |
| "loss": 0.1329, | |
| "reward": 0.8687802950541178, | |
| "reward_std": 0.9246738056341807, | |
| "rewards/cosine_scaled_reward": 0.06633459031581879, | |
| "rewards/format_reward": 0.7361111243565878, | |
| "step": 233 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2677.7639973958335, | |
| "epoch": 0.40114285714285713, | |
| "grad_norm": 1.2829643487930298, | |
| "kl": 0.5794270833333334, | |
| "learning_rate": 6.770536555792944e-07, | |
| "loss": 0.1263, | |
| "reward": 0.45140206813812256, | |
| "reward_std": 0.6863954265912374, | |
| "rewards/cosine_scaled_reward": -0.05902118872230252, | |
| "rewards/format_reward": 0.5694444626569748, | |
| "step": 234 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2758.3472900390625, | |
| "epoch": 0.40285714285714286, | |
| "grad_norm": 1.9622496366500854, | |
| "kl": 0.7135416666666666, | |
| "learning_rate": 6.740368101176495e-07, | |
| "loss": 0.0289, | |
| "reward": 0.22837330649296442, | |
| "reward_std": 0.6859817057847977, | |
| "rewards/cosine_scaled_reward": -0.14970224350690842, | |
| "rewards/format_reward": 0.5277777935067812, | |
| "step": 235 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2700.2084147135415, | |
| "epoch": 0.4045714285714286, | |
| "grad_norm": 1.3918635845184326, | |
| "kl": 0.7838541666666666, | |
| "learning_rate": 6.710139192768694e-07, | |
| "loss": 0.0739, | |
| "reward": 0.5495684078584114, | |
| "reward_std": 0.6962601939837137, | |
| "rewards/cosine_scaled_reward": -0.05160469561815262, | |
| "rewards/format_reward": 0.6527778059244156, | |
| "step": 236 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2417.166748046875, | |
| "epoch": 0.4062857142857143, | |
| "grad_norm": 1.6477667093276978, | |
| "kl": 0.7522786458333334, | |
| "learning_rate": 6.679851303883891e-07, | |
| "loss": 0.0863, | |
| "reward": 0.37436943625410396, | |
| "reward_std": 0.5070767054955164, | |
| "rewards/cosine_scaled_reward": -0.11837084715565045, | |
| "rewards/format_reward": 0.6111111293236414, | |
| "step": 237 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2684.2361653645835, | |
| "epoch": 0.408, | |
| "grad_norm": 1.3266140222549438, | |
| "kl": 0.7845052083333334, | |
| "learning_rate": 6.649505910711058e-07, | |
| "loss": 0.0796, | |
| "reward": 0.5846595851083597, | |
| "reward_std": 0.7575941383838654, | |
| "rewards/cosine_scaled_reward": -0.04794799474378427, | |
| "rewards/format_reward": 0.6805555721124014, | |
| "step": 238 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2514.9444986979165, | |
| "epoch": 0.4097142857142857, | |
| "grad_norm": 1.3039730787277222, | |
| "kl": 0.6119791666666666, | |
| "learning_rate": 6.619104492241847e-07, | |
| "loss": 0.099, | |
| "reward": 0.6142191051815947, | |
| "reward_std": 0.7915631234645844, | |
| "rewards/cosine_scaled_reward": 0.008498436460892359, | |
| "rewards/format_reward": 0.5972222338120142, | |
| "step": 239 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2022.0000813802083, | |
| "epoch": 0.4114285714285714, | |
| "grad_norm": 2.0695018768310547, | |
| "kl": 0.5843098958333334, | |
| "learning_rate": 6.588648530198504e-07, | |
| "loss": -0.0272, | |
| "reward": 0.8475765399634838, | |
| "reward_std": 0.7008786648511887, | |
| "rewards/cosine_scaled_reward": 0.09045492650087301, | |
| "rewards/format_reward": 0.6666666766007742, | |
| "step": 240 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2004.6528727213542, | |
| "epoch": 0.41314285714285715, | |
| "grad_norm": 0.8023363947868347, | |
| "kl": 0.4563802083333333, | |
| "learning_rate": 6.558139508961654e-07, | |
| "loss": 0.0557, | |
| "reward": 0.6628641933202744, | |
| "reward_std": 0.7972640494505564, | |
| "rewards/cosine_scaled_reward": -0.008845687843859196, | |
| "rewards/format_reward": 0.6805555721124014, | |
| "step": 241 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2852.194539388021, | |
| "epoch": 0.41485714285714287, | |
| "grad_norm": 1.4631015062332153, | |
| "kl": 0.724609375, | |
| "learning_rate": 6.527578915497951e-07, | |
| "loss": 0.137, | |
| "reward": 0.39045383781194687, | |
| "reward_std": 0.8659462332725525, | |
| "rewards/cosine_scaled_reward": -0.09643974993377924, | |
| "rewards/format_reward": 0.5833333532015482, | |
| "step": 242 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2759.6944986979165, | |
| "epoch": 0.4165714285714286, | |
| "grad_norm": 0.7083789706230164, | |
| "kl": 0.7708333333333334, | |
| "learning_rate": 6.496968239287603e-07, | |
| "loss": 0.1077, | |
| "reward": 0.2541711802283923, | |
| "reward_std": 0.8134296288092931, | |
| "rewards/cosine_scaled_reward": -0.1298588669548432, | |
| "rewards/format_reward": 0.5138889153798422, | |
| "step": 243 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2422.291707356771, | |
| "epoch": 0.41828571428571426, | |
| "grad_norm": 0.9804867506027222, | |
| "kl": 0.5730794270833334, | |
| "learning_rate": 6.466308972251785e-07, | |
| "loss": 0.0699, | |
| "reward": 0.3161549934496482, | |
| "reward_std": 0.6544978817303976, | |
| "rewards/cosine_scaled_reward": -0.09192252531647682, | |
| "rewards/format_reward": 0.5000000136593977, | |
| "step": 244 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1686.4444783528645, | |
| "epoch": 0.42, | |
| "grad_norm": 0.794727087020874, | |
| "kl": 0.4567057291666667, | |
| "learning_rate": 6.435602608679916e-07, | |
| "loss": 0.0214, | |
| "reward": 0.7671606143315634, | |
| "reward_std": 0.6458114782969157, | |
| "rewards/cosine_scaled_reward": -0.012253028651078543, | |
| "rewards/format_reward": 0.7916666741172472, | |
| "step": 245 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2375.4444986979165, | |
| "epoch": 0.4217142857142857, | |
| "grad_norm": 1.029865026473999, | |
| "kl": 0.5813802083333334, | |
| "learning_rate": 6.404850645156841e-07, | |
| "loss": 0.083, | |
| "reward": 0.8023618534207344, | |
| "reward_std": 0.6820149670044581, | |
| "rewards/cosine_scaled_reward": 0.033125363290309906, | |
| "rewards/format_reward": 0.7361111243565878, | |
| "step": 246 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2819.291788736979, | |
| "epoch": 0.42342857142857143, | |
| "grad_norm": 1.1775052547454834, | |
| "kl": 0.6790364583333334, | |
| "learning_rate": 6.374054580489873e-07, | |
| "loss": 0.0957, | |
| "reward": 0.22988373103241125, | |
| "reward_std": 0.8021631240844727, | |
| "rewards/cosine_scaled_reward": -0.12811370752751827, | |
| "rewards/format_reward": 0.486111119389534, | |
| "step": 247 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2254.6389770507812, | |
| "epoch": 0.42514285714285716, | |
| "grad_norm": 3.8226728439331055, | |
| "kl": 0.5494791666666666, | |
| "learning_rate": 6.343215915635761e-07, | |
| "loss": -0.0441, | |
| "reward": 0.7102490166823069, | |
| "reward_std": 0.7107690672079722, | |
| "rewards/cosine_scaled_reward": -0.019875490417083103, | |
| "rewards/format_reward": 0.750000019868215, | |
| "step": 248 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2385.305623372396, | |
| "epoch": 0.4268571428571429, | |
| "grad_norm": 1.4578808546066284, | |
| "kl": 0.5494791666666666, | |
| "learning_rate": 6.31233615362752e-07, | |
| "loss": 0.0833, | |
| "reward": 0.4758163373917341, | |
| "reward_std": 0.700827419757843, | |
| "rewards/cosine_scaled_reward": -0.08848073395589988, | |
| "rewards/format_reward": 0.6527777959903082, | |
| "step": 249 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2486.138956705729, | |
| "epoch": 0.42857142857142855, | |
| "grad_norm": 1.2223151922225952, | |
| "kl": 0.5774739583333334, | |
| "learning_rate": 6.281416799501187e-07, | |
| "loss": 0.0684, | |
| "reward": 0.44935302270945005, | |
| "reward_std": 0.8305058976014456, | |
| "rewards/cosine_scaled_reward": -0.08087907855709393, | |
| "rewards/format_reward": 0.6111111243565878, | |
| "step": 250 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2477.402872721354, | |
| "epoch": 0.43028571428571427, | |
| "grad_norm": 1.054969072341919, | |
| "kl": 0.5849609375, | |
| "learning_rate": 6.25045936022246e-07, | |
| "loss": 0.047, | |
| "reward": 0.8605522364377975, | |
| "reward_std": 0.9354294538497925, | |
| "rewards/cosine_scaled_reward": 0.048331660528977714, | |
| "rewards/format_reward": 0.7638889054457346, | |
| "step": 251 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2311.9861857096353, | |
| "epoch": 0.432, | |
| "grad_norm": 2.4419050216674805, | |
| "kl": 0.5869140625, | |
| "learning_rate": 6.219465344613258e-07, | |
| "loss": 0.1212, | |
| "reward": 0.4062855467200279, | |
| "reward_std": 0.7568367024262747, | |
| "rewards/cosine_scaled_reward": -0.16491279751062393, | |
| "rewards/format_reward": 0.7361111243565878, | |
| "step": 252 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2629.541748046875, | |
| "epoch": 0.4337142857142857, | |
| "grad_norm": 1.7984837293624878, | |
| "kl": 0.7083333333333334, | |
| "learning_rate": 6.188436263278172e-07, | |
| "loss": 0.1168, | |
| "reward": 0.5991527636845907, | |
| "reward_std": 1.061007301012675, | |
| "rewards/cosine_scaled_reward": -0.04764582713445028, | |
| "rewards/format_reward": 0.6944444477558136, | |
| "step": 253 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2593.430623372396, | |
| "epoch": 0.43542857142857144, | |
| "grad_norm": 1.6819883584976196, | |
| "kl": 0.7115885416666666, | |
| "learning_rate": 6.157373628530852e-07, | |
| "loss": 0.0679, | |
| "reward": 0.5450623606642088, | |
| "reward_std": 0.6844839950402578, | |
| "rewards/cosine_scaled_reward": -0.0885799415409565, | |
| "rewards/format_reward": 0.722222238779068, | |
| "step": 254 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2515.3750203450522, | |
| "epoch": 0.43714285714285717, | |
| "grad_norm": 2.012617826461792, | |
| "kl": 0.6770833333333334, | |
| "learning_rate": 6.126278954320294e-07, | |
| "loss": 0.0077, | |
| "reward": 0.6203830689191818, | |
| "reward_std": 0.7368340541919073, | |
| "rewards/cosine_scaled_reward": -0.03703069780021906, | |
| "rewards/format_reward": 0.694444457689921, | |
| "step": 255 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2365.777791341146, | |
| "epoch": 0.43885714285714283, | |
| "grad_norm": 1.526092767715454, | |
| "kl": 0.65234375, | |
| "learning_rate": 6.095153756157051e-07, | |
| "loss": 0.0693, | |
| "reward": 0.2759497178097566, | |
| "reward_std": 0.665930817524592, | |
| "rewards/cosine_scaled_reward": -0.19535848374168077, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 256 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2113.3056437174478, | |
| "epoch": 0.44057142857142856, | |
| "grad_norm": 0.6899253726005554, | |
| "kl": 0.4892578125, | |
| "learning_rate": 6.06399955103937e-07, | |
| "loss": 0.065, | |
| "reward": 0.8705088198184967, | |
| "reward_std": 0.7922781060139338, | |
| "rewards/cosine_scaled_reward": 0.03942106167475382, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 257 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2093.138956705729, | |
| "epoch": 0.4422857142857143, | |
| "grad_norm": 1.339942216873169, | |
| "kl": 0.4703776041666667, | |
| "learning_rate": 6.032817857379256e-07, | |
| "loss": 0.013, | |
| "reward": 0.44951390971740085, | |
| "reward_std": 0.6888295412063599, | |
| "rewards/cosine_scaled_reward": -0.1294097180167834, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 258 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1974.4306030273438, | |
| "epoch": 0.444, | |
| "grad_norm": 1.3252131938934326, | |
| "kl": 0.3956705729166667, | |
| "learning_rate": 6.001610194928464e-07, | |
| "loss": -0.0225, | |
| "reward": 0.96558295438687, | |
| "reward_std": 0.6466954797506332, | |
| "rewards/cosine_scaled_reward": 0.08001367375254631, | |
| "rewards/format_reward": 0.8055555820465088, | |
| "step": 259 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2373.0417277018228, | |
| "epoch": 0.44571428571428573, | |
| "grad_norm": 1.0619585514068604, | |
| "kl": 0.6057942708333334, | |
| "learning_rate": 5.97037808470444e-07, | |
| "loss": 0.0403, | |
| "reward": 0.8011937191088995, | |
| "reward_std": 0.6582668324311575, | |
| "rewards/cosine_scaled_reward": -0.04384759394451976, | |
| "rewards/format_reward": 0.8888889054457346, | |
| "step": 260 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1771.138936360677, | |
| "epoch": 0.44742857142857145, | |
| "grad_norm": 1.7330862283706665, | |
| "kl": 0.2923177083333333, | |
| "learning_rate": 5.939123048916173e-07, | |
| "loss": 0.0036, | |
| "reward": 0.6879289001226425, | |
| "reward_std": 0.7631189922491709, | |
| "rewards/cosine_scaled_reward": -0.051868884513775505, | |
| "rewards/format_reward": 0.791666696468989, | |
| "step": 261 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2371.4862060546875, | |
| "epoch": 0.4491428571428571, | |
| "grad_norm": 1.004339575767517, | |
| "kl": 0.412109375, | |
| "learning_rate": 5.907846610890011e-07, | |
| "loss": 0.0439, | |
| "reward": 0.8089919307579597, | |
| "reward_std": 0.9169580042362213, | |
| "rewards/cosine_scaled_reward": 0.05727372753123442, | |
| "rewards/format_reward": 0.6944444676240286, | |
| "step": 262 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2095.3472696940103, | |
| "epoch": 0.45085714285714285, | |
| "grad_norm": 2.7382376194000244, | |
| "kl": 0.4140625, | |
| "learning_rate": 5.87655029499542e-07, | |
| "loss": 0.1085, | |
| "reward": 0.41056395694613457, | |
| "reward_std": 0.6896774967511495, | |
| "rewards/cosine_scaled_reward": -0.13499580944577852, | |
| "rewards/format_reward": 0.6805555820465088, | |
| "step": 263 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2220.5000813802085, | |
| "epoch": 0.45257142857142857, | |
| "grad_norm": 2.29777455329895, | |
| "kl": 0.4033203125, | |
| "learning_rate": 5.845235626570683e-07, | |
| "loss": -0.0707, | |
| "reward": 0.4301049162944158, | |
| "reward_std": 0.7033511698246002, | |
| "rewards/cosine_scaled_reward": -0.10439199861139059, | |
| "rewards/format_reward": 0.6388889104127884, | |
| "step": 264 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2194.527872721354, | |
| "epoch": 0.4542857142857143, | |
| "grad_norm": 1.217092514038086, | |
| "kl": 0.3553059895833333, | |
| "learning_rate": 5.813904131848564e-07, | |
| "loss": 0.0359, | |
| "reward": 1.044247900446256, | |
| "reward_std": 0.7928864806890488, | |
| "rewards/cosine_scaled_reward": 0.19573504539827505, | |
| "rewards/format_reward": 0.6527777910232544, | |
| "step": 265 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2299.4723307291665, | |
| "epoch": 0.456, | |
| "grad_norm": 2.3428361415863037, | |
| "kl": 0.3310546875, | |
| "learning_rate": 5.78255733788191e-07, | |
| "loss": 0.122, | |
| "reward": 0.5011805972705284, | |
| "reward_std": 0.8234343528747559, | |
| "rewards/cosine_scaled_reward": -0.041076372688015304, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 266 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2005.8195190429688, | |
| "epoch": 0.45771428571428574, | |
| "grad_norm": 0.8565515875816345, | |
| "kl": 0.33056640625, | |
| "learning_rate": 5.751196772469237e-07, | |
| "loss": 0.0298, | |
| "reward": 0.48155983661611873, | |
| "reward_std": 0.66623854637146, | |
| "rewards/cosine_scaled_reward": -0.12033120046059291, | |
| "rewards/format_reward": 0.7222222487131754, | |
| "step": 267 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2255.541707356771, | |
| "epoch": 0.4594285714285714, | |
| "grad_norm": 0.9378978610038757, | |
| "kl": 0.36962890625, | |
| "learning_rate": 5.71982396408026e-07, | |
| "loss": -0.0187, | |
| "reward": 0.5376216843724251, | |
| "reward_std": 0.800332690278689, | |
| "rewards/cosine_scaled_reward": -0.057578048358360924, | |
| "rewards/format_reward": 0.6527777910232544, | |
| "step": 268 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2507.888956705729, | |
| "epoch": 0.46114285714285713, | |
| "grad_norm": 0.8739426136016846, | |
| "kl": 0.3658854166666667, | |
| "learning_rate": 5.688440441781398e-07, | |
| "loss": 0.0227, | |
| "reward": 0.6940428738792738, | |
| "reward_std": 0.7719042052825292, | |
| "rewards/cosine_scaled_reward": 0.027576979249715805, | |
| "rewards/format_reward": 0.638888900478681, | |
| "step": 269 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2235.013916015625, | |
| "epoch": 0.46285714285714286, | |
| "grad_norm": 0.9512042999267578, | |
| "kl": 0.27734375, | |
| "learning_rate": 5.657047735161255e-07, | |
| "loss": 0.0205, | |
| "reward": 0.8053374737501144, | |
| "reward_std": 0.6894664441545805, | |
| "rewards/cosine_scaled_reward": 0.034613192081451416, | |
| "rewards/format_reward": 0.7361111243565878, | |
| "step": 270 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2215.3611857096353, | |
| "epoch": 0.4645714285714286, | |
| "grad_norm": 0.7835772037506104, | |
| "kl": 0.3251953125, | |
| "learning_rate": 5.625647374256061e-07, | |
| "loss": 0.0337, | |
| "reward": 0.49012333899736404, | |
| "reward_std": 0.8560872872670492, | |
| "rewards/cosine_scaled_reward": -0.053549456099669136, | |
| "rewards/format_reward": 0.5972222288449606, | |
| "step": 271 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2071.6944783528647, | |
| "epoch": 0.4662857142857143, | |
| "grad_norm": 0.9682870507240295, | |
| "kl": 0.278076171875, | |
| "learning_rate": 5.594240889475106e-07, | |
| "loss": 0.088, | |
| "reward": 0.604171751687924, | |
| "reward_std": 0.6147580544153849, | |
| "rewards/cosine_scaled_reward": -0.10069191455841064, | |
| "rewards/format_reward": 0.8055555721124014, | |
| "step": 272 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2155.8056030273438, | |
| "epoch": 0.468, | |
| "grad_norm": 0.7006252408027649, | |
| "kl": 0.24723307291666666, | |
| "learning_rate": 5.562829811526154e-07, | |
| "loss": 0.071, | |
| "reward": 0.6659345651666323, | |
| "reward_std": 0.6156023293733597, | |
| "rewards/cosine_scaled_reward": 0.04824505373835564, | |
| "rewards/format_reward": 0.5694444676240286, | |
| "step": 273 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2206.875040690104, | |
| "epoch": 0.4697142857142857, | |
| "grad_norm": 1.4718269109725952, | |
| "kl": 0.29541015625, | |
| "learning_rate": 5.531415671340826e-07, | |
| "loss": -0.0375, | |
| "reward": 0.17582272396733364, | |
| "reward_std": 0.6097265183925629, | |
| "rewards/cosine_scaled_reward": -0.2245886487265428, | |
| "rewards/format_reward": 0.6250000298023224, | |
| "step": 274 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2460.513916015625, | |
| "epoch": 0.4714285714285714, | |
| "grad_norm": 1.5263134241104126, | |
| "kl": 0.3785807291666667, | |
| "learning_rate": 5.5e-07, | |
| "loss": 0.0947, | |
| "reward": 0.24748976714909077, | |
| "reward_std": 0.732678105433782, | |
| "rewards/cosine_scaled_reward": -0.13319957504669824, | |
| "rewards/format_reward": 0.5138889029622078, | |
| "step": 275 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2293.2361857096353, | |
| "epoch": 0.47314285714285714, | |
| "grad_norm": 0.6164983510971069, | |
| "kl": 0.3359375, | |
| "learning_rate": 5.468584328659172e-07, | |
| "loss": 0.061, | |
| "reward": 0.3768885980049769, | |
| "reward_std": 0.6698301434516907, | |
| "rewards/cosine_scaled_reward": -0.09627792984247208, | |
| "rewards/format_reward": 0.569444457689921, | |
| "step": 276 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2532.8750610351562, | |
| "epoch": 0.47485714285714287, | |
| "grad_norm": 1.1975572109222412, | |
| "kl": 0.4314778645833333, | |
| "learning_rate": 5.437170188473847e-07, | |
| "loss": 0.0882, | |
| "reward": 0.3943765697379907, | |
| "reward_std": 0.905136227607727, | |
| "rewards/cosine_scaled_reward": -0.045867277309298515, | |
| "rewards/format_reward": 0.4861111293236415, | |
| "step": 277 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2375.513956705729, | |
| "epoch": 0.4765714285714286, | |
| "grad_norm": 0.937203586101532, | |
| "kl": 0.3907877604166667, | |
| "learning_rate": 5.405759110524894e-07, | |
| "loss": 0.0572, | |
| "reward": 0.56180848646909, | |
| "reward_std": 0.7609256953001022, | |
| "rewards/cosine_scaled_reward": -0.059373569985230766, | |
| "rewards/format_reward": 0.6805555721124014, | |
| "step": 278 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2525.0417277018228, | |
| "epoch": 0.47828571428571426, | |
| "grad_norm": 2.170426607131958, | |
| "kl": 0.4606119791666667, | |
| "learning_rate": 5.37435262574394e-07, | |
| "loss": 0.1603, | |
| "reward": 0.3840874930222829, | |
| "reward_std": 0.8736404478549957, | |
| "rewards/cosine_scaled_reward": -0.12045624665915966, | |
| "rewards/format_reward": 0.6250000099341074, | |
| "step": 279 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2001.7223103841145, | |
| "epoch": 0.48, | |
| "grad_norm": 1.6350005865097046, | |
| "kl": 0.3879801432291667, | |
| "learning_rate": 5.342952264838747e-07, | |
| "loss": 0.0682, | |
| "reward": 0.5671223948399226, | |
| "reward_std": 0.8623219728469849, | |
| "rewards/cosine_scaled_reward": -0.04977213963866234, | |
| "rewards/format_reward": 0.6666666766007742, | |
| "step": 280 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2128.166707356771, | |
| "epoch": 0.4817142857142857, | |
| "grad_norm": 1.9210960865020752, | |
| "kl": 0.5135091145833334, | |
| "learning_rate": 5.311559558218603e-07, | |
| "loss": 0.1009, | |
| "reward": 0.5354955531656742, | |
| "reward_std": 0.8274398694435755, | |
| "rewards/cosine_scaled_reward": -0.06558556606372197, | |
| "rewards/format_reward": 0.666666696468989, | |
| "step": 281 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2006.791748046875, | |
| "epoch": 0.48342857142857143, | |
| "grad_norm": 1.6356080770492554, | |
| "kl": 0.4724934895833333, | |
| "learning_rate": 5.28017603591974e-07, | |
| "loss": 0.044, | |
| "reward": 0.5969029689828554, | |
| "reward_std": 0.6455038587252299, | |
| "rewards/cosine_scaled_reward": -0.07654852420091629, | |
| "rewards/format_reward": 0.7500000099341074, | |
| "step": 282 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2071.9584147135415, | |
| "epoch": 0.48514285714285715, | |
| "grad_norm": 3.145484447479248, | |
| "kl": 0.50390625, | |
| "learning_rate": 5.248803227530763e-07, | |
| "loss": 0.1914, | |
| "reward": 0.6117152844866117, | |
| "reward_std": 0.8820924858252207, | |
| "rewards/cosine_scaled_reward": -0.04136457946151495, | |
| "rewards/format_reward": 0.694444477558136, | |
| "step": 283 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1353.138916015625, | |
| "epoch": 0.4868571428571429, | |
| "grad_norm": 0.7648276090621948, | |
| "kl": 0.19087727864583334, | |
| "learning_rate": 5.21744266211809e-07, | |
| "loss": -0.0233, | |
| "reward": 0.7949296062191328, | |
| "reward_std": 0.8057575623194376, | |
| "rewards/cosine_scaled_reward": -0.00531297301252683, | |
| "rewards/format_reward": 0.8055555820465088, | |
| "step": 284 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2116.3334350585938, | |
| "epoch": 0.48857142857142855, | |
| "grad_norm": 1.6355130672454834, | |
| "kl": 0.6466471354166666, | |
| "learning_rate": 5.186095868151436e-07, | |
| "loss": 0.0967, | |
| "reward": 0.5773124222954115, | |
| "reward_std": 0.6827881832917532, | |
| "rewards/cosine_scaled_reward": -0.0585660122645398, | |
| "rewards/format_reward": 0.694444457689921, | |
| "step": 285 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2211.1667277018228, | |
| "epoch": 0.49028571428571427, | |
| "grad_norm": 2.118144989013672, | |
| "kl": 0.78515625, | |
| "learning_rate": 5.154764373429315e-07, | |
| "loss": 0.0694, | |
| "reward": 0.5460888457794985, | |
| "reward_std": 0.7309039731820425, | |
| "rewards/cosine_scaled_reward": -0.046400028901795544, | |
| "rewards/format_reward": 0.6388889104127884, | |
| "step": 286 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2121.6528523763022, | |
| "epoch": 0.492, | |
| "grad_norm": 1.354079008102417, | |
| "kl": 0.7288411458333334, | |
| "learning_rate": 5.123449705004581e-07, | |
| "loss": 0.045, | |
| "reward": 0.4143553910156091, | |
| "reward_std": 0.6491135756174723, | |
| "rewards/cosine_scaled_reward": -0.1331000936528047, | |
| "rewards/format_reward": 0.6805555820465088, | |
| "step": 287 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1698.1944986979167, | |
| "epoch": 0.4937142857142857, | |
| "grad_norm": 3.493518352508545, | |
| "kl": 0.5521647135416666, | |
| "learning_rate": 5.09215338910999e-07, | |
| "loss": 0.0567, | |
| "reward": 0.40716485182444256, | |
| "reward_std": 0.5465908547242483, | |
| "rewards/cosine_scaled_reward": -0.1853064758082231, | |
| "rewards/format_reward": 0.777777781089147, | |
| "step": 288 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1824.027852376302, | |
| "epoch": 0.49542857142857144, | |
| "grad_norm": 4.566341876983643, | |
| "kl": 0.4425455729166667, | |
| "learning_rate": 5.060876951083828e-07, | |
| "loss": 0.2121, | |
| "reward": 0.8162123672664165, | |
| "reward_std": 0.9079096416632334, | |
| "rewards/cosine_scaled_reward": 0.033106171836455665, | |
| "rewards/format_reward": 0.750000019868215, | |
| "step": 289 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1446.777852376302, | |
| "epoch": 0.49714285714285716, | |
| "grad_norm": 1.3814009428024292, | |
| "kl": 0.2623697916666667, | |
| "learning_rate": 5.02962191529556e-07, | |
| "loss": 0.0802, | |
| "reward": 0.7263295451800028, | |
| "reward_std": 0.6949130694071451, | |
| "rewards/cosine_scaled_reward": -0.0812796950340271, | |
| "rewards/format_reward": 0.8888889054457346, | |
| "step": 290 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1938.4167378743489, | |
| "epoch": 0.49885714285714283, | |
| "grad_norm": 1.1442235708236694, | |
| "kl": 0.5257364908854166, | |
| "learning_rate": 4.998389805071536e-07, | |
| "loss": 0.1135, | |
| "reward": 0.7018542202810446, | |
| "reward_std": 0.8068165977795919, | |
| "rewards/cosine_scaled_reward": -0.010184012353420258, | |
| "rewards/format_reward": 0.722222238779068, | |
| "step": 291 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1429.7500508626301, | |
| "epoch": 0.5005714285714286, | |
| "grad_norm": 0.8907753229141235, | |
| "kl": 0.2737630208333333, | |
| "learning_rate": 4.967182142620745e-07, | |
| "loss": 0.0892, | |
| "reward": 0.7117127279440562, | |
| "reward_std": 0.7207831740379333, | |
| "rewards/cosine_scaled_reward": -0.08164365869015455, | |
| "rewards/format_reward": 0.8750000099341074, | |
| "step": 292 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1907.4861450195312, | |
| "epoch": 0.5022857142857143, | |
| "grad_norm": 2.139615058898926, | |
| "kl": 0.6721598307291666, | |
| "learning_rate": 4.93600044896063e-07, | |
| "loss": 0.0884, | |
| "reward": 0.510564868648847, | |
| "reward_std": 0.5789864038427671, | |
| "rewards/cosine_scaled_reward": -0.11277312971651554, | |
| "rewards/format_reward": 0.7361111243565878, | |
| "step": 293 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1574.3056030273438, | |
| "epoch": 0.504, | |
| "grad_norm": 1.177461862564087, | |
| "kl": 0.3779703776041667, | |
| "learning_rate": 4.904846243842949e-07, | |
| "loss": 0.0378, | |
| "reward": 0.8885930180549622, | |
| "reward_std": 0.6348190978169441, | |
| "rewards/cosine_scaled_reward": 0.041518718004226685, | |
| "rewards/format_reward": 0.8055555820465088, | |
| "step": 294 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1597.4722696940105, | |
| "epoch": 0.5057142857142857, | |
| "grad_norm": 2.63672137260437, | |
| "kl": 0.3726806640625, | |
| "learning_rate": 4.873721045679706e-07, | |
| "loss": 0.1681, | |
| "reward": 0.7374154428641001, | |
| "reward_std": 0.8401837547620138, | |
| "rewards/cosine_scaled_reward": -0.034070080456634365, | |
| "rewards/format_reward": 0.8055555721124014, | |
| "step": 295 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2060.5972900390625, | |
| "epoch": 0.5074285714285715, | |
| "grad_norm": 0.9306944608688354, | |
| "kl": 0.556640625, | |
| "learning_rate": 4.842626371469149e-07, | |
| "loss": 0.1311, | |
| "reward": 0.792794277270635, | |
| "reward_std": 0.8793422281742096, | |
| "rewards/cosine_scaled_reward": 0.04917491475741068, | |
| "rewards/format_reward": 0.694444457689921, | |
| "step": 296 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1878.6945190429688, | |
| "epoch": 0.5091428571428571, | |
| "grad_norm": 1.0184504985809326, | |
| "kl": 0.5145060221354166, | |
| "learning_rate": 4.811563736721829e-07, | |
| "loss": 0.1255, | |
| "reward": 0.4036409060160319, | |
| "reward_std": 0.6954380199313164, | |
| "rewards/cosine_scaled_reward": -0.15929067445298037, | |
| "rewards/format_reward": 0.7222222288449606, | |
| "step": 297 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1867.1667073567708, | |
| "epoch": 0.5108571428571429, | |
| "grad_norm": 2.424537181854248, | |
| "kl": 0.61328125, | |
| "learning_rate": 4.780534655386743e-07, | |
| "loss": 0.1332, | |
| "reward": 0.5895284209400415, | |
| "reward_std": 0.8056660344203314, | |
| "rewards/cosine_scaled_reward": -0.0455135852098465, | |
| "rewards/format_reward": 0.680555579562982, | |
| "step": 298 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1496.2361653645833, | |
| "epoch": 0.5125714285714286, | |
| "grad_norm": 3.2494192123413086, | |
| "kl": 0.302734375, | |
| "learning_rate": 4.749540639777539e-07, | |
| "loss": 0.2394, | |
| "reward": 0.9463320672512054, | |
| "reward_std": 0.8579710175593694, | |
| "rewards/cosine_scaled_reward": 0.07038825005292892, | |
| "rewards/format_reward": 0.8055555820465088, | |
| "step": 299 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2018.8334045410156, | |
| "epoch": 0.5142857142857142, | |
| "grad_norm": 2.1177589893341064, | |
| "kl": 0.728515625, | |
| "learning_rate": 4.7185832004988133e-07, | |
| "loss": 0.1215, | |
| "reward": 0.42369461866716546, | |
| "reward_std": 0.626913865407308, | |
| "rewards/cosine_scaled_reward": -0.10065270067813496, | |
| "rewards/format_reward": 0.6250000099341074, | |
| "step": 300 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1759.1945190429688, | |
| "epoch": 0.516, | |
| "grad_norm": 1.3609451055526733, | |
| "kl": 0.6129557291666666, | |
| "learning_rate": 4.68766384637248e-07, | |
| "loss": 0.1594, | |
| "reward": 0.4863266460597515, | |
| "reward_std": 0.6535109877586365, | |
| "rewards/cosine_scaled_reward": -0.1526700264463822, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 301 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2371.40283203125, | |
| "epoch": 0.5177142857142857, | |
| "grad_norm": 1.5224748849868774, | |
| "kl": 0.9853515625, | |
| "learning_rate": 4.656784084364238e-07, | |
| "loss": 0.1954, | |
| "reward": 0.3445633811255296, | |
| "reward_std": 0.8744351814190546, | |
| "rewards/cosine_scaled_reward": -0.1263294412444035, | |
| "rewards/format_reward": 0.5972222437461218, | |
| "step": 302 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1883.388916015625, | |
| "epoch": 0.5194285714285715, | |
| "grad_norm": 1.7124310731887817, | |
| "kl": 0.6731770833333334, | |
| "learning_rate": 4.6259454195101267e-07, | |
| "loss": 0.201, | |
| "reward": 0.4631364569067955, | |
| "reward_std": 0.6874663432439169, | |
| "rewards/cosine_scaled_reward": -0.14343177201226354, | |
| "rewards/format_reward": 0.750000019868215, | |
| "step": 303 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1798.0139567057292, | |
| "epoch": 0.5211428571428571, | |
| "grad_norm": 1.490600824356079, | |
| "kl": 0.5875651041666666, | |
| "learning_rate": 4.59514935484316e-07, | |
| "loss": 0.0632, | |
| "reward": 0.8596306045850118, | |
| "reward_std": 0.6755616863568624, | |
| "rewards/cosine_scaled_reward": 0.04787082721789678, | |
| "rewards/format_reward": 0.7638889153798422, | |
| "step": 304 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1948.6250610351562, | |
| "epoch": 0.5228571428571429, | |
| "grad_norm": 2.2148995399475098, | |
| "kl": 0.7317708333333334, | |
| "learning_rate": 4.5643973913200837e-07, | |
| "loss": 0.0873, | |
| "reward": 0.6427298511068026, | |
| "reward_std": 0.7944980164368948, | |
| "rewards/cosine_scaled_reward": -0.06057953586181005, | |
| "rewards/format_reward": 0.7638889153798422, | |
| "step": 305 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1753.263916015625, | |
| "epoch": 0.5245714285714286, | |
| "grad_norm": 1.741482138633728, | |
| "kl": 0.5782877604166666, | |
| "learning_rate": 4.5336910277482155e-07, | |
| "loss": 0.14, | |
| "reward": 0.5987357745567957, | |
| "reward_std": 0.7464914421240488, | |
| "rewards/cosine_scaled_reward": -0.061743236457308136, | |
| "rewards/format_reward": 0.722222238779068, | |
| "step": 306 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2037.4862060546875, | |
| "epoch": 0.5262857142857142, | |
| "grad_norm": 1.1908149719238281, | |
| "kl": 0.64404296875, | |
| "learning_rate": 4.503031760712397e-07, | |
| "loss": 0.1031, | |
| "reward": 0.5286059752106667, | |
| "reward_std": 0.8457674185434977, | |
| "rewards/cosine_scaled_reward": -0.08986368278662364, | |
| "rewards/format_reward": 0.7083333532015482, | |
| "step": 307 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2182.7639363606772, | |
| "epoch": 0.528, | |
| "grad_norm": 1.169242024421692, | |
| "kl": 0.7589518229166666, | |
| "learning_rate": 4.4724210845020494e-07, | |
| "loss": 0.1726, | |
| "reward": 0.6471676652630171, | |
| "reward_std": 0.8749262392520905, | |
| "rewards/cosine_scaled_reward": 0.004139383633931478, | |
| "rewards/format_reward": 0.6388889054457346, | |
| "step": 308 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1855.777852376302, | |
| "epoch": 0.5297142857142857, | |
| "grad_norm": 1.0205694437026978, | |
| "kl": 0.6122029622395834, | |
| "learning_rate": 4.441860491038345e-07, | |
| "loss": 0.1003, | |
| "reward": 0.7865540675508479, | |
| "reward_std": 0.8031655053297678, | |
| "rewards/cosine_scaled_reward": 0.03216590483983358, | |
| "rewards/format_reward": 0.7222222288449606, | |
| "step": 309 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2336.2500610351562, | |
| "epoch": 0.5314285714285715, | |
| "grad_norm": 1.6022837162017822, | |
| "kl": 0.91796875, | |
| "learning_rate": 4.4113514698014953e-07, | |
| "loss": 0.2071, | |
| "reward": 0.36028168102105457, | |
| "reward_std": 0.7865394751230875, | |
| "rewards/cosine_scaled_reward": -0.11152582863966624, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 310 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2217.2778523763022, | |
| "epoch": 0.5331428571428571, | |
| "grad_norm": 0.9660167098045349, | |
| "kl": 0.65087890625, | |
| "learning_rate": 4.3808955077581546e-07, | |
| "loss": 0.1467, | |
| "reward": 0.4127930849790573, | |
| "reward_std": 0.748573382695516, | |
| "rewards/cosine_scaled_reward": -0.17554792222411683, | |
| "rewards/format_reward": 0.7638889054457346, | |
| "step": 311 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1980.2361653645833, | |
| "epoch": 0.5348571428571428, | |
| "grad_norm": 1.4037264585494995, | |
| "kl": 0.650390625, | |
| "learning_rate": 4.350494089288943e-07, | |
| "loss": 0.1445, | |
| "reward": 0.7002697885036469, | |
| "reward_std": 0.622652659813563, | |
| "rewards/cosine_scaled_reward": 0.002912662923336029, | |
| "rewards/format_reward": 0.694444457689921, | |
| "step": 312 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1648.4306030273438, | |
| "epoch": 0.5365714285714286, | |
| "grad_norm": 2.3723721504211426, | |
| "kl": 0.6311848958333334, | |
| "learning_rate": 4.3201486961161093e-07, | |
| "loss": 0.1003, | |
| "reward": 0.7444769740104675, | |
| "reward_std": 0.694429432352384, | |
| "rewards/cosine_scaled_reward": -0.044428194562594094, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 313 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2126.819539388021, | |
| "epoch": 0.5382857142857143, | |
| "grad_norm": 2.8545262813568115, | |
| "kl": 0.7340494791666666, | |
| "learning_rate": 4.2898608072313045e-07, | |
| "loss": 0.102, | |
| "reward": 0.5754300641516844, | |
| "reward_std": 0.9160548051198324, | |
| "rewards/cosine_scaled_reward": -0.05256276298314333, | |
| "rewards/format_reward": 0.6805555721124014, | |
| "step": 314 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1566.000020345052, | |
| "epoch": 0.54, | |
| "grad_norm": 2.3709523677825928, | |
| "kl": 0.6233723958333334, | |
| "learning_rate": 4.2596318988235037e-07, | |
| "loss": 0.1876, | |
| "reward": 0.6666001776854197, | |
| "reward_std": 0.7325516641139984, | |
| "rewards/cosine_scaled_reward": -0.09031105739995837, | |
| "rewards/format_reward": 0.847222238779068, | |
| "step": 315 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1738.7361246744792, | |
| "epoch": 0.5417142857142857, | |
| "grad_norm": 2.186328887939453, | |
| "kl": 0.6171061197916666, | |
| "learning_rate": 4.2294634442070553e-07, | |
| "loss": 0.0583, | |
| "reward": 0.5522626154124737, | |
| "reward_std": 0.6918301979700724, | |
| "rewards/cosine_scaled_reward": -0.1266464803678294, | |
| "rewards/format_reward": 0.8055555621782938, | |
| "step": 316 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2365.263956705729, | |
| "epoch": 0.5434285714285715, | |
| "grad_norm": 2.4845731258392334, | |
| "kl": 0.8971354166666666, | |
| "learning_rate": 4.1993569137498776e-07, | |
| "loss": 0.1195, | |
| "reward": 0.354046537540853, | |
| "reward_std": 0.5743257055679957, | |
| "rewards/cosine_scaled_reward": -0.13547672952214876, | |
| "rewards/format_reward": 0.6250000037252903, | |
| "step": 317 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1331.9722595214844, | |
| "epoch": 0.5451428571428572, | |
| "grad_norm": 1.1837691068649292, | |
| "kl": 0.324951171875, | |
| "learning_rate": 4.1693137748017915e-07, | |
| "loss": 0.0795, | |
| "reward": 1.3221057256062825, | |
| "reward_std": 0.6699743270874023, | |
| "rewards/cosine_scaled_reward": 0.19577506929636002, | |
| "rewards/format_reward": 0.9305555621782938, | |
| "step": 318 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1687.416727701823, | |
| "epoch": 0.5468571428571428, | |
| "grad_norm": 1.5313892364501953, | |
| "kl": 0.423828125, | |
| "learning_rate": 4.1393354916230005e-07, | |
| "loss": 0.0828, | |
| "reward": 0.7049875284234682, | |
| "reward_std": 0.5341855933268865, | |
| "rewards/cosine_scaled_reward": -0.050284020602703094, | |
| "rewards/format_reward": 0.8055555721124014, | |
| "step": 319 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1963.3333638509114, | |
| "epoch": 0.5485714285714286, | |
| "grad_norm": 0.8704672455787659, | |
| "kl": 0.6151529947916666, | |
| "learning_rate": 4.1094235253127374e-07, | |
| "loss": 0.155, | |
| "reward": 0.9678831398487091, | |
| "reward_std": 0.8379487742980322, | |
| "rewards/cosine_scaled_reward": 0.10199710850914319, | |
| "rewards/format_reward": 0.7638889153798422, | |
| "step": 320 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2094.180623372396, | |
| "epoch": 0.5502857142857143, | |
| "grad_norm": 1.842789888381958, | |
| "kl": 0.5999348958333334, | |
| "learning_rate": 4.079579333738039e-07, | |
| "loss": 0.2119, | |
| "reward": 0.24729357163111368, | |
| "reward_std": 0.6844222048918406, | |
| "rewards/cosine_scaled_reward": -0.2166309952735901, | |
| "rewards/format_reward": 0.6805555721124014, | |
| "step": 321 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1753.6250610351562, | |
| "epoch": 0.552, | |
| "grad_norm": 0.6552063226699829, | |
| "kl": 0.4476725260416667, | |
| "learning_rate": 4.0498043714627006e-07, | |
| "loss": 0.0657, | |
| "reward": 0.8655827442804972, | |
| "reward_std": 0.7572722683350245, | |
| "rewards/cosine_scaled_reward": 0.0022358112037181854, | |
| "rewards/format_reward": 0.8611111243565878, | |
| "step": 322 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2107.2084147135415, | |
| "epoch": 0.5537142857142857, | |
| "grad_norm": 1.9173928499221802, | |
| "kl": 0.5352376302083334, | |
| "learning_rate": 4.020100089676376e-07, | |
| "loss": 0.087, | |
| "reward": 0.7162150360333422, | |
| "reward_std": 0.4351879407962163, | |
| "rewards/cosine_scaled_reward": 0.003940838078657786, | |
| "rewards/format_reward": 0.7083333482344946, | |
| "step": 323 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1855.9306538899739, | |
| "epoch": 0.5554285714285714, | |
| "grad_norm": 0.974723756313324, | |
| "kl": 0.5054524739583334, | |
| "learning_rate": 3.9904679361238526e-07, | |
| "loss": 0.133, | |
| "reward": 0.8169851501782736, | |
| "reward_std": 0.8269292811552683, | |
| "rewards/cosine_scaled_reward": 0.012659239039445916, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 324 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2389.291748046875, | |
| "epoch": 0.5571428571428572, | |
| "grad_norm": 0.9530824422836304, | |
| "kl": 0.7034505208333334, | |
| "learning_rate": 3.9609093550344907e-07, | |
| "loss": 0.1551, | |
| "reward": 0.3352165271838506, | |
| "reward_std": 0.7478228956460953, | |
| "rewards/cosine_scaled_reward": -0.1379473035534223, | |
| "rewards/format_reward": 0.6111111293236414, | |
| "step": 325 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1728.5555826822917, | |
| "epoch": 0.5588571428571428, | |
| "grad_norm": 0.780616819858551, | |
| "kl": 0.3742879231770833, | |
| "learning_rate": 3.931425787051832e-07, | |
| "loss": 0.0722, | |
| "reward": 0.8215913027524948, | |
| "reward_std": 0.6323897987604141, | |
| "rewards/cosine_scaled_reward": 0.021906748414039612, | |
| "rewards/format_reward": 0.7777778009573618, | |
| "step": 326 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2125.402872721354, | |
| "epoch": 0.5605714285714286, | |
| "grad_norm": 1.645381212234497, | |
| "kl": 0.56689453125, | |
| "learning_rate": 3.902018669163384e-07, | |
| "loss": 0.0979, | |
| "reward": 0.5315917382637659, | |
| "reward_std": 0.7190234859784445, | |
| "rewards/cosine_scaled_reward": -0.05364859973390897, | |
| "rewards/format_reward": 0.6388889054457346, | |
| "step": 327 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1913.6250406901042, | |
| "epoch": 0.5622857142857143, | |
| "grad_norm": 2.42509126663208, | |
| "kl": 0.3966471354166667, | |
| "learning_rate": 3.872689434630585e-07, | |
| "loss": 0.1815, | |
| "reward": 0.7333748464783033, | |
| "reward_std": 0.7171944777170817, | |
| "rewards/cosine_scaled_reward": -0.029145926237106323, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 328 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1638.4722696940105, | |
| "epoch": 0.564, | |
| "grad_norm": 0.3869437575340271, | |
| "kl": 0.2791951497395833, | |
| "learning_rate": 3.843439512918949e-07, | |
| "loss": 0.0902, | |
| "reward": 0.697057361404101, | |
| "reward_std": 0.7673922777175903, | |
| "rewards/cosine_scaled_reward": -0.08202686998993158, | |
| "rewards/format_reward": 0.8611111243565878, | |
| "step": 329 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2337.4583943684897, | |
| "epoch": 0.5657142857142857, | |
| "grad_norm": 2.004044532775879, | |
| "kl": 0.5452473958333334, | |
| "learning_rate": 3.8142703296283953e-07, | |
| "loss": 0.1357, | |
| "reward": 0.58099132279555, | |
| "reward_std": 0.7395560791095098, | |
| "rewards/cosine_scaled_reward": -0.056726557513078056, | |
| "rewards/format_reward": 0.694444477558136, | |
| "step": 330 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2154.1111857096353, | |
| "epoch": 0.5674285714285714, | |
| "grad_norm": 1.2939128875732422, | |
| "kl": 0.5686848958333334, | |
| "learning_rate": 3.785183306423767e-07, | |
| "loss": 0.1441, | |
| "reward": 0.5753989368677139, | |
| "reward_std": 0.8985026180744171, | |
| "rewards/cosine_scaled_reward": -0.031744986617316805, | |
| "rewards/format_reward": 0.6388889054457346, | |
| "step": 331 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2247.1111857096353, | |
| "epoch": 0.5691428571428572, | |
| "grad_norm": 1.6428916454315186, | |
| "kl": 0.5973307291666666, | |
| "learning_rate": 3.7561798609655373e-07, | |
| "loss": 0.1237, | |
| "reward": 0.5965733677148819, | |
| "reward_std": 0.8030048410097758, | |
| "rewards/cosine_scaled_reward": -0.06976890688141187, | |
| "rewards/format_reward": 0.7361111342906952, | |
| "step": 332 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2405.555623372396, | |
| "epoch": 0.5708571428571428, | |
| "grad_norm": 1.464685082435608, | |
| "kl": 0.5660807291666666, | |
| "learning_rate": 3.72726140684072e-07, | |
| "loss": 0.0918, | |
| "reward": 0.24896016468604407, | |
| "reward_std": 0.5915202548106512, | |
| "rewards/cosine_scaled_reward": -0.1602421539525191, | |
| "rewards/format_reward": 0.5694444552063942, | |
| "step": 333 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2515.0833943684897, | |
| "epoch": 0.5725714285714286, | |
| "grad_norm": 2.0023365020751953, | |
| "kl": 0.638671875, | |
| "learning_rate": 3.6984293534939737e-07, | |
| "loss": 0.0715, | |
| "reward": 0.1347294623653094, | |
| "reward_std": 0.6633831461270651, | |
| "rewards/cosine_scaled_reward": -0.1965241671229402, | |
| "rewards/format_reward": 0.5277777959903082, | |
| "step": 334 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2115.5000610351562, | |
| "epoch": 0.5742857142857143, | |
| "grad_norm": 1.5139273405075073, | |
| "kl": 0.5081380208333334, | |
| "learning_rate": 3.6696851061588994e-07, | |
| "loss": 0.1252, | |
| "reward": 0.5351596586406231, | |
| "reward_std": 0.6959919532140096, | |
| "rewards/cosine_scaled_reward": -0.10047572727004687, | |
| "rewards/format_reward": 0.7361111342906952, | |
| "step": 335 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1951.02783203125, | |
| "epoch": 0.576, | |
| "grad_norm": 1.7731704711914062, | |
| "kl": 0.3974609375, | |
| "learning_rate": 3.641030065789562e-07, | |
| "loss": 0.0831, | |
| "reward": 0.2905444505934914, | |
| "reward_std": 0.5701153924067816, | |
| "rewards/cosine_scaled_reward": -0.22278334324558577, | |
| "rewards/format_reward": 0.7361111243565878, | |
| "step": 336 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1892.0833841959636, | |
| "epoch": 0.5777142857142857, | |
| "grad_norm": 1.0699217319488525, | |
| "kl": 0.3992513020833333, | |
| "learning_rate": 3.612465628992203e-07, | |
| "loss": 0.0297, | |
| "reward": 0.6083886424700419, | |
| "reward_std": 0.7866451044877371, | |
| "rewards/cosine_scaled_reward": -0.0708056998749574, | |
| "rewards/format_reward": 0.750000019868215, | |
| "step": 337 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1777.625020345052, | |
| "epoch": 0.5794285714285714, | |
| "grad_norm": 0.6811065673828125, | |
| "kl": 0.22542317708333334, | |
| "learning_rate": 3.5839931879571725e-07, | |
| "loss": 0.0332, | |
| "reward": 0.8551764388879141, | |
| "reward_std": 0.878966490427653, | |
| "rewards/cosine_scaled_reward": 0.00397710253794988, | |
| "rewards/format_reward": 0.847222238779068, | |
| "step": 338 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1847.9305826822917, | |
| "epoch": 0.5811428571428572, | |
| "grad_norm": 2.3545331954956055, | |
| "kl": 0.24153645833333334, | |
| "learning_rate": 3.555614130391079e-07, | |
| "loss": 0.1222, | |
| "reward": 0.5180872877438863, | |
| "reward_std": 0.6465301861365637, | |
| "rewards/cosine_scaled_reward": -0.12984526778260866, | |
| "rewards/format_reward": 0.7777778108914694, | |
| "step": 339 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2060.6250813802085, | |
| "epoch": 0.5828571428571429, | |
| "grad_norm": 1.6732137203216553, | |
| "kl": 0.2791341145833333, | |
| "learning_rate": 3.5273298394491515e-07, | |
| "loss": 0.1809, | |
| "reward": 0.3110894759496053, | |
| "reward_std": 0.6754192064205805, | |
| "rewards/cosine_scaled_reward": -0.16389971474806467, | |
| "rewards/format_reward": 0.6388889029622078, | |
| "step": 340 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1794.138936360677, | |
| "epoch": 0.5845714285714285, | |
| "grad_norm": 0.6482177972793579, | |
| "kl": 0.22782389322916666, | |
| "learning_rate": 3.4991416936678276e-07, | |
| "loss": 0.0242, | |
| "reward": 0.22059367721279463, | |
| "reward_std": 0.4378715306520462, | |
| "rewards/cosine_scaled_reward": -0.2647031719485919, | |
| "rewards/format_reward": 0.7500000099341074, | |
| "step": 341 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2373.555623372396, | |
| "epoch": 0.5862857142857143, | |
| "grad_norm": 1.4188843965530396, | |
| "kl": 0.3212890625, | |
| "learning_rate": 3.471051066897562e-07, | |
| "loss": 0.1317, | |
| "reward": 0.6352181024849415, | |
| "reward_std": 0.8267830014228821, | |
| "rewards/cosine_scaled_reward": -0.036557632188002266, | |
| "rewards/format_reward": 0.7083333631356558, | |
| "step": 342 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1830.777811686198, | |
| "epoch": 0.588, | |
| "grad_norm": 1.6553319692611694, | |
| "kl": 0.242431640625, | |
| "learning_rate": 3.4430593282358777e-07, | |
| "loss": 0.1346, | |
| "reward": 0.703708291053772, | |
| "reward_std": 0.7348743031422297, | |
| "rewards/cosine_scaled_reward": -0.0370347515369455, | |
| "rewards/format_reward": 0.7777777910232544, | |
| "step": 343 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1918.3472696940105, | |
| "epoch": 0.5897142857142857, | |
| "grad_norm": 1.0326299667358398, | |
| "kl": 0.3053385416666667, | |
| "learning_rate": 3.4151678419606233e-07, | |
| "loss": 0.0747, | |
| "reward": 0.3245452443758647, | |
| "reward_std": 0.674353172381719, | |
| "rewards/cosine_scaled_reward": -0.20578294371565184, | |
| "rewards/format_reward": 0.7361111342906952, | |
| "step": 344 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1848.3194986979167, | |
| "epoch": 0.5914285714285714, | |
| "grad_norm": 1.297278881072998, | |
| "kl": 0.248291015625, | |
| "learning_rate": 3.387377967463493e-07, | |
| "loss": 0.0945, | |
| "reward": 0.963769598553578, | |
| "reward_std": 0.9214291075865427, | |
| "rewards/cosine_scaled_reward": 0.09299589941898982, | |
| "rewards/format_reward": 0.7777777910232544, | |
| "step": 345 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1926.916727701823, | |
| "epoch": 0.5931428571428572, | |
| "grad_norm": 1.1500418186187744, | |
| "kl": 0.2668863932291667, | |
| "learning_rate": 3.359691059183761e-07, | |
| "loss": 0.0482, | |
| "reward": 0.8292954713106155, | |
| "reward_std": 0.7426730096340179, | |
| "rewards/cosine_scaled_reward": -0.002018950879573822, | |
| "rewards/format_reward": 0.8333333532015482, | |
| "step": 346 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1988.8611653645833, | |
| "epoch": 0.5948571428571429, | |
| "grad_norm": 1.0159980058670044, | |
| "kl": 0.3473307291666667, | |
| "learning_rate": 3.3321084665422803e-07, | |
| "loss": 0.1357, | |
| "reward": 0.42399795974294346, | |
| "reward_std": 0.7303314308325449, | |
| "rewards/cosine_scaled_reward": -0.1630010406176249, | |
| "rewards/format_reward": 0.750000019868215, | |
| "step": 347 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1968.5139770507812, | |
| "epoch": 0.5965714285714285, | |
| "grad_norm": 1.025205373764038, | |
| "kl": 0.3509928385416667, | |
| "learning_rate": 3.3046315338757026e-07, | |
| "loss": 0.1001, | |
| "reward": 0.44912690420945484, | |
| "reward_std": 0.48930580417315167, | |
| "rewards/cosine_scaled_reward": -0.17126989364624023, | |
| "rewards/format_reward": 0.7916666766007742, | |
| "step": 348 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2184.4583740234375, | |
| "epoch": 0.5982857142857143, | |
| "grad_norm": 0.42495182156562805, | |
| "kl": 0.3214518229166667, | |
| "learning_rate": 3.2772616003709616e-07, | |
| "loss": 0.078, | |
| "reward": 0.7055053611596426, | |
| "reward_std": 0.676128163933754, | |
| "rewards/cosine_scaled_reward": -0.008358433842658997, | |
| "rewards/format_reward": 0.7222222362955412, | |
| "step": 349 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2091.7916870117188, | |
| "epoch": 0.6, | |
| "grad_norm": 3.5313339233398438, | |
| "kl": 0.3573404947916667, | |
| "learning_rate": 3.250000000000001e-07, | |
| "loss": 0.2022, | |
| "reward": 0.6713175723950068, | |
| "reward_std": 1.0513839721679688, | |
| "rewards/cosine_scaled_reward": -0.04628566776712736, | |
| "rewards/format_reward": 0.7638889054457346, | |
| "step": 350 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1881.5000406901042, | |
| "epoch": 0.6017142857142858, | |
| "grad_norm": 1.7608799934387207, | |
| "kl": 0.3059895833333333, | |
| "learning_rate": 3.222848061454764e-07, | |
| "loss": 0.1153, | |
| "reward": 0.8349494735399882, | |
| "reward_std": 0.7551977684100469, | |
| "rewards/cosine_scaled_reward": 0.007752507925033569, | |
| "rewards/format_reward": 0.8194444676240286, | |
| "step": 351 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2189.8750610351562, | |
| "epoch": 0.6034285714285714, | |
| "grad_norm": 1.3282064199447632, | |
| "kl": 0.576171875, | |
| "learning_rate": 3.195807108082429e-07, | |
| "loss": 0.1313, | |
| "reward": 0.2600073926150799, | |
| "reward_std": 0.668842022617658, | |
| "rewards/cosine_scaled_reward": -0.21027409036954245, | |
| "rewards/format_reward": 0.6805555721124014, | |
| "step": 352 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2166.291748046875, | |
| "epoch": 0.6051428571428571, | |
| "grad_norm": 1.6542304754257202, | |
| "kl": 0.4603678385416667, | |
| "learning_rate": 3.168878457820915e-07, | |
| "loss": 0.0418, | |
| "reward": 0.4779660537218054, | |
| "reward_std": 0.8410133322079977, | |
| "rewards/cosine_scaled_reward": -0.12907253950834274, | |
| "rewards/format_reward": 0.7361111293236414, | |
| "step": 353 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1462.4166971842449, | |
| "epoch": 0.6068571428571429, | |
| "grad_norm": 1.2300294637680054, | |
| "kl": 0.23640950520833334, | |
| "learning_rate": 3.142063423134644e-07, | |
| "loss": 0.0818, | |
| "reward": 1.0592672030131023, | |
| "reward_std": 0.8312759300072988, | |
| "rewards/cosine_scaled_reward": 0.06435581048329671, | |
| "rewards/format_reward": 0.9305555721124014, | |
| "step": 354 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1911.52783203125, | |
| "epoch": 0.6085714285714285, | |
| "grad_norm": 167.4272003173828, | |
| "kl": 1.8644205729166667, | |
| "learning_rate": 3.115363310950578e-07, | |
| "loss": 0.2472, | |
| "reward": 0.5436144371827444, | |
| "reward_std": 0.5198759684960047, | |
| "rewards/cosine_scaled_reward": -0.11708168437083562, | |
| "rewards/format_reward": 0.7777778009573618, | |
| "step": 355 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1726.8611653645833, | |
| "epoch": 0.6102857142857143, | |
| "grad_norm": 1.9463508129119873, | |
| "kl": 0.3705240885416667, | |
| "learning_rate": 3.0887794225945143e-07, | |
| "loss": 0.0962, | |
| "reward": 0.47158247729142505, | |
| "reward_std": 0.6186495224634806, | |
| "rewards/cosine_scaled_reward": -0.1947643185655276, | |
| "rewards/format_reward": 0.8611111243565878, | |
| "step": 356 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1617.7916870117188, | |
| "epoch": 0.612, | |
| "grad_norm": 1.844290018081665, | |
| "kl": 0.3195393880208333, | |
| "learning_rate": 3.062313053727671e-07, | |
| "loss": -0.033, | |
| "reward": 0.9479092061519623, | |
| "reward_std": 0.6967554092407227, | |
| "rewards/cosine_scaled_reward": 0.015621266948680082, | |
| "rewards/format_reward": 0.9166666766007742, | |
| "step": 357 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1857.1111653645833, | |
| "epoch": 0.6137142857142858, | |
| "grad_norm": 0.8043171167373657, | |
| "kl": 0.3922932942708333, | |
| "learning_rate": 3.0359654942835247e-07, | |
| "loss": 0.079, | |
| "reward": 0.6832286963860194, | |
| "reward_std": 0.7118935982386271, | |
| "rewards/cosine_scaled_reward": -0.026441220194101334, | |
| "rewards/format_reward": 0.7361111342906952, | |
| "step": 358 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2079.4445597330728, | |
| "epoch": 0.6154285714285714, | |
| "grad_norm": 2.380119562149048, | |
| "kl": 0.4624837239583333, | |
| "learning_rate": 3.0097380284049523e-07, | |
| "loss": 0.1016, | |
| "reward": 0.4707499146461487, | |
| "reward_std": 0.7072432239850363, | |
| "rewards/cosine_scaled_reward": -0.11879172051946323, | |
| "rewards/format_reward": 0.7083333482344946, | |
| "step": 359 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1659.2639465332031, | |
| "epoch": 0.6171428571428571, | |
| "grad_norm": 1.4301224946975708, | |
| "kl": 0.3580729166666667, | |
| "learning_rate": 2.9836319343816397e-07, | |
| "loss": 0.1301, | |
| "reward": 0.6515451321999232, | |
| "reward_std": 0.6147226591904958, | |
| "rewards/cosine_scaled_reward": -0.09089410801728566, | |
| "rewards/format_reward": 0.8333333532015482, | |
| "step": 360 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1458.8750406901042, | |
| "epoch": 0.6188571428571429, | |
| "grad_norm": 0.6013662219047546, | |
| "kl": 0.2197265625, | |
| "learning_rate": 2.9576484845877793e-07, | |
| "loss": 0.0238, | |
| "reward": 0.8060784737269083, | |
| "reward_std": 0.762891560792923, | |
| "rewards/cosine_scaled_reward": -0.04140521648029486, | |
| "rewards/format_reward": 0.8888889153798422, | |
| "step": 361 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2134.1667683919272, | |
| "epoch": 0.6205714285714286, | |
| "grad_norm": 1343.1533203125, | |
| "kl": 6.80322265625, | |
| "learning_rate": 2.931788945420058e-07, | |
| "loss": 0.4339, | |
| "reward": 0.4417159979542096, | |
| "reward_std": 0.6921458492676417, | |
| "rewards/cosine_scaled_reward": -0.140253125767534, | |
| "rewards/format_reward": 0.7222222338120142, | |
| "step": 362 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2177.1250610351562, | |
| "epoch": 0.6222857142857143, | |
| "grad_norm": 1.3377630710601807, | |
| "kl": 0.443359375, | |
| "learning_rate": 2.9060545772359305e-07, | |
| "loss": 0.0332, | |
| "reward": 0.7354927112658819, | |
| "reward_std": 0.9386909107367197, | |
| "rewards/cosine_scaled_reward": -0.028086995395521324, | |
| "rewards/format_reward": 0.7916666766007742, | |
| "step": 363 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1922.2083943684895, | |
| "epoch": 0.624, | |
| "grad_norm": 1.409002423286438, | |
| "kl": 0.4161783854166667, | |
| "learning_rate": 2.8804466342921987e-07, | |
| "loss": 0.0728, | |
| "reward": 0.5048904443780581, | |
| "reward_std": 0.9239566624164581, | |
| "rewards/cosine_scaled_reward": -0.12255478355412681, | |
| "rewards/format_reward": 0.750000019868215, | |
| "step": 364 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1996.4583943684895, | |
| "epoch": 0.6257142857142857, | |
| "grad_norm": 22.69778823852539, | |
| "kl": 0.4385579427083333, | |
| "learning_rate": 2.854966364683872e-07, | |
| "loss": 0.0747, | |
| "reward": 0.49555062254269916, | |
| "reward_std": 0.6983461529016495, | |
| "rewards/cosine_scaled_reward": -0.1550024850293994, | |
| "rewards/format_reward": 0.8055555621782938, | |
| "step": 365 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2252.138956705729, | |
| "epoch": 0.6274285714285714, | |
| "grad_norm": 1.1316742897033691, | |
| "kl": 0.5764973958333334, | |
| "learning_rate": 2.829615010283344e-07, | |
| "loss": 0.1063, | |
| "reward": 0.6296257488429546, | |
| "reward_std": 0.6832745472590128, | |
| "rewards/cosine_scaled_reward": -0.04629825303951899, | |
| "rewards/format_reward": 0.722222238779068, | |
| "step": 366 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1691.9861653645833, | |
| "epoch": 0.6291428571428571, | |
| "grad_norm": 2.4533822536468506, | |
| "kl": 0.2789713541666667, | |
| "learning_rate": 2.8043938066798645e-07, | |
| "loss": 0.181, | |
| "reward": 0.8881241927544276, | |
| "reward_std": 0.8296632667382559, | |
| "rewards/cosine_scaled_reward": 0.0273954135676225, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 367 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2092.166748046875, | |
| "epoch": 0.6308571428571429, | |
| "grad_norm": 1.5362274646759033, | |
| "kl": 0.3756510416666667, | |
| "learning_rate": 2.7793039831193133e-07, | |
| "loss": 0.0588, | |
| "reward": 0.13919082408150038, | |
| "reward_std": 0.4670659353335698, | |
| "rewards/cosine_scaled_reward": -0.2706823796033859, | |
| "rewards/format_reward": 0.6805555721124014, | |
| "step": 368 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1440.6944986979167, | |
| "epoch": 0.6325714285714286, | |
| "grad_norm": 1.7778924703598022, | |
| "kl": 0.212646484375, | |
| "learning_rate": 2.7543467624442956e-07, | |
| "loss": 0.04, | |
| "reward": 0.926197330156962, | |
| "reward_std": 0.6758472969134649, | |
| "rewards/cosine_scaled_reward": 0.0047653187066316605, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 369 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1670.2917175292969, | |
| "epoch": 0.6342857142857142, | |
| "grad_norm": 0.7548136711120605, | |
| "kl": 0.331787109375, | |
| "learning_rate": 2.729523361034538e-07, | |
| "loss": 0.0612, | |
| "reward": 0.8845050384600958, | |
| "reward_std": 0.5586157590150833, | |
| "rewards/cosine_scaled_reward": 0.04641916851202647, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 370 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2030.9167073567708, | |
| "epoch": 0.636, | |
| "grad_norm": 1.3891069889068604, | |
| "kl": 0.288818359375, | |
| "learning_rate": 2.7048349887476037e-07, | |
| "loss": 0.0543, | |
| "reward": 0.6472903043031693, | |
| "reward_std": 0.6831158598264059, | |
| "rewards/cosine_scaled_reward": -0.04441040630141894, | |
| "rewards/format_reward": 0.7361111342906952, | |
| "step": 371 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1828.388916015625, | |
| "epoch": 0.6377142857142857, | |
| "grad_norm": 1.0828273296356201, | |
| "kl": 0.3238932291666667, | |
| "learning_rate": 2.6802828488599294e-07, | |
| "loss": 0.0689, | |
| "reward": 0.8317237993081411, | |
| "reward_std": 0.8715166846911112, | |
| "rewards/cosine_scaled_reward": 0.02697300041715304, | |
| "rewards/format_reward": 0.7777778009573618, | |
| "step": 372 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1877.555643717448, | |
| "epoch": 0.6394285714285715, | |
| "grad_norm": 0.45865127444267273, | |
| "kl": 0.3331197102864583, | |
| "learning_rate": 2.655868138008171e-07, | |
| "loss": 0.1121, | |
| "reward": 0.5179976920286814, | |
| "reward_std": 0.7481978138287863, | |
| "rewards/cosine_scaled_reward": -0.13683448607722917, | |
| "rewards/format_reward": 0.7916666766007742, | |
| "step": 373 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2111.5972900390625, | |
| "epoch": 0.6411428571428571, | |
| "grad_norm": 1.6562620401382446, | |
| "kl": 0.3701171875, | |
| "learning_rate": 2.631592046130896e-07, | |
| "loss": 0.0879, | |
| "reward": 0.42803238332271576, | |
| "reward_std": 0.5924511949221293, | |
| "rewards/cosine_scaled_reward": -0.16792825407659015, | |
| "rewards/format_reward": 0.7638888955116272, | |
| "step": 374 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1977.513936360677, | |
| "epoch": 0.6428571428571429, | |
| "grad_norm": 1.7319481372833252, | |
| "kl": 0.36328125, | |
| "learning_rate": 2.6074557564105724e-07, | |
| "loss": 0.0465, | |
| "reward": 0.48163893694678944, | |
| "reward_std": 0.8631226420402527, | |
| "rewards/cosine_scaled_reward": -0.1480694351096948, | |
| "rewards/format_reward": 0.7777778009573618, | |
| "step": 375 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1789.1389567057292, | |
| "epoch": 0.6445714285714286, | |
| "grad_norm": 1.1992692947387695, | |
| "kl": 0.37255859375, | |
| "learning_rate": 2.583460445215911e-07, | |
| "loss": 0.0782, | |
| "reward": 0.743633359670639, | |
| "reward_std": 0.6923061857620875, | |
| "rewards/cosine_scaled_reward": -0.030961106220881145, | |
| "rewards/format_reward": 0.8055555721124014, | |
| "step": 376 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1978.77783203125, | |
| "epoch": 0.6462857142857142, | |
| "grad_norm": 1.6979951858520508, | |
| "kl": 0.3738606770833333, | |
| "learning_rate": 2.5596072820445254e-07, | |
| "loss": 0.1507, | |
| "reward": 0.6574459498127302, | |
| "reward_std": 0.6552928984165192, | |
| "rewards/cosine_scaled_reward": -0.05322146819283565, | |
| "rewards/format_reward": 0.7638889054457346, | |
| "step": 377 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1818.0694986979167, | |
| "epoch": 0.648, | |
| "grad_norm": 1.8327480554580688, | |
| "kl": 0.29296875, | |
| "learning_rate": 2.5358974294659373e-07, | |
| "loss": 0.1536, | |
| "reward": 0.3741883759697278, | |
| "reward_std": 0.6344276517629623, | |
| "rewards/cosine_scaled_reward": -0.2226280445853869, | |
| "rewards/format_reward": 0.8194444676240286, | |
| "step": 378 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1630.0972696940105, | |
| "epoch": 0.6497142857142857, | |
| "grad_norm": 1.1835570335388184, | |
| "kl": 0.3250325520833333, | |
| "learning_rate": 2.512332043064913e-07, | |
| "loss": 0.1165, | |
| "reward": 0.8845698088407516, | |
| "reward_std": 0.7334518581628799, | |
| "rewards/cosine_scaled_reward": -0.002159561961889267, | |
| "rewards/format_reward": 0.8888889054457346, | |
| "step": 379 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1613.1250813802083, | |
| "epoch": 0.6514285714285715, | |
| "grad_norm": 1.1109741926193237, | |
| "kl": 0.3570963541666667, | |
| "learning_rate": 2.488912271385139e-07, | |
| "loss": 0.1097, | |
| "reward": 1.1600602467854817, | |
| "reward_std": 0.9610305726528168, | |
| "rewards/cosine_scaled_reward": 0.13558567812045416, | |
| "rewards/format_reward": 0.8888889054457346, | |
| "step": 380 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2391.3611450195312, | |
| "epoch": 0.6531428571428571, | |
| "grad_norm": 1.6611226797103882, | |
| "kl": 0.4931640625, | |
| "learning_rate": 2.465639255873246e-07, | |
| "loss": 0.183, | |
| "reward": 0.3325663444896539, | |
| "reward_std": 0.7603594561417898, | |
| "rewards/cosine_scaled_reward": -0.16010572264591852, | |
| "rewards/format_reward": 0.6527778009573618, | |
| "step": 381 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1949.638936360677, | |
| "epoch": 0.6548571428571428, | |
| "grad_norm": 1.2779847383499146, | |
| "kl": 0.4043782552083333, | |
| "learning_rate": 2.4425141308231765e-07, | |
| "loss": 0.1131, | |
| "reward": 0.7218312869469324, | |
| "reward_std": 0.8889608283837637, | |
| "rewards/cosine_scaled_reward": -0.021028820425271988, | |
| "rewards/format_reward": 0.7638889153798422, | |
| "step": 382 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2023.3750813802083, | |
| "epoch": 0.6565714285714286, | |
| "grad_norm": 1.4803309440612793, | |
| "kl": 0.4348958333333333, | |
| "learning_rate": 2.4195380233209006e-07, | |
| "loss": 0.0389, | |
| "reward": 0.692891001701355, | |
| "reward_std": 0.796034554640452, | |
| "rewards/cosine_scaled_reward": -0.04244339714447657, | |
| "rewards/format_reward": 0.7777778009573618, | |
| "step": 383 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2347.9444986979165, | |
| "epoch": 0.6582857142857143, | |
| "grad_norm": 1.2414591312408447, | |
| "kl": 0.576171875, | |
| "learning_rate": 2.3967120531894857e-07, | |
| "loss": 0.0537, | |
| "reward": 0.31488546387602884, | |
| "reward_std": 0.559194877743721, | |
| "rewards/cosine_scaled_reward": -0.1967239367465178, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 384 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2299.7222696940103, | |
| "epoch": 0.66, | |
| "grad_norm": 2.55776309967041, | |
| "kl": 0.5263671875, | |
| "learning_rate": 2.374037332934512e-07, | |
| "loss": 0.0029, | |
| "reward": 0.4423376147945722, | |
| "reward_std": 0.6057499647140503, | |
| "rewards/cosine_scaled_reward": -0.1191089612742265, | |
| "rewards/format_reward": 0.6805555721124014, | |
| "step": 385 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2466.6528523763022, | |
| "epoch": 0.6617142857142857, | |
| "grad_norm": 1.4592732191085815, | |
| "kl": 0.4833984375, | |
| "learning_rate": 2.3515149676898552e-07, | |
| "loss": 0.155, | |
| "reward": 0.14802361528078714, | |
| "reward_std": 0.5865804652372996, | |
| "rewards/cosine_scaled_reward": -0.2107104236880938, | |
| "rewards/format_reward": 0.5694444626569748, | |
| "step": 386 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2103.9583943684897, | |
| "epoch": 0.6634285714285715, | |
| "grad_norm": 0.7224381566047668, | |
| "kl": 0.3951822916666667, | |
| "learning_rate": 2.3291460551638237e-07, | |
| "loss": 0.0729, | |
| "reward": 0.7492095430692037, | |
| "reward_std": 0.681890199581782, | |
| "rewards/cosine_scaled_reward": 0.027382536480824154, | |
| "rewards/format_reward": 0.6944444676240286, | |
| "step": 387 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1462.2361348470051, | |
| "epoch": 0.6651428571428571, | |
| "grad_norm": 0.9537621736526489, | |
| "kl": 0.18473307291666666, | |
| "learning_rate": 2.306931685585657e-07, | |
| "loss": 0.0658, | |
| "reward": 0.9828118880589803, | |
| "reward_std": 0.7033060888449351, | |
| "rewards/cosine_scaled_reward": 0.04001703610022863, | |
| "rewards/format_reward": 0.902777781089147, | |
| "step": 388 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2040.6667582194011, | |
| "epoch": 0.6668571428571428, | |
| "grad_norm": 0.7157368659973145, | |
| "kl": 0.4547526041666667, | |
| "learning_rate": 2.2848729416523859e-07, | |
| "loss": 0.1088, | |
| "reward": 0.6306080569823583, | |
| "reward_std": 0.5829611023267111, | |
| "rewards/cosine_scaled_reward": -0.024973766257365543, | |
| "rewards/format_reward": 0.6805555721124014, | |
| "step": 389 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1828.0972696940105, | |
| "epoch": 0.6685714285714286, | |
| "grad_norm": 0.7172399759292603, | |
| "kl": 0.3055013020833333, | |
| "learning_rate": 2.2629708984760706e-07, | |
| "loss": 0.0354, | |
| "reward": 0.8313513994216919, | |
| "reward_std": 0.7012214362621307, | |
| "rewards/cosine_scaled_reward": -0.021824313948551815, | |
| "rewards/format_reward": 0.875000019868215, | |
| "step": 390 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2106.8472493489585, | |
| "epoch": 0.6702857142857143, | |
| "grad_norm": 1.3844515085220337, | |
| "kl": 0.4137369791666667, | |
| "learning_rate": 2.2412266235313973e-07, | |
| "loss": 0.1326, | |
| "reward": 0.68645707766215, | |
| "reward_std": 0.8559178411960602, | |
| "rewards/cosine_scaled_reward": -0.03177145775407553, | |
| "rewards/format_reward": 0.750000019868215, | |
| "step": 391 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1971.6111450195312, | |
| "epoch": 0.672, | |
| "grad_norm": 2.4108059406280518, | |
| "kl": 0.3716634114583333, | |
| "learning_rate": 2.2196411766036487e-07, | |
| "loss": 0.0187, | |
| "reward": 0.5283236006895701, | |
| "reward_std": 0.6501945902903875, | |
| "rewards/cosine_scaled_reward": -0.09000487873951594, | |
| "rewards/format_reward": 0.7083333532015482, | |
| "step": 392 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2265.7639363606772, | |
| "epoch": 0.6737142857142857, | |
| "grad_norm": 0.7517369985580444, | |
| "kl": 0.3922526041666667, | |
| "learning_rate": 2.1982156097370557e-07, | |
| "loss": 0.073, | |
| "reward": 0.5178258121013641, | |
| "reward_std": 0.6695823272069296, | |
| "rewards/cosine_scaled_reward": -0.06747599689212318, | |
| "rewards/format_reward": 0.6527777910232544, | |
| "step": 393 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1601.4583638509114, | |
| "epoch": 0.6754285714285714, | |
| "grad_norm": 0.5745236277580261, | |
| "kl": 0.2665608723958333, | |
| "learning_rate": 2.1769509671835223e-07, | |
| "loss": 0.122, | |
| "reward": 0.735952208439509, | |
| "reward_std": 0.7679448922475179, | |
| "rewards/cosine_scaled_reward": -0.0556350282082955, | |
| "rewards/format_reward": 0.847222238779068, | |
| "step": 394 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1680.8333943684895, | |
| "epoch": 0.6771428571428572, | |
| "grad_norm": 0.8152760863304138, | |
| "kl": 0.24751790364583334, | |
| "learning_rate": 2.1558482853517253e-07, | |
| "loss": 0.0047, | |
| "reward": 0.6032729422052702, | |
| "reward_std": 0.5637368957201639, | |
| "rewards/cosine_scaled_reward": -0.11503020425637563, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 395 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2343.0833740234375, | |
| "epoch": 0.6788571428571428, | |
| "grad_norm": 1.2189439535140991, | |
| "kl": 0.3642578125, | |
| "learning_rate": 2.134908592756607e-07, | |
| "loss": 0.0563, | |
| "reward": 0.4998194696381688, | |
| "reward_std": 0.7090245336294174, | |
| "rewards/cosine_scaled_reward": -0.08342361201842625, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 396 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1882.5833943684895, | |
| "epoch": 0.6805714285714286, | |
| "grad_norm": 0.7444446682929993, | |
| "kl": 0.2833658854166667, | |
| "learning_rate": 2.1141329099692406e-07, | |
| "loss": 0.0719, | |
| "reward": 0.6090759175519148, | |
| "reward_std": 0.7299742499987284, | |
| "rewards/cosine_scaled_reward": -0.049628703544537224, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 397 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2406.555623372396, | |
| "epoch": 0.6822857142857143, | |
| "grad_norm": 1.209504246711731, | |
| "kl": 0.3766276041666667, | |
| "learning_rate": 2.0935222495670968e-07, | |
| "loss": 0.1522, | |
| "reward": 0.34390421211719513, | |
| "reward_std": 0.773917888601621, | |
| "rewards/cosine_scaled_reward": -0.1336034582927823, | |
| "rewards/format_reward": 0.6111111243565878, | |
| "step": 398 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1526.3889261881511, | |
| "epoch": 0.684, | |
| "grad_norm": 1.2556852102279663, | |
| "kl": 0.241455078125, | |
| "learning_rate": 2.0730776160846853e-07, | |
| "loss": 0.0574, | |
| "reward": 1.074718529979388, | |
| "reward_std": 0.8735250234603882, | |
| "rewards/cosine_scaled_reward": 0.1068036916355292, | |
| "rewards/format_reward": 0.8611111342906952, | |
| "step": 399 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2166.5694986979165, | |
| "epoch": 0.6857142857142857, | |
| "grad_norm": 0.8293429613113403, | |
| "kl": 0.2623697916666667, | |
| "learning_rate": 2.0528000059645995e-07, | |
| "loss": 0.1125, | |
| "reward": 0.5970514553288618, | |
| "reward_std": 0.8870691955089569, | |
| "rewards/cosine_scaled_reward": -0.06258538489540418, | |
| "rewards/format_reward": 0.7222222487131754, | |
| "step": 400 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1920.2361450195312, | |
| "epoch": 0.6874285714285714, | |
| "grad_norm": 2.201414108276367, | |
| "kl": 0.23396809895833334, | |
| "learning_rate": 2.032690407508949e-07, | |
| "loss": 0.2032, | |
| "reward": 0.5355731310943762, | |
| "reward_std": 0.7314170847336451, | |
| "rewards/cosine_scaled_reward": -0.1349912267178297, | |
| "rewards/format_reward": 0.8055555721124014, | |
| "step": 401 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2077.263916015625, | |
| "epoch": 0.6891428571428572, | |
| "grad_norm": 2.413097381591797, | |
| "kl": 0.3064778645833333, | |
| "learning_rate": 2.0127498008311922e-07, | |
| "loss": 0.1142, | |
| "reward": 0.1725784676770369, | |
| "reward_std": 0.5135719130436579, | |
| "rewards/cosine_scaled_reward": -0.26787744959195453, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 402 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1997.1945190429688, | |
| "epoch": 0.6908571428571428, | |
| "grad_norm": 5.780155658721924, | |
| "kl": 0.24544270833333334, | |
| "learning_rate": 1.9929791578083655e-07, | |
| "loss": 0.1367, | |
| "reward": 0.6813353076577187, | |
| "reward_std": 0.6979800562063853, | |
| "rewards/cosine_scaled_reward": -0.034332344929377236, | |
| "rewards/format_reward": 0.750000019868215, | |
| "step": 403 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2016.6666870117188, | |
| "epoch": 0.6925714285714286, | |
| "grad_norm": 3.2379355430603027, | |
| "kl": 0.2771809895833333, | |
| "learning_rate": 1.9733794420337213e-07, | |
| "loss": 0.1152, | |
| "reward": 0.4048723876476288, | |
| "reward_std": 0.49137820800145465, | |
| "rewards/cosine_scaled_reward": -0.1517304852604866, | |
| "rewards/format_reward": 0.7083333631356558, | |
| "step": 404 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2269.0973103841147, | |
| "epoch": 0.6942857142857143, | |
| "grad_norm": 4.353272914886475, | |
| "kl": 0.3338216145833333, | |
| "learning_rate": 1.9539516087697517e-07, | |
| "loss": 0.1358, | |
| "reward": 0.3609397957722346, | |
| "reward_std": 0.7847002049287161, | |
| "rewards/cosine_scaled_reward": -0.1598078909640511, | |
| "rewards/format_reward": 0.6805555721124014, | |
| "step": 405 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1751.541727701823, | |
| "epoch": 0.696, | |
| "grad_norm": 3.1557979583740234, | |
| "kl": 0.24625651041666666, | |
| "learning_rate": 1.934696604901642e-07, | |
| "loss": -0.0139, | |
| "reward": 1.0194816986719768, | |
| "reward_std": 0.855416273077329, | |
| "rewards/cosine_scaled_reward": 0.07918529336651166, | |
| "rewards/format_reward": 0.8611111144224802, | |
| "step": 406 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1928.805643717448, | |
| "epoch": 0.6977142857142857, | |
| "grad_norm": 5.042448997497559, | |
| "kl": 0.24934895833333334, | |
| "learning_rate": 1.915615368891117e-07, | |
| "loss": 0.0138, | |
| "reward": 0.555758461356163, | |
| "reward_std": 0.8657824893792471, | |
| "rewards/cosine_scaled_reward": -0.07628745750601713, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 407 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1522.152811686198, | |
| "epoch": 0.6994285714285714, | |
| "grad_norm": 1.781553864479065, | |
| "kl": 0.21044921875, | |
| "learning_rate": 1.8967088307307e-07, | |
| "loss": 0.0591, | |
| "reward": 1.13477690021197, | |
| "reward_std": 0.6563934882481893, | |
| "rewards/cosine_scaled_reward": 0.1159995732208093, | |
| "rewards/format_reward": 0.9027777910232544, | |
| "step": 408 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1659.1250813802083, | |
| "epoch": 0.7011428571428572, | |
| "grad_norm": 96.76517486572266, | |
| "kl": 0.711181640625, | |
| "learning_rate": 1.8779779118983867e-07, | |
| "loss": 0.1656, | |
| "reward": 0.9782565534114838, | |
| "reward_std": 0.890614777803421, | |
| "rewards/cosine_scaled_reward": 0.07246161034951608, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 409 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1625.138916015625, | |
| "epoch": 0.7028571428571428, | |
| "grad_norm": 16.717042922973633, | |
| "kl": 0.3666178385416667, | |
| "learning_rate": 1.8594235253127372e-07, | |
| "loss": 0.0587, | |
| "reward": 0.5817512522141138, | |
| "reward_std": 0.7319327294826508, | |
| "rewards/cosine_scaled_reward": -0.12579103155682483, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 410 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1689.541727701823, | |
| "epoch": 0.7045714285714286, | |
| "grad_norm": 218.95281982421875, | |
| "kl": 1.294921875, | |
| "learning_rate": 1.8410465752883758e-07, | |
| "loss": 0.1322, | |
| "reward": 0.5819959590832392, | |
| "reward_std": 0.557060182094574, | |
| "rewards/cosine_scaled_reward": -0.13955759505430856, | |
| "rewards/format_reward": 0.8611111243565878, | |
| "step": 411 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1785.652811686198, | |
| "epoch": 0.7062857142857143, | |
| "grad_norm": 344.584228515625, | |
| "kl": 4.371419270833333, | |
| "learning_rate": 1.822847957491922e-07, | |
| "loss": 0.1837, | |
| "reward": 0.506547600030899, | |
| "reward_std": 0.5975389977296194, | |
| "rewards/cosine_scaled_reward": -0.15644842634598413, | |
| "rewards/format_reward": 0.819444457689921, | |
| "step": 412 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1614.0000406901042, | |
| "epoch": 0.708, | |
| "grad_norm": 319.2081298828125, | |
| "kl": 2.5314127604166665, | |
| "learning_rate": 1.804828558898332e-07, | |
| "loss": 0.2332, | |
| "reward": 0.7054447730382284, | |
| "reward_std": 0.8039397050937017, | |
| "rewards/cosine_scaled_reward": -0.08477763210733731, | |
| "rewards/format_reward": 0.8750000099341074, | |
| "step": 413 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1835.5833740234375, | |
| "epoch": 0.7097142857142857, | |
| "grad_norm": 205.16404724121094, | |
| "kl": 1.4137369791666667, | |
| "learning_rate": 1.7869892577476722e-07, | |
| "loss": 0.1652, | |
| "reward": 0.5560694153731068, | |
| "reward_std": 0.6995243926843008, | |
| "rewards/cosine_scaled_reward": -0.09696529805660248, | |
| "rewards/format_reward": 0.750000019868215, | |
| "step": 414 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1755.3611857096355, | |
| "epoch": 0.7114285714285714, | |
| "grad_norm": 5.430647373199463, | |
| "kl": 0.3307291666666667, | |
| "learning_rate": 1.7693309235023127e-07, | |
| "loss": 0.0615, | |
| "reward": 0.4476342163980007, | |
| "reward_std": 0.6154121855894724, | |
| "rewards/cosine_scaled_reward": -0.16507180035114288, | |
| "rewards/format_reward": 0.7777778009573618, | |
| "step": 415 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2070.5278523763022, | |
| "epoch": 0.7131428571428572, | |
| "grad_norm": 4.579702854156494, | |
| "kl": 0.4453125, | |
| "learning_rate": 1.7518544168045524e-07, | |
| "loss": 0.1439, | |
| "reward": 0.7116053154071172, | |
| "reward_std": 0.6810421248277029, | |
| "rewards/cosine_scaled_reward": -0.03308624650041262, | |
| "rewards/format_reward": 0.7777778009573618, | |
| "step": 416 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1932.1527913411458, | |
| "epoch": 0.7148571428571429, | |
| "grad_norm": 479.8846435546875, | |
| "kl": 3.73779296875, | |
| "learning_rate": 1.7345605894346726e-07, | |
| "loss": 0.2702, | |
| "reward": 0.4647614856561025, | |
| "reward_std": 0.6491421411434809, | |
| "rewards/cosine_scaled_reward": -0.14956370865305266, | |
| "rewards/format_reward": 0.7638889054457346, | |
| "step": 417 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1700.541727701823, | |
| "epoch": 0.7165714285714285, | |
| "grad_norm": 5.443795680999756, | |
| "kl": 0.4032999674479167, | |
| "learning_rate": 1.7174502842694212e-07, | |
| "loss": 0.0519, | |
| "reward": 0.1600899597009023, | |
| "reward_std": 0.45748260617256165, | |
| "rewards/cosine_scaled_reward": -0.3088439305623372, | |
| "rewards/format_reward": 0.777777781089147, | |
| "step": 418 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1669.9722696940105, | |
| "epoch": 0.7182857142857143, | |
| "grad_norm": 117.56401824951172, | |
| "kl": 1.25830078125, | |
| "learning_rate": 1.7005243352409333e-07, | |
| "loss": 0.1109, | |
| "reward": 0.4580128379166126, | |
| "reward_std": 0.6389024009307226, | |
| "rewards/cosine_scaled_reward": -0.1668269212047259, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 419 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1804.3750610351562, | |
| "epoch": 0.72, | |
| "grad_norm": 3.484607458114624, | |
| "kl": 0.22444661458333334, | |
| "learning_rate": 1.6837835672960831e-07, | |
| "loss": 0.1056, | |
| "reward": 0.5213985095421473, | |
| "reward_std": 0.5480342656373978, | |
| "rewards/cosine_scaled_reward": -0.19068964570760727, | |
| "rewards/format_reward": 0.9027777910232544, | |
| "step": 420 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1636.291727701823, | |
| "epoch": 0.7217142857142858, | |
| "grad_norm": 3.8642489910125732, | |
| "kl": 0.24129231770833334, | |
| "learning_rate": 1.6672287963562852e-07, | |
| "loss": 0.0585, | |
| "reward": 0.9361005599300066, | |
| "reward_std": 0.6863935093084971, | |
| "rewards/cosine_scaled_reward": 0.05832804015759999, | |
| "rewards/format_reward": 0.819444457689921, | |
| "step": 421 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1438.3750712076824, | |
| "epoch": 0.7234285714285714, | |
| "grad_norm": 4.483888626098633, | |
| "kl": 0.3055826822916667, | |
| "learning_rate": 1.6508608292777203e-07, | |
| "loss": 0.178, | |
| "reward": 0.5787845104932785, | |
| "reward_std": 0.5376511911551157, | |
| "rewards/cosine_scaled_reward": -0.161996653303504, | |
| "rewards/format_reward": 0.9027777910232544, | |
| "step": 422 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1796.5555928548176, | |
| "epoch": 0.7251428571428571, | |
| "grad_norm": 2.4398016929626465, | |
| "kl": 0.25048828125, | |
| "learning_rate": 1.6346804638120098e-07, | |
| "loss": 0.1271, | |
| "reward": 0.6371371994415919, | |
| "reward_std": 0.6434228718280792, | |
| "rewards/cosine_scaled_reward": -0.084209187887609, | |
| "rewards/format_reward": 0.8055555621782938, | |
| "step": 423 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1877.5000610351562, | |
| "epoch": 0.7268571428571429, | |
| "grad_norm": 1.8450111150741577, | |
| "kl": 0.2255859375, | |
| "learning_rate": 1.6186884885673413e-07, | |
| "loss": 0.1473, | |
| "reward": 0.5857030351956686, | |
| "reward_std": 0.8147616585095724, | |
| "rewards/cosine_scaled_reward": -0.14464849730332693, | |
| "rewards/format_reward": 0.875000019868215, | |
| "step": 424 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1878.8472696940105, | |
| "epoch": 0.7285714285714285, | |
| "grad_norm": 2.389019012451172, | |
| "kl": 0.3597005208333333, | |
| "learning_rate": 1.6028856829700258e-07, | |
| "loss": 0.0622, | |
| "reward": 0.8244462410608927, | |
| "reward_std": 0.9060603678226471, | |
| "rewards/cosine_scaled_reward": 0.016389766087134678, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 425 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1429.5000610351562, | |
| "epoch": 0.7302857142857143, | |
| "grad_norm": 96.92652893066406, | |
| "kl": 1.4789225260416667, | |
| "learning_rate": 1.5872728172265146e-07, | |
| "loss": 0.0266, | |
| "reward": 1.0368034442265828, | |
| "reward_std": 0.7116367469231287, | |
| "rewards/cosine_scaled_reward": 0.060068391263484955, | |
| "rewards/format_reward": 0.9166666766007742, | |
| "step": 426 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2004.152852376302, | |
| "epoch": 0.732, | |
| "grad_norm": 2.5690948963165283, | |
| "kl": 0.3536783854166667, | |
| "learning_rate": 1.5718506522858572e-07, | |
| "loss": 0.0126, | |
| "reward": 0.5105084627866745, | |
| "reward_std": 0.7548707127571106, | |
| "rewards/cosine_scaled_reward": -0.1336346616347631, | |
| "rewards/format_reward": 0.7777778009573618, | |
| "step": 427 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1632.7361551920574, | |
| "epoch": 0.7337142857142858, | |
| "grad_norm": 1.8503578901290894, | |
| "kl": 0.15877278645833334, | |
| "learning_rate": 1.5566199398026147e-07, | |
| "loss": 0.0097, | |
| "reward": 1.0049935777982075, | |
| "reward_std": 0.6189515888690948, | |
| "rewards/cosine_scaled_reward": 0.05110790518422922, | |
| "rewards/format_reward": 0.9027777910232544, | |
| "step": 428 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1933.6389567057292, | |
| "epoch": 0.7354285714285714, | |
| "grad_norm": 4.138815879821777, | |
| "kl": 0.2600911458333333, | |
| "learning_rate": 1.5415814221002265e-07, | |
| "loss": -0.0049, | |
| "reward": 0.5391122822960218, | |
| "reward_std": 0.6431433210770289, | |
| "rewards/cosine_scaled_reward": -0.14711054414510727, | |
| "rewards/format_reward": 0.8333333532015482, | |
| "step": 429 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1679.250020345052, | |
| "epoch": 0.7371428571428571, | |
| "grad_norm": 2.0407519340515137, | |
| "kl": 0.21101888020833334, | |
| "learning_rate": 1.5267358321348285e-07, | |
| "loss": 0.1197, | |
| "reward": 0.7227591512103876, | |
| "reward_std": 0.6118214925130209, | |
| "rewards/cosine_scaled_reward": -0.0483426662782828, | |
| "rewards/format_reward": 0.819444457689921, | |
| "step": 430 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1924.3056233723958, | |
| "epoch": 0.7388571428571429, | |
| "grad_norm": 9.671761512756348, | |
| "kl": 0.657958984375, | |
| "learning_rate": 1.5120838934595337e-07, | |
| "loss": 0.0715, | |
| "reward": 0.6376418769359589, | |
| "reward_std": 0.7228594521681467, | |
| "rewards/cosine_scaled_reward": -0.06312351549665134, | |
| "rewards/format_reward": 0.7638889054457346, | |
| "step": 431 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1668.4583536783855, | |
| "epoch": 0.7405714285714285, | |
| "grad_norm": 5.288298606872559, | |
| "kl": 0.21541341145833334, | |
| "learning_rate": 1.4976263201891613e-07, | |
| "loss": 0.1874, | |
| "reward": 0.7793505688508352, | |
| "reward_std": 0.8537674943606058, | |
| "rewards/cosine_scaled_reward": -0.026991385345657665, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 432 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1944.0417582194011, | |
| "epoch": 0.7422857142857143, | |
| "grad_norm": 1.1715089082717896, | |
| "kl": 0.25537109375, | |
| "learning_rate": 1.483363816965435e-07, | |
| "loss": 0.0625, | |
| "reward": 0.6889009128014246, | |
| "reward_std": 0.6274515042702357, | |
| "rewards/cosine_scaled_reward": -0.07916065181295077, | |
| "rewards/format_reward": 0.847222238779068, | |
| "step": 433 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1384.777811686198, | |
| "epoch": 0.744, | |
| "grad_norm": 1.3403633832931519, | |
| "kl": 0.1846923828125, | |
| "learning_rate": 1.469297078922642e-07, | |
| "loss": 0.0381, | |
| "reward": 0.4820244958003362, | |
| "reward_std": 0.546803817152977, | |
| "rewards/cosine_scaled_reward": -0.21037665382027626, | |
| "rewards/format_reward": 0.9027777910232544, | |
| "step": 434 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2111.3056131998696, | |
| "epoch": 0.7457142857142857, | |
| "grad_norm": 3.8649866580963135, | |
| "kl": 0.3348795572916667, | |
| "learning_rate": 1.4554267916537495e-07, | |
| "loss": 0.057, | |
| "reward": 0.6068959900488456, | |
| "reward_std": 0.7177309989929199, | |
| "rewards/cosine_scaled_reward": -0.04377423319965601, | |
| "rewards/format_reward": 0.6944444676240286, | |
| "step": 435 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1758.0833638509114, | |
| "epoch": 0.7474285714285714, | |
| "grad_norm": 3.089583158493042, | |
| "kl": 0.2522989908854167, | |
| "learning_rate": 1.4417536311769885e-07, | |
| "loss": 0.0443, | |
| "reward": 0.7974027544260025, | |
| "reward_std": 0.7327479670445124, | |
| "rewards/cosine_scaled_reward": -0.031854174410303436, | |
| "rewards/format_reward": 0.8611111243565878, | |
| "step": 436 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1980.6389567057292, | |
| "epoch": 0.7491428571428571, | |
| "grad_norm": 18.65666961669922, | |
| "kl": 0.695556640625, | |
| "learning_rate": 1.4282782639029128e-07, | |
| "loss": 0.0852, | |
| "reward": 0.4337080344557762, | |
| "reward_std": 0.6298949966828028, | |
| "rewards/cosine_scaled_reward": -0.18592377689977488, | |
| "rewards/format_reward": 0.8055555721124014, | |
| "step": 437 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1867.9305928548176, | |
| "epoch": 0.7508571428571429, | |
| "grad_norm": 72.61431121826172, | |
| "kl": 1.0759684244791667, | |
| "learning_rate": 1.4150013466019114e-07, | |
| "loss": 0.0965, | |
| "reward": 0.9961026844878992, | |
| "reward_std": 0.660401776432991, | |
| "rewards/cosine_scaled_reward": 0.12305134286483128, | |
| "rewards/format_reward": 0.7500000099341074, | |
| "step": 438 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1852.02783203125, | |
| "epoch": 0.7525714285714286, | |
| "grad_norm": 3.921339273452759, | |
| "kl": 0.4046223958333333, | |
| "learning_rate": 1.4019235263722034e-07, | |
| "loss": 0.0221, | |
| "reward": 0.5507546067237854, | |
| "reward_std": 0.5439638768633207, | |
| "rewards/cosine_scaled_reward": -0.14823382099469504, | |
| "rewards/format_reward": 0.847222238779068, | |
| "step": 439 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1705.0000813802083, | |
| "epoch": 0.7542857142857143, | |
| "grad_norm": 17.273841857910156, | |
| "kl": 0.8013102213541666, | |
| "learning_rate": 1.3890454406082956e-07, | |
| "loss": 0.1405, | |
| "reward": 0.9204966376225153, | |
| "reward_std": 0.5747697303692499, | |
| "rewards/cosine_scaled_reward": 0.06441497895866632, | |
| "rewards/format_reward": 0.7916666766007742, | |
| "step": 440 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1872.4722696940105, | |
| "epoch": 0.756, | |
| "grad_norm": 3.2660210132598877, | |
| "kl": 0.3829752604166667, | |
| "learning_rate": 1.3763677169699217e-07, | |
| "loss": 0.0423, | |
| "reward": 0.6346297065416971, | |
| "reward_std": 0.687380443016688, | |
| "rewards/cosine_scaled_reward": -0.12018515511105458, | |
| "rewards/format_reward": 0.875000019868215, | |
| "step": 441 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1443.638921101888, | |
| "epoch": 0.7577142857142857, | |
| "grad_norm": 2.511837959289551, | |
| "kl": 0.26324462890625, | |
| "learning_rate": 1.3638909733514452e-07, | |
| "loss": 0.0736, | |
| "reward": 0.7940039783716202, | |
| "reward_std": 0.7034708857536316, | |
| "rewards/cosine_scaled_reward": -0.04049801888565222, | |
| "rewards/format_reward": 0.875000019868215, | |
| "step": 442 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1466.9167073567708, | |
| "epoch": 0.7594285714285715, | |
| "grad_norm": 2.3003296852111816, | |
| "kl": 0.1939697265625, | |
| "learning_rate": 1.351615817851748e-07, | |
| "loss": -0.0187, | |
| "reward": 0.9426108794286847, | |
| "reward_std": 0.6795027256011963, | |
| "rewards/cosine_scaled_reward": 0.0338054308667779, | |
| "rewards/format_reward": 0.875000019868215, | |
| "step": 443 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1648.513936360677, | |
| "epoch": 0.7611428571428571, | |
| "grad_norm": 797.2700805664062, | |
| "kl": 4.786539713541667, | |
| "learning_rate": 1.3395428487445914e-07, | |
| "loss": 0.4395, | |
| "reward": 0.6396536206205686, | |
| "reward_std": 0.5960048884153366, | |
| "rewards/cosine_scaled_reward": -0.14545098366215825, | |
| "rewards/format_reward": 0.9305555621782938, | |
| "step": 444 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2130.7778116861978, | |
| "epoch": 0.7628571428571429, | |
| "grad_norm": 3.519359827041626, | |
| "kl": 0.2513020833333333, | |
| "learning_rate": 1.3276726544494571e-07, | |
| "loss": 0.0925, | |
| "reward": 0.6113192215561867, | |
| "reward_std": 0.8141778856515884, | |
| "rewards/cosine_scaled_reward": -0.06934040101865928, | |
| "rewards/format_reward": 0.750000019868215, | |
| "step": 445 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1757.9861450195312, | |
| "epoch": 0.7645714285714286, | |
| "grad_norm": 1.927884578704834, | |
| "kl": 0.2086181640625, | |
| "learning_rate": 1.316005813502869e-07, | |
| "loss": 0.0041, | |
| "reward": 0.8222450017929077, | |
| "reward_std": 0.7467124362786611, | |
| "rewards/cosine_scaled_reward": -0.026377519592642784, | |
| "rewards/format_reward": 0.8750000099341074, | |
| "step": 446 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1813.4305826822917, | |
| "epoch": 0.7662857142857142, | |
| "grad_norm": 5.447101593017578, | |
| "kl": 0.20865885416666666, | |
| "learning_rate": 1.3045428945301953e-07, | |
| "loss": 0.0173, | |
| "reward": 0.9355805069208145, | |
| "reward_std": 0.5909775694211324, | |
| "rewards/cosine_scaled_reward": 0.030290252218643825, | |
| "rewards/format_reward": 0.875000019868215, | |
| "step": 447 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1639.1111246744792, | |
| "epoch": 0.768, | |
| "grad_norm": 3.9583089351654053, | |
| "kl": 0.20918782552083334, | |
| "learning_rate": 1.2932844562179352e-07, | |
| "loss": 0.1673, | |
| "reward": 0.8595867802699407, | |
| "reward_std": 0.6513732473055521, | |
| "rewards/cosine_scaled_reward": 0.006182260811328888, | |
| "rewards/format_reward": 0.847222238779068, | |
| "step": 448 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2084.9722900390625, | |
| "epoch": 0.7697142857142857, | |
| "grad_norm": 7.27739953994751, | |
| "kl": 0.3561197916666667, | |
| "learning_rate": 1.2822310472864885e-07, | |
| "loss": 0.0568, | |
| "reward": 0.4823018138607343, | |
| "reward_std": 0.5893634855747223, | |
| "rewards/cosine_scaled_reward": -0.11996020376682281, | |
| "rewards/format_reward": 0.7222222487131754, | |
| "step": 449 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1505.3611755371094, | |
| "epoch": 0.7714285714285715, | |
| "grad_norm": 53.90168380737305, | |
| "kl": 0.8046875, | |
| "learning_rate": 1.2713832064634125e-07, | |
| "loss": -0.001, | |
| "reward": 0.5860073566436768, | |
| "reward_std": 0.5414268672466278, | |
| "rewards/cosine_scaled_reward": -0.15838521718978882, | |
| "rewards/format_reward": 0.9027777910232544, | |
| "step": 450 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2174.0833943684897, | |
| "epoch": 0.7731428571428571, | |
| "grad_norm": 5.571269512176514, | |
| "kl": 0.6153971354166666, | |
| "learning_rate": 1.260741462457165e-07, | |
| "loss": 0.1584, | |
| "reward": 0.4900339717666308, | |
| "reward_std": 0.7578103343645731, | |
| "rewards/cosine_scaled_reward": -0.12303857877850533, | |
| "rewards/format_reward": 0.7361111342906952, | |
| "step": 451 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1577.4583943684895, | |
| "epoch": 0.7748571428571429, | |
| "grad_norm": 1.526145339012146, | |
| "kl": 0.16603597005208334, | |
| "learning_rate": 1.2503063339313356e-07, | |
| "loss": 0.0292, | |
| "reward": 0.7762962381045023, | |
| "reward_std": 0.8878602981567383, | |
| "rewards/cosine_scaled_reward": -0.08407412593563397, | |
| "rewards/format_reward": 0.944444457689921, | |
| "step": 452 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1635.6944986979167, | |
| "epoch": 0.7765714285714286, | |
| "grad_norm": 7.803213596343994, | |
| "kl": 0.3418782552083333, | |
| "learning_rate": 1.2400783294793668e-07, | |
| "loss": 0.0271, | |
| "reward": 0.5607372919718424, | |
| "reward_std": 0.5661151061455408, | |
| "rewards/cosine_scaled_reward": -0.15018692115942636, | |
| "rewards/format_reward": 0.8611111243565878, | |
| "step": 453 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1415.7778422037761, | |
| "epoch": 0.7782857142857142, | |
| "grad_norm": 2.2922298908233643, | |
| "kl": 0.15354410807291666, | |
| "learning_rate": 1.2300579475997657e-07, | |
| "loss": -0.0128, | |
| "reward": 1.2998482882976532, | |
| "reward_std": 0.9970080256462097, | |
| "rewards/cosine_scaled_reward": 0.21242412373734018, | |
| "rewards/format_reward": 0.875000019868215, | |
| "step": 454 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1734.6944986979167, | |
| "epoch": 0.78, | |
| "grad_norm": 1.3200474977493286, | |
| "kl": 0.21321614583333334, | |
| "learning_rate": 1.220245676671809e-07, | |
| "loss": 0.0444, | |
| "reward": 0.6947739571332932, | |
| "reward_std": 0.6087273160616556, | |
| "rewards/cosine_scaled_reward": -0.08316858857870102, | |
| "rewards/format_reward": 0.8611111243565878, | |
| "step": 455 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1789.3611450195312, | |
| "epoch": 0.7817142857142857, | |
| "grad_norm": 14.790578842163086, | |
| "kl": 0.3724772135416667, | |
| "learning_rate": 1.2106419949317388e-07, | |
| "loss": 0.0723, | |
| "reward": 0.9045670529206594, | |
| "reward_std": 0.7356864313284556, | |
| "rewards/cosine_scaled_reward": 0.014783527702093124, | |
| "rewards/format_reward": 0.875000019868215, | |
| "step": 456 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1688.6111857096355, | |
| "epoch": 0.7834285714285715, | |
| "grad_norm": 2.2542686462402344, | |
| "kl": 0.24503580729166666, | |
| "learning_rate": 1.2012473704494537e-07, | |
| "loss": 0.1015, | |
| "reward": 0.8334505259990692, | |
| "reward_std": 0.57002954185009, | |
| "rewards/cosine_scaled_reward": -0.03466361885269483, | |
| "rewards/format_reward": 0.9027777910232544, | |
| "step": 457 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1607.9722493489583, | |
| "epoch": 0.7851428571428571, | |
| "grad_norm": 3.5625736713409424, | |
| "kl": 0.26171875, | |
| "learning_rate": 1.1920622611056974e-07, | |
| "loss": 0.0333, | |
| "reward": 0.6640358219544092, | |
| "reward_std": 0.6666950782140096, | |
| "rewards/cosine_scaled_reward": -0.09159320111696918, | |
| "rewards/format_reward": 0.8472222288449606, | |
| "step": 458 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1716.3472900390625, | |
| "epoch": 0.7868571428571428, | |
| "grad_norm": 4.804265022277832, | |
| "kl": 0.2941080729166667, | |
| "learning_rate": 1.1830871145697412e-07, | |
| "loss": 0.1308, | |
| "reward": 0.8736765384674072, | |
| "reward_std": 0.6609072983264923, | |
| "rewards/cosine_scaled_reward": 0.013227129975954691, | |
| "rewards/format_reward": 0.847222238779068, | |
| "step": 459 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1981.27783203125, | |
| "epoch": 0.7885714285714286, | |
| "grad_norm": 4.590798377990723, | |
| "kl": 0.451171875, | |
| "learning_rate": 1.1743223682775649e-07, | |
| "loss": 0.0783, | |
| "reward": 0.5861403544743856, | |
| "reward_std": 0.6631839474042257, | |
| "rewards/cosine_scaled_reward": -0.13748539076186717, | |
| "rewards/format_reward": 0.8611111243565878, | |
| "step": 460 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1629.5000610351562, | |
| "epoch": 0.7902857142857143, | |
| "grad_norm": 3.229027271270752, | |
| "kl": 0.3135579427083333, | |
| "learning_rate": 1.1657684494105386e-07, | |
| "loss": 0.0158, | |
| "reward": 0.6585318843523661, | |
| "reward_std": 0.6985587080319723, | |
| "rewards/cosine_scaled_reward": -0.10823406899968784, | |
| "rewards/format_reward": 0.8750000099341074, | |
| "step": 461 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1465.2222900390625, | |
| "epoch": 0.792, | |
| "grad_norm": 1.0727124214172363, | |
| "kl": 0.1905517578125, | |
| "learning_rate": 1.1574257748745986e-07, | |
| "loss": 0.0638, | |
| "reward": 0.7913508663574854, | |
| "reward_std": 0.815668652455012, | |
| "rewards/cosine_scaled_reward": -0.04876901054133972, | |
| "rewards/format_reward": 0.8888889054457346, | |
| "step": 462 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1590.6944885253906, | |
| "epoch": 0.7937142857142857, | |
| "grad_norm": 0.9051072597503662, | |
| "kl": 0.1478271484375, | |
| "learning_rate": 1.1492947512799328e-07, | |
| "loss": -0.0103, | |
| "reward": 0.8648321181535721, | |
| "reward_std": 0.7620634287595749, | |
| "rewards/cosine_scaled_reward": -0.018972843885421753, | |
| "rewards/format_reward": 0.9027778009573618, | |
| "step": 463 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1846.3611450195312, | |
| "epoch": 0.7954285714285714, | |
| "grad_norm": 4.749053001403809, | |
| "kl": 0.41650390625, | |
| "learning_rate": 1.1413757749211602e-07, | |
| "loss": 0.0878, | |
| "reward": 0.6989162738124529, | |
| "reward_std": 0.6048727780580521, | |
| "rewards/cosine_scaled_reward": -0.06026409255961577, | |
| "rewards/format_reward": 0.819444477558136, | |
| "step": 464 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2484.4305826822915, | |
| "epoch": 0.7971428571428572, | |
| "grad_norm": 3.1575112342834473, | |
| "kl": 0.3430989583333333, | |
| "learning_rate": 1.1336692317580158e-07, | |
| "loss": 0.1322, | |
| "reward": 0.5418286422888438, | |
| "reward_std": 0.8770113388697306, | |
| "rewards/cosine_scaled_reward": -0.09714125220974286, | |
| "rewards/format_reward": 0.7361111243565878, | |
| "step": 465 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1908.8333943684895, | |
| "epoch": 0.7988571428571428, | |
| "grad_norm": 3.4126105308532715, | |
| "kl": 0.2589518229166667, | |
| "learning_rate": 1.1261754973965422e-07, | |
| "loss": 0.064, | |
| "reward": 0.8162196526924769, | |
| "reward_std": 0.8852183620134989, | |
| "rewards/cosine_scaled_reward": -0.0016124049822489421, | |
| "rewards/format_reward": 0.819444457689921, | |
| "step": 466 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2231.416748046875, | |
| "epoch": 0.8005714285714286, | |
| "grad_norm": 16.11284637451172, | |
| "kl": 0.5618489583333334, | |
| "learning_rate": 1.1188949370707787e-07, | |
| "loss": 0.1891, | |
| "reward": 0.5065229311585426, | |
| "reward_std": 0.5748920440673828, | |
| "rewards/cosine_scaled_reward": -0.12173853317896526, | |
| "rewards/format_reward": 0.7500000099341074, | |
| "step": 467 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1403.4722493489583, | |
| "epoch": 0.8022857142857143, | |
| "grad_norm": 3.2317402362823486, | |
| "kl": 0.16902669270833334, | |
| "learning_rate": 1.1118279056249653e-07, | |
| "loss": -0.0056, | |
| "reward": 0.9127690196037292, | |
| "reward_std": 0.6132321407397588, | |
| "rewards/cosine_scaled_reward": -0.008893255144357681, | |
| "rewards/format_reward": 0.9305555721124014, | |
| "step": 468 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1686.402852376302, | |
| "epoch": 0.804, | |
| "grad_norm": 96.09001922607422, | |
| "kl": 0.8639322916666666, | |
| "learning_rate": 1.1049747474962444e-07, | |
| "loss": 0.1148, | |
| "reward": 0.8764675855636597, | |
| "reward_std": 0.8609135548273722, | |
| "rewards/cosine_scaled_reward": -0.006210653732220332, | |
| "rewards/format_reward": 0.8888889054457346, | |
| "step": 469 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1467.5694986979167, | |
| "epoch": 0.8057142857142857, | |
| "grad_norm": 2.5543787479400635, | |
| "kl": 0.20747884114583334, | |
| "learning_rate": 1.0983357966978745e-07, | |
| "loss": -0.0014, | |
| "reward": 0.897119422753652, | |
| "reward_std": 0.7088388601938883, | |
| "rewards/cosine_scaled_reward": -0.030606964603066444, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 470 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1904.888916015625, | |
| "epoch": 0.8074285714285714, | |
| "grad_norm": 28.1047420501709, | |
| "kl": 0.46728515625, | |
| "learning_rate": 1.0919113768029517e-07, | |
| "loss": 0.079, | |
| "reward": 0.5574485460917155, | |
| "reward_std": 0.7020039608081182, | |
| "rewards/cosine_scaled_reward": -0.13794240107138953, | |
| "rewards/format_reward": 0.8333333532015482, | |
| "step": 471 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1954.9583943684895, | |
| "epoch": 0.8091428571428572, | |
| "grad_norm": 3.3789680004119873, | |
| "kl": 0.3369140625, | |
| "learning_rate": 1.0857018009286381e-07, | |
| "loss": 0.0878, | |
| "reward": 0.6358366658290228, | |
| "reward_std": 0.6697615136702856, | |
| "rewards/cosine_scaled_reward": -0.057081678261359535, | |
| "rewards/format_reward": 0.750000019868215, | |
| "step": 472 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1655.9306030273438, | |
| "epoch": 0.8108571428571428, | |
| "grad_norm": 1.9385606050491333, | |
| "kl": 0.262451171875, | |
| "learning_rate": 1.0797073717209013e-07, | |
| "loss": -0.0096, | |
| "reward": 0.8060556352138519, | |
| "reward_std": 0.6168865412473679, | |
| "rewards/cosine_scaled_reward": -0.05530552690227827, | |
| "rewards/format_reward": 0.9166666766007742, | |
| "step": 473 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1728.291727701823, | |
| "epoch": 0.8125714285714286, | |
| "grad_norm": 6136.6171875, | |
| "kl": 71.012939453125, | |
| "learning_rate": 1.0739283813397639e-07, | |
| "loss": 3.9357, | |
| "reward": 0.916077122092247, | |
| "reward_std": 0.5927824179331461, | |
| "rewards/cosine_scaled_reward": 0.02053854987025261, | |
| "rewards/format_reward": 0.875000019868215, | |
| "step": 474 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1824.5139770507812, | |
| "epoch": 0.8142857142857143, | |
| "grad_norm": 2.5220632553100586, | |
| "kl": 0.243408203125, | |
| "learning_rate": 1.068365111445064e-07, | |
| "loss": 0.0095, | |
| "reward": 0.6059877965599298, | |
| "reward_std": 0.7628039022286733, | |
| "rewards/cosine_scaled_reward": -0.10672833397984505, | |
| "rewards/format_reward": 0.819444457689921, | |
| "step": 475 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1742.0694783528645, | |
| "epoch": 0.816, | |
| "grad_norm": 4.123868465423584, | |
| "kl": 0.31689453125, | |
| "learning_rate": 1.063017833182728e-07, | |
| "loss": 0.1004, | |
| "reward": 0.4524025088176131, | |
| "reward_std": 0.5874565740426382, | |
| "rewards/cosine_scaled_reward": -0.16268765678008398, | |
| "rewards/format_reward": 0.7777777910232544, | |
| "step": 476 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1744.6806233723958, | |
| "epoch": 0.8177142857142857, | |
| "grad_norm": 5.533082008361816, | |
| "kl": 0.20548502604166666, | |
| "learning_rate": 1.0578868071715544e-07, | |
| "loss": 0.1574, | |
| "reward": 0.775577150285244, | |
| "reward_std": 0.6082544376452764, | |
| "rewards/cosine_scaled_reward": -0.028878106425205868, | |
| "rewards/format_reward": 0.8333333532015482, | |
| "step": 477 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1754.7222798665364, | |
| "epoch": 0.8194285714285714, | |
| "grad_norm": 2.4026851654052734, | |
| "kl": 0.3175455729166667, | |
| "learning_rate": 1.0529722834905125e-07, | |
| "loss": 0.044, | |
| "reward": 0.5808839599291483, | |
| "reward_std": 0.8224713504314423, | |
| "rewards/cosine_scaled_reward": -0.11928024825950463, | |
| "rewards/format_reward": 0.8194444676240286, | |
| "step": 478 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1709.3750406901042, | |
| "epoch": 0.8211428571428572, | |
| "grad_norm": 3.1133224964141846, | |
| "kl": 0.3463948567708333, | |
| "learning_rate": 1.0482745016665526e-07, | |
| "loss": 0.038, | |
| "reward": 1.2287188371022542, | |
| "reward_std": 0.6503394469618797, | |
| "rewards/cosine_scaled_reward": 0.1699149707953135, | |
| "rewards/format_reward": 0.8888888955116272, | |
| "step": 479 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1664.8056131998699, | |
| "epoch": 0.8228571428571428, | |
| "grad_norm": 1.5718989372253418, | |
| "kl": 0.3487548828125, | |
| "learning_rate": 1.0437936906629334e-07, | |
| "loss": 0.0253, | |
| "reward": 0.7876748417814573, | |
| "reward_std": 0.6895910153786341, | |
| "rewards/cosine_scaled_reward": -0.015884815404812496, | |
| "rewards/format_reward": 0.819444457689921, | |
| "step": 480 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1771.3055826822917, | |
| "epoch": 0.8245714285714286, | |
| "grad_norm": 55.6639404296875, | |
| "kl": 1.0530598958333333, | |
| "learning_rate": 1.0395300688680625e-07, | |
| "loss": 0.0457, | |
| "reward": 0.8772253592809042, | |
| "reward_std": 0.6274547676245371, | |
| "rewards/cosine_scaled_reward": 0.0011126622557640076, | |
| "rewards/format_reward": 0.8750000099341074, | |
| "step": 481 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1903.8750610351562, | |
| "epoch": 0.8262857142857143, | |
| "grad_norm": 2.326770544052124, | |
| "kl": 0.19234212239583334, | |
| "learning_rate": 1.0354838440848501e-07, | |
| "loss": 0.046, | |
| "reward": 0.5817524641752243, | |
| "reward_std": 0.6069566210110983, | |
| "rewards/cosine_scaled_reward": -0.1327348860601584, | |
| "rewards/format_reward": 0.8472222288449606, | |
| "step": 482 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1471.1944986979167, | |
| "epoch": 0.828, | |
| "grad_norm": 5.376038551330566, | |
| "kl": 0.2686360677083333, | |
| "learning_rate": 1.0316552135205837e-07, | |
| "loss": 0.1529, | |
| "reward": 0.7287046511967977, | |
| "reward_std": 0.6021015048027039, | |
| "rewards/cosine_scaled_reward": -0.06620323099195957, | |
| "rewards/format_reward": 0.8611111243565878, | |
| "step": 483 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1324.1805826822917, | |
| "epoch": 0.8297142857142857, | |
| "grad_norm": 0.5980741381645203, | |
| "kl": 0.09529622395833333, | |
| "learning_rate": 1.0280443637773163e-07, | |
| "loss": -0.0071, | |
| "reward": 0.8527243783076605, | |
| "reward_std": 0.7082930107911428, | |
| "rewards/cosine_scaled_reward": -0.03891559566060702, | |
| "rewards/format_reward": 0.9305555621782938, | |
| "step": 484 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1476.013916015625, | |
| "epoch": 0.8314285714285714, | |
| "grad_norm": 1.3439418077468872, | |
| "kl": 0.12544759114583334, | |
| "learning_rate": 1.0246514708427701e-07, | |
| "loss": -0.0842, | |
| "reward": 0.76032821337382, | |
| "reward_std": 0.6948419213294983, | |
| "rewards/cosine_scaled_reward": -0.07816922292113304, | |
| "rewards/format_reward": 0.9166666666666666, | |
| "step": 485 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1530.0972493489583, | |
| "epoch": 0.8331428571428572, | |
| "grad_norm": 1.4637749195098877, | |
| "kl": 0.13285319010416666, | |
| "learning_rate": 1.0214767000817596e-07, | |
| "loss": 0.0431, | |
| "reward": 1.017515196154515, | |
| "reward_std": 0.60954583187898, | |
| "rewards/cosine_scaled_reward": 0.0643131285905838, | |
| "rewards/format_reward": 0.8888889054457346, | |
| "step": 486 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1948.013936360677, | |
| "epoch": 0.8348571428571429, | |
| "grad_norm": 1.7914323806762695, | |
| "kl": 0.302978515625, | |
| "learning_rate": 1.0185202062281336e-07, | |
| "loss": 0.0142, | |
| "reward": 0.517247294386228, | |
| "reward_std": 0.6710758606592814, | |
| "rewards/cosine_scaled_reward": -0.13720969607432684, | |
| "rewards/format_reward": 0.7916666766007742, | |
| "step": 487 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1410.1805623372395, | |
| "epoch": 0.8365714285714285, | |
| "grad_norm": 1.1928882598876953, | |
| "kl": 0.26220703125, | |
| "learning_rate": 1.0157821333772304e-07, | |
| "loss": 0.084, | |
| "reward": 1.2000691179806988, | |
| "reward_std": 0.7926478882630666, | |
| "rewards/cosine_scaled_reward": 0.14170121401548386, | |
| "rewards/format_reward": 0.9166666766007742, | |
| "step": 488 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1674.0555623372395, | |
| "epoch": 0.8382857142857143, | |
| "grad_norm": 3.095184087753296, | |
| "kl": 0.20222981770833334, | |
| "learning_rate": 1.013262614978859e-07, | |
| "loss": 0.0929, | |
| "reward": 0.5767870992422104, | |
| "reward_std": 0.7787861227989197, | |
| "rewards/cosine_scaled_reward": -0.15605091055234274, | |
| "rewards/format_reward": 0.8888889153798422, | |
| "step": 489 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1790.4028727213542, | |
| "epoch": 0.84, | |
| "grad_norm": 4.162797927856445, | |
| "kl": 0.3492838541666667, | |
| "learning_rate": 1.0109617738307911e-07, | |
| "loss": -0.0185, | |
| "reward": 0.8680712406833967, | |
| "reward_std": 0.6225861012935638, | |
| "rewards/cosine_scaled_reward": 0.031257815969487034, | |
| "rewards/format_reward": 0.8055555721124014, | |
| "step": 490 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1826.9444986979167, | |
| "epoch": 0.8417142857142857, | |
| "grad_norm": 2.3164162635803223, | |
| "kl": 0.14998372395833334, | |
| "learning_rate": 1.0088797220727779e-07, | |
| "loss": 0.0348, | |
| "reward": 0.46461812655131024, | |
| "reward_std": 0.5297000010808309, | |
| "rewards/cosine_scaled_reward": -0.20519095162550607, | |
| "rewards/format_reward": 0.875000019868215, | |
| "step": 491 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2010.1806233723958, | |
| "epoch": 0.8434285714285714, | |
| "grad_norm": 6.063572406768799, | |
| "kl": 0.2660319010416667, | |
| "learning_rate": 1.0070165611810855e-07, | |
| "loss": -0.0337, | |
| "reward": 0.8101351310809454, | |
| "reward_std": 0.6275952160358429, | |
| "rewards/cosine_scaled_reward": -0.053265770276387535, | |
| "rewards/format_reward": 0.9166666766007742, | |
| "step": 492 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1724.0416971842449, | |
| "epoch": 0.8451428571428572, | |
| "grad_norm": 51.09119415283203, | |
| "kl": 0.8478800455729166, | |
| "learning_rate": 1.005372381963547e-07, | |
| "loss": 0.0196, | |
| "reward": 0.4897388517856598, | |
| "reward_std": 0.4803101494908333, | |
| "rewards/cosine_scaled_reward": -0.1717972457408905, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 493 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1520.638936360677, | |
| "epoch": 0.8468571428571429, | |
| "grad_norm": 3.4709181785583496, | |
| "kl": 0.2751057942708333, | |
| "learning_rate": 1.0039472645551372e-07, | |
| "loss": -0.0017, | |
| "reward": 0.677379826704661, | |
| "reward_std": 0.530278280377388, | |
| "rewards/cosine_scaled_reward": -0.11964342991511027, | |
| "rewards/format_reward": 0.9166666766007742, | |
| "step": 494 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1459.0694783528645, | |
| "epoch": 0.8485714285714285, | |
| "grad_norm": 1.3744151592254639, | |
| "kl": 0.1253662109375, | |
| "learning_rate": 1.002741278414069e-07, | |
| "loss": 0.0702, | |
| "reward": 0.8713287462790807, | |
| "reward_std": 0.5445962051550547, | |
| "rewards/cosine_scaled_reward": -0.03655784406388799, | |
| "rewards/format_reward": 0.9444444477558136, | |
| "step": 495 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1768.3611450195312, | |
| "epoch": 0.8502857142857143, | |
| "grad_norm": 2.5198593139648438, | |
| "kl": 0.17313639322916666, | |
| "learning_rate": 1.0017544823184055e-07, | |
| "loss": 0.0236, | |
| "reward": 1.1546540260314941, | |
| "reward_std": 0.7593556443850199, | |
| "rewards/cosine_scaled_reward": 0.12593810757001242, | |
| "rewards/format_reward": 0.9027777910232544, | |
| "step": 496 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1813.916727701823, | |
| "epoch": 0.852, | |
| "grad_norm": 5.687215805053711, | |
| "kl": 0.2569173177083333, | |
| "learning_rate": 1.0009869243631952e-07, | |
| "loss": 0.1046, | |
| "reward": 0.8248144760727882, | |
| "reward_std": 0.9260699351628622, | |
| "rewards/cosine_scaled_reward": -0.018148329108953476, | |
| "rewards/format_reward": 0.8611111243565878, | |
| "step": 497 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1948.7917073567708, | |
| "epoch": 0.8537142857142858, | |
| "grad_norm": 1.560611605644226, | |
| "kl": 0.4242350260416667, | |
| "learning_rate": 1.000438641958131e-07, | |
| "loss": 0.0562, | |
| "reward": 0.47958845893541974, | |
| "reward_std": 0.6168194462855657, | |
| "rewards/cosine_scaled_reward": -0.20465023008485636, | |
| "rewards/format_reward": 0.8888889054457346, | |
| "step": 498 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1600.5972798665364, | |
| "epoch": 0.8554285714285714, | |
| "grad_norm": 12.422072410583496, | |
| "kl": 0.3626708984375, | |
| "learning_rate": 1.0001096618257236e-07, | |
| "loss": 0.1101, | |
| "reward": 1.0055039525032043, | |
| "reward_std": 0.7366370757420858, | |
| "rewards/cosine_scaled_reward": 0.05830751188720266, | |
| "rewards/format_reward": 0.8888889054457346, | |
| "step": 499 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1162.1250305175781, | |
| "epoch": 0.8571428571428571, | |
| "grad_norm": 1.4971429109573364, | |
| "kl": 0.08601888020833333, | |
| "learning_rate": 1e-07, | |
| "loss": 0.0439, | |
| "reward": 1.1824093063672383, | |
| "reward_std": 0.5583150386810303, | |
| "rewards/cosine_scaled_reward": 0.11203796043992043, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8571428571428571, | |
| "step": 500, | |
| "total_flos": 0.0, | |
| "train_loss": 0.07676086095208302, | |
| "train_runtime": 243727.0611, | |
| "train_samples_per_second": 0.148, | |
| "train_steps_per_second": 0.002 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |