| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.5714285714285714, | |
| "eval_steps": 500, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2878.5416870117188, | |
| "epoch": 0.001142857142857143, | |
| "grad_norm": 0.2171134501695633, | |
| "kl": 0.0, | |
| "learning_rate": 2e-08, | |
| "loss": -0.0109, | |
| "num_tokens": 146240.0, | |
| "reward": 0.09066538512706757, | |
| "reward_std": 0.19429835677146912, | |
| "rewards/cosine_scaled_reward": -0.14216730743646622, | |
| "rewards/format_reward": 0.375, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2564.8333435058594, | |
| "epoch": 0.002285714285714286, | |
| "grad_norm": 0.27501821517944336, | |
| "kl": 0.0, | |
| "learning_rate": 4e-08, | |
| "loss": 0.0684, | |
| "num_tokens": 277074.0, | |
| "reward": 0.5692815706133842, | |
| "reward_std": 0.6707231402397156, | |
| "rewards/cosine_scaled_reward": 0.013807429000735283, | |
| "rewards/format_reward": 0.5416666772216558, | |
| "step": 2 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2944.854217529297, | |
| "epoch": 0.0034285714285714284, | |
| "grad_norm": 0.2801656723022461, | |
| "kl": 0.0006022453308105469, | |
| "learning_rate": 6e-08, | |
| "loss": 0.0777, | |
| "num_tokens": 426995.0, | |
| "reward": -0.32244681287556887, | |
| "reward_std": 0.3240165375173092, | |
| "rewards/cosine_scaled_reward": -0.2862233966588974, | |
| "rewards/format_reward": 0.24999999813735485, | |
| "step": 3 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2979.8125, | |
| "epoch": 0.004571428571428572, | |
| "grad_norm": 0.2610093951225281, | |
| "kl": 0.000518798828125, | |
| "learning_rate": 8e-08, | |
| "loss": 0.1125, | |
| "num_tokens": 577748.0, | |
| "reward": -0.16339990682899952, | |
| "reward_std": 0.43585263565182686, | |
| "rewards/cosine_scaled_reward": -0.23794995370553806, | |
| "rewards/format_reward": 0.3125, | |
| "step": 4 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2759.1458740234375, | |
| "epoch": 0.005714285714285714, | |
| "grad_norm": 0.2045336216688156, | |
| "kl": 0.00048828125, | |
| "learning_rate": 1e-07, | |
| "loss": -0.0202, | |
| "num_tokens": 717849.0, | |
| "reward": 0.541385006159544, | |
| "reward_std": 0.8971522003412247, | |
| "rewards/cosine_scaled_reward": 0.010275840759277344, | |
| "rewards/format_reward": 0.5208333358168602, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2596.8959350585938, | |
| "epoch": 0.006857142857142857, | |
| "grad_norm": 0.28218525648117065, | |
| "kl": 0.0006213188171386719, | |
| "learning_rate": 1.2e-07, | |
| "loss": 0.0694, | |
| "num_tokens": 850060.0, | |
| "reward": 0.18730801343917847, | |
| "reward_std": 0.7535099536180496, | |
| "rewards/cosine_scaled_reward": -0.1667626677080989, | |
| "rewards/format_reward": 0.5208333358168602, | |
| "step": 6 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2559.14599609375, | |
| "epoch": 0.008, | |
| "grad_norm": 0.27962586283683777, | |
| "kl": 0.0004992485046386719, | |
| "learning_rate": 1.4e-07, | |
| "loss": 0.0313, | |
| "num_tokens": 980555.0, | |
| "reward": 0.7750245705246925, | |
| "reward_std": 0.49819300323724747, | |
| "rewards/cosine_scaled_reward": 0.10626226477324963, | |
| "rewards/format_reward": 0.5624999925494194, | |
| "step": 7 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2791.5625610351562, | |
| "epoch": 0.009142857142857144, | |
| "grad_norm": 0.2271980494260788, | |
| "kl": 0.0005779266357421875, | |
| "learning_rate": 1.6e-07, | |
| "loss": 0.1349, | |
| "num_tokens": 1122434.0, | |
| "reward": 0.28142981859855354, | |
| "reward_std": 0.6572991460561752, | |
| "rewards/cosine_scaled_reward": -0.10928510129451752, | |
| "rewards/format_reward": 0.4999999925494194, | |
| "step": 8 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3071.0625610351562, | |
| "epoch": 0.010285714285714285, | |
| "grad_norm": 0.21379557251930237, | |
| "kl": 0.0006103515625, | |
| "learning_rate": 1.8e-07, | |
| "loss": 0.0952, | |
| "num_tokens": 1277807.0, | |
| "reward": -0.12088486924767494, | |
| "reward_std": 0.6939665377140045, | |
| "rewards/cosine_scaled_reward": -0.19585910439491272, | |
| "rewards/format_reward": 0.2708333283662796, | |
| "step": 9 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2429.5000610351562, | |
| "epoch": 0.011428571428571429, | |
| "grad_norm": 0.27965009212493896, | |
| "kl": 0.0006093978881835938, | |
| "learning_rate": 2e-07, | |
| "loss": 0.0617, | |
| "num_tokens": 1402259.0, | |
| "reward": 0.2807231955230236, | |
| "reward_std": 0.5721964091062546, | |
| "rewards/cosine_scaled_reward": -0.1617217343300581, | |
| "rewards/format_reward": 0.604166679084301, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2379.6250610351562, | |
| "epoch": 0.012571428571428572, | |
| "grad_norm": 0.389694482088089, | |
| "kl": 0.0004887580871582031, | |
| "learning_rate": 2.1999999999999998e-07, | |
| "loss": 0.1212, | |
| "num_tokens": 1524605.0, | |
| "reward": 0.8620961308479309, | |
| "reward_std": 0.8407739326357841, | |
| "rewards/cosine_scaled_reward": 0.09771470725536346, | |
| "rewards/format_reward": 0.6666666716337204, | |
| "step": 11 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2639.5625610351562, | |
| "epoch": 0.013714285714285714, | |
| "grad_norm": 0.20471037924289703, | |
| "kl": 0.0005512237548828125, | |
| "learning_rate": 2.4e-07, | |
| "loss": 0.0464, | |
| "num_tokens": 1659272.0, | |
| "reward": 0.5478887595236301, | |
| "reward_std": 0.8274511396884918, | |
| "rewards/cosine_scaled_reward": -0.017722302465699613, | |
| "rewards/format_reward": 0.5833333283662796, | |
| "step": 12 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2278.9375610351562, | |
| "epoch": 0.014857142857142857, | |
| "grad_norm": 0.30582067370414734, | |
| "kl": 0.0005240440368652344, | |
| "learning_rate": 2.6e-07, | |
| "loss": 0.0729, | |
| "num_tokens": 1776629.0, | |
| "reward": 0.45567256212234497, | |
| "reward_std": 0.7415833473205566, | |
| "rewards/cosine_scaled_reward": -0.07424705754965544, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 13 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2861.3958740234375, | |
| "epoch": 0.016, | |
| "grad_norm": 0.25264662504196167, | |
| "kl": 0.0006380081176757812, | |
| "learning_rate": 2.8e-07, | |
| "loss": 0.0145, | |
| "num_tokens": 1922556.0, | |
| "reward": -0.11359720956534147, | |
| "reward_std": 0.47907784581184387, | |
| "rewards/cosine_scaled_reward": -0.22346526756882668, | |
| "rewards/format_reward": 0.3333333283662796, | |
| "step": 14 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3137.7083740234375, | |
| "epoch": 0.017142857142857144, | |
| "grad_norm": 0.218108668923378, | |
| "kl": 0.0006160736083984375, | |
| "learning_rate": 3e-07, | |
| "loss": 0.0771, | |
| "num_tokens": 2081608.0, | |
| "reward": 0.05102095380425453, | |
| "reward_std": 0.49165723100304604, | |
| "rewards/cosine_scaled_reward": -0.07865619286894798, | |
| "rewards/format_reward": 0.2083333395421505, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2212.479248046875, | |
| "epoch": 0.018285714285714287, | |
| "grad_norm": 0.3894731402397156, | |
| "kl": 0.0005140304565429688, | |
| "learning_rate": 3.2e-07, | |
| "loss": 0.1743, | |
| "num_tokens": 2195313.0, | |
| "reward": 0.7095662355422974, | |
| "reward_std": 0.6108940467238426, | |
| "rewards/cosine_scaled_reward": 0.08394978567957878, | |
| "rewards/format_reward": 0.5416666641831398, | |
| "step": 16 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3214.9583740234375, | |
| "epoch": 0.019428571428571427, | |
| "grad_norm": 0.22356842458248138, | |
| "kl": 0.0006380081176757812, | |
| "learning_rate": 3.4000000000000003e-07, | |
| "loss": -0.0182, | |
| "num_tokens": 2357947.0, | |
| "reward": -0.35805825144052505, | |
| "reward_std": 0.27831941843032837, | |
| "rewards/cosine_scaled_reward": -0.29361244291067123, | |
| "rewards/format_reward": 0.22916666232049465, | |
| "step": 17 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2965.0208740234375, | |
| "epoch": 0.02057142857142857, | |
| "grad_norm": 0.2580890357494354, | |
| "kl": 0.0005941390991210938, | |
| "learning_rate": 3.6e-07, | |
| "loss": 0.1942, | |
| "num_tokens": 2508662.0, | |
| "reward": -0.20507963374257088, | |
| "reward_std": 0.4416894242167473, | |
| "rewards/cosine_scaled_reward": -0.23795648105442524, | |
| "rewards/format_reward": 0.2708333395421505, | |
| "step": 18 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3202.1875610351562, | |
| "epoch": 0.021714285714285714, | |
| "grad_norm": 0.19615532457828522, | |
| "kl": 0.0005750656127929688, | |
| "learning_rate": 3.7999999999999996e-07, | |
| "loss": 0.0641, | |
| "num_tokens": 2670995.0, | |
| "reward": 0.053066179156303406, | |
| "reward_std": 0.7212408185005188, | |
| "rewards/cosine_scaled_reward": -0.11930023087188601, | |
| "rewards/format_reward": 0.2916666679084301, | |
| "step": 19 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2715.6666870117188, | |
| "epoch": 0.022857142857142857, | |
| "grad_norm": 0.24433307349681854, | |
| "kl": 0.0006351470947265625, | |
| "learning_rate": 4e-07, | |
| "loss": 0.0429, | |
| "num_tokens": 2808961.0, | |
| "reward": 0.023024218156933784, | |
| "reward_std": 0.6361246034502983, | |
| "rewards/cosine_scaled_reward": -0.1968212267383933, | |
| "rewards/format_reward": 0.4166666679084301, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2765.8958740234375, | |
| "epoch": 0.024, | |
| "grad_norm": 0.24205811321735382, | |
| "kl": 0.0005364418029785156, | |
| "learning_rate": 4.1999999999999995e-07, | |
| "loss": 0.066, | |
| "num_tokens": 2950058.0, | |
| "reward": 0.03793077915906906, | |
| "reward_std": 0.43773240596055984, | |
| "rewards/cosine_scaled_reward": -0.14770127274096012, | |
| "rewards/format_reward": 0.33333333395421505, | |
| "step": 21 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3380.0833740234375, | |
| "epoch": 0.025142857142857144, | |
| "grad_norm": 0.19774502515792847, | |
| "kl": 0.0005664825439453125, | |
| "learning_rate": 4.3999999999999997e-07, | |
| "loss": 0.0911, | |
| "num_tokens": 3120072.0, | |
| "reward": 0.15241558849811554, | |
| "reward_std": 0.8311697989702225, | |
| "rewards/cosine_scaled_reward": -0.02795886993408203, | |
| "rewards/format_reward": 0.20833333767950535, | |
| "step": 22 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3330.9375610351562, | |
| "epoch": 0.026285714285714287, | |
| "grad_norm": 0.218012735247612, | |
| "kl": 0.0006666183471679688, | |
| "learning_rate": 4.6e-07, | |
| "loss": 0.0349, | |
| "num_tokens": 3287889.0, | |
| "reward": 0.24341265857219696, | |
| "reward_std": 0.7591935321688652, | |
| "rewards/cosine_scaled_reward": -0.09704366815276444, | |
| "rewards/format_reward": 0.4375, | |
| "step": 23 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2302.3959045410156, | |
| "epoch": 0.027428571428571427, | |
| "grad_norm": 0.27331000566482544, | |
| "kl": 0.00044727325439453125, | |
| "learning_rate": 4.8e-07, | |
| "loss": 0.0858, | |
| "num_tokens": 3406318.0, | |
| "reward": 0.6318932324647903, | |
| "reward_std": 0.7281809970736504, | |
| "rewards/cosine_scaled_reward": 0.02427995391190052, | |
| "rewards/format_reward": 0.5833333395421505, | |
| "step": 24 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2765.9375610351562, | |
| "epoch": 0.02857142857142857, | |
| "grad_norm": 0.28522786498069763, | |
| "kl": 0.000560760498046875, | |
| "learning_rate": 5e-07, | |
| "loss": 0.1322, | |
| "num_tokens": 3546859.0, | |
| "reward": 0.09269634401425719, | |
| "reward_std": 0.6214342266321182, | |
| "rewards/cosine_scaled_reward": -0.12031849287450314, | |
| "rewards/format_reward": 0.3333333395421505, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3093.6876220703125, | |
| "epoch": 0.029714285714285714, | |
| "grad_norm": 0.21539199352264404, | |
| "kl": 0.00049591064453125, | |
| "learning_rate": 5.2e-07, | |
| "loss": 0.0489, | |
| "num_tokens": 3704476.0, | |
| "reward": 0.1484052948653698, | |
| "reward_std": 0.8102314993739128, | |
| "rewards/cosine_scaled_reward": -0.07163068139925599, | |
| "rewards/format_reward": 0.29166666604578495, | |
| "step": 26 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2976.229248046875, | |
| "epoch": 0.030857142857142857, | |
| "grad_norm": 0.25912725925445557, | |
| "kl": 0.0005688667297363281, | |
| "learning_rate": 5.4e-07, | |
| "loss": -0.0511, | |
| "num_tokens": 3856233.0, | |
| "reward": -0.04554035887122154, | |
| "reward_std": 0.3555384576320648, | |
| "rewards/cosine_scaled_reward": -0.15818685293197632, | |
| "rewards/format_reward": 0.27083333395421505, | |
| "step": 27 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3116.479248046875, | |
| "epoch": 0.032, | |
| "grad_norm": 0.2779461145401001, | |
| "kl": 0.0006647109985351562, | |
| "learning_rate": 5.6e-07, | |
| "loss": 0.0937, | |
| "num_tokens": 4014002.0, | |
| "reward": -0.06345795840024948, | |
| "reward_std": 0.6914098784327507, | |
| "rewards/cosine_scaled_reward": -0.17756231129169464, | |
| "rewards/format_reward": 0.2916666679084301, | |
| "step": 28 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3091.8125, | |
| "epoch": 0.03314285714285714, | |
| "grad_norm": 0.20903803408145905, | |
| "kl": 0.0004973411560058594, | |
| "learning_rate": 5.8e-07, | |
| "loss": 0.0822, | |
| "num_tokens": 4170833.0, | |
| "reward": -0.17993240803480148, | |
| "reward_std": 0.6661744937300682, | |
| "rewards/cosine_scaled_reward": -0.2357995305210352, | |
| "rewards/format_reward": 0.29166667349636555, | |
| "step": 29 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3212.3125610351562, | |
| "epoch": 0.03428571428571429, | |
| "grad_norm": 0.19359447062015533, | |
| "kl": 0.0006041526794433594, | |
| "learning_rate": 6e-07, | |
| "loss": 0.125, | |
| "num_tokens": 4333220.0, | |
| "reward": 0.13911130279302597, | |
| "reward_std": 0.826167568564415, | |
| "rewards/cosine_scaled_reward": -0.09711101395078003, | |
| "rewards/format_reward": 0.3333333320915699, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2465.5208740234375, | |
| "epoch": 0.03542857142857143, | |
| "grad_norm": 0.3267292380332947, | |
| "kl": 0.0005950927734375, | |
| "learning_rate": 6.2e-07, | |
| "loss": 0.0618, | |
| "num_tokens": 4459791.0, | |
| "reward": 0.28785821609199047, | |
| "reward_std": 0.5182768851518631, | |
| "rewards/cosine_scaled_reward": -0.08523756638169289, | |
| "rewards/format_reward": 0.45833333395421505, | |
| "step": 31 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3584.0, | |
| "epoch": 0.036571428571428574, | |
| "grad_norm": 0.18462678790092468, | |
| "kl": 0.0006098747253417969, | |
| "learning_rate": 6.4e-07, | |
| "loss": 0.0, | |
| "num_tokens": 4640043.0, | |
| "reward": -0.45041289925575256, | |
| "reward_std": 0.18651413917541504, | |
| "rewards/cosine_scaled_reward": -0.22520644962787628, | |
| "rewards/format_reward": 0.0, | |
| "step": 32 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3056.166748046875, | |
| "epoch": 0.037714285714285714, | |
| "grad_norm": 0.22060233354568481, | |
| "kl": 0.0005612373352050781, | |
| "learning_rate": 6.6e-07, | |
| "loss": 0.1453, | |
| "num_tokens": 4794857.0, | |
| "reward": 0.029838480055332184, | |
| "reward_std": 0.7281420417129993, | |
| "rewards/cosine_scaled_reward": -0.14133075810968876, | |
| "rewards/format_reward": 0.31249999441206455, | |
| "step": 33 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3175.4791870117188, | |
| "epoch": 0.038857142857142854, | |
| "grad_norm": 0.24089233577251434, | |
| "kl": 0.0005521774291992188, | |
| "learning_rate": 6.800000000000001e-07, | |
| "loss": 0.0075, | |
| "num_tokens": 4955170.0, | |
| "reward": -0.21627317368984222, | |
| "reward_std": 0.49830519035458565, | |
| "rewards/cosine_scaled_reward": -0.1914699161425233, | |
| "rewards/format_reward": 0.1666666679084301, | |
| "step": 34 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2963.02099609375, | |
| "epoch": 0.04, | |
| "grad_norm": 0.22746875882148743, | |
| "kl": 0.0005502700805664062, | |
| "learning_rate": 7e-07, | |
| "loss": 0.0903, | |
| "num_tokens": 5105531.0, | |
| "reward": 0.07940403372049332, | |
| "reward_std": 0.8007240146398544, | |
| "rewards/cosine_scaled_reward": -0.16863130778074265, | |
| "rewards/format_reward": 0.4166666679084301, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2587.0000610351562, | |
| "epoch": 0.04114285714285714, | |
| "grad_norm": 0.24526968598365784, | |
| "kl": 0.0005397796630859375, | |
| "learning_rate": 7.2e-07, | |
| "loss": 0.0504, | |
| "num_tokens": 5237645.0, | |
| "reward": 0.5731585621833801, | |
| "reward_std": 0.4250886049121618, | |
| "rewards/cosine_scaled_reward": 0.05741261690855026, | |
| "rewards/format_reward": 0.4583333283662796, | |
| "step": 36 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2604.7500610351562, | |
| "epoch": 0.04228571428571429, | |
| "grad_norm": 0.3840998709201813, | |
| "kl": 0.00054931640625, | |
| "learning_rate": 7.4e-07, | |
| "loss": 0.1006, | |
| "num_tokens": 5370077.0, | |
| "reward": 0.44232267513871193, | |
| "reward_std": 0.6607204154133797, | |
| "rewards/cosine_scaled_reward": -0.049671996384859085, | |
| "rewards/format_reward": 0.541666654869914, | |
| "step": 37 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2919.1250610351562, | |
| "epoch": 0.04342857142857143, | |
| "grad_norm": 0.2132943570613861, | |
| "kl": 0.000682830810546875, | |
| "learning_rate": 7.599999999999999e-07, | |
| "loss": 0.1065, | |
| "num_tokens": 5517587.0, | |
| "reward": -0.07279382459819317, | |
| "reward_std": 0.5536654070019722, | |
| "rewards/cosine_scaled_reward": -0.2551469076424837, | |
| "rewards/format_reward": 0.4375000074505806, | |
| "step": 38 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2926.6459350585938, | |
| "epoch": 0.044571428571428574, | |
| "grad_norm": 0.23606236279010773, | |
| "kl": 0.0006608963012695312, | |
| "learning_rate": 7.799999999999999e-07, | |
| "loss": 0.113, | |
| "num_tokens": 5665770.0, | |
| "reward": 0.4562120959162712, | |
| "reward_std": 0.876225158572197, | |
| "rewards/cosine_scaled_reward": -0.011477291118353605, | |
| "rewards/format_reward": 0.4791666641831398, | |
| "step": 39 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3004.5625915527344, | |
| "epoch": 0.045714285714285714, | |
| "grad_norm": 0.2187039852142334, | |
| "kl": 0.0005340576171875, | |
| "learning_rate": 8e-07, | |
| "loss": 0.068, | |
| "num_tokens": 5818485.0, | |
| "reward": 0.15439531083393376, | |
| "reward_std": 0.6592165231704712, | |
| "rewards/cosine_scaled_reward": -0.11030234955251217, | |
| "rewards/format_reward": 0.3750000037252903, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3318.166748046875, | |
| "epoch": 0.046857142857142854, | |
| "grad_norm": 0.20403259992599487, | |
| "kl": 0.0006113052368164062, | |
| "learning_rate": 8.199999999999999e-07, | |
| "loss": 0.0189, | |
| "num_tokens": 5985731.0, | |
| "reward": 0.24213121831417084, | |
| "reward_std": 0.8019094243645668, | |
| "rewards/cosine_scaled_reward": -0.03518439130857587, | |
| "rewards/format_reward": 0.3124999925494194, | |
| "step": 41 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2081.0834350585938, | |
| "epoch": 0.048, | |
| "grad_norm": 0.3065926134586334, | |
| "kl": 0.0006427764892578125, | |
| "learning_rate": 8.399999999999999e-07, | |
| "loss": 0.0694, | |
| "num_tokens": 6092841.0, | |
| "reward": 0.7219225168228149, | |
| "reward_std": 0.7827914208173752, | |
| "rewards/cosine_scaled_reward": 0.006794577464461327, | |
| "rewards/format_reward": 0.7083333358168602, | |
| "step": 42 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3077.604248046875, | |
| "epoch": 0.04914285714285714, | |
| "grad_norm": 0.25333571434020996, | |
| "kl": 0.0007266998291015625, | |
| "learning_rate": 8.599999999999999e-07, | |
| "loss": 0.0159, | |
| "num_tokens": 6248312.0, | |
| "reward": 0.6068893522024155, | |
| "reward_std": 1.1225253641605377, | |
| "rewards/cosine_scaled_reward": 0.04302799212746322, | |
| "rewards/format_reward": 0.5208333283662796, | |
| "step": 43 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2814.9583740234375, | |
| "epoch": 0.05028571428571429, | |
| "grad_norm": 0.2492004930973053, | |
| "kl": 0.0008058547973632812, | |
| "learning_rate": 8.799999999999999e-07, | |
| "loss": 0.0075, | |
| "num_tokens": 6391476.0, | |
| "reward": 0.25283733755350113, | |
| "reward_std": 0.5932779908180237, | |
| "rewards/cosine_scaled_reward": -0.08191467449069023, | |
| "rewards/format_reward": 0.41666666232049465, | |
| "step": 44 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2648.3959350585938, | |
| "epoch": 0.05142857142857143, | |
| "grad_norm": 0.2790423333644867, | |
| "kl": 0.0006084442138671875, | |
| "learning_rate": 9e-07, | |
| "loss": -0.0224, | |
| "num_tokens": 6526459.0, | |
| "reward": 0.6963641820475459, | |
| "reward_std": 0.8004543036222458, | |
| "rewards/cosine_scaled_reward": 0.05651540495455265, | |
| "rewards/format_reward": 0.5833333283662796, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2959.2916870117188, | |
| "epoch": 0.052571428571428575, | |
| "grad_norm": 0.27206096053123474, | |
| "kl": 0.000598907470703125, | |
| "learning_rate": 9.2e-07, | |
| "loss": 0.1131, | |
| "num_tokens": 6676827.0, | |
| "reward": 0.2336385459639132, | |
| "reward_std": 0.44995977729558945, | |
| "rewards/cosine_scaled_reward": -0.029014069586992264, | |
| "rewards/format_reward": 0.2916666716337204, | |
| "step": 46 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2453.0625915527344, | |
| "epoch": 0.053714285714285714, | |
| "grad_norm": 0.28164342045783997, | |
| "kl": 0.00048160552978515625, | |
| "learning_rate": 9.399999999999999e-07, | |
| "loss": 0.0803, | |
| "num_tokens": 6802206.0, | |
| "reward": 0.4458533003926277, | |
| "reward_std": 0.5642239525914192, | |
| "rewards/cosine_scaled_reward": -0.06874001771211624, | |
| "rewards/format_reward": 0.5833333283662796, | |
| "step": 47 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2640.7916870117188, | |
| "epoch": 0.054857142857142854, | |
| "grad_norm": 0.3081265687942505, | |
| "kl": 0.0006432533264160156, | |
| "learning_rate": 9.6e-07, | |
| "loss": 0.0203, | |
| "num_tokens": 6936848.0, | |
| "reward": 0.4199897248763591, | |
| "reward_std": 0.5818230472505093, | |
| "rewards/cosine_scaled_reward": -0.05042180512100458, | |
| "rewards/format_reward": 0.5208333414047956, | |
| "step": 48 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2285.7916870117188, | |
| "epoch": 0.056, | |
| "grad_norm": 0.3449012339115143, | |
| "kl": 0.0005960464477539062, | |
| "learning_rate": 9.8e-07, | |
| "loss": -0.0279, | |
| "num_tokens": 7054330.0, | |
| "reward": 0.6692525297403336, | |
| "reward_std": 0.6822149157524109, | |
| "rewards/cosine_scaled_reward": 0.032542891800403595, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 49 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2134.8334045410156, | |
| "epoch": 0.05714285714285714, | |
| "grad_norm": 0.387600839138031, | |
| "kl": 0.0008153915405273438, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1332, | |
| "num_tokens": 7163978.0, | |
| "reward": 0.8850179463624954, | |
| "reward_std": 0.8217868953943253, | |
| "rewards/cosine_scaled_reward": 0.1300089694559574, | |
| "rewards/format_reward": 0.6249999925494194, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3059.0834045410156, | |
| "epoch": 0.05828571428571429, | |
| "grad_norm": 0.23220700025558472, | |
| "kl": 0.0007076263427734375, | |
| "learning_rate": 9.999890338174275e-07, | |
| "loss": 0.1301, | |
| "num_tokens": 7319304.0, | |
| "reward": -0.1438809223473072, | |
| "reward_std": 0.772916778922081, | |
| "rewards/cosine_scaled_reward": -0.2386071290820837, | |
| "rewards/format_reward": 0.33333333022892475, | |
| "step": 51 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3285.854248046875, | |
| "epoch": 0.05942857142857143, | |
| "grad_norm": 0.19926899671554565, | |
| "kl": 0.0007266998291015625, | |
| "learning_rate": 9.999561358041868e-07, | |
| "loss": 0.0235, | |
| "num_tokens": 7485797.0, | |
| "reward": -0.1713619939982891, | |
| "reward_std": 0.6841993480920792, | |
| "rewards/cosine_scaled_reward": -0.20026432862505317, | |
| "rewards/format_reward": 0.22916666232049465, | |
| "step": 52 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2906.2500610351562, | |
| "epoch": 0.060571428571428575, | |
| "grad_norm": 0.293755441904068, | |
| "kl": 0.0007658004760742188, | |
| "learning_rate": 9.999013075636804e-07, | |
| "loss": 0.0641, | |
| "num_tokens": 7633685.0, | |
| "reward": -0.10038524121046066, | |
| "reward_std": 0.548088788986206, | |
| "rewards/cosine_scaled_reward": -0.19602595455944538, | |
| "rewards/format_reward": 0.2916666604578495, | |
| "step": 53 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2585.937530517578, | |
| "epoch": 0.061714285714285715, | |
| "grad_norm": 0.29792076349258423, | |
| "kl": 0.0007724761962890625, | |
| "learning_rate": 9.998245517681593e-07, | |
| "loss": 0.065, | |
| "num_tokens": 7765328.0, | |
| "reward": -0.13627923466265202, | |
| "reward_std": 0.2906072996556759, | |
| "rewards/cosine_scaled_reward": -0.25563961640000343, | |
| "rewards/format_reward": 0.3749999925494194, | |
| "step": 54 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3107.7916870117188, | |
| "epoch": 0.06285714285714286, | |
| "grad_norm": 0.22162412106990814, | |
| "kl": 0.0007085800170898438, | |
| "learning_rate": 9.997258721585931e-07, | |
| "loss": 0.0752, | |
| "num_tokens": 7922746.0, | |
| "reward": -0.2543004397302866, | |
| "reward_std": 0.6761848628520966, | |
| "rewards/cosine_scaled_reward": -0.2625668868422508, | |
| "rewards/format_reward": 0.27083333022892475, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2845.8958740234375, | |
| "epoch": 0.064, | |
| "grad_norm": 0.2844100296497345, | |
| "kl": 0.0008029937744140625, | |
| "learning_rate": 9.996052735444862e-07, | |
| "loss": 0.0857, | |
| "num_tokens": 8068109.0, | |
| "reward": 0.17015837877988815, | |
| "reward_std": 0.6355826109647751, | |
| "rewards/cosine_scaled_reward": -0.060754150777938776, | |
| "rewards/format_reward": 0.2916666679084301, | |
| "step": 56 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2969.6250610351562, | |
| "epoch": 0.06514285714285714, | |
| "grad_norm": 0.3783540725708008, | |
| "kl": 0.0008459091186523438, | |
| "learning_rate": 9.994627618036452e-07, | |
| "loss": 0.0707, | |
| "num_tokens": 8218943.0, | |
| "reward": 0.123802050948143, | |
| "reward_std": 0.569492757320404, | |
| "rewards/cosine_scaled_reward": -0.12559896823950112, | |
| "rewards/format_reward": 0.375, | |
| "step": 57 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2966.0209350585938, | |
| "epoch": 0.06628571428571428, | |
| "grad_norm": 0.24569377303123474, | |
| "kl": 0.0008134841918945312, | |
| "learning_rate": 9.992983438818915e-07, | |
| "loss": 0.0928, | |
| "num_tokens": 8369862.0, | |
| "reward": -0.38002127036452293, | |
| "reward_std": 0.40066082403063774, | |
| "rewards/cosine_scaled_reward": -0.3358439467847347, | |
| "rewards/format_reward": 0.2916666679084301, | |
| "step": 58 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2924.1251220703125, | |
| "epoch": 0.06742857142857143, | |
| "grad_norm": 0.22264137864112854, | |
| "kl": 0.00084686279296875, | |
| "learning_rate": 9.991120277927223e-07, | |
| "loss": -0.0067, | |
| "num_tokens": 8518362.0, | |
| "reward": 0.3800372362602502, | |
| "reward_std": 0.7081842869520187, | |
| "rewards/cosine_scaled_reward": 0.0025186067214235663, | |
| "rewards/format_reward": 0.3750000037252903, | |
| "step": 59 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2843.4375610351562, | |
| "epoch": 0.06857142857142857, | |
| "grad_norm": 0.26855766773223877, | |
| "kl": 0.0009984970092773438, | |
| "learning_rate": 9.989038226169207e-07, | |
| "loss": 0.0308, | |
| "num_tokens": 8663217.0, | |
| "reward": 0.23737115785479546, | |
| "reward_std": 0.5052844993770123, | |
| "rewards/cosine_scaled_reward": -0.04798109957482666, | |
| "rewards/format_reward": 0.3333333358168602, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2237.854217529297, | |
| "epoch": 0.06971428571428571, | |
| "grad_norm": 0.26261472702026367, | |
| "kl": 0.0006732940673828125, | |
| "learning_rate": 9.98673738502114e-07, | |
| "loss": 0.0594, | |
| "num_tokens": 8778356.0, | |
| "reward": 0.7166529446840286, | |
| "reward_std": 0.5268924571573734, | |
| "rewards/cosine_scaled_reward": 0.06665980257093906, | |
| "rewards/format_reward": 0.5833333283662796, | |
| "step": 61 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3172.4376220703125, | |
| "epoch": 0.07085714285714285, | |
| "grad_norm": 0.19514356553554535, | |
| "kl": 0.0007085800170898438, | |
| "learning_rate": 9.98421786662277e-07, | |
| "loss": 0.0541, | |
| "num_tokens": 8938649.0, | |
| "reward": 0.18832573667168617, | |
| "reward_std": 0.6950011849403381, | |
| "rewards/cosine_scaled_reward": -0.06208712235093117, | |
| "rewards/format_reward": 0.3125, | |
| "step": 62 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1910.0833587646484, | |
| "epoch": 0.072, | |
| "grad_norm": 0.3044058680534363, | |
| "kl": 0.001178741455078125, | |
| "learning_rate": 9.981479793771866e-07, | |
| "loss": 0.0997, | |
| "num_tokens": 9037839.0, | |
| "reward": 0.692467100918293, | |
| "reward_std": 0.7802318185567856, | |
| "rewards/cosine_scaled_reward": 0.012900200905278325, | |
| "rewards/format_reward": 0.6666666567325592, | |
| "step": 63 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3039.916748046875, | |
| "epoch": 0.07314285714285715, | |
| "grad_norm": 0.23920981585979462, | |
| "kl": 0.0007276535034179688, | |
| "learning_rate": 9.97852329991824e-07, | |
| "loss": 0.0027, | |
| "num_tokens": 9191927.0, | |
| "reward": 0.4938540682196617, | |
| "reward_std": 0.7745917662978172, | |
| "rewards/cosine_scaled_reward": 0.05942701664753258, | |
| "rewards/format_reward": 0.3750000074505806, | |
| "step": 64 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2283.8334350585938, | |
| "epoch": 0.07428571428571429, | |
| "grad_norm": 0.23445375263690948, | |
| "kl": 0.0009222030639648438, | |
| "learning_rate": 9.975348529157229e-07, | |
| "loss": 0.1023, | |
| "num_tokens": 9309057.0, | |
| "reward": 0.34917649751878344, | |
| "reward_std": 0.7536975219845772, | |
| "rewards/cosine_scaled_reward": -0.11707842443138361, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3020.8333740234375, | |
| "epoch": 0.07542857142857143, | |
| "grad_norm": 0.23693493008613586, | |
| "kl": 0.0007867813110351562, | |
| "learning_rate": 9.971955636222684e-07, | |
| "loss": 0.0562, | |
| "num_tokens": 9461935.0, | |
| "reward": -0.07892957702279091, | |
| "reward_std": 0.5825114250183105, | |
| "rewards/cosine_scaled_reward": -0.2165481224656105, | |
| "rewards/format_reward": 0.3541666641831398, | |
| "step": 66 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2902.0208435058594, | |
| "epoch": 0.07657142857142857, | |
| "grad_norm": 0.23581241071224213, | |
| "kl": 0.00127410888671875, | |
| "learning_rate": 9.968344786479415e-07, | |
| "loss": 0.1158, | |
| "num_tokens": 9609506.0, | |
| "reward": 0.5483582876622677, | |
| "reward_std": 0.5557361207902431, | |
| "rewards/cosine_scaled_reward": 0.055429140105843544, | |
| "rewards/format_reward": 0.4375000111758709, | |
| "step": 67 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2495.6458740234375, | |
| "epoch": 0.07771428571428571, | |
| "grad_norm": 0.3460405170917511, | |
| "kl": 0.0016241073608398438, | |
| "learning_rate": 9.964516155915151e-07, | |
| "loss": -0.0339, | |
| "num_tokens": 9737079.0, | |
| "reward": -0.16728203371167183, | |
| "reward_std": 0.2556677311658859, | |
| "rewards/cosine_scaled_reward": -0.27114101499319077, | |
| "rewards/format_reward": 0.375, | |
| "step": 68 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2326.791748046875, | |
| "epoch": 0.07885714285714286, | |
| "grad_norm": 0.2886251211166382, | |
| "kl": 0.0007181167602539062, | |
| "learning_rate": 9.960469931131936e-07, | |
| "loss": 0.0408, | |
| "num_tokens": 9856457.0, | |
| "reward": 0.2764681279659271, | |
| "reward_std": 0.4927753880620003, | |
| "rewards/cosine_scaled_reward": -0.1221826063701883, | |
| "rewards/format_reward": 0.5208333283662796, | |
| "step": 69 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2659.9584045410156, | |
| "epoch": 0.08, | |
| "grad_norm": 0.18427890539169312, | |
| "kl": 0.0009355545043945312, | |
| "learning_rate": 9.956206309337066e-07, | |
| "loss": 0.0742, | |
| "num_tokens": 9992241.0, | |
| "reward": 0.3243530666222796, | |
| "reward_std": 0.5141221769154072, | |
| "rewards/cosine_scaled_reward": -0.11907346919178963, | |
| "rewards/format_reward": 0.5624999981373549, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2051.4375610351562, | |
| "epoch": 0.08114285714285714, | |
| "grad_norm": 0.37901344895362854, | |
| "kl": 0.001514434814453125, | |
| "learning_rate": 9.951725498333448e-07, | |
| "loss": 0.1337, | |
| "num_tokens": 10098330.0, | |
| "reward": 0.2932474911212921, | |
| "reward_std": 0.562146857380867, | |
| "rewards/cosine_scaled_reward": -0.16587623208761215, | |
| "rewards/format_reward": 0.6249999925494194, | |
| "step": 71 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2446.187530517578, | |
| "epoch": 0.08228571428571428, | |
| "grad_norm": 0.2612290680408478, | |
| "kl": 0.0010986328125, | |
| "learning_rate": 9.947027716509488e-07, | |
| "loss": 0.063, | |
| "num_tokens": 10223667.0, | |
| "reward": 0.843310259282589, | |
| "reward_std": 0.9821145087480545, | |
| "rewards/cosine_scaled_reward": 0.10915513057261705, | |
| "rewards/format_reward": 0.6249999925494194, | |
| "step": 72 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1752.6875762939453, | |
| "epoch": 0.08342857142857144, | |
| "grad_norm": 0.32139480113983154, | |
| "kl": 0.001506805419921875, | |
| "learning_rate": 9.942113192828444e-07, | |
| "loss": 0.0949, | |
| "num_tokens": 10314594.0, | |
| "reward": 0.7357300966978073, | |
| "reward_std": 0.713388629257679, | |
| "rewards/cosine_scaled_reward": 0.00328170508146286, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 73 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2221.375030517578, | |
| "epoch": 0.08457142857142858, | |
| "grad_norm": 0.26274731755256653, | |
| "kl": 0.00138092041015625, | |
| "learning_rate": 9.93698216681727e-07, | |
| "loss": 0.142, | |
| "num_tokens": 10429110.0, | |
| "reward": 0.44233171858650167, | |
| "reward_std": 0.6651621311903, | |
| "rewards/cosine_scaled_reward": -0.08091748412698507, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 74 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2533.6459350585938, | |
| "epoch": 0.08571428571428572, | |
| "grad_norm": 0.3130059540271759, | |
| "kl": 0.0022735595703125, | |
| "learning_rate": 9.931634888554935e-07, | |
| "loss": 0.1731, | |
| "num_tokens": 10558639.0, | |
| "reward": 0.012795105576515198, | |
| "reward_std": 0.6212242320179939, | |
| "rewards/cosine_scaled_reward": -0.21235244907438755, | |
| "rewards/format_reward": 0.4375000037252903, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2811.0001220703125, | |
| "epoch": 0.08685714285714285, | |
| "grad_norm": 0.2659956216812134, | |
| "kl": 0.0014276504516601562, | |
| "learning_rate": 9.926071618660237e-07, | |
| "loss": 0.1683, | |
| "num_tokens": 10701949.0, | |
| "reward": 0.43999071419239044, | |
| "reward_std": 0.5641827136278152, | |
| "rewards/cosine_scaled_reward": 0.011662017554044724, | |
| "rewards/format_reward": 0.4166666641831398, | |
| "step": 76 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3142.1458740234375, | |
| "epoch": 0.088, | |
| "grad_norm": 0.2155051827430725, | |
| "kl": 0.0018138885498046875, | |
| "learning_rate": 9.9202926282791e-07, | |
| "loss": 0.0556, | |
| "num_tokens": 10860824.0, | |
| "reward": 0.007742300629615784, | |
| "reward_std": 0.5252507999539375, | |
| "rewards/cosine_scaled_reward": -0.1107122041285038, | |
| "rewards/format_reward": 0.2291666641831398, | |
| "step": 77 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2719.3333740234375, | |
| "epoch": 0.08914285714285715, | |
| "grad_norm": 0.27928948402404785, | |
| "kl": 0.0011129379272460938, | |
| "learning_rate": 9.91429819907136e-07, | |
| "loss": 0.0509, | |
| "num_tokens": 10998852.0, | |
| "reward": 0.4107997752726078, | |
| "reward_std": 0.7809228599071503, | |
| "rewards/cosine_scaled_reward": -0.002933473326265812, | |
| "rewards/format_reward": 0.41666666232049465, | |
| "step": 78 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2861.6875610351562, | |
| "epoch": 0.09028571428571429, | |
| "grad_norm": 0.24252408742904663, | |
| "kl": 0.0010776519775390625, | |
| "learning_rate": 9.908088623197048e-07, | |
| "loss": 0.167, | |
| "num_tokens": 11144973.0, | |
| "reward": 0.06531679630279541, | |
| "reward_std": 0.5613357946276665, | |
| "rewards/cosine_scaled_reward": -0.17567493673413992, | |
| "rewards/format_reward": 0.4166666641831398, | |
| "step": 79 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3362.5416870117188, | |
| "epoch": 0.09142857142857143, | |
| "grad_norm": 0.1729951947927475, | |
| "kl": 0.0009174346923828125, | |
| "learning_rate": 9.901664203302124e-07, | |
| "loss": 0.0921, | |
| "num_tokens": 11314865.0, | |
| "reward": -0.06127368565648794, | |
| "reward_std": 0.6489161625504494, | |
| "rewards/cosine_scaled_reward": -0.1348035205155611, | |
| "rewards/format_reward": 0.2083333358168602, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2769.0625610351562, | |
| "epoch": 0.09257142857142857, | |
| "grad_norm": 0.20924705266952515, | |
| "kl": 0.0013446807861328125, | |
| "learning_rate": 9.895025252503755e-07, | |
| "loss": 0.1263, | |
| "num_tokens": 11455040.0, | |
| "reward": 0.23385965824127197, | |
| "reward_std": 0.7129553332924843, | |
| "rewards/cosine_scaled_reward": -0.08098683506250381, | |
| "rewards/format_reward": 0.3958333283662796, | |
| "step": 81 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2727.9584350585938, | |
| "epoch": 0.09371428571428571, | |
| "grad_norm": 0.26771607995033264, | |
| "kl": 0.0012149810791015625, | |
| "learning_rate": 9.888172094375033e-07, | |
| "loss": 0.2245, | |
| "num_tokens": 11593944.0, | |
| "reward": 0.02113605911290506, | |
| "reward_std": 0.5952873006463051, | |
| "rewards/cosine_scaled_reward": -0.21859865076839924, | |
| "rewards/format_reward": 0.4583333358168602, | |
| "step": 82 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3496.9375610351562, | |
| "epoch": 0.09485714285714286, | |
| "grad_norm": 0.20277726650238037, | |
| "kl": 0.0009927749633789062, | |
| "learning_rate": 9.881105062929221e-07, | |
| "loss": 0.0322, | |
| "num_tokens": 11770755.0, | |
| "reward": -0.40173853002488613, | |
| "reward_std": 0.4993053339421749, | |
| "rewards/cosine_scaled_reward": -0.28420259058475494, | |
| "rewards/format_reward": 0.16666666604578495, | |
| "step": 83 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2602.8959350585938, | |
| "epoch": 0.096, | |
| "grad_norm": 0.24264362454414368, | |
| "kl": 0.0011615753173828125, | |
| "learning_rate": 9.873824502603459e-07, | |
| "loss": 0.1134, | |
| "num_tokens": 11903482.0, | |
| "reward": 0.8407080993056297, | |
| "reward_std": 0.7610376700758934, | |
| "rewards/cosine_scaled_reward": 0.11827070452272892, | |
| "rewards/format_reward": 0.6041666567325592, | |
| "step": 84 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3080.4375915527344, | |
| "epoch": 0.09714285714285714, | |
| "grad_norm": 0.1991293877363205, | |
| "kl": 0.001140594482421875, | |
| "learning_rate": 9.866330768241983e-07, | |
| "loss": 0.008, | |
| "num_tokens": 12058927.0, | |
| "reward": 0.20353421010077, | |
| "reward_std": 0.8020459171384573, | |
| "rewards/cosine_scaled_reward": -0.06489956192672253, | |
| "rewards/format_reward": 0.3333333395421505, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2708.541748046875, | |
| "epoch": 0.09828571428571428, | |
| "grad_norm": 0.24375155568122864, | |
| "kl": 0.001434326171875, | |
| "learning_rate": 9.85862422507884e-07, | |
| "loss": 0.0966, | |
| "num_tokens": 12197073.0, | |
| "reward": 0.2201586167793721, | |
| "reward_std": 0.3715602122247219, | |
| "rewards/cosine_scaled_reward": -0.08783736452460289, | |
| "rewards/format_reward": 0.3958333283662796, | |
| "step": 86 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2833.2500610351562, | |
| "epoch": 0.09942857142857142, | |
| "grad_norm": 0.2276199460029602, | |
| "kl": 0.001941680908203125, | |
| "learning_rate": 9.850705248720068e-07, | |
| "loss": 0.0738, | |
| "num_tokens": 12340527.0, | |
| "reward": -0.10197540372610092, | |
| "reward_std": 0.6884967442601919, | |
| "rewards/cosine_scaled_reward": -0.2384876972064376, | |
| "rewards/format_reward": 0.3749999925494194, | |
| "step": 87 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2349.5833740234375, | |
| "epoch": 0.10057142857142858, | |
| "grad_norm": 0.2580890357494354, | |
| "kl": 0.0018215179443359375, | |
| "learning_rate": 9.8425742251254e-07, | |
| "loss": 0.0471, | |
| "num_tokens": 12461275.0, | |
| "reward": 0.8556020110845566, | |
| "reward_std": 0.8413996547460556, | |
| "rewards/cosine_scaled_reward": 0.13613433949649334, | |
| "rewards/format_reward": 0.5833333358168602, | |
| "step": 88 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3218.3125610351562, | |
| "epoch": 0.10171428571428572, | |
| "grad_norm": 0.19695505499839783, | |
| "kl": 0.00116729736328125, | |
| "learning_rate": 9.83423155058946e-07, | |
| "loss": 0.0728, | |
| "num_tokens": 12623878.0, | |
| "reward": 0.16969604790210724, | |
| "reward_std": 0.7643514573574066, | |
| "rewards/cosine_scaled_reward": -0.050568655133247375, | |
| "rewards/format_reward": 0.27083333395421505, | |
| "step": 89 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2965.916748046875, | |
| "epoch": 0.10285714285714286, | |
| "grad_norm": 0.23769812285900116, | |
| "kl": 0.0013370513916015625, | |
| "learning_rate": 9.825677631722435e-07, | |
| "loss": 0.1051, | |
| "num_tokens": 12774504.0, | |
| "reward": -0.07332007400691509, | |
| "reward_std": 0.7234293296933174, | |
| "rewards/cosine_scaled_reward": -0.19291004166007042, | |
| "rewards/format_reward": 0.3124999962747097, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3094.291748046875, | |
| "epoch": 0.104, | |
| "grad_norm": 0.23373101651668549, | |
| "kl": 0.0011157989501953125, | |
| "learning_rate": 9.816912885430258e-07, | |
| "loss": 0.0653, | |
| "num_tokens": 12931448.0, | |
| "reward": -0.2179771214723587, | |
| "reward_std": 0.5946699306368828, | |
| "rewards/cosine_scaled_reward": -0.2652385588735342, | |
| "rewards/format_reward": 0.31249999813735485, | |
| "step": 91 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2507.500030517578, | |
| "epoch": 0.10514285714285715, | |
| "grad_norm": 0.26020577549934387, | |
| "kl": 0.001468658447265625, | |
| "learning_rate": 9.807937738894303e-07, | |
| "loss": 0.1937, | |
| "num_tokens": 13060088.0, | |
| "reward": 0.3611624091863632, | |
| "reward_std": 0.5020711049437523, | |
| "rewards/cosine_scaled_reward": -0.07983547076582909, | |
| "rewards/format_reward": 0.5208333283662796, | |
| "step": 92 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2233.666717529297, | |
| "epoch": 0.10628571428571429, | |
| "grad_norm": 0.3289438486099243, | |
| "kl": 0.001186370849609375, | |
| "learning_rate": 9.798752629550546e-07, | |
| "loss": 0.1828, | |
| "num_tokens": 13175656.0, | |
| "reward": 0.38644066639244556, | |
| "reward_std": 0.6418244391679764, | |
| "rewards/cosine_scaled_reward": -0.09844633843749762, | |
| "rewards/format_reward": 0.583333320915699, | |
| "step": 93 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3342.6666870117188, | |
| "epoch": 0.10742857142857143, | |
| "grad_norm": 0.22657889127731323, | |
| "kl": 0.0014677047729492188, | |
| "learning_rate": 9.78935800506826e-07, | |
| "loss": 0.0397, | |
| "num_tokens": 13344348.0, | |
| "reward": -0.197869211435318, | |
| "reward_std": 0.6262447759509087, | |
| "rewards/cosine_scaled_reward": -0.1926846019923687, | |
| "rewards/format_reward": 0.1875, | |
| "step": 94 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3180.7084350585938, | |
| "epoch": 0.10857142857142857, | |
| "grad_norm": 0.18802064657211304, | |
| "kl": 0.0009918212890625, | |
| "learning_rate": 9.779754323328192e-07, | |
| "loss": -0.0051, | |
| "num_tokens": 13506052.0, | |
| "reward": -0.047770393546670675, | |
| "reward_std": 0.6212666258215904, | |
| "rewards/cosine_scaled_reward": -0.16971853002905846, | |
| "rewards/format_reward": 0.29166666232049465, | |
| "step": 95 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2359.6459350585938, | |
| "epoch": 0.10971428571428571, | |
| "grad_norm": 0.3318856656551361, | |
| "kl": 0.0027332305908203125, | |
| "learning_rate": 9.769942052400235e-07, | |
| "loss": 0.0385, | |
| "num_tokens": 13626977.0, | |
| "reward": 0.3386218100786209, | |
| "reward_std": 0.5042477250099182, | |
| "rewards/cosine_scaled_reward": -0.10152245499193668, | |
| "rewards/format_reward": 0.5416666641831398, | |
| "step": 96 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3064.625030517578, | |
| "epoch": 0.11085714285714286, | |
| "grad_norm": 0.2832983434200287, | |
| "kl": 0.0015583038330078125, | |
| "learning_rate": 9.759921670520634e-07, | |
| "loss": 0.046, | |
| "num_tokens": 13782221.0, | |
| "reward": 0.006381474435329437, | |
| "reward_std": 0.5968942120671272, | |
| "rewards/cosine_scaled_reward": -0.12180926650762558, | |
| "rewards/format_reward": 0.2499999962747097, | |
| "step": 97 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2421.041748046875, | |
| "epoch": 0.112, | |
| "grad_norm": 0.24057374894618988, | |
| "kl": 0.001895904541015625, | |
| "learning_rate": 9.749693666068663e-07, | |
| "loss": 0.0459, | |
| "num_tokens": 13906513.0, | |
| "reward": 0.23158405721187592, | |
| "reward_std": 0.46807075664401054, | |
| "rewards/cosine_scaled_reward": -0.12379130208864808, | |
| "rewards/format_reward": 0.47916667349636555, | |
| "step": 98 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2506.291748046875, | |
| "epoch": 0.11314285714285714, | |
| "grad_norm": 0.2858019769191742, | |
| "kl": 0.0012254714965820312, | |
| "learning_rate": 9.739258537542835e-07, | |
| "loss": 0.0971, | |
| "num_tokens": 14034993.0, | |
| "reward": 0.28322335705161095, | |
| "reward_std": 0.4190603345632553, | |
| "rewards/cosine_scaled_reward": -0.11880499497056007, | |
| "rewards/format_reward": 0.5208333320915699, | |
| "step": 99 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2948.5625610351562, | |
| "epoch": 0.11428571428571428, | |
| "grad_norm": 0.2643907070159912, | |
| "kl": 0.00148773193359375, | |
| "learning_rate": 9.728616793536587e-07, | |
| "loss": -0.0572, | |
| "num_tokens": 14184588.0, | |
| "reward": 0.147983580827713, | |
| "reward_std": 0.6056301072239876, | |
| "rewards/cosine_scaled_reward": -0.09267487563192844, | |
| "rewards/format_reward": 0.3333333358168602, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2414.666732788086, | |
| "epoch": 0.11542857142857142, | |
| "grad_norm": 0.2964082658290863, | |
| "kl": 0.001434326171875, | |
| "learning_rate": 9.717768952713511e-07, | |
| "loss": 0.0541, | |
| "num_tokens": 14308580.0, | |
| "reward": 0.07409980893135071, | |
| "reward_std": 0.4281482808291912, | |
| "rewards/cosine_scaled_reward": -0.18170008901506662, | |
| "rewards/format_reward": 0.43750000558793545, | |
| "step": 101 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2445.604217529297, | |
| "epoch": 0.11657142857142858, | |
| "grad_norm": 0.25845882296562195, | |
| "kl": 0.0014829635620117188, | |
| "learning_rate": 9.706715543782064e-07, | |
| "loss": 0.1582, | |
| "num_tokens": 14433931.0, | |
| "reward": 0.17498280853033066, | |
| "reward_std": 0.7513352856040001, | |
| "rewards/cosine_scaled_reward": -0.18334193527698517, | |
| "rewards/format_reward": 0.5416666567325592, | |
| "step": 102 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2614.7291870117188, | |
| "epoch": 0.11771428571428572, | |
| "grad_norm": 0.20373135805130005, | |
| "kl": 0.0013742446899414062, | |
| "learning_rate": 9.695457105469804e-07, | |
| "loss": 0.0359, | |
| "num_tokens": 14567022.0, | |
| "reward": 0.029875081032514572, | |
| "reward_std": 0.6425531953573227, | |
| "rewards/cosine_scaled_reward": -0.19339579716324806, | |
| "rewards/format_reward": 0.41666666977107525, | |
| "step": 103 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2409.0001220703125, | |
| "epoch": 0.11885714285714286, | |
| "grad_norm": 0.23686547577381134, | |
| "kl": 0.0011281967163085938, | |
| "learning_rate": 9.683994186497132e-07, | |
| "loss": 0.106, | |
| "num_tokens": 14690484.0, | |
| "reward": 0.8152372092008591, | |
| "reward_std": 0.6015111114829779, | |
| "rewards/cosine_scaled_reward": 0.13678526505827904, | |
| "rewards/format_reward": 0.5416666641831398, | |
| "step": 104 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2311.3125610351562, | |
| "epoch": 0.12, | |
| "grad_norm": 0.2990974187850952, | |
| "kl": 0.00154876708984375, | |
| "learning_rate": 9.672327345550543e-07, | |
| "loss": 0.2331, | |
| "num_tokens": 14809947.0, | |
| "reward": 0.3171987719833851, | |
| "reward_std": 0.437034510076046, | |
| "rewards/cosine_scaled_reward": -0.11223394051194191, | |
| "rewards/format_reward": 0.5416666641831398, | |
| "step": 105 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2736.7500610351562, | |
| "epoch": 0.12114285714285715, | |
| "grad_norm": 0.26455891132354736, | |
| "kl": 0.002010345458984375, | |
| "learning_rate": 9.66045715125541e-07, | |
| "loss": 0.1562, | |
| "num_tokens": 14948811.0, | |
| "reward": 0.34779771137982607, | |
| "reward_std": 0.6852256283164024, | |
| "rewards/cosine_scaled_reward": -0.024017807096242905, | |
| "rewards/format_reward": 0.3958333358168602, | |
| "step": 106 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2184.2291870117188, | |
| "epoch": 0.12228571428571429, | |
| "grad_norm": 0.33748024702072144, | |
| "kl": 0.0018157958984375, | |
| "learning_rate": 9.648384182148252e-07, | |
| "loss": -0.0665, | |
| "num_tokens": 15061676.0, | |
| "reward": 0.24546424997970462, | |
| "reward_std": 0.5336676575243473, | |
| "rewards/cosine_scaled_reward": -0.16893455758690834, | |
| "rewards/format_reward": 0.5833333358168602, | |
| "step": 107 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2026.3126068115234, | |
| "epoch": 0.12342857142857143, | |
| "grad_norm": 0.34937456250190735, | |
| "kl": 0.0019092559814453125, | |
| "learning_rate": 9.636109026648554e-07, | |
| "loss": 0.1239, | |
| "num_tokens": 15166709.0, | |
| "reward": 0.8228390365839005, | |
| "reward_std": 0.5794718265533447, | |
| "rewards/cosine_scaled_reward": 0.10933613404631615, | |
| "rewards/format_reward": 0.6041666623204947, | |
| "step": 108 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2952.7916870117188, | |
| "epoch": 0.12457142857142857, | |
| "grad_norm": 0.3034088611602783, | |
| "kl": 0.0014963150024414062, | |
| "learning_rate": 9.623632283030077e-07, | |
| "loss": -0.0444, | |
| "num_tokens": 15316837.0, | |
| "reward": -0.007000848650932312, | |
| "reward_std": 0.5003913566470146, | |
| "rewards/cosine_scaled_reward": -0.15975042153149843, | |
| "rewards/format_reward": 0.31249999813735485, | |
| "step": 109 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2271.3125610351562, | |
| "epoch": 0.12571428571428572, | |
| "grad_norm": 0.2639235854148865, | |
| "kl": 0.002208709716796875, | |
| "learning_rate": 9.610954559391704e-07, | |
| "loss": 0.1034, | |
| "num_tokens": 15433678.0, | |
| "reward": 1.0950873866677284, | |
| "reward_std": 0.6892717778682709, | |
| "rewards/cosine_scaled_reward": 0.17254368215799332, | |
| "rewards/format_reward": 0.7499999850988388, | |
| "step": 110 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1646.6458740234375, | |
| "epoch": 0.12685714285714286, | |
| "grad_norm": 0.3545893728733063, | |
| "kl": 0.002468109130859375, | |
| "learning_rate": 9.598076473627796e-07, | |
| "loss": 0.1439, | |
| "num_tokens": 15519989.0, | |
| "reward": 0.6878980733454227, | |
| "reward_std": 0.6134084016084671, | |
| "rewards/cosine_scaled_reward": -0.02063431334681809, | |
| "rewards/format_reward": 0.7291666641831398, | |
| "step": 111 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2729.0209045410156, | |
| "epoch": 0.128, | |
| "grad_norm": 0.22189046442508698, | |
| "kl": 0.001827239990234375, | |
| "learning_rate": 9.58499865339809e-07, | |
| "loss": 0.067, | |
| "num_tokens": 15658938.0, | |
| "reward": 0.4110586680471897, | |
| "reward_std": 0.5899315737187862, | |
| "rewards/cosine_scaled_reward": -0.02363734319806099, | |
| "rewards/format_reward": 0.4583333283662796, | |
| "step": 112 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1823.4375915527344, | |
| "epoch": 0.12914285714285714, | |
| "grad_norm": 0.2697267532348633, | |
| "kl": 0.0023956298828125, | |
| "learning_rate": 9.571721736097088e-07, | |
| "loss": 0.0572, | |
| "num_tokens": 15754773.0, | |
| "reward": 0.7300843407865614, | |
| "reward_std": 0.41797127947211266, | |
| "rewards/cosine_scaled_reward": 0.02129216119647026, | |
| "rewards/format_reward": 0.6875, | |
| "step": 113 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1685.7500610351562, | |
| "epoch": 0.13028571428571428, | |
| "grad_norm": 0.3274898827075958, | |
| "kl": 0.0018157958984375, | |
| "learning_rate": 9.55824636882301e-07, | |
| "loss": 0.1355, | |
| "num_tokens": 15843075.0, | |
| "reward": 0.9840904623270035, | |
| "reward_std": 0.6420945823192596, | |
| "rewards/cosine_scaled_reward": 0.13787856203271076, | |
| "rewards/format_reward": 0.708333320915699, | |
| "step": 114 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2409.3750610351562, | |
| "epoch": 0.13142857142857142, | |
| "grad_norm": 0.21988999843597412, | |
| "kl": 0.0018787384033203125, | |
| "learning_rate": 9.54457320834625e-07, | |
| "loss": 0.1401, | |
| "num_tokens": 15967587.0, | |
| "reward": 0.5942223705351353, | |
| "reward_std": 0.9110212624073029, | |
| "rewards/cosine_scaled_reward": -0.03622216871008277, | |
| "rewards/format_reward": 0.6666666567325592, | |
| "step": 115 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2412.041748046875, | |
| "epoch": 0.13257142857142856, | |
| "grad_norm": 0.2862938940525055, | |
| "kl": 0.0075397491455078125, | |
| "learning_rate": 9.530702921077358e-07, | |
| "loss": 0.068, | |
| "num_tokens": 16090931.0, | |
| "reward": 0.35588742792606354, | |
| "reward_std": 0.5233044624328613, | |
| "rewards/cosine_scaled_reward": -0.07205628603696823, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 116 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3117.5208740234375, | |
| "epoch": 0.1337142857142857, | |
| "grad_norm": 0.17605924606323242, | |
| "kl": 0.0014972686767578125, | |
| "learning_rate": 9.516636183034564e-07, | |
| "loss": 0.127, | |
| "num_tokens": 16249248.0, | |
| "reward": -0.08803762402385473, | |
| "reward_std": 0.6678096652030945, | |
| "rewards/cosine_scaled_reward": -0.18985214456915855, | |
| "rewards/format_reward": 0.2916666679084301, | |
| "step": 117 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1823.6042175292969, | |
| "epoch": 0.13485714285714287, | |
| "grad_norm": 0.3110818564891815, | |
| "kl": 0.0024127960205078125, | |
| "learning_rate": 9.502373679810839e-07, | |
| "loss": 0.0435, | |
| "num_tokens": 16344899.0, | |
| "reward": 0.8677586033008993, | |
| "reward_std": 0.6636749655008316, | |
| "rewards/cosine_scaled_reward": 0.058879293501377106, | |
| "rewards/format_reward": 0.75, | |
| "step": 118 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1695.0000457763672, | |
| "epoch": 0.136, | |
| "grad_norm": 0.4480116665363312, | |
| "kl": 0.002857208251953125, | |
| "learning_rate": 9.487916106540465e-07, | |
| "loss": 0.2014, | |
| "num_tokens": 16434299.0, | |
| "reward": 0.3477291911840439, | |
| "reward_std": 0.4658740572631359, | |
| "rewards/cosine_scaled_reward": -0.16988542396575212, | |
| "rewards/format_reward": 0.6874999925494194, | |
| "step": 119 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2670.5625610351562, | |
| "epoch": 0.13714285714285715, | |
| "grad_norm": 0.3484485149383545, | |
| "kl": 0.0051021575927734375, | |
| "learning_rate": 9.473264167865171e-07, | |
| "loss": 0.1218, | |
| "num_tokens": 16570394.0, | |
| "reward": 0.19900877276086248, | |
| "reward_std": 0.7815569303929806, | |
| "rewards/cosine_scaled_reward": -0.17132895928807557, | |
| "rewards/format_reward": 0.5416666641831398, | |
| "step": 120 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2795.354278564453, | |
| "epoch": 0.1382857142857143, | |
| "grad_norm": 0.26137691736221313, | |
| "kl": 0.0018291473388671875, | |
| "learning_rate": 9.458418577899774e-07, | |
| "loss": 0.1001, | |
| "num_tokens": 16712407.0, | |
| "reward": 0.32655623741447926, | |
| "reward_std": 0.5866650827229023, | |
| "rewards/cosine_scaled_reward": -0.055471885949373245, | |
| "rewards/format_reward": 0.4375000074505806, | |
| "step": 121 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2151.520965576172, | |
| "epoch": 0.13942857142857143, | |
| "grad_norm": 0.23685228824615479, | |
| "kl": 0.0031890869140625, | |
| "learning_rate": 9.443380060197385e-07, | |
| "loss": 0.0582, | |
| "num_tokens": 16823474.0, | |
| "reward": 0.03241742588579655, | |
| "reward_std": 0.5829479023814201, | |
| "rewards/cosine_scaled_reward": -0.28587461821734905, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 122 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2567.7084350585938, | |
| "epoch": 0.14057142857142857, | |
| "grad_norm": 0.2046615034341812, | |
| "kl": 0.0016498565673828125, | |
| "learning_rate": 9.428149347714143e-07, | |
| "loss": 0.1672, | |
| "num_tokens": 16954332.0, | |
| "reward": 0.25633008778095245, | |
| "reward_std": 0.868492528796196, | |
| "rewards/cosine_scaled_reward": -0.13225161656737328, | |
| "rewards/format_reward": 0.5208333358168602, | |
| "step": 123 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2511.541748046875, | |
| "epoch": 0.1417142857142857, | |
| "grad_norm": 0.3218223750591278, | |
| "kl": 0.0021820068359375, | |
| "learning_rate": 9.412727182773486e-07, | |
| "loss": 0.0675, | |
| "num_tokens": 17083004.0, | |
| "reward": 0.7707539834082127, | |
| "reward_std": 0.6489643305540085, | |
| "rewards/cosine_scaled_reward": 0.11454363912343979, | |
| "rewards/format_reward": 0.5416666753590107, | |
| "step": 124 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2831.3959350585938, | |
| "epoch": 0.14285714285714285, | |
| "grad_norm": 0.20118391513824463, | |
| "kl": 0.0013217926025390625, | |
| "learning_rate": 9.397114317029974e-07, | |
| "loss": 0.1636, | |
| "num_tokens": 17226939.0, | |
| "reward": 0.01697085052728653, | |
| "reward_std": 0.6512814313173294, | |
| "rewards/cosine_scaled_reward": -0.18943122308701277, | |
| "rewards/format_reward": 0.3958333283662796, | |
| "step": 125 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1912.6875457763672, | |
| "epoch": 0.144, | |
| "grad_norm": 0.3025752902030945, | |
| "kl": 0.00395965576171875, | |
| "learning_rate": 9.381311511432658e-07, | |
| "loss": 0.0732, | |
| "num_tokens": 17326566.0, | |
| "reward": 0.29770641401410103, | |
| "reward_std": 0.6499512940645218, | |
| "rewards/cosine_scaled_reward": -0.1948967999778688, | |
| "rewards/format_reward": 0.6875, | |
| "step": 126 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2401.1875610351562, | |
| "epoch": 0.14514285714285713, | |
| "grad_norm": 0.2488543838262558, | |
| "kl": 0.001705169677734375, | |
| "learning_rate": 9.36531953618799e-07, | |
| "loss": 0.1578, | |
| "num_tokens": 17449599.0, | |
| "reward": 0.6098210737109184, | |
| "reward_std": 0.6302774548530579, | |
| "rewards/cosine_scaled_reward": 0.02366053406149149, | |
| "rewards/format_reward": 0.5625000074505806, | |
| "step": 127 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2305.854248046875, | |
| "epoch": 0.1462857142857143, | |
| "grad_norm": 0.24932162463665009, | |
| "kl": 0.00275421142578125, | |
| "learning_rate": 9.34913917072228e-07, | |
| "loss": 0.1021, | |
| "num_tokens": 17568806.0, | |
| "reward": 0.21157516352832317, | |
| "reward_std": 0.688478484749794, | |
| "rewards/cosine_scaled_reward": -0.16504575312137604, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 128 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1629.6250915527344, | |
| "epoch": 0.14742857142857144, | |
| "grad_norm": 0.31823137402534485, | |
| "kl": 0.0034008026123046875, | |
| "learning_rate": 9.332771203643714e-07, | |
| "loss": 0.0721, | |
| "num_tokens": 17654918.0, | |
| "reward": 0.7311810962855816, | |
| "reward_std": 0.48786818608641624, | |
| "rewards/cosine_scaled_reward": -0.009409455582499504, | |
| "rewards/format_reward": 0.75, | |
| "step": 129 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2351.6250610351562, | |
| "epoch": 0.14857142857142858, | |
| "grad_norm": 0.22688433527946472, | |
| "kl": 0.002315521240234375, | |
| "learning_rate": 9.316216432703916e-07, | |
| "loss": 0.1654, | |
| "num_tokens": 17775866.0, | |
| "reward": 0.1337121445685625, | |
| "reward_std": 0.489741962403059, | |
| "rewards/cosine_scaled_reward": -0.19356059283018112, | |
| "rewards/format_reward": 0.5208333358168602, | |
| "step": 130 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2512.2083740234375, | |
| "epoch": 0.14971428571428572, | |
| "grad_norm": 0.2089494913816452, | |
| "kl": 0.002399444580078125, | |
| "learning_rate": 9.299475664759068e-07, | |
| "loss": 0.0292, | |
| "num_tokens": 17904126.0, | |
| "reward": 0.10406213253736496, | |
| "reward_std": 0.556065134704113, | |
| "rewards/cosine_scaled_reward": -0.22921893745660782, | |
| "rewards/format_reward": 0.5624999962747097, | |
| "step": 131 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2390.916748046875, | |
| "epoch": 0.15085714285714286, | |
| "grad_norm": 0.25721561908721924, | |
| "kl": 0.002170562744140625, | |
| "learning_rate": 9.282549715730579e-07, | |
| "loss": 0.1248, | |
| "num_tokens": 18027062.0, | |
| "reward": 0.03476526029407978, | |
| "reward_std": 0.5311327800154686, | |
| "rewards/cosine_scaled_reward": -0.22220071218907833, | |
| "rewards/format_reward": 0.4791666641831398, | |
| "step": 132 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2429.3959350585938, | |
| "epoch": 0.152, | |
| "grad_norm": 0.24131183326244354, | |
| "kl": 0.002513885498046875, | |
| "learning_rate": 9.265439410565328e-07, | |
| "loss": 0.021, | |
| "num_tokens": 18151617.0, | |
| "reward": 0.32898006960749626, | |
| "reward_std": 0.6633763536810875, | |
| "rewards/cosine_scaled_reward": -0.15842663776129484, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 133 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1656.7500457763672, | |
| "epoch": 0.15314285714285714, | |
| "grad_norm": 0.769141674041748, | |
| "kl": 0.020517349243164062, | |
| "learning_rate": 9.248145583195447e-07, | |
| "loss": -0.092, | |
| "num_tokens": 18238839.0, | |
| "reward": 0.7126629631966352, | |
| "reward_std": 0.7828906625509262, | |
| "rewards/cosine_scaled_reward": -0.06033520896744449, | |
| "rewards/format_reward": 0.8333333283662796, | |
| "step": 134 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2022.3334045410156, | |
| "epoch": 0.15428571428571428, | |
| "grad_norm": 0.2641633152961731, | |
| "kl": 0.002735137939453125, | |
| "learning_rate": 9.230669076497687e-07, | |
| "loss": -0.0217, | |
| "num_tokens": 18343621.0, | |
| "reward": 0.44979627430438995, | |
| "reward_std": 0.4202271206304431, | |
| "rewards/cosine_scaled_reward": -0.0876018637791276, | |
| "rewards/format_reward": 0.625, | |
| "step": 135 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1820.8958740234375, | |
| "epoch": 0.15542857142857142, | |
| "grad_norm": 0.40350502729415894, | |
| "kl": 0.0032672882080078125, | |
| "learning_rate": 9.213010742252327e-07, | |
| "loss": 0.1507, | |
| "num_tokens": 18439412.0, | |
| "reward": 0.5865317583084106, | |
| "reward_std": 0.6874261423945427, | |
| "rewards/cosine_scaled_reward": -0.07131745107471943, | |
| "rewards/format_reward": 0.7291666641831398, | |
| "step": 136 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2143.875, | |
| "epoch": 0.15657142857142858, | |
| "grad_norm": 0.31186163425445557, | |
| "kl": 0.003925323486328125, | |
| "learning_rate": 9.195171441101668e-07, | |
| "loss": 0.0978, | |
| "num_tokens": 18550394.0, | |
| "reward": 0.24395663291215897, | |
| "reward_std": 0.3794162981212139, | |
| "rewards/cosine_scaled_reward": -0.1592716935556382, | |
| "rewards/format_reward": 0.5625000055879354, | |
| "step": 137 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1822.8750457763672, | |
| "epoch": 0.15771428571428572, | |
| "grad_norm": 0.2998411953449249, | |
| "kl": 0.0027942657470703125, | |
| "learning_rate": 9.177152042508077e-07, | |
| "loss": 0.0397, | |
| "num_tokens": 18645914.0, | |
| "reward": 0.8181012012064457, | |
| "reward_std": 0.8359499275684357, | |
| "rewards/cosine_scaled_reward": 0.04446728294715285, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 138 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2293.8125610351562, | |
| "epoch": 0.15885714285714286, | |
| "grad_norm": 0.23238134384155273, | |
| "kl": 0.0021343231201171875, | |
| "learning_rate": 9.158953424711624e-07, | |
| "loss": -0.0407, | |
| "num_tokens": 18764201.0, | |
| "reward": 0.44185843877494335, | |
| "reward_std": 0.595511220395565, | |
| "rewards/cosine_scaled_reward": -0.10198746342211962, | |
| "rewards/format_reward": 0.645833333954215, | |
| "step": 139 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1963.7500305175781, | |
| "epoch": 0.16, | |
| "grad_norm": 0.32312485575675964, | |
| "kl": 0.002685546875, | |
| "learning_rate": 9.140576474687263e-07, | |
| "loss": -0.005, | |
| "num_tokens": 18866069.0, | |
| "reward": 0.7035277560353279, | |
| "reward_std": 0.563681848347187, | |
| "rewards/cosine_scaled_reward": -0.033652789890766144, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 140 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1937.9792098999023, | |
| "epoch": 0.16114285714285714, | |
| "grad_norm": 0.336083322763443, | |
| "kl": 0.002849578857421875, | |
| "learning_rate": 9.122022088101613e-07, | |
| "loss": 0.0896, | |
| "num_tokens": 18967072.0, | |
| "reward": 0.6717538591474295, | |
| "reward_std": 0.4805175382643938, | |
| "rewards/cosine_scaled_reward": -0.007873063907027245, | |
| "rewards/format_reward": 0.6874999962747097, | |
| "step": 141 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1959.3125610351562, | |
| "epoch": 0.16228571428571428, | |
| "grad_norm": 0.3192788362503052, | |
| "kl": 0.0032787322998046875, | |
| "learning_rate": 9.103291169269299e-07, | |
| "loss": 0.0537, | |
| "num_tokens": 19069219.0, | |
| "reward": 0.5299720168113708, | |
| "reward_std": 0.5246853530406952, | |
| "rewards/cosine_scaled_reward": -0.026680664159357548, | |
| "rewards/format_reward": 0.5833333283662796, | |
| "step": 142 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2556.0000610351562, | |
| "epoch": 0.16342857142857142, | |
| "grad_norm": 0.23155175149440765, | |
| "kl": 0.0023345947265625, | |
| "learning_rate": 9.084384631108882e-07, | |
| "loss": 0.0581, | |
| "num_tokens": 19200019.0, | |
| "reward": 0.5917239114642143, | |
| "reward_std": 0.7362638562917709, | |
| "rewards/cosine_scaled_reward": 0.014611944556236267, | |
| "rewards/format_reward": 0.5625, | |
| "step": 143 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2280.416748046875, | |
| "epoch": 0.16457142857142856, | |
| "grad_norm": 0.27343228459358215, | |
| "kl": 0.00260162353515625, | |
| "learning_rate": 9.065303395098358e-07, | |
| "loss": 0.1897, | |
| "num_tokens": 19317651.0, | |
| "reward": 0.27971339225769043, | |
| "reward_std": 0.8346386849880219, | |
| "rewards/cosine_scaled_reward": -0.17264331132173538, | |
| "rewards/format_reward": 0.6250000111758709, | |
| "step": 144 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1757.3125610351562, | |
| "epoch": 0.1657142857142857, | |
| "grad_norm": 0.31246596574783325, | |
| "kl": 0.0029392242431640625, | |
| "learning_rate": 9.046048391230247e-07, | |
| "loss": 0.1442, | |
| "num_tokens": 19409574.0, | |
| "reward": 0.28544900193810463, | |
| "reward_std": 0.6451750323176384, | |
| "rewards/cosine_scaled_reward": -0.23227551455056528, | |
| "rewards/format_reward": 0.75, | |
| "step": 145 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1878.5000305175781, | |
| "epoch": 0.16685714285714287, | |
| "grad_norm": 0.27821478247642517, | |
| "kl": 0.0026645660400390625, | |
| "learning_rate": 9.026620557966279e-07, | |
| "loss": -0.0058, | |
| "num_tokens": 19507560.0, | |
| "reward": 0.08767887763679028, | |
| "reward_std": 0.2916584052145481, | |
| "rewards/cosine_scaled_reward": -0.3311605527997017, | |
| "rewards/format_reward": 0.75, | |
| "step": 146 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1733.6250305175781, | |
| "epoch": 0.168, | |
| "grad_norm": 0.3348139822483063, | |
| "kl": 0.002803802490234375, | |
| "learning_rate": 9.007020842191634e-07, | |
| "loss": 0.0576, | |
| "num_tokens": 19598754.0, | |
| "reward": 0.6129366103559732, | |
| "reward_std": 0.6426418013870716, | |
| "rewards/cosine_scaled_reward": -0.07894838228821754, | |
| "rewards/format_reward": 0.7708333283662796, | |
| "step": 147 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1734.5625457763672, | |
| "epoch": 0.16914285714285715, | |
| "grad_norm": 0.348442405462265, | |
| "kl": 0.0042572021484375, | |
| "learning_rate": 8.987250199168808e-07, | |
| "loss": 0.0653, | |
| "num_tokens": 19690329.0, | |
| "reward": 0.09828764945268631, | |
| "reward_std": 0.49646729975938797, | |
| "rewards/cosine_scaled_reward": -0.32585618272423744, | |
| "rewards/format_reward": 0.7500000111758709, | |
| "step": 148 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1818.1250915527344, | |
| "epoch": 0.1702857142857143, | |
| "grad_norm": 0.3159027695655823, | |
| "kl": 0.003021240234375, | |
| "learning_rate": 8.967309592491052e-07, | |
| "loss": 0.1256, | |
| "num_tokens": 19785447.0, | |
| "reward": 0.9565939288586378, | |
| "reward_std": 0.8120209276676178, | |
| "rewards/cosine_scaled_reward": 0.10329693369567394, | |
| "rewards/format_reward": 0.7499999925494194, | |
| "step": 149 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1871.4584045410156, | |
| "epoch": 0.17142857142857143, | |
| "grad_norm": 0.30927422642707825, | |
| "kl": 0.003932952880859375, | |
| "learning_rate": 8.9471999940354e-07, | |
| "loss": 0.1203, | |
| "num_tokens": 19883539.0, | |
| "reward": 0.08374191913753748, | |
| "reward_std": 0.42122264206409454, | |
| "rewards/cosine_scaled_reward": -0.30187904462218285, | |
| "rewards/format_reward": 0.6874999962747097, | |
| "step": 150 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1639.1459045410156, | |
| "epoch": 0.17257142857142857, | |
| "grad_norm": 0.31134167313575745, | |
| "kl": 0.0028209686279296875, | |
| "learning_rate": 8.926922383915315e-07, | |
| "loss": 0.0552, | |
| "num_tokens": 19970774.0, | |
| "reward": 0.5363836996257305, | |
| "reward_std": 0.3387358784675598, | |
| "rewards/cosine_scaled_reward": -0.11722482740879059, | |
| "rewards/format_reward": 0.7708333283662796, | |
| "step": 151 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2605.3333740234375, | |
| "epoch": 0.1737142857142857, | |
| "grad_norm": 0.24991856515407562, | |
| "kl": 0.0028591156005859375, | |
| "learning_rate": 8.906477750432903e-07, | |
| "loss": 0.1381, | |
| "num_tokens": 20104146.0, | |
| "reward": 0.15796156786382198, | |
| "reward_std": 0.6485603600740433, | |
| "rewards/cosine_scaled_reward": -0.17101922258734703, | |
| "rewards/format_reward": 0.5, | |
| "step": 152 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2192.8333740234375, | |
| "epoch": 0.17485714285714285, | |
| "grad_norm": 0.2899620831012726, | |
| "kl": 0.0022563934326171875, | |
| "learning_rate": 8.88586709003076e-07, | |
| "loss": -0.0387, | |
| "num_tokens": 20217076.0, | |
| "reward": 0.8120561987161636, | |
| "reward_std": 0.8174077644944191, | |
| "rewards/cosine_scaled_reward": 0.08311141841113567, | |
| "rewards/format_reward": 0.6458333358168602, | |
| "step": 153 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2512.0625610351562, | |
| "epoch": 0.176, | |
| "grad_norm": 0.20951369404792786, | |
| "kl": 0.001766204833984375, | |
| "learning_rate": 8.865091407243394e-07, | |
| "loss": 0.1924, | |
| "num_tokens": 20345389.0, | |
| "reward": 0.715275889262557, | |
| "reward_std": 0.9692183881998062, | |
| "rewards/cosine_scaled_reward": 0.06597128417342901, | |
| "rewards/format_reward": 0.5833333358168602, | |
| "step": 154 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2936.9376220703125, | |
| "epoch": 0.17714285714285713, | |
| "grad_norm": 0.22864274680614471, | |
| "kl": 0.00200653076171875, | |
| "learning_rate": 8.844151714648274e-07, | |
| "loss": 0.2167, | |
| "num_tokens": 20494816.0, | |
| "reward": -0.09320222213864326, | |
| "reward_std": 0.5999226495623589, | |
| "rewards/cosine_scaled_reward": -0.2132677833433263, | |
| "rewards/format_reward": 0.3333333358168602, | |
| "step": 155 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2167.229217529297, | |
| "epoch": 0.1782857142857143, | |
| "grad_norm": 0.27711907029151917, | |
| "kl": 0.003047943115234375, | |
| "learning_rate": 8.823049032816478e-07, | |
| "loss": 0.126, | |
| "num_tokens": 20606895.0, | |
| "reward": 0.036196669563651085, | |
| "reward_std": 0.3412875160574913, | |
| "rewards/cosine_scaled_reward": -0.2944016717374325, | |
| "rewards/format_reward": 0.625, | |
| "step": 156 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2172.3333435058594, | |
| "epoch": 0.17942857142857144, | |
| "grad_norm": 0.29314175248146057, | |
| "kl": 0.004207611083984375, | |
| "learning_rate": 8.801784390262943e-07, | |
| "loss": 0.021, | |
| "num_tokens": 20719723.0, | |
| "reward": 0.9620806649327278, | |
| "reward_std": 0.7838257402181625, | |
| "rewards/cosine_scaled_reward": 0.1893736298661679, | |
| "rewards/format_reward": 0.5833333283662796, | |
| "step": 157 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2577.8959350585938, | |
| "epoch": 0.18057142857142858, | |
| "grad_norm": 0.24172109365463257, | |
| "kl": 0.003192901611328125, | |
| "learning_rate": 8.780358823396352e-07, | |
| "loss": 0.0886, | |
| "num_tokens": 20851364.0, | |
| "reward": 0.09658388825482689, | |
| "reward_std": 0.5904048159718513, | |
| "rewards/cosine_scaled_reward": -0.23295804858207703, | |
| "rewards/format_reward": 0.5625, | |
| "step": 158 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1744.1459350585938, | |
| "epoch": 0.18171428571428572, | |
| "grad_norm": 0.28953787684440613, | |
| "kl": 0.0030364990234375, | |
| "learning_rate": 8.758773376468604e-07, | |
| "loss": 0.1287, | |
| "num_tokens": 20942583.0, | |
| "reward": 0.7739498913288116, | |
| "reward_std": 0.6133845373988152, | |
| "rewards/cosine_scaled_reward": -0.008858396206051111, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 159 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1813.291748046875, | |
| "epoch": 0.18285714285714286, | |
| "grad_norm": 0.28211480379104614, | |
| "kl": 0.00348663330078125, | |
| "learning_rate": 8.737029101523929e-07, | |
| "loss": 0.1865, | |
| "num_tokens": 21038333.0, | |
| "reward": 0.3583908216096461, | |
| "reward_std": 0.7735366523265839, | |
| "rewards/cosine_scaled_reward": -0.1645545873325318, | |
| "rewards/format_reward": 0.6875, | |
| "step": 160 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2659.5209350585938, | |
| "epoch": 0.184, | |
| "grad_norm": 0.2976882755756378, | |
| "kl": 0.004169464111328125, | |
| "learning_rate": 8.715127058347614e-07, | |
| "loss": 0.1072, | |
| "num_tokens": 21174012.0, | |
| "reward": 0.19437413476407528, | |
| "reward_std": 0.6651452034711838, | |
| "rewards/cosine_scaled_reward": -0.16322960006073117, | |
| "rewards/format_reward": 0.520833320915699, | |
| "step": 161 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1947.6458740234375, | |
| "epoch": 0.18514285714285714, | |
| "grad_norm": 0.2935653626918793, | |
| "kl": 0.003204345703125, | |
| "learning_rate": 8.693068314414344e-07, | |
| "loss": 0.0734, | |
| "num_tokens": 21275227.0, | |
| "reward": 0.25041304528713226, | |
| "reward_std": 0.5214099213480949, | |
| "rewards/cosine_scaled_reward": -0.22896013781428337, | |
| "rewards/format_reward": 0.7083333283662796, | |
| "step": 162 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1663.5833740234375, | |
| "epoch": 0.18628571428571428, | |
| "grad_norm": 0.4594109058380127, | |
| "kl": 0.0039997100830078125, | |
| "learning_rate": 8.670853944836176e-07, | |
| "loss": 0.0258, | |
| "num_tokens": 21363017.0, | |
| "reward": 0.5311151891946793, | |
| "reward_std": 0.5461377911269665, | |
| "rewards/cosine_scaled_reward": -0.07819239422678947, | |
| "rewards/format_reward": 0.6875, | |
| "step": 163 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2301.812530517578, | |
| "epoch": 0.18742857142857142, | |
| "grad_norm": 0.33991318941116333, | |
| "kl": 0.003398895263671875, | |
| "learning_rate": 8.648485032310144e-07, | |
| "loss": 0.1005, | |
| "num_tokens": 21480998.0, | |
| "reward": 0.026660297065973282, | |
| "reward_std": 0.3485817238688469, | |
| "rewards/cosine_scaled_reward": -0.20541985146701336, | |
| "rewards/format_reward": 0.4375, | |
| "step": 164 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2174.479248046875, | |
| "epoch": 0.18857142857142858, | |
| "grad_norm": 0.3102648854255676, | |
| "kl": 0.0030345916748046875, | |
| "learning_rate": 8.625962667065487e-07, | |
| "loss": 0.1386, | |
| "num_tokens": 21593905.0, | |
| "reward": 0.4817277453839779, | |
| "reward_std": 0.9409472942352295, | |
| "rewards/cosine_scaled_reward": -0.07163612451404333, | |
| "rewards/format_reward": 0.6250000074505806, | |
| "step": 165 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1578.7708740234375, | |
| "epoch": 0.18971428571428572, | |
| "grad_norm": 0.31517553329467773, | |
| "kl": 0.0026702880859375, | |
| "learning_rate": 8.603287946810513e-07, | |
| "loss": 0.1302, | |
| "num_tokens": 21677198.0, | |
| "reward": 0.5695466273464262, | |
| "reward_std": 0.2626556381583214, | |
| "rewards/cosine_scaled_reward": -0.0902266874909401, | |
| "rewards/format_reward": 0.75, | |
| "step": 166 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2458.0000610351562, | |
| "epoch": 0.19085714285714286, | |
| "grad_norm": 0.2574119567871094, | |
| "kl": 0.0025386810302734375, | |
| "learning_rate": 8.580461976679099e-07, | |
| "loss": 0.1922, | |
| "num_tokens": 21803210.0, | |
| "reward": 0.16224310919642448, | |
| "reward_std": 0.544936329126358, | |
| "rewards/cosine_scaled_reward": -0.1584617868065834, | |
| "rewards/format_reward": 0.47916666232049465, | |
| "step": 167 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1711.7291870117188, | |
| "epoch": 0.192, | |
| "grad_norm": 0.31856468319892883, | |
| "kl": 0.00446319580078125, | |
| "learning_rate": 8.557485869176825e-07, | |
| "loss": -0.0714, | |
| "num_tokens": 21892741.0, | |
| "reward": 0.4072803081944585, | |
| "reward_std": 0.6576224267482758, | |
| "rewards/cosine_scaled_reward": -0.21302651334553957, | |
| "rewards/format_reward": 0.8333333283662796, | |
| "step": 168 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2047.8750915527344, | |
| "epoch": 0.19314285714285714, | |
| "grad_norm": 0.3831358850002289, | |
| "kl": 0.005199432373046875, | |
| "learning_rate": 8.534360744126753e-07, | |
| "loss": -0.0838, | |
| "num_tokens": 21999667.0, | |
| "reward": 1.2296884339302778, | |
| "reward_std": 0.8351282328367233, | |
| "rewards/cosine_scaled_reward": 0.22942754812538624, | |
| "rewards/format_reward": 0.7708333358168602, | |
| "step": 169 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1698.7917175292969, | |
| "epoch": 0.19428571428571428, | |
| "grad_norm": 0.25520145893096924, | |
| "kl": 0.00498199462890625, | |
| "learning_rate": 8.511087728614862e-07, | |
| "loss": 0.0237, | |
| "num_tokens": 22089615.0, | |
| "reward": 0.33604998141527176, | |
| "reward_std": 0.4241115599870682, | |
| "rewards/cosine_scaled_reward": -0.20697502605617046, | |
| "rewards/format_reward": 0.75, | |
| "step": 170 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1190.8541870117188, | |
| "epoch": 0.19542857142857142, | |
| "grad_norm": 0.35177505016326904, | |
| "kl": 0.0034351348876953125, | |
| "learning_rate": 8.487667956935087e-07, | |
| "loss": 0.076, | |
| "num_tokens": 22154102.0, | |
| "reward": 0.4167235270142555, | |
| "reward_std": 0.44075047969818115, | |
| "rewards/cosine_scaled_reward": -0.23955491092056036, | |
| "rewards/format_reward": 0.8958333283662796, | |
| "step": 171 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1618.5625305175781, | |
| "epoch": 0.19657142857142856, | |
| "grad_norm": 0.4714674949645996, | |
| "kl": 0.01062774658203125, | |
| "learning_rate": 8.464102570534061e-07, | |
| "loss": 0.1873, | |
| "num_tokens": 22239383.0, | |
| "reward": 0.2359453495591879, | |
| "reward_std": 0.39740175753831863, | |
| "rewards/cosine_scaled_reward": -0.24661065079271793, | |
| "rewards/format_reward": 0.7291666641831398, | |
| "step": 172 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1911.3750457763672, | |
| "epoch": 0.1977142857142857, | |
| "grad_norm": 0.3989448845386505, | |
| "kl": 0.003726959228515625, | |
| "learning_rate": 8.440392717955475e-07, | |
| "loss": 0.1805, | |
| "num_tokens": 22339241.0, | |
| "reward": 0.6558301709592342, | |
| "reward_std": 0.5854435563087463, | |
| "rewards/cosine_scaled_reward": -0.026251595467329025, | |
| "rewards/format_reward": 0.7083333283662796, | |
| "step": 173 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1451.1250610351562, | |
| "epoch": 0.19885714285714284, | |
| "grad_norm": 0.2805970311164856, | |
| "kl": 0.005706787109375, | |
| "learning_rate": 8.416539554784089e-07, | |
| "loss": 0.2465, | |
| "num_tokens": 22417001.0, | |
| "reward": 0.3804206885397434, | |
| "reward_std": 0.45497578382492065, | |
| "rewards/cosine_scaled_reward": -0.2681229915469885, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 174 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1849.6875457763672, | |
| "epoch": 0.2, | |
| "grad_norm": 0.2999878227710724, | |
| "kl": 0.004802703857421875, | |
| "learning_rate": 8.392544243589427e-07, | |
| "loss": 0.109, | |
| "num_tokens": 22513610.0, | |
| "reward": 0.6369728110730648, | |
| "reward_std": 0.5664872080087662, | |
| "rewards/cosine_scaled_reward": -0.05651361867785454, | |
| "rewards/format_reward": 0.75, | |
| "step": 175 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1288.7916717529297, | |
| "epoch": 0.20114285714285715, | |
| "grad_norm": 0.40278714895248413, | |
| "kl": 0.0055389404296875, | |
| "learning_rate": 8.368407953869103e-07, | |
| "loss": 0.2272, | |
| "num_tokens": 22583158.0, | |
| "reward": 0.6769410446286201, | |
| "reward_std": 0.5952249988913536, | |
| "rewards/cosine_scaled_reward": -0.09902950003743172, | |
| "rewards/format_reward": 0.875, | |
| "step": 176 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1662.1250305175781, | |
| "epoch": 0.2022857142857143, | |
| "grad_norm": 0.28330227732658386, | |
| "kl": 0.00322723388671875, | |
| "learning_rate": 8.344131861991828e-07, | |
| "loss": 0.1889, | |
| "num_tokens": 22670926.0, | |
| "reward": 0.352513425052166, | |
| "reward_std": 0.5872365534305573, | |
| "rewards/cosine_scaled_reward": -0.2299932837486267, | |
| "rewards/format_reward": 0.8125, | |
| "step": 177 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1484.5208892822266, | |
| "epoch": 0.20342857142857143, | |
| "grad_norm": 0.39071711897850037, | |
| "kl": 0.005340576171875, | |
| "learning_rate": 8.319717151140072e-07, | |
| "loss": 0.2975, | |
| "num_tokens": 22749749.0, | |
| "reward": 1.0806448608636856, | |
| "reward_std": 0.36274878680706024, | |
| "rewards/cosine_scaled_reward": 0.12365575134754181, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 178 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1618.3125610351562, | |
| "epoch": 0.20457142857142857, | |
| "grad_norm": 0.29320603609085083, | |
| "kl": 0.003803253173828125, | |
| "learning_rate": 8.295165011252396e-07, | |
| "loss": 0.2516, | |
| "num_tokens": 22835786.0, | |
| "reward": 0.607761038467288, | |
| "reward_std": 0.7367214486002922, | |
| "rewards/cosine_scaled_reward": -0.11278617009520531, | |
| "rewards/format_reward": 0.8333333283662796, | |
| "step": 179 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1773.6250610351562, | |
| "epoch": 0.2057142857142857, | |
| "grad_norm": 0.2515128254890442, | |
| "kl": 0.003505706787109375, | |
| "learning_rate": 8.270476638965461e-07, | |
| "loss": 0.0654, | |
| "num_tokens": 22928852.0, | |
| "reward": 0.419520135037601, | |
| "reward_std": 0.6586542651057243, | |
| "rewards/cosine_scaled_reward": -0.1964899403974414, | |
| "rewards/format_reward": 0.8125, | |
| "step": 180 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1947.354248046875, | |
| "epoch": 0.20685714285714285, | |
| "grad_norm": 0.25565311312675476, | |
| "kl": 0.0033130645751953125, | |
| "learning_rate": 8.245653237555705e-07, | |
| "loss": 0.0849, | |
| "num_tokens": 23030425.0, | |
| "reward": 0.3168973168358207, | |
| "reward_std": 0.6736587360501289, | |
| "rewards/cosine_scaled_reward": -0.20613467320799828, | |
| "rewards/format_reward": 0.7291666567325592, | |
| "step": 181 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1263.9375457763672, | |
| "epoch": 0.208, | |
| "grad_norm": 0.33973386883735657, | |
| "kl": 0.00634002685546875, | |
| "learning_rate": 8.220696016880687e-07, | |
| "loss": 0.0838, | |
| "num_tokens": 23098498.0, | |
| "reward": 0.6266386806964874, | |
| "reward_std": 0.1899520792067051, | |
| "rewards/cosine_scaled_reward": -0.12418065406382084, | |
| "rewards/format_reward": 0.875, | |
| "step": 182 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2050.9584350585938, | |
| "epoch": 0.20914285714285713, | |
| "grad_norm": 0.382978230714798, | |
| "kl": 0.004489898681640625, | |
| "learning_rate": 8.195606193320136e-07, | |
| "loss": 0.2785, | |
| "num_tokens": 23204696.0, | |
| "reward": 0.052440449595451355, | |
| "reward_std": 0.3785889223217964, | |
| "rewards/cosine_scaled_reward": -0.3175297752022743, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 183 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1581.916748046875, | |
| "epoch": 0.2102857142857143, | |
| "grad_norm": 0.29861006140708923, | |
| "kl": 0.0050201416015625, | |
| "learning_rate": 8.170384989716657e-07, | |
| "loss": 0.0973, | |
| "num_tokens": 23288308.0, | |
| "reward": 1.0160801857709885, | |
| "reward_std": 0.9725418835878372, | |
| "rewards/cosine_scaled_reward": 0.07054009102284908, | |
| "rewards/format_reward": 0.875, | |
| "step": 184 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1973.0625610351562, | |
| "epoch": 0.21142857142857144, | |
| "grad_norm": 0.2891089916229248, | |
| "kl": 0.005382537841796875, | |
| "learning_rate": 8.145033635316128e-07, | |
| "loss": 0.1131, | |
| "num_tokens": 23390791.0, | |
| "reward": 0.47157616540789604, | |
| "reward_std": 0.8375265002250671, | |
| "rewards/cosine_scaled_reward": -0.12879525747848675, | |
| "rewards/format_reward": 0.7291666641831398, | |
| "step": 185 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1952.8959197998047, | |
| "epoch": 0.21257142857142858, | |
| "grad_norm": 0.2916695475578308, | |
| "kl": 0.004917144775390625, | |
| "learning_rate": 8.119553365707802e-07, | |
| "loss": 0.122, | |
| "num_tokens": 23492504.0, | |
| "reward": 0.6901115328073502, | |
| "reward_std": 0.6558038257062435, | |
| "rewards/cosine_scaled_reward": -0.019527582451701164, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 186 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1829.1041870117188, | |
| "epoch": 0.21371428571428572, | |
| "grad_norm": 0.26562783122062683, | |
| "kl": 0.005084991455078125, | |
| "learning_rate": 8.093945422764069e-07, | |
| "loss": 0.1011, | |
| "num_tokens": 23589427.0, | |
| "reward": 1.0519462451338768, | |
| "reward_std": 0.8125685751438141, | |
| "rewards/cosine_scaled_reward": 0.09888977278023958, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 187 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1543.3541870117188, | |
| "epoch": 0.21485714285714286, | |
| "grad_norm": 0.28853264451026917, | |
| "kl": 0.0056972503662109375, | |
| "learning_rate": 8.068211054579943e-07, | |
| "loss": 0.1731, | |
| "num_tokens": 23671032.0, | |
| "reward": 0.8557066395878792, | |
| "reward_std": 0.9914733618497849, | |
| "rewards/cosine_scaled_reward": 0.032019972801208496, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 188 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1163.1042022705078, | |
| "epoch": 0.216, | |
| "grad_norm": 0.3592197000980377, | |
| "kl": 0.006622314453125, | |
| "learning_rate": 8.04235151541222e-07, | |
| "loss": 0.1053, | |
| "num_tokens": 23734703.0, | |
| "reward": 0.6029985174536705, | |
| "reward_std": 0.4228066951036453, | |
| "rewards/cosine_scaled_reward": -0.16725075244903564, | |
| "rewards/format_reward": 0.9375, | |
| "step": 189 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1680.9584197998047, | |
| "epoch": 0.21714285714285714, | |
| "grad_norm": 0.2913980782032013, | |
| "kl": 0.00589752197265625, | |
| "learning_rate": 8.01636806561836e-07, | |
| "loss": 0.1223, | |
| "num_tokens": 23823057.0, | |
| "reward": 0.45007142052054405, | |
| "reward_std": 0.5233389809727669, | |
| "rewards/cosine_scaled_reward": -0.191630975343287, | |
| "rewards/format_reward": 0.8333333283662796, | |
| "step": 190 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1796.8333740234375, | |
| "epoch": 0.21828571428571428, | |
| "grad_norm": 0.29226669669151306, | |
| "kl": 0.00363922119140625, | |
| "learning_rate": 7.990261971595048e-07, | |
| "loss": 0.1848, | |
| "num_tokens": 23917189.0, | |
| "reward": 0.3450898602604866, | |
| "reward_std": 0.7155111283063889, | |
| "rewards/cosine_scaled_reward": -0.202455073595047, | |
| "rewards/format_reward": 0.7499999962747097, | |
| "step": 191 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2041.4583740234375, | |
| "epoch": 0.21942857142857142, | |
| "grad_norm": 0.38081520795822144, | |
| "kl": 0.006256103515625, | |
| "learning_rate": 7.964034505716476e-07, | |
| "loss": 0.0669, | |
| "num_tokens": 24022817.0, | |
| "reward": 0.24144138023257256, | |
| "reward_std": 0.6051021218299866, | |
| "rewards/cosine_scaled_reward": -0.17094597034156322, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 192 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1747.8125305175781, | |
| "epoch": 0.22057142857142858, | |
| "grad_norm": 0.41225576400756836, | |
| "kl": 0.00653076171875, | |
| "learning_rate": 7.93768694627233e-07, | |
| "loss": 0.0997, | |
| "num_tokens": 24114524.0, | |
| "reward": 0.5901122093200684, | |
| "reward_std": 0.49861879646778107, | |
| "rewards/cosine_scaled_reward": -0.07994389347732067, | |
| "rewards/format_reward": 0.7500000074505806, | |
| "step": 193 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1069.3958587646484, | |
| "epoch": 0.22171428571428572, | |
| "grad_norm": 0.36986562609672546, | |
| "kl": 0.0089263916015625, | |
| "learning_rate": 7.911220577405484e-07, | |
| "loss": 0.1094, | |
| "num_tokens": 24173427.0, | |
| "reward": 1.4875798523426056, | |
| "reward_std": 0.6242154352366924, | |
| "rewards/cosine_scaled_reward": 0.28545656986534595, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 194 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1081.3750305175781, | |
| "epoch": 0.22285714285714286, | |
| "grad_norm": 0.3711891770362854, | |
| "kl": 0.00726318359375, | |
| "learning_rate": 7.884636689049422e-07, | |
| "loss": 0.2086, | |
| "num_tokens": 24233181.0, | |
| "reward": 1.04611661657691, | |
| "reward_std": 0.5401172246783972, | |
| "rewards/cosine_scaled_reward": 0.05430831015110016, | |
| "rewards/format_reward": 0.9375, | |
| "step": 195 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1984.6459045410156, | |
| "epoch": 0.224, | |
| "grad_norm": 0.3469684422016144, | |
| "kl": 0.0052490234375, | |
| "learning_rate": 7.857936576865356e-07, | |
| "loss": 0.1803, | |
| "num_tokens": 24336388.0, | |
| "reward": 0.38754457980394363, | |
| "reward_std": 0.5967799983918667, | |
| "rewards/cosine_scaled_reward": -0.13956104964017868, | |
| "rewards/format_reward": 0.666666679084301, | |
| "step": 196 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1723.8334045410156, | |
| "epoch": 0.22514285714285714, | |
| "grad_norm": 0.3208644986152649, | |
| "kl": 0.005466461181640625, | |
| "learning_rate": 7.831121542179086e-07, | |
| "loss": 0.1018, | |
| "num_tokens": 24427034.0, | |
| "reward": 0.13043999671936035, | |
| "reward_std": 0.36407894641160965, | |
| "rewards/cosine_scaled_reward": -0.2785299941897392, | |
| "rewards/format_reward": 0.6875000055879354, | |
| "step": 197 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2133.6875610351562, | |
| "epoch": 0.22628571428571428, | |
| "grad_norm": 0.26503321528434753, | |
| "kl": 0.00514984130859375, | |
| "learning_rate": 7.804192891917571e-07, | |
| "loss": 0.0816, | |
| "num_tokens": 24537677.0, | |
| "reward": 0.20322639122605324, | |
| "reward_std": 0.41373175010085106, | |
| "rewards/cosine_scaled_reward": -0.22130347788333893, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 198 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1842.7500915527344, | |
| "epoch": 0.22742857142857142, | |
| "grad_norm": 0.3147067427635193, | |
| "kl": 0.00490570068359375, | |
| "learning_rate": 7.777151938545235e-07, | |
| "loss": 0.116, | |
| "num_tokens": 24634271.0, | |
| "reward": 0.8186136335134506, | |
| "reward_std": 0.7792095839977264, | |
| "rewards/cosine_scaled_reward": 0.013473461382091045, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 199 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1465.3750305175781, | |
| "epoch": 0.22857142857142856, | |
| "grad_norm": 0.4409931004047394, | |
| "kl": 0.00621795654296875, | |
| "learning_rate": 7.75e-07, | |
| "loss": 0.2345, | |
| "num_tokens": 24712667.0, | |
| "reward": 0.9040451645851135, | |
| "reward_std": 0.6532056555151939, | |
| "rewards/cosine_scaled_reward": 0.06660587899386883, | |
| "rewards/format_reward": 0.7708333358168602, | |
| "step": 200 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1399.9791870117188, | |
| "epoch": 0.2297142857142857, | |
| "grad_norm": 4.2830705642700195, | |
| "kl": 0.08713531494140625, | |
| "learning_rate": 7.72273839962904e-07, | |
| "loss": 0.1038, | |
| "num_tokens": 24788044.0, | |
| "reward": 0.5020628832280636, | |
| "reward_std": 0.5367235317826271, | |
| "rewards/cosine_scaled_reward": -0.1552185484324582, | |
| "rewards/format_reward": 0.8125, | |
| "step": 201 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1097.625015258789, | |
| "epoch": 0.23085714285714284, | |
| "grad_norm": 0.4227047562599182, | |
| "kl": 0.0122528076171875, | |
| "learning_rate": 7.695368466124296e-07, | |
| "loss": 0.1053, | |
| "num_tokens": 24848728.0, | |
| "reward": 0.8473174124956131, | |
| "reward_std": 0.7071373090147972, | |
| "rewards/cosine_scaled_reward": -0.02425796026363969, | |
| "rewards/format_reward": 0.8958333283662796, | |
| "step": 202 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1515.2292175292969, | |
| "epoch": 0.232, | |
| "grad_norm": 0.41127878427505493, | |
| "kl": 0.01018524169921875, | |
| "learning_rate": 7.667891533457718e-07, | |
| "loss": 0.2108, | |
| "num_tokens": 24929409.0, | |
| "reward": 0.7710594609379768, | |
| "reward_std": 0.7223998121917248, | |
| "rewards/cosine_scaled_reward": 0.010529719293117523, | |
| "rewards/format_reward": 0.7499999925494194, | |
| "step": 203 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1023.0417175292969, | |
| "epoch": 0.23314285714285715, | |
| "grad_norm": 0.44631218910217285, | |
| "kl": 0.009815216064453125, | |
| "learning_rate": 7.640308940816239e-07, | |
| "loss": 0.225, | |
| "num_tokens": 24986189.0, | |
| "reward": 1.2192229256033897, | |
| "reward_std": 0.7845951393246651, | |
| "rewards/cosine_scaled_reward": 0.16169476276263595, | |
| "rewards/format_reward": 0.8958333283662796, | |
| "step": 204 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1646.1875305175781, | |
| "epoch": 0.2342857142857143, | |
| "grad_norm": 0.2805160880088806, | |
| "kl": 0.00618743896484375, | |
| "learning_rate": 7.612622032536507e-07, | |
| "loss": 0.1993, | |
| "num_tokens": 25073390.0, | |
| "reward": 0.5499276574701071, | |
| "reward_std": 0.8229625821113586, | |
| "rewards/cosine_scaled_reward": -0.14170285500586033, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 205 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1825.0209045410156, | |
| "epoch": 0.23542857142857143, | |
| "grad_norm": 0.2909686267375946, | |
| "kl": 0.00579071044921875, | |
| "learning_rate": 7.584832158039378e-07, | |
| "loss": 0.04, | |
| "num_tokens": 25169739.0, | |
| "reward": 0.30743252485990524, | |
| "reward_std": 0.7692845687270164, | |
| "rewards/cosine_scaled_reward": -0.2212837437982671, | |
| "rewards/format_reward": 0.75, | |
| "step": 206 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1912.0000915527344, | |
| "epoch": 0.23657142857142857, | |
| "grad_norm": 0.27535000443458557, | |
| "kl": 0.004474639892578125, | |
| "learning_rate": 7.556940671764124e-07, | |
| "loss": 0.0423, | |
| "num_tokens": 25269513.0, | |
| "reward": 0.8390000090003014, | |
| "reward_std": 0.8653023391962051, | |
| "rewards/cosine_scaled_reward": 0.02366666356101632, | |
| "rewards/format_reward": 0.7916666567325592, | |
| "step": 207 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1695.3333892822266, | |
| "epoch": 0.2377142857142857, | |
| "grad_norm": 0.31623345613479614, | |
| "kl": 0.00894927978515625, | |
| "learning_rate": 7.528948933102438e-07, | |
| "loss": 0.0569, | |
| "num_tokens": 25358791.0, | |
| "reward": 0.2730532819405198, | |
| "reward_std": 0.2860656566917896, | |
| "rewards/cosine_scaled_reward": -0.23847334645688534, | |
| "rewards/format_reward": 0.7499999925494194, | |
| "step": 208 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 900.3750305175781, | |
| "epoch": 0.23885714285714285, | |
| "grad_norm": 0.42924708127975464, | |
| "kl": 0.00959014892578125, | |
| "learning_rate": 7.500858306332172e-07, | |
| "loss": 0.132, | |
| "num_tokens": 25409989.0, | |
| "reward": 0.6640197485685349, | |
| "reward_std": 0.662564605474472, | |
| "rewards/cosine_scaled_reward": -0.11590681597590446, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 209 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1257.7708740234375, | |
| "epoch": 0.24, | |
| "grad_norm": 0.6309426426887512, | |
| "kl": 0.0152740478515625, | |
| "learning_rate": 7.472670160550848e-07, | |
| "loss": 0.2302, | |
| "num_tokens": 25478378.0, | |
| "reward": 0.9711966030299664, | |
| "reward_std": 0.5971670504659414, | |
| "rewards/cosine_scaled_reward": 0.0585149209946394, | |
| "rewards/format_reward": 0.8541666567325592, | |
| "step": 210 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2222.7084350585938, | |
| "epoch": 0.24114285714285713, | |
| "grad_norm": 0.26984068751335144, | |
| "kl": 0.0057525634765625, | |
| "learning_rate": 7.444385869608921e-07, | |
| "loss": 0.1591, | |
| "num_tokens": 25593546.0, | |
| "reward": 0.15845571644604206, | |
| "reward_std": 0.6340156942605972, | |
| "rewards/cosine_scaled_reward": -0.2124387975782156, | |
| "rewards/format_reward": 0.5833333283662796, | |
| "step": 211 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1195.187515258789, | |
| "epoch": 0.2422857142857143, | |
| "grad_norm": 0.37575727701187134, | |
| "kl": 0.0077056884765625, | |
| "learning_rate": 7.416006812042827e-07, | |
| "loss": 0.1568, | |
| "num_tokens": 25658679.0, | |
| "reward": 0.7064365767873824, | |
| "reward_std": 0.46855687722563744, | |
| "rewards/cosine_scaled_reward": -0.07386504206806421, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 212 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2643.291748046875, | |
| "epoch": 0.24342857142857144, | |
| "grad_norm": 0.26660388708114624, | |
| "kl": 0.0048313140869140625, | |
| "learning_rate": 7.387534371007797e-07, | |
| "loss": 0.064, | |
| "num_tokens": 25793729.0, | |
| "reward": -0.02674592100083828, | |
| "reward_std": 0.3547811508178711, | |
| "rewards/cosine_scaled_reward": -0.2112896330654621, | |
| "rewards/format_reward": 0.3958333320915699, | |
| "step": 213 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1974.1459350585938, | |
| "epoch": 0.24457142857142858, | |
| "grad_norm": 0.275295615196228, | |
| "kl": 0.004917144775390625, | |
| "learning_rate": 7.358969934210438e-07, | |
| "loss": 0.077, | |
| "num_tokens": 25896684.0, | |
| "reward": 0.5545921013690531, | |
| "reward_std": 0.7209105789661407, | |
| "rewards/cosine_scaled_reward": -0.08728731423616409, | |
| "rewards/format_reward": 0.7291666567325592, | |
| "step": 214 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1460.9792175292969, | |
| "epoch": 0.24571428571428572, | |
| "grad_norm": 0.3583175539970398, | |
| "kl": 0.00732421875, | |
| "learning_rate": 7.330314893841101e-07, | |
| "loss": 0.0764, | |
| "num_tokens": 25974851.0, | |
| "reward": 0.6321092396974564, | |
| "reward_std": 0.7664843499660492, | |
| "rewards/cosine_scaled_reward": -0.12144538667052984, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 215 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1925.8958435058594, | |
| "epoch": 0.24685714285714286, | |
| "grad_norm": 0.27199554443359375, | |
| "kl": 0.00620269775390625, | |
| "learning_rate": 7.301570646506027e-07, | |
| "loss": 0.1422, | |
| "num_tokens": 26075400.0, | |
| "reward": 0.2661210894584656, | |
| "reward_std": 0.5832830742001534, | |
| "rewards/cosine_scaled_reward": -0.22110612504184246, | |
| "rewards/format_reward": 0.708333320915699, | |
| "step": 216 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1680.729248046875, | |
| "epoch": 0.248, | |
| "grad_norm": 0.36763909459114075, | |
| "kl": 0.00833892822265625, | |
| "learning_rate": 7.27273859315928e-07, | |
| "loss": 0.0663, | |
| "num_tokens": 26163863.0, | |
| "reward": 0.5651724338531494, | |
| "reward_std": 0.47575872763991356, | |
| "rewards/cosine_scaled_reward": -0.10283046402037144, | |
| "rewards/format_reward": 0.7708333283662796, | |
| "step": 217 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1642.354248046875, | |
| "epoch": 0.24914285714285714, | |
| "grad_norm": 0.3406772017478943, | |
| "kl": 0.006591796875, | |
| "learning_rate": 7.243820139034464e-07, | |
| "loss": 0.0695, | |
| "num_tokens": 26251186.0, | |
| "reward": 0.955289987847209, | |
| "reward_std": 0.7435199301689863, | |
| "rewards/cosine_scaled_reward": 0.08181164413690567, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 218 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 980.8542175292969, | |
| "epoch": 0.2502857142857143, | |
| "grad_norm": 0.378128319978714, | |
| "kl": 0.00731658935546875, | |
| "learning_rate": 7.214816693576234e-07, | |
| "loss": 0.2817, | |
| "num_tokens": 26305803.0, | |
| "reward": 0.768341101706028, | |
| "reward_std": 0.45224541425704956, | |
| "rewards/cosine_scaled_reward": -0.0949961468577385, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 219 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1017.9166717529297, | |
| "epoch": 0.25142857142857145, | |
| "grad_norm": 0.3980640172958374, | |
| "kl": 0.0093231201171875, | |
| "learning_rate": 7.185729670371604e-07, | |
| "loss": 0.2144, | |
| "num_tokens": 26362757.0, | |
| "reward": 0.5230312906205654, | |
| "reward_std": 0.6670053526759148, | |
| "rewards/cosine_scaled_reward": -0.1968176942318678, | |
| "rewards/format_reward": 0.9166666567325592, | |
| "step": 220 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1961.1459350585938, | |
| "epoch": 0.25257142857142856, | |
| "grad_norm": 0.3665303885936737, | |
| "kl": 0.00670623779296875, | |
| "learning_rate": 7.156560487081051e-07, | |
| "loss": 0.2889, | |
| "num_tokens": 26465760.0, | |
| "reward": 0.3678629584610462, | |
| "reward_std": 0.5251259803771973, | |
| "rewards/cosine_scaled_reward": -0.11815185844898224, | |
| "rewards/format_reward": 0.6041666641831398, | |
| "step": 221 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2460.3334350585938, | |
| "epoch": 0.2537142857142857, | |
| "grad_norm": 0.24931471049785614, | |
| "kl": 0.00376129150390625, | |
| "learning_rate": 7.127310565369415e-07, | |
| "loss": 0.257, | |
| "num_tokens": 26592070.0, | |
| "reward": -0.09669354045763612, | |
| "reward_std": 0.49049024283885956, | |
| "rewards/cosine_scaled_reward": -0.30876342952251434, | |
| "rewards/format_reward": 0.5208333283662796, | |
| "step": 222 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1533.3333892822266, | |
| "epoch": 0.25485714285714284, | |
| "grad_norm": 0.36577993631362915, | |
| "kl": 0.0063323974609375, | |
| "learning_rate": 7.097981330836616e-07, | |
| "loss": 0.1167, | |
| "num_tokens": 26673644.0, | |
| "reward": 0.7940644323825836, | |
| "reward_std": 0.6795159354805946, | |
| "rewards/cosine_scaled_reward": -0.05088444147258997, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 223 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2049.979248046875, | |
| "epoch": 0.256, | |
| "grad_norm": 0.28553715348243713, | |
| "kl": 0.00672149658203125, | |
| "learning_rate": 7.068574212948169e-07, | |
| "loss": 0.0225, | |
| "num_tokens": 26779861.0, | |
| "reward": 0.3970666974782944, | |
| "reward_std": 0.25915637239813805, | |
| "rewards/cosine_scaled_reward": -0.10354999452829361, | |
| "rewards/format_reward": 0.6041666641831398, | |
| "step": 224 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1318.2708740234375, | |
| "epoch": 0.2571428571428571, | |
| "grad_norm": 0.35073599219322205, | |
| "kl": 0.00748443603515625, | |
| "learning_rate": 7.039090644965509e-07, | |
| "loss": 0.1601, | |
| "num_tokens": 26851604.0, | |
| "reward": 1.1711989641189575, | |
| "reward_std": 0.7675672024488449, | |
| "rewards/cosine_scaled_reward": 0.16893283650279045, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 225 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1755.7083435058594, | |
| "epoch": 0.2582857142857143, | |
| "grad_norm": 0.448542058467865, | |
| "kl": 0.009063720703125, | |
| "learning_rate": 7.009532063876148e-07, | |
| "loss": 0.1154, | |
| "num_tokens": 26943744.0, | |
| "reward": 0.5059840455651283, | |
| "reward_std": 0.397355318069458, | |
| "rewards/cosine_scaled_reward": -0.10117465630173683, | |
| "rewards/format_reward": 0.708333333954215, | |
| "step": 226 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1148.0416870117188, | |
| "epoch": 0.25942857142857145, | |
| "grad_norm": 0.40192481875419617, | |
| "kl": 0.01007080078125, | |
| "learning_rate": 6.979899910323624e-07, | |
| "loss": 0.1993, | |
| "num_tokens": 27006674.0, | |
| "reward": 0.7949583828449249, | |
| "reward_std": 0.5842397212982178, | |
| "rewards/cosine_scaled_reward": -0.07127081975340843, | |
| "rewards/format_reward": 0.9375, | |
| "step": 227 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1674.8125305175781, | |
| "epoch": 0.26057142857142856, | |
| "grad_norm": 0.3176382780075073, | |
| "kl": 0.00914764404296875, | |
| "learning_rate": 6.950195628537299e-07, | |
| "loss": 0.171, | |
| "num_tokens": 27095561.0, | |
| "reward": 0.23410499235615134, | |
| "reward_std": 0.3607608489692211, | |
| "rewards/cosine_scaled_reward": -0.24753085523843765, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 228 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1639.3333740234375, | |
| "epoch": 0.26171428571428573, | |
| "grad_norm": 0.35383015871047974, | |
| "kl": 0.01419830322265625, | |
| "learning_rate": 6.920420666261961e-07, | |
| "loss": 0.1877, | |
| "num_tokens": 27181737.0, | |
| "reward": 0.4472636952996254, | |
| "reward_std": 0.5543514788150787, | |
| "rewards/cosine_scaled_reward": -0.13053483422845602, | |
| "rewards/format_reward": 0.7083333358168602, | |
| "step": 229 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1613.6042022705078, | |
| "epoch": 0.26285714285714284, | |
| "grad_norm": 0.3959270119667053, | |
| "kl": 0.007358551025390625, | |
| "learning_rate": 6.890576474687263e-07, | |
| "loss": 0.1748, | |
| "num_tokens": 27267662.0, | |
| "reward": 0.7049806490540504, | |
| "reward_std": 0.637917771935463, | |
| "rewards/cosine_scaled_reward": -0.04334301874041557, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 230 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2350.979217529297, | |
| "epoch": 0.264, | |
| "grad_norm": 0.29473647475242615, | |
| "kl": 0.006885528564453125, | |
| "learning_rate": 6.860664508377001e-07, | |
| "loss": 0.1554, | |
| "num_tokens": 27388519.0, | |
| "reward": 0.6861059963703156, | |
| "reward_std": 0.5922371260821819, | |
| "rewards/cosine_scaled_reward": 0.009719666093587875, | |
| "rewards/format_reward": 0.6666666641831398, | |
| "step": 231 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1402.4167022705078, | |
| "epoch": 0.2651428571428571, | |
| "grad_norm": 0.354436457157135, | |
| "kl": 0.01007080078125, | |
| "learning_rate": 6.83068622519821e-07, | |
| "loss": 0.1694, | |
| "num_tokens": 27464409.0, | |
| "reward": 0.5453539118170738, | |
| "reward_std": 0.7636468224227428, | |
| "rewards/cosine_scaled_reward": -0.13357303908560425, | |
| "rewards/format_reward": 0.8125, | |
| "step": 232 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1941.7291870117188, | |
| "epoch": 0.2662857142857143, | |
| "grad_norm": 0.36628639698028564, | |
| "kl": 0.0104217529296875, | |
| "learning_rate": 6.800643086250121e-07, | |
| "loss": 0.132, | |
| "num_tokens": 27565634.0, | |
| "reward": 0.8026084899902344, | |
| "reward_std": 0.9171203821897507, | |
| "rewards/cosine_scaled_reward": 0.015887574292719364, | |
| "rewards/format_reward": 0.7708333283662796, | |
| "step": 233 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1735.4583740234375, | |
| "epoch": 0.2674285714285714, | |
| "grad_norm": 0.2988748848438263, | |
| "kl": 0.00730133056640625, | |
| "learning_rate": 6.770536555792944e-07, | |
| "loss": 0.2224, | |
| "num_tokens": 27657318.0, | |
| "reward": 0.6515710987150669, | |
| "reward_std": 0.6439172849059105, | |
| "rewards/cosine_scaled_reward": -0.07004780881106853, | |
| "rewards/format_reward": 0.7916666567325592, | |
| "step": 234 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1488.208366394043, | |
| "epoch": 0.26857142857142857, | |
| "grad_norm": 0.3993307948112488, | |
| "kl": 0.01326751708984375, | |
| "learning_rate": 6.740368101176495e-07, | |
| "loss": 0.1033, | |
| "num_tokens": 27735910.0, | |
| "reward": 0.8183648958802223, | |
| "reward_std": 0.5113412290811539, | |
| "rewards/cosine_scaled_reward": 0.03418244048953056, | |
| "rewards/format_reward": 0.75, | |
| "step": 235 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1141.8333587646484, | |
| "epoch": 0.26971428571428574, | |
| "grad_norm": 0.3268232047557831, | |
| "kl": 0.00972747802734375, | |
| "learning_rate": 6.710139192768694e-07, | |
| "loss": 0.209, | |
| "num_tokens": 27798722.0, | |
| "reward": 0.45677755028009415, | |
| "reward_std": 0.5210263505578041, | |
| "rewards/cosine_scaled_reward": -0.22994457185268402, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 236 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1664.7500305175781, | |
| "epoch": 0.27085714285714285, | |
| "grad_norm": 0.33243250846862793, | |
| "kl": 0.0086517333984375, | |
| "learning_rate": 6.679851303883891e-07, | |
| "loss": 0.078, | |
| "num_tokens": 27887030.0, | |
| "reward": 0.7787259165197611, | |
| "reward_std": 0.5983692929148674, | |
| "rewards/cosine_scaled_reward": -0.01688704453408718, | |
| "rewards/format_reward": 0.8125, | |
| "step": 237 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1015.4167175292969, | |
| "epoch": 0.272, | |
| "grad_norm": 0.4665866792201996, | |
| "kl": 0.0143890380859375, | |
| "learning_rate": 6.649505910711058e-07, | |
| "loss": 0.1004, | |
| "num_tokens": 27944128.0, | |
| "reward": 0.5497091813012958, | |
| "reward_std": 0.5992827862501144, | |
| "rewards/cosine_scaled_reward": -0.18347875276231207, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 238 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1973.1458740234375, | |
| "epoch": 0.27314285714285713, | |
| "grad_norm": 0.25983497500419617, | |
| "kl": 0.007293701171875, | |
| "learning_rate": 6.619104492241847e-07, | |
| "loss": 0.1291, | |
| "num_tokens": 28047701.0, | |
| "reward": 0.27186793461441994, | |
| "reward_std": 0.6742007434368134, | |
| "rewards/cosine_scaled_reward": -0.21823271550238132, | |
| "rewards/format_reward": 0.708333320915699, | |
| "step": 239 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1144.4166870117188, | |
| "epoch": 0.2742857142857143, | |
| "grad_norm": 0.3183096945285797, | |
| "kl": 0.0117340087890625, | |
| "learning_rate": 6.588648530198504e-07, | |
| "loss": 0.1353, | |
| "num_tokens": 28110835.0, | |
| "reward": 1.299223653972149, | |
| "reward_std": 0.5905436016619205, | |
| "rewards/cosine_scaled_reward": 0.17044511064887047, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 240 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1381.416732788086, | |
| "epoch": 0.2754285714285714, | |
| "grad_norm": 0.4551909565925598, | |
| "kl": 0.014190673828125, | |
| "learning_rate": 6.558139508961654e-07, | |
| "loss": 0.0289, | |
| "num_tokens": 28185639.0, | |
| "reward": 0.633112620562315, | |
| "reward_std": 0.5124102905392647, | |
| "rewards/cosine_scaled_reward": -0.08969368692487478, | |
| "rewards/format_reward": 0.8125, | |
| "step": 241 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1745.875015258789, | |
| "epoch": 0.2765714285714286, | |
| "grad_norm": 0.46845531463623047, | |
| "kl": 0.00977325439453125, | |
| "learning_rate": 6.527578915497951e-07, | |
| "loss": 0.354, | |
| "num_tokens": 28277157.0, | |
| "reward": 0.6752857603132725, | |
| "reward_std": 0.7641724795103073, | |
| "rewards/cosine_scaled_reward": -0.047773787286132574, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 242 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1423.1875610351562, | |
| "epoch": 0.2777142857142857, | |
| "grad_norm": 0.3185717463493347, | |
| "kl": 0.0092926025390625, | |
| "learning_rate": 6.496968239287603e-07, | |
| "loss": 0.2263, | |
| "num_tokens": 28352916.0, | |
| "reward": 0.8118701353669167, | |
| "reward_std": 0.5183029696345329, | |
| "rewards/cosine_scaled_reward": -0.01073160395026207, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 243 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1071.0417175292969, | |
| "epoch": 0.27885714285714286, | |
| "grad_norm": 4.240310192108154, | |
| "kl": 0.1209869384765625, | |
| "learning_rate": 6.466308972251785e-07, | |
| "loss": 0.2218, | |
| "num_tokens": 28412534.0, | |
| "reward": 1.1517661362886429, | |
| "reward_std": 0.9977184236049652, | |
| "rewards/cosine_scaled_reward": 0.12796638230793178, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 244 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1106.6042022705078, | |
| "epoch": 0.28, | |
| "grad_norm": 0.5300337076187134, | |
| "kl": 0.0133056640625, | |
| "learning_rate": 6.435602608679916e-07, | |
| "loss": 0.1105, | |
| "num_tokens": 28472587.0, | |
| "reward": 0.7246537022292614, | |
| "reward_std": 0.5131946355104446, | |
| "rewards/cosine_scaled_reward": -0.08558984659612179, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 245 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1154.0833740234375, | |
| "epoch": 0.28114285714285714, | |
| "grad_norm": 0.45345908403396606, | |
| "kl": 0.0103302001953125, | |
| "learning_rate": 6.404850645156841e-07, | |
| "loss": 0.183, | |
| "num_tokens": 28536143.0, | |
| "reward": 0.912123791873455, | |
| "reward_std": 0.47968992590904236, | |
| "rewards/cosine_scaled_reward": -0.023104790598154068, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 246 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 768.0625228881836, | |
| "epoch": 0.2822857142857143, | |
| "grad_norm": 0.527489423751831, | |
| "kl": 0.018798828125, | |
| "learning_rate": 6.374054580489873e-07, | |
| "loss": 0.0279, | |
| "num_tokens": 28580654.0, | |
| "reward": 1.2265305668115616, | |
| "reward_std": 0.6787078976631165, | |
| "rewards/cosine_scaled_reward": 0.1340985968708992, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 247 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1808.3333358764648, | |
| "epoch": 0.2834285714285714, | |
| "grad_norm": 0.4756445288658142, | |
| "kl": 0.011627197265625, | |
| "learning_rate": 6.343215915635761e-07, | |
| "loss": 0.1571, | |
| "num_tokens": 28675230.0, | |
| "reward": 0.4981997571885586, | |
| "reward_std": 0.6461528465151787, | |
| "rewards/cosine_scaled_reward": -0.08423344511538744, | |
| "rewards/format_reward": 0.6666666828095913, | |
| "step": 248 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1773.5000762939453, | |
| "epoch": 0.2845714285714286, | |
| "grad_norm": 0.31437787413597107, | |
| "kl": 0.007415771484375, | |
| "learning_rate": 6.31233615362752e-07, | |
| "loss": 0.1652, | |
| "num_tokens": 28769256.0, | |
| "reward": 0.6388058252632618, | |
| "reward_std": 0.7867574747651815, | |
| "rewards/cosine_scaled_reward": -0.03476376552134752, | |
| "rewards/format_reward": 0.708333333954215, | |
| "step": 249 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 931.2083587646484, | |
| "epoch": 0.2857142857142857, | |
| "grad_norm": 0.41349363327026367, | |
| "kl": 0.0152130126953125, | |
| "learning_rate": 6.281416799501187e-07, | |
| "loss": 0.0753, | |
| "num_tokens": 28821742.0, | |
| "reward": 0.7699981704354286, | |
| "reward_std": 0.718483492732048, | |
| "rewards/cosine_scaled_reward": -0.09416760504245758, | |
| "rewards/format_reward": 0.9583333283662796, | |
| "step": 250 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2199.6876220703125, | |
| "epoch": 0.28685714285714287, | |
| "grad_norm": 0.2921055555343628, | |
| "kl": 0.0113372802734375, | |
| "learning_rate": 6.25045936022246e-07, | |
| "loss": 0.0523, | |
| "num_tokens": 28935739.0, | |
| "reward": 0.542440053075552, | |
| "reward_std": 0.9616198837757111, | |
| "rewards/cosine_scaled_reward": -0.041279987432062626, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 251 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1409.8542022705078, | |
| "epoch": 0.288, | |
| "grad_norm": 0.34936532378196716, | |
| "kl": 0.011505126953125, | |
| "learning_rate": 6.219465344613258e-07, | |
| "loss": 0.123, | |
| "num_tokens": 29011548.0, | |
| "reward": 0.8992012739181519, | |
| "reward_std": 0.5171293690800667, | |
| "rewards/cosine_scaled_reward": -0.00873270258307457, | |
| "rewards/format_reward": 0.9166666567325592, | |
| "step": 252 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1621.0417022705078, | |
| "epoch": 0.28914285714285715, | |
| "grad_norm": 0.3845634460449219, | |
| "kl": 0.0136260986328125, | |
| "learning_rate": 6.188436263278172e-07, | |
| "loss": 0.2043, | |
| "num_tokens": 29097182.0, | |
| "reward": 0.47796724177896976, | |
| "reward_std": 0.4307953789830208, | |
| "rewards/cosine_scaled_reward": -0.11518305912613869, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 253 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1132.7917175292969, | |
| "epoch": 0.29028571428571426, | |
| "grad_norm": 0.3748434782028198, | |
| "kl": 0.0124969482421875, | |
| "learning_rate": 6.157373628530852e-07, | |
| "loss": 0.1782, | |
| "num_tokens": 29158930.0, | |
| "reward": 1.4391566216945648, | |
| "reward_std": 0.6073151230812073, | |
| "rewards/cosine_scaled_reward": 0.2924949713051319, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 254 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1166.7500610351562, | |
| "epoch": 0.2914285714285714, | |
| "grad_norm": 0.3888883888721466, | |
| "kl": 0.013397216796875, | |
| "learning_rate": 6.126278954320294e-07, | |
| "loss": 0.1089, | |
| "num_tokens": 29223376.0, | |
| "reward": 0.8761316128075123, | |
| "reward_std": 0.4091811142861843, | |
| "rewards/cosine_scaled_reward": 0.010982461273670197, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 255 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 863.0833587646484, | |
| "epoch": 0.2925714285714286, | |
| "grad_norm": 0.37044182419776917, | |
| "kl": 0.0126190185546875, | |
| "learning_rate": 6.095153756157051e-07, | |
| "loss": 0.0904, | |
| "num_tokens": 29273426.0, | |
| "reward": 0.9074563533067703, | |
| "reward_std": 0.49417800083756447, | |
| "rewards/cosine_scaled_reward": -0.03585517778992653, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 256 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1247.6875305175781, | |
| "epoch": 0.2937142857142857, | |
| "grad_norm": 0.2904569208621979, | |
| "kl": 0.011871337890625, | |
| "learning_rate": 6.06399955103937e-07, | |
| "loss": 0.2652, | |
| "num_tokens": 29341499.0, | |
| "reward": 0.43778856843709946, | |
| "reward_std": 0.5745215713977814, | |
| "rewards/cosine_scaled_reward": -0.22902238368988037, | |
| "rewards/format_reward": 0.8958333283662796, | |
| "step": 257 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1685.5833435058594, | |
| "epoch": 0.2948571428571429, | |
| "grad_norm": 0.366291880607605, | |
| "kl": 0.013092041015625, | |
| "learning_rate": 6.032817857379256e-07, | |
| "loss": 0.3003, | |
| "num_tokens": 29430639.0, | |
| "reward": 0.5727702639997005, | |
| "reward_std": 0.6911112070083618, | |
| "rewards/cosine_scaled_reward": -0.06778154894709587, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 258 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1004.8750457763672, | |
| "epoch": 0.296, | |
| "grad_norm": 0.4196613132953644, | |
| "kl": 0.012115478515625, | |
| "learning_rate": 6.001610194928464e-07, | |
| "loss": 0.1882, | |
| "num_tokens": 29486475.0, | |
| "reward": 0.6432907655835152, | |
| "reward_std": 0.41963067930191755, | |
| "rewards/cosine_scaled_reward": -0.12627129815518856, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 259 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1489.8333740234375, | |
| "epoch": 0.29714285714285715, | |
| "grad_norm": 0.34775781631469727, | |
| "kl": 0.0118255615234375, | |
| "learning_rate": 5.97037808470444e-07, | |
| "loss": 0.1441, | |
| "num_tokens": 29566183.0, | |
| "reward": 0.4704531617462635, | |
| "reward_std": 0.6600965559482574, | |
| "rewards/cosine_scaled_reward": -0.191856749355793, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 260 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1136.9792022705078, | |
| "epoch": 0.29828571428571427, | |
| "grad_norm": 0.34792360663414, | |
| "kl": 0.01123046875, | |
| "learning_rate": 5.939123048916173e-07, | |
| "loss": 0.2938, | |
| "num_tokens": 29628600.0, | |
| "reward": 0.5166485756635666, | |
| "reward_std": 0.40996433794498444, | |
| "rewards/cosine_scaled_reward": -0.20000904146581888, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 261 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1239.2917175292969, | |
| "epoch": 0.29942857142857143, | |
| "grad_norm": 0.3884727358818054, | |
| "kl": 0.017578125, | |
| "learning_rate": 5.907846610890011e-07, | |
| "loss": 0.0502, | |
| "num_tokens": 29696036.0, | |
| "reward": 0.5863418951630592, | |
| "reward_std": 0.3798971250653267, | |
| "rewards/cosine_scaled_reward": -0.14432908222079277, | |
| "rewards/format_reward": 0.875, | |
| "step": 262 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 709.4375152587891, | |
| "epoch": 0.30057142857142854, | |
| "grad_norm": 0.4851728677749634, | |
| "kl": 0.0172271728515625, | |
| "learning_rate": 5.87655029499542e-07, | |
| "loss": 0.173, | |
| "num_tokens": 29738195.0, | |
| "reward": 0.806241512298584, | |
| "reward_std": 0.5233523100614548, | |
| "rewards/cosine_scaled_reward": -0.08646258153021336, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 263 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1195.7916870117188, | |
| "epoch": 0.3017142857142857, | |
| "grad_norm": 0.3379260301589966, | |
| "kl": 0.00963592529296875, | |
| "learning_rate": 5.845235626570683e-07, | |
| "loss": 0.2216, | |
| "num_tokens": 29803975.0, | |
| "reward": 0.7059077769517899, | |
| "reward_std": 0.5995725318789482, | |
| "rewards/cosine_scaled_reward": -0.1262128073722124, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 264 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1640.9583740234375, | |
| "epoch": 0.3028571428571429, | |
| "grad_norm": 0.3161636292934418, | |
| "kl": 0.01184844970703125, | |
| "learning_rate": 5.813904131848564e-07, | |
| "loss": 0.2132, | |
| "num_tokens": 29890727.0, | |
| "reward": 0.43143167346715927, | |
| "reward_std": 0.5859523415565491, | |
| "rewards/cosine_scaled_reward": -0.12803417071700096, | |
| "rewards/format_reward": 0.6875, | |
| "step": 265 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1334.6666870117188, | |
| "epoch": 0.304, | |
| "grad_norm": 3.1876046657562256, | |
| "kl": 0.0460357666015625, | |
| "learning_rate": 5.78255733788191e-07, | |
| "loss": 0.2019, | |
| "num_tokens": 29962909.0, | |
| "reward": 0.8172680884599686, | |
| "reward_std": 0.3814220707863569, | |
| "rewards/cosine_scaled_reward": -0.008032636949792504, | |
| "rewards/format_reward": 0.8333333283662796, | |
| "step": 266 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1512.7500305175781, | |
| "epoch": 0.30514285714285716, | |
| "grad_norm": 0.33352527022361755, | |
| "kl": 0.013641357421875, | |
| "learning_rate": 5.751196772469237e-07, | |
| "loss": 0.1679, | |
| "num_tokens": 30043741.0, | |
| "reward": 0.7953452169895172, | |
| "reward_std": 0.5917237177491188, | |
| "rewards/cosine_scaled_reward": 0.0018392521888017654, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 267 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1813.7709045410156, | |
| "epoch": 0.3062857142857143, | |
| "grad_norm": 0.3294363021850586, | |
| "kl": 0.0094451904296875, | |
| "learning_rate": 5.71982396408026e-07, | |
| "loss": 0.2892, | |
| "num_tokens": 30138998.0, | |
| "reward": 0.4046551361680031, | |
| "reward_std": 0.4766751229763031, | |
| "rewards/cosine_scaled_reward": -0.16225576400756836, | |
| "rewards/format_reward": 0.7291666641831398, | |
| "step": 268 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 905.3750152587891, | |
| "epoch": 0.30742857142857144, | |
| "grad_norm": 0.43302249908447266, | |
| "kl": 0.01812744140625, | |
| "learning_rate": 5.688440441781398e-07, | |
| "loss": 0.3599, | |
| "num_tokens": 30190574.0, | |
| "reward": 0.7200521975755692, | |
| "reward_std": 0.552587129175663, | |
| "rewards/cosine_scaled_reward": -0.10872390307486057, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 269 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 885.4583435058594, | |
| "epoch": 0.30857142857142855, | |
| "grad_norm": 0.470708429813385, | |
| "kl": 0.0196990966796875, | |
| "learning_rate": 5.657047735161255e-07, | |
| "loss": 0.0815, | |
| "num_tokens": 30240552.0, | |
| "reward": 1.292167842388153, | |
| "reward_std": 0.6577414199709892, | |
| "rewards/cosine_scaled_reward": 0.15650059608742595, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 270 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1603.4792175292969, | |
| "epoch": 0.3097142857142857, | |
| "grad_norm": 0.3854743242263794, | |
| "kl": 0.014801025390625, | |
| "learning_rate": 5.625647374256061e-07, | |
| "loss": 0.087, | |
| "num_tokens": 30325325.0, | |
| "reward": 0.9050909653306007, | |
| "reward_std": 0.64243184030056, | |
| "rewards/cosine_scaled_reward": 0.04629545658826828, | |
| "rewards/format_reward": 0.8125, | |
| "step": 271 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1576.1458740234375, | |
| "epoch": 0.31085714285714283, | |
| "grad_norm": 0.391513854265213, | |
| "kl": 0.01590728759765625, | |
| "learning_rate": 5.594240889475106e-07, | |
| "loss": 0.1901, | |
| "num_tokens": 30409218.0, | |
| "reward": 0.4028028752654791, | |
| "reward_std": 0.659964844584465, | |
| "rewards/cosine_scaled_reward": -0.20484856329858303, | |
| "rewards/format_reward": 0.8125, | |
| "step": 272 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1670.6875610351562, | |
| "epoch": 0.312, | |
| "grad_norm": 0.6100649833679199, | |
| "kl": 0.013885498046875, | |
| "learning_rate": 5.562829811526154e-07, | |
| "loss": 0.2061, | |
| "num_tokens": 30497661.0, | |
| "reward": 0.19328145054169, | |
| "reward_std": 0.6281792521476746, | |
| "rewards/cosine_scaled_reward": -0.2679426074028015, | |
| "rewards/format_reward": 0.7291666567325592, | |
| "step": 273 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1347.3333435058594, | |
| "epoch": 0.31314285714285717, | |
| "grad_norm": 0.38863444328308105, | |
| "kl": 0.0164031982421875, | |
| "learning_rate": 5.531415671340826e-07, | |
| "loss": 0.0935, | |
| "num_tokens": 30570181.0, | |
| "reward": 0.7795020919293165, | |
| "reward_std": 0.7149695008993149, | |
| "rewards/cosine_scaled_reward": -0.026915639638900757, | |
| "rewards/format_reward": 0.8333333358168602, | |
| "step": 274 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2051.1875610351562, | |
| "epoch": 0.3142857142857143, | |
| "grad_norm": 0.4029337763786316, | |
| "kl": 0.015407562255859375, | |
| "learning_rate": 5.5e-07, | |
| "loss": 0.2663, | |
| "num_tokens": 30677122.0, | |
| "reward": 0.3835557587444782, | |
| "reward_std": 0.8576947599649429, | |
| "rewards/cosine_scaled_reward": -0.14155545644462109, | |
| "rewards/format_reward": 0.6666666641831398, | |
| "step": 275 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 747.5625305175781, | |
| "epoch": 0.31542857142857145, | |
| "grad_norm": 0.8817548751831055, | |
| "kl": 0.039520263671875, | |
| "learning_rate": 5.468584328659172e-07, | |
| "loss": 0.203, | |
| "num_tokens": 30720433.0, | |
| "reward": 1.04302117228508, | |
| "reward_std": 0.6926102489233017, | |
| "rewards/cosine_scaled_reward": 0.031927239149808884, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 276 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1000.7083740234375, | |
| "epoch": 0.31657142857142856, | |
| "grad_norm": 0.44100356101989746, | |
| "kl": 0.020416259765625, | |
| "learning_rate": 5.437170188473847e-07, | |
| "loss": 0.4464, | |
| "num_tokens": 30776447.0, | |
| "reward": 0.8042115196585655, | |
| "reward_std": 0.6335554867982864, | |
| "rewards/cosine_scaled_reward": -0.03539424831978977, | |
| "rewards/format_reward": 0.875, | |
| "step": 277 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1254.1250610351562, | |
| "epoch": 0.3177142857142857, | |
| "grad_norm": 0.4421320855617523, | |
| "kl": 0.02032470703125, | |
| "learning_rate": 5.405759110524894e-07, | |
| "loss": 0.0181, | |
| "num_tokens": 30844427.0, | |
| "reward": 0.9023200869560242, | |
| "reward_std": 0.6986501328647137, | |
| "rewards/cosine_scaled_reward": 0.024076687172055244, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 278 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1248.6458435058594, | |
| "epoch": 0.31885714285714284, | |
| "grad_norm": 0.43398356437683105, | |
| "kl": 0.023956298828125, | |
| "learning_rate": 5.37435262574394e-07, | |
| "loss": 0.1313, | |
| "num_tokens": 30912714.0, | |
| "reward": 0.7318704128265381, | |
| "reward_std": 0.6384659558534622, | |
| "rewards/cosine_scaled_reward": -0.08198146149516106, | |
| "rewards/format_reward": 0.8958333283662796, | |
| "step": 279 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1238.8333740234375, | |
| "epoch": 0.32, | |
| "grad_norm": 0.34043627977371216, | |
| "kl": 0.0171966552734375, | |
| "learning_rate": 5.342952264838747e-07, | |
| "loss": 0.1864, | |
| "num_tokens": 30979336.0, | |
| "reward": 0.9366099536418915, | |
| "reward_std": 0.5328906774520874, | |
| "rewards/cosine_scaled_reward": -0.0004450604319572449, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 280 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1022.8125457763672, | |
| "epoch": 0.3211428571428571, | |
| "grad_norm": 0.3274538218975067, | |
| "kl": 0.021148681640625, | |
| "learning_rate": 5.311559558218603e-07, | |
| "loss": 0.2272, | |
| "num_tokens": 31037209.0, | |
| "reward": 1.0622368827462196, | |
| "reward_std": 0.5683299601078033, | |
| "rewards/cosine_scaled_reward": 0.07278510555624962, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 281 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 856.2708435058594, | |
| "epoch": 0.3222857142857143, | |
| "grad_norm": 0.3970337212085724, | |
| "kl": 0.0233154296875, | |
| "learning_rate": 5.28017603591974e-07, | |
| "loss": 0.3198, | |
| "num_tokens": 31086464.0, | |
| "reward": 1.0004925429821014, | |
| "reward_std": 0.6593362241983414, | |
| "rewards/cosine_scaled_reward": 0.01066293753683567, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 282 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1168.979232788086, | |
| "epoch": 0.32342857142857145, | |
| "grad_norm": 0.7218369841575623, | |
| "kl": 0.037445068359375, | |
| "learning_rate": 5.248803227530763e-07, | |
| "loss": 0.2112, | |
| "num_tokens": 31150315.0, | |
| "reward": 0.5269430354237556, | |
| "reward_std": 0.39331691712141037, | |
| "rewards/cosine_scaled_reward": -0.14277848601341248, | |
| "rewards/format_reward": 0.8124999850988388, | |
| "step": 283 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1338.3750305175781, | |
| "epoch": 0.32457142857142857, | |
| "grad_norm": 0.3623667359352112, | |
| "kl": 0.01910400390625, | |
| "learning_rate": 5.21744266211809e-07, | |
| "loss": 0.2846, | |
| "num_tokens": 31222393.0, | |
| "reward": 0.6486207991838455, | |
| "reward_std": 0.5024735182523727, | |
| "rewards/cosine_scaled_reward": -0.1131896236911416, | |
| "rewards/format_reward": 0.875, | |
| "step": 284 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1468.0833740234375, | |
| "epoch": 0.32571428571428573, | |
| "grad_norm": 0.31146690249443054, | |
| "kl": 0.0211334228515625, | |
| "learning_rate": 5.186095868151436e-07, | |
| "loss": 0.3163, | |
| "num_tokens": 31300721.0, | |
| "reward": 0.5346148237586021, | |
| "reward_std": 0.5083964094519615, | |
| "rewards/cosine_scaled_reward": -0.1597759248688817, | |
| "rewards/format_reward": 0.8541666567325592, | |
| "step": 285 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 920.1250305175781, | |
| "epoch": 0.32685714285714285, | |
| "grad_norm": 0.3465576767921448, | |
| "kl": 0.02032470703125, | |
| "learning_rate": 5.154764373429315e-07, | |
| "loss": 0.1612, | |
| "num_tokens": 31352723.0, | |
| "reward": 1.0635754466056824, | |
| "reward_std": 0.8085414916276932, | |
| "rewards/cosine_scaled_reward": 0.04220437444746494, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 286 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 915.2500152587891, | |
| "epoch": 0.328, | |
| "grad_norm": 0.4792618751525879, | |
| "kl": 0.031646728515625, | |
| "learning_rate": 5.123449705004581e-07, | |
| "loss": 0.3252, | |
| "num_tokens": 31404635.0, | |
| "reward": 0.9221690893173218, | |
| "reward_std": 0.3884202316403389, | |
| "rewards/cosine_scaled_reward": 0.002751174382865429, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 287 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 839.7291946411133, | |
| "epoch": 0.3291428571428571, | |
| "grad_norm": 1.0952292680740356, | |
| "kl": 0.05023193359375, | |
| "learning_rate": 5.09215338910999e-07, | |
| "loss": 0.1517, | |
| "num_tokens": 31452514.0, | |
| "reward": 0.9844385534524918, | |
| "reward_std": 0.5249290615320206, | |
| "rewards/cosine_scaled_reward": 0.002635940443724394, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 288 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1021.3125305175781, | |
| "epoch": 0.3302857142857143, | |
| "grad_norm": 0.4906879961490631, | |
| "kl": 0.023193359375, | |
| "learning_rate": 5.060876951083828e-07, | |
| "loss": 0.2844, | |
| "num_tokens": 31509289.0, | |
| "reward": 0.7608919851481915, | |
| "reward_std": 0.28182800114154816, | |
| "rewards/cosine_scaled_reward": -0.08830402046442032, | |
| "rewards/format_reward": 0.9375, | |
| "step": 289 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1111.2500305175781, | |
| "epoch": 0.3314285714285714, | |
| "grad_norm": 0.490951806306839, | |
| "kl": 0.023193359375, | |
| "learning_rate": 5.02962191529556e-07, | |
| "loss": 0.0739, | |
| "num_tokens": 31570423.0, | |
| "reward": 1.42685866355896, | |
| "reward_std": 0.8060109168291092, | |
| "rewards/cosine_scaled_reward": 0.2655126517638564, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 290 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 605.0625228881836, | |
| "epoch": 0.3325714285714286, | |
| "grad_norm": 0.6173872351646423, | |
| "kl": 0.031097412109375, | |
| "learning_rate": 4.998389805071536e-07, | |
| "loss": 0.2537, | |
| "num_tokens": 31607386.0, | |
| "reward": 0.8903323113918304, | |
| "reward_std": 0.38973837345838547, | |
| "rewards/cosine_scaled_reward": -0.05483385222032666, | |
| "rewards/format_reward": 1.0, | |
| "step": 291 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1337.9167175292969, | |
| "epoch": 0.33371428571428574, | |
| "grad_norm": 0.41129082441329956, | |
| "kl": 0.025390625, | |
| "learning_rate": 4.967182142620745e-07, | |
| "loss": 0.177, | |
| "num_tokens": 31679484.0, | |
| "reward": 0.4600318595767021, | |
| "reward_std": 0.3473619818687439, | |
| "rewards/cosine_scaled_reward": -0.19706740230321884, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 292 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1204.5833587646484, | |
| "epoch": 0.33485714285714285, | |
| "grad_norm": 0.5244786739349365, | |
| "kl": 0.02227783203125, | |
| "learning_rate": 4.93600044896063e-07, | |
| "loss": 0.2838, | |
| "num_tokens": 31744744.0, | |
| "reward": 0.4529779404401779, | |
| "reward_std": 0.5236243791878223, | |
| "rewards/cosine_scaled_reward": -0.20059436932206154, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 293 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1033.0416870117188, | |
| "epoch": 0.336, | |
| "grad_norm": 0.7969145774841309, | |
| "kl": 0.0338134765625, | |
| "learning_rate": 4.904846243842949e-07, | |
| "loss": 0.1185, | |
| "num_tokens": 31802316.0, | |
| "reward": 1.1573065668344498, | |
| "reward_std": 0.7508738189935684, | |
| "rewards/cosine_scaled_reward": 0.10990326898172498, | |
| "rewards/format_reward": 0.9375, | |
| "step": 294 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1818.1875762939453, | |
| "epoch": 0.33714285714285713, | |
| "grad_norm": 0.3968656361103058, | |
| "kl": 0.019561767578125, | |
| "learning_rate": 4.873721045679706e-07, | |
| "loss": 0.0126, | |
| "num_tokens": 31897539.0, | |
| "reward": 0.5907609835267067, | |
| "reward_std": 0.6056300327181816, | |
| "rewards/cosine_scaled_reward": -0.0275361780077219, | |
| "rewards/format_reward": 0.6458333283662796, | |
| "step": 295 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 863.5416793823242, | |
| "epoch": 0.3382857142857143, | |
| "grad_norm": 0.47012004256248474, | |
| "kl": 0.0283203125, | |
| "learning_rate": 4.842626371469149e-07, | |
| "loss": 0.0947, | |
| "num_tokens": 31946135.0, | |
| "reward": 1.1826601028442383, | |
| "reward_std": 0.742069885134697, | |
| "rewards/cosine_scaled_reward": 0.11216334369964898, | |
| "rewards/format_reward": 0.9583333283662796, | |
| "step": 296 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1349.1250305175781, | |
| "epoch": 0.3394285714285714, | |
| "grad_norm": 0.41933736205101013, | |
| "kl": 0.022247314453125, | |
| "learning_rate": 4.811563736721829e-07, | |
| "loss": 0.4068, | |
| "num_tokens": 32018543.0, | |
| "reward": 0.6420099958777428, | |
| "reward_std": 0.4163365215063095, | |
| "rewards/cosine_scaled_reward": -0.1060783602297306, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 297 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1557.1458740234375, | |
| "epoch": 0.3405714285714286, | |
| "grad_norm": 0.3282775282859802, | |
| "kl": 0.01898193359375, | |
| "learning_rate": 4.780534655386743e-07, | |
| "loss": 0.1917, | |
| "num_tokens": 32101620.0, | |
| "reward": 0.8522088825702667, | |
| "reward_std": 0.7961080744862556, | |
| "rewards/cosine_scaled_reward": -0.011395589681342244, | |
| "rewards/format_reward": 0.8749999850988388, | |
| "step": 298 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1234.8333587646484, | |
| "epoch": 0.3417142857142857, | |
| "grad_norm": 0.4244450330734253, | |
| "kl": 0.02215576171875, | |
| "learning_rate": 4.749540639777539e-07, | |
| "loss": 0.4339, | |
| "num_tokens": 32168944.0, | |
| "reward": 0.5344287008047104, | |
| "reward_std": 0.5752788633108139, | |
| "rewards/cosine_scaled_reward": -0.18070233054459095, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 299 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1201.8125305175781, | |
| "epoch": 0.34285714285714286, | |
| "grad_norm": 0.412088543176651, | |
| "kl": 0.021697998046875, | |
| "learning_rate": 4.7185832004988133e-07, | |
| "loss": 0.1095, | |
| "num_tokens": 32235523.0, | |
| "reward": 1.3541258573532104, | |
| "reward_std": 0.2665238669142127, | |
| "rewards/cosine_scaled_reward": 0.24997958540916443, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 300 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1400.1667175292969, | |
| "epoch": 0.344, | |
| "grad_norm": 0.35744866728782654, | |
| "kl": 0.0169830322265625, | |
| "learning_rate": 4.68766384637248e-07, | |
| "loss": 0.1998, | |
| "num_tokens": 32310705.0, | |
| "reward": 0.8010849542915821, | |
| "reward_std": 0.6691881567239761, | |
| "rewards/cosine_scaled_reward": -0.005707542411983013, | |
| "rewards/format_reward": 0.8125000074505806, | |
| "step": 301 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1336.312515258789, | |
| "epoch": 0.34514285714285714, | |
| "grad_norm": 0.8253682255744934, | |
| "kl": 0.028411865234375, | |
| "learning_rate": 4.656784084364238e-07, | |
| "loss": 0.2364, | |
| "num_tokens": 32382504.0, | |
| "reward": 0.816130556166172, | |
| "reward_std": 0.3864128515124321, | |
| "rewards/cosine_scaled_reward": -0.01901806378737092, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 302 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1049.1458740234375, | |
| "epoch": 0.3462857142857143, | |
| "grad_norm": 0.49328041076660156, | |
| "kl": 0.031890869140625, | |
| "learning_rate": 4.6259454195101267e-07, | |
| "loss": 0.0389, | |
| "num_tokens": 32440591.0, | |
| "reward": 1.1552183032035828, | |
| "reward_std": 0.6544827744364738, | |
| "rewards/cosine_scaled_reward": 0.10885915206745267, | |
| "rewards/format_reward": 0.9375, | |
| "step": 303 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 922.5417022705078, | |
| "epoch": 0.3474285714285714, | |
| "grad_norm": 0.36159005761146545, | |
| "kl": 0.0200042724609375, | |
| "learning_rate": 4.59514935484316e-07, | |
| "loss": 0.2868, | |
| "num_tokens": 32492841.0, | |
| "reward": 1.4484843313694, | |
| "reward_std": 0.9271421581506729, | |
| "rewards/cosine_scaled_reward": 0.23465881869196892, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 304 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1809.0000305175781, | |
| "epoch": 0.3485714285714286, | |
| "grad_norm": 0.3415130078792572, | |
| "kl": 0.01605224609375, | |
| "learning_rate": 4.5643973913200837e-07, | |
| "loss": 0.2458, | |
| "num_tokens": 32587287.0, | |
| "reward": 0.5845005512237549, | |
| "reward_std": 0.6893949508666992, | |
| "rewards/cosine_scaled_reward": -0.08274972066283226, | |
| "rewards/format_reward": 0.75, | |
| "step": 305 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1085.083396911621, | |
| "epoch": 0.3497142857142857, | |
| "grad_norm": 0.44499897956848145, | |
| "kl": 0.023040771484375, | |
| "learning_rate": 4.5336910277482155e-07, | |
| "loss": 0.1673, | |
| "num_tokens": 32646685.0, | |
| "reward": 1.1248981356620789, | |
| "reward_std": 0.6042323708534241, | |
| "rewards/cosine_scaled_reward": 0.10411571276199538, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 306 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1154.5000305175781, | |
| "epoch": 0.35085714285714287, | |
| "grad_norm": 0.44520094990730286, | |
| "kl": 0.020751953125, | |
| "learning_rate": 4.503031760712397e-07, | |
| "loss": 0.2247, | |
| "num_tokens": 32710717.0, | |
| "reward": 1.0021317303180695, | |
| "reward_std": 0.6453195139765739, | |
| "rewards/cosine_scaled_reward": 0.05314922146499157, | |
| "rewards/format_reward": 0.8958333283662796, | |
| "step": 307 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1123.9375305175781, | |
| "epoch": 0.352, | |
| "grad_norm": 0.6290622353553772, | |
| "kl": 0.0402679443359375, | |
| "learning_rate": 4.4724210845020494e-07, | |
| "loss": 0.2587, | |
| "num_tokens": 32772196.0, | |
| "reward": 0.933015450835228, | |
| "reward_std": 0.6842742636799812, | |
| "rewards/cosine_scaled_reward": 0.029007713310420513, | |
| "rewards/format_reward": 0.875, | |
| "step": 308 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1053.7708435058594, | |
| "epoch": 0.35314285714285715, | |
| "grad_norm": 0.42671680450439453, | |
| "kl": 0.019866943359375, | |
| "learning_rate": 4.441860491038345e-07, | |
| "loss": 0.1434, | |
| "num_tokens": 32830427.0, | |
| "reward": 0.898878924548626, | |
| "reward_std": 0.8900427669286728, | |
| "rewards/cosine_scaled_reward": 0.011939452961087227, | |
| "rewards/format_reward": 0.875, | |
| "step": 309 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1383.520866394043, | |
| "epoch": 0.35428571428571426, | |
| "grad_norm": 0.40926042199134827, | |
| "kl": 0.020233154296875, | |
| "learning_rate": 4.4113514698014953e-07, | |
| "loss": 0.0119, | |
| "num_tokens": 32904654.0, | |
| "reward": 0.7741856873035431, | |
| "reward_std": 0.43527317326515913, | |
| "rewards/cosine_scaled_reward": 0.01209283946081996, | |
| "rewards/format_reward": 0.75, | |
| "step": 310 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1021.2917327880859, | |
| "epoch": 0.3554285714285714, | |
| "grad_norm": 0.6428707838058472, | |
| "kl": 0.036163330078125, | |
| "learning_rate": 4.3808955077581546e-07, | |
| "loss": 0.2168, | |
| "num_tokens": 32961134.0, | |
| "reward": 1.0443171262741089, | |
| "reward_std": 0.5101570673286915, | |
| "rewards/cosine_scaled_reward": 0.0742418859153986, | |
| "rewards/format_reward": 0.8958333283662796, | |
| "step": 311 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1167.8333892822266, | |
| "epoch": 0.3565714285714286, | |
| "grad_norm": 0.3679564297199249, | |
| "kl": 0.0187530517578125, | |
| "learning_rate": 4.350494089288943e-07, | |
| "loss": 0.177, | |
| "num_tokens": 33024690.0, | |
| "reward": 0.8535246178507805, | |
| "reward_std": 0.7677148133516312, | |
| "rewards/cosine_scaled_reward": -0.021154375048354268, | |
| "rewards/format_reward": 0.8958333283662796, | |
| "step": 312 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1486.0833587646484, | |
| "epoch": 0.3577142857142857, | |
| "grad_norm": 0.5498233437538147, | |
| "kl": 0.0181732177734375, | |
| "learning_rate": 4.3201486961161093e-07, | |
| "loss": 0.2557, | |
| "num_tokens": 33104086.0, | |
| "reward": 0.9443192332983017, | |
| "reward_std": 0.4855539873242378, | |
| "rewards/cosine_scaled_reward": 0.06590958125889301, | |
| "rewards/format_reward": 0.8125, | |
| "step": 313 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 951.8958740234375, | |
| "epoch": 0.3588571428571429, | |
| "grad_norm": 0.4268878698348999, | |
| "kl": 0.020538330078125, | |
| "learning_rate": 4.2898608072313045e-07, | |
| "loss": 0.1463, | |
| "num_tokens": 33157841.0, | |
| "reward": 0.9409515410661697, | |
| "reward_std": 0.5662648305296898, | |
| "rewards/cosine_scaled_reward": 0.012142402119934559, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 314 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 922.3125610351562, | |
| "epoch": 0.36, | |
| "grad_norm": 0.5434188842773438, | |
| "kl": 0.02447509765625, | |
| "learning_rate": 4.2596318988235037e-07, | |
| "loss": 0.17, | |
| "num_tokens": 33210182.0, | |
| "reward": 0.8172921747900546, | |
| "reward_std": 0.4631008133292198, | |
| "rewards/cosine_scaled_reward": -0.04968724772334099, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 315 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1805.6458435058594, | |
| "epoch": 0.36114285714285715, | |
| "grad_norm": 0.3704501688480377, | |
| "kl": 0.01605224609375, | |
| "learning_rate": 4.2294634442070553e-07, | |
| "loss": 0.2939, | |
| "num_tokens": 33305451.0, | |
| "reward": 0.3478560894727707, | |
| "reward_std": 0.5816800445318222, | |
| "rewards/cosine_scaled_reward": -0.19065529108047485, | |
| "rewards/format_reward": 0.7291666567325592, | |
| "step": 316 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1177.6666870117188, | |
| "epoch": 0.36228571428571427, | |
| "grad_norm": 0.35206273198127747, | |
| "kl": 0.018798828125, | |
| "learning_rate": 4.1993569137498776e-07, | |
| "loss": 0.1731, | |
| "num_tokens": 33370553.0, | |
| "reward": 1.243408765643835, | |
| "reward_std": 0.6171375811100006, | |
| "rewards/cosine_scaled_reward": 0.1633710116147995, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 317 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1134.6875381469727, | |
| "epoch": 0.36342857142857143, | |
| "grad_norm": 0.5754601359367371, | |
| "kl": 0.0250244140625, | |
| "learning_rate": 4.1693137748017915e-07, | |
| "loss": 0.198, | |
| "num_tokens": 33432614.0, | |
| "reward": 0.9843797795474529, | |
| "reward_std": 0.5465483516454697, | |
| "rewards/cosine_scaled_reward": 0.06510654278099537, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 318 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1850.2083740234375, | |
| "epoch": 0.36457142857142855, | |
| "grad_norm": 0.3118242621421814, | |
| "kl": 0.01446533203125, | |
| "learning_rate": 4.1393354916230005e-07, | |
| "loss": 0.1195, | |
| "num_tokens": 33529392.0, | |
| "reward": 0.3020896166563034, | |
| "reward_std": 0.48611459136009216, | |
| "rewards/cosine_scaled_reward": -0.21353853773325682, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 319 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1210.4791870117188, | |
| "epoch": 0.3657142857142857, | |
| "grad_norm": 0.35057225823402405, | |
| "kl": 0.0148773193359375, | |
| "learning_rate": 4.1094235253127374e-07, | |
| "loss": 0.1352, | |
| "num_tokens": 33595565.0, | |
| "reward": 0.7223537564277649, | |
| "reward_std": 0.7105579674243927, | |
| "rewards/cosine_scaled_reward": -0.07632312597706914, | |
| "rewards/format_reward": 0.875, | |
| "step": 320 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1327.3334045410156, | |
| "epoch": 0.3668571428571429, | |
| "grad_norm": 0.4181998074054718, | |
| "kl": 0.022186279296875, | |
| "learning_rate": 4.079579333738039e-07, | |
| "loss": 0.1917, | |
| "num_tokens": 33667215.0, | |
| "reward": 1.0562772899866104, | |
| "reward_std": 0.5822824612259865, | |
| "rewards/cosine_scaled_reward": 0.038555288687348366, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 321 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1351.7083587646484, | |
| "epoch": 0.368, | |
| "grad_norm": 0.3627634644508362, | |
| "kl": 0.0172576904296875, | |
| "learning_rate": 4.0498043714627006e-07, | |
| "loss": 0.2456, | |
| "num_tokens": 33740323.0, | |
| "reward": 0.9927564784884453, | |
| "reward_std": 0.689672015607357, | |
| "rewards/cosine_scaled_reward": 0.10054487735033035, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 322 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1161.5833435058594, | |
| "epoch": 0.36914285714285716, | |
| "grad_norm": 0.4092625677585602, | |
| "kl": 0.021759033203125, | |
| "learning_rate": 4.020100089676376e-07, | |
| "loss": 0.2509, | |
| "num_tokens": 33804041.0, | |
| "reward": 1.174657016992569, | |
| "reward_std": 0.748102068901062, | |
| "rewards/cosine_scaled_reward": 0.12899513231241144, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 323 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1409.6042175292969, | |
| "epoch": 0.3702857142857143, | |
| "grad_norm": 0.3942246735095978, | |
| "kl": 0.016510009765625, | |
| "learning_rate": 3.9904679361238526e-07, | |
| "loss": 0.2506, | |
| "num_tokens": 33879412.0, | |
| "reward": 0.6940434277057648, | |
| "reward_std": 0.6117554008960724, | |
| "rewards/cosine_scaled_reward": -0.12172829359769821, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 324 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1158.0208740234375, | |
| "epoch": 0.37142857142857144, | |
| "grad_norm": 0.41832220554351807, | |
| "kl": 0.021728515625, | |
| "learning_rate": 3.9609093550344907e-07, | |
| "loss": 0.2276, | |
| "num_tokens": 33942629.0, | |
| "reward": 0.961421325802803, | |
| "reward_std": 0.5547241270542145, | |
| "rewards/cosine_scaled_reward": 0.011960638221353292, | |
| "rewards/format_reward": 0.9375, | |
| "step": 325 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2292.2708740234375, | |
| "epoch": 0.37257142857142855, | |
| "grad_norm": 0.28062522411346436, | |
| "kl": 0.0122222900390625, | |
| "learning_rate": 3.931425787051832e-07, | |
| "loss": 0.2198, | |
| "num_tokens": 34061160.0, | |
| "reward": 0.520195122808218, | |
| "reward_std": 0.6474234536290169, | |
| "rewards/cosine_scaled_reward": -0.0732357781380415, | |
| "rewards/format_reward": 0.6666666641831398, | |
| "step": 326 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1594.6042022705078, | |
| "epoch": 0.3737142857142857, | |
| "grad_norm": 0.3291863203048706, | |
| "kl": 0.018585205078125, | |
| "learning_rate": 3.902018669163384e-07, | |
| "loss": 0.2308, | |
| "num_tokens": 34145765.0, | |
| "reward": 1.1332450732588768, | |
| "reward_std": 0.7591084539890289, | |
| "rewards/cosine_scaled_reward": 0.1603725180029869, | |
| "rewards/format_reward": 0.8125, | |
| "step": 327 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1420.0833892822266, | |
| "epoch": 0.37485714285714283, | |
| "grad_norm": 0.3587169647216797, | |
| "kl": 0.0157318115234375, | |
| "learning_rate": 3.872689434630585e-07, | |
| "loss": 0.1698, | |
| "num_tokens": 34221987.0, | |
| "reward": 0.8290468156337738, | |
| "reward_std": 0.7251327261328697, | |
| "rewards/cosine_scaled_reward": -0.012559936614707112, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 328 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1996.2916870117188, | |
| "epoch": 0.376, | |
| "grad_norm": 0.2455548644065857, | |
| "kl": 0.012237548828125, | |
| "learning_rate": 3.843439512918949e-07, | |
| "loss": 0.1534, | |
| "num_tokens": 34326347.0, | |
| "reward": 0.6604045107960701, | |
| "reward_std": 0.5404551178216934, | |
| "rewards/cosine_scaled_reward": -0.05521441251039505, | |
| "rewards/format_reward": 0.7708333358168602, | |
| "step": 329 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1714.5417785644531, | |
| "epoch": 0.37714285714285717, | |
| "grad_norm": 0.3259330689907074, | |
| "kl": 0.016571044921875, | |
| "learning_rate": 3.8142703296283953e-07, | |
| "loss": 0.259, | |
| "num_tokens": 34416559.0, | |
| "reward": 0.49699069559574127, | |
| "reward_std": 0.7413402050733566, | |
| "rewards/cosine_scaled_reward": -0.13692133501172066, | |
| "rewards/format_reward": 0.7708333283662796, | |
| "step": 330 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1363.2500610351562, | |
| "epoch": 0.3782857142857143, | |
| "grad_norm": 0.3899250626564026, | |
| "kl": 0.018096923828125, | |
| "learning_rate": 3.785183306423767e-07, | |
| "loss": 0.2887, | |
| "num_tokens": 34489507.0, | |
| "reward": 0.6278744414448738, | |
| "reward_std": 0.6622165739536285, | |
| "rewards/cosine_scaled_reward": -0.12356280605308712, | |
| "rewards/format_reward": 0.875, | |
| "step": 331 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1064.4792175292969, | |
| "epoch": 0.37942857142857145, | |
| "grad_norm": 0.4999028742313385, | |
| "kl": 0.02362060546875, | |
| "learning_rate": 3.7561798609655373e-07, | |
| "loss": 0.3095, | |
| "num_tokens": 34548552.0, | |
| "reward": 1.3007973432540894, | |
| "reward_std": 0.6751734167337418, | |
| "rewards/cosine_scaled_reward": 0.1816486194729805, | |
| "rewards/format_reward": 0.9375, | |
| "step": 332 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1111.0208740234375, | |
| "epoch": 0.38057142857142856, | |
| "grad_norm": 0.4572555422782898, | |
| "kl": 0.0247955322265625, | |
| "learning_rate": 3.72726140684072e-07, | |
| "loss": 0.4488, | |
| "num_tokens": 34610029.0, | |
| "reward": 0.5547422207891941, | |
| "reward_std": 0.4824206456542015, | |
| "rewards/cosine_scaled_reward": -0.16012889053672552, | |
| "rewards/format_reward": 0.875, | |
| "step": 333 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1466.7709045410156, | |
| "epoch": 0.38171428571428573, | |
| "grad_norm": 0.30377352237701416, | |
| "kl": 0.014862060546875, | |
| "learning_rate": 3.6984293534939737e-07, | |
| "loss": 0.418, | |
| "num_tokens": 34689020.0, | |
| "reward": 0.5577372368425131, | |
| "reward_std": 0.8148107454180717, | |
| "rewards/cosine_scaled_reward": -0.12738139368593693, | |
| "rewards/format_reward": 0.8125, | |
| "step": 334 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1797.125015258789, | |
| "epoch": 0.38285714285714284, | |
| "grad_norm": 0.350172758102417, | |
| "kl": 0.0150299072265625, | |
| "learning_rate": 3.6696851061588994e-07, | |
| "loss": 0.1338, | |
| "num_tokens": 34783118.0, | |
| "reward": 0.7335403561592102, | |
| "reward_std": 0.6356885507702827, | |
| "rewards/cosine_scaled_reward": 0.012603512033820152, | |
| "rewards/format_reward": 0.7083333358168602, | |
| "step": 335 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1083.3541870117188, | |
| "epoch": 0.384, | |
| "grad_norm": 0.5177885293960571, | |
| "kl": 0.0233154296875, | |
| "learning_rate": 3.641030065789562e-07, | |
| "loss": 0.4695, | |
| "num_tokens": 34842745.0, | |
| "reward": 0.5901815667748451, | |
| "reward_std": 0.6160614341497421, | |
| "rewards/cosine_scaled_reward": -0.14240923523902893, | |
| "rewards/format_reward": 0.875, | |
| "step": 336 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 698.9583511352539, | |
| "epoch": 0.3851428571428571, | |
| "grad_norm": 0.905795693397522, | |
| "kl": 0.05902099609375, | |
| "learning_rate": 3.612465628992203e-07, | |
| "loss": 0.4006, | |
| "num_tokens": 34884581.0, | |
| "reward": 1.1506546884775162, | |
| "reward_std": 0.6925991177558899, | |
| "rewards/cosine_scaled_reward": 0.0961606577038765, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 337 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1615.7916717529297, | |
| "epoch": 0.3862857142857143, | |
| "grad_norm": 0.4690731167793274, | |
| "kl": 0.0206756591796875, | |
| "learning_rate": 3.5839931879571725e-07, | |
| "loss": 0.1015, | |
| "num_tokens": 34970017.0, | |
| "reward": 0.3961530327796936, | |
| "reward_std": 0.42486437410116196, | |
| "rewards/cosine_scaled_reward": -0.1456734873354435, | |
| "rewards/format_reward": 0.6874999962747097, | |
| "step": 338 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1820.6250610351562, | |
| "epoch": 0.38742857142857146, | |
| "grad_norm": 0.26318851113319397, | |
| "kl": 0.01601409912109375, | |
| "learning_rate": 3.555614130391079e-07, | |
| "loss": 0.2107, | |
| "num_tokens": 35066017.0, | |
| "reward": 0.5171072706580162, | |
| "reward_std": 0.734325036406517, | |
| "rewards/cosine_scaled_reward": -0.14769636327400804, | |
| "rewards/format_reward": 0.8124999925494194, | |
| "step": 339 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1169.3125457763672, | |
| "epoch": 0.38857142857142857, | |
| "grad_norm": 0.8620088696479797, | |
| "kl": 0.051483154296875, | |
| "learning_rate": 3.5273298394491515e-07, | |
| "loss": 0.0998, | |
| "num_tokens": 35130214.0, | |
| "reward": 0.9262717366218567, | |
| "reward_std": 0.5215450003743172, | |
| "rewards/cosine_scaled_reward": 0.015219194581732154, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 340 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1397.3542175292969, | |
| "epoch": 0.38971428571428574, | |
| "grad_norm": 0.47853705286979675, | |
| "kl": 0.0279083251953125, | |
| "learning_rate": 3.4991416936678276e-07, | |
| "loss": 0.2305, | |
| "num_tokens": 35205861.0, | |
| "reward": 1.2229997366666794, | |
| "reward_std": 0.9069466888904572, | |
| "rewards/cosine_scaled_reward": 0.17399985902011395, | |
| "rewards/format_reward": 0.875, | |
| "step": 341 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1234.1666870117188, | |
| "epoch": 0.39085714285714285, | |
| "grad_norm": 0.4025494456291199, | |
| "kl": 0.02392578125, | |
| "learning_rate": 3.471051066897562e-07, | |
| "loss": 0.2341, | |
| "num_tokens": 35272907.0, | |
| "reward": 0.8216968104243279, | |
| "reward_std": 0.5812697075307369, | |
| "rewards/cosine_scaled_reward": 0.004598394094500691, | |
| "rewards/format_reward": 0.8125, | |
| "step": 342 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1443.8333740234375, | |
| "epoch": 0.392, | |
| "grad_norm": 0.41579753160476685, | |
| "kl": 0.0198211669921875, | |
| "learning_rate": 3.4430593282358777e-07, | |
| "loss": 0.2448, | |
| "num_tokens": 35350047.0, | |
| "reward": 1.148866169154644, | |
| "reward_std": 0.8502667844295502, | |
| "rewards/cosine_scaled_reward": 0.1369330883026123, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 343 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1166.4792022705078, | |
| "epoch": 0.3931428571428571, | |
| "grad_norm": 0.44617852568626404, | |
| "kl": 0.027435302734375, | |
| "learning_rate": 3.4151678419606233e-07, | |
| "loss": 0.1595, | |
| "num_tokens": 35414708.0, | |
| "reward": 0.8810491487383842, | |
| "reward_std": 0.3746897503733635, | |
| "rewards/cosine_scaled_reward": -0.03864210657775402, | |
| "rewards/format_reward": 0.9583333283662796, | |
| "step": 344 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 949.5625152587891, | |
| "epoch": 0.3942857142857143, | |
| "grad_norm": 0.3746589124202728, | |
| "kl": 0.019561767578125, | |
| "learning_rate": 3.387377967463493e-07, | |
| "loss": 0.4207, | |
| "num_tokens": 35467829.0, | |
| "reward": 0.8917692303657532, | |
| "reward_std": 0.3601553849875927, | |
| "rewards/cosine_scaled_reward": -0.022865407168865204, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 345 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1269.5208587646484, | |
| "epoch": 0.3954285714285714, | |
| "grad_norm": 0.4141824543476105, | |
| "kl": 0.02264404296875, | |
| "learning_rate": 3.359691059183761e-07, | |
| "loss": 0.0, | |
| "num_tokens": 35536926.0, | |
| "reward": 0.8968387022614479, | |
| "reward_std": 0.4571293629705906, | |
| "rewards/cosine_scaled_reward": 0.010919326916337013, | |
| "rewards/format_reward": 0.875, | |
| "step": 346 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1213.3333435058594, | |
| "epoch": 0.3965714285714286, | |
| "grad_norm": 0.7501462697982788, | |
| "kl": 0.0380859375, | |
| "learning_rate": 3.3321084665422803e-07, | |
| "loss": 0.23, | |
| "num_tokens": 35603176.0, | |
| "reward": 1.100012943148613, | |
| "reward_std": 0.9596873223781586, | |
| "rewards/cosine_scaled_reward": 0.13333981484174728, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 347 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1136.8750305175781, | |
| "epoch": 0.3977142857142857, | |
| "grad_norm": 0.40254196524620056, | |
| "kl": 0.0218658447265625, | |
| "learning_rate": 3.3046315338757026e-07, | |
| "loss": -0.0113, | |
| "num_tokens": 35665438.0, | |
| "reward": 0.8266020230948925, | |
| "reward_std": 0.5243057832121849, | |
| "rewards/cosine_scaled_reward": -0.03461567126214504, | |
| "rewards/format_reward": 0.8958333283662796, | |
| "step": 348 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 974.5833892822266, | |
| "epoch": 0.39885714285714285, | |
| "grad_norm": 0.4754847586154938, | |
| "kl": 0.0227203369140625, | |
| "learning_rate": 3.2772616003709616e-07, | |
| "loss": 0.175, | |
| "num_tokens": 35720006.0, | |
| "reward": 1.4643159806728363, | |
| "reward_std": 0.525859147310257, | |
| "rewards/cosine_scaled_reward": 0.25299129262566566, | |
| "rewards/format_reward": 0.9583333283662796, | |
| "step": 349 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 811.2291793823242, | |
| "epoch": 0.4, | |
| "grad_norm": 0.4111607074737549, | |
| "kl": 0.02227783203125, | |
| "learning_rate": 3.250000000000001e-07, | |
| "loss": 0.3028, | |
| "num_tokens": 35766733.0, | |
| "reward": 1.3436209559440613, | |
| "reward_std": 0.5788788720965385, | |
| "rewards/cosine_scaled_reward": 0.20306045236065984, | |
| "rewards/format_reward": 0.9375, | |
| "step": 350 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1149.437515258789, | |
| "epoch": 0.40114285714285713, | |
| "grad_norm": 0.5062916874885559, | |
| "kl": 0.0225830078125, | |
| "learning_rate": 3.222848061454764e-07, | |
| "loss": 0.199, | |
| "num_tokens": 35830264.0, | |
| "reward": 0.8019091859459877, | |
| "reward_std": 0.5678411647677422, | |
| "rewards/cosine_scaled_reward": -0.03654539864510298, | |
| "rewards/format_reward": 0.875, | |
| "step": 351 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1280.0625305175781, | |
| "epoch": 0.4022857142857143, | |
| "grad_norm": 0.3271620571613312, | |
| "kl": 0.0175933837890625, | |
| "learning_rate": 3.195807108082429e-07, | |
| "loss": 0.2915, | |
| "num_tokens": 35899897.0, | |
| "reward": 0.9320071637630463, | |
| "reward_std": 0.5362000018358231, | |
| "rewards/cosine_scaled_reward": -0.00274643674492836, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 352 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1254.0208435058594, | |
| "epoch": 0.4034285714285714, | |
| "grad_norm": 0.3472294211387634, | |
| "kl": 0.02008056640625, | |
| "learning_rate": 3.168878457820915e-07, | |
| "loss": 0.2191, | |
| "num_tokens": 35968424.0, | |
| "reward": 1.047704242169857, | |
| "reward_std": 0.4904594272375107, | |
| "rewards/cosine_scaled_reward": 0.06551877036690712, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 353 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1009.8542175292969, | |
| "epoch": 0.4045714285714286, | |
| "grad_norm": 0.49516573548316956, | |
| "kl": 0.0303955078125, | |
| "learning_rate": 3.142063423134644e-07, | |
| "loss": 0.2002, | |
| "num_tokens": 36024817.0, | |
| "reward": 0.5630597248673439, | |
| "reward_std": 0.22670871764421463, | |
| "rewards/cosine_scaled_reward": -0.20805347710847855, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 354 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1270.7291946411133, | |
| "epoch": 0.4057142857142857, | |
| "grad_norm": 0.7044930458068848, | |
| "kl": 0.0262908935546875, | |
| "learning_rate": 3.115363310950578e-07, | |
| "loss": 0.206, | |
| "num_tokens": 36092706.0, | |
| "reward": 0.9475711584091187, | |
| "reward_std": 0.36783919855952263, | |
| "rewards/cosine_scaled_reward": 0.06753556802868843, | |
| "rewards/format_reward": 0.8124999925494194, | |
| "step": 355 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1286.166732788086, | |
| "epoch": 0.40685714285714286, | |
| "grad_norm": 0.3811585009098053, | |
| "kl": 0.019927978515625, | |
| "learning_rate": 3.0887794225945143e-07, | |
| "loss": 0.3389, | |
| "num_tokens": 36163136.0, | |
| "reward": 0.8006436377763748, | |
| "reward_std": 0.4206415191292763, | |
| "rewards/cosine_scaled_reward": -0.026761506218463182, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 356 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1452.2709045410156, | |
| "epoch": 0.408, | |
| "grad_norm": 0.48744305968284607, | |
| "kl": 0.0196380615234375, | |
| "learning_rate": 3.062313053727671e-07, | |
| "loss": 0.5284, | |
| "num_tokens": 36241107.0, | |
| "reward": 0.5863317297771573, | |
| "reward_std": 0.4162827581167221, | |
| "rewards/cosine_scaled_reward": -0.11308415234088898, | |
| "rewards/format_reward": 0.8125000074505806, | |
| "step": 357 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1344.3541870117188, | |
| "epoch": 0.40914285714285714, | |
| "grad_norm": 0.4239792823791504, | |
| "kl": 0.02117919921875, | |
| "learning_rate": 3.0359654942835247e-07, | |
| "loss": 0.3413, | |
| "num_tokens": 36314024.0, | |
| "reward": 0.6943970248103142, | |
| "reward_std": 0.7140942215919495, | |
| "rewards/cosine_scaled_reward": -0.07988481689244509, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 358 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1117.1042404174805, | |
| "epoch": 0.4102857142857143, | |
| "grad_norm": 0.5485315918922424, | |
| "kl": 0.0322418212890625, | |
| "learning_rate": 3.0097380284049523e-07, | |
| "loss": 0.1325, | |
| "num_tokens": 36375343.0, | |
| "reward": 1.2047154903411865, | |
| "reward_std": 0.547421857714653, | |
| "rewards/cosine_scaled_reward": 0.13360773120075464, | |
| "rewards/format_reward": 0.9375, | |
| "step": 359 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 562.5208511352539, | |
| "epoch": 0.4114285714285714, | |
| "grad_norm": 0.5694870352745056, | |
| "kl": 0.03411865234375, | |
| "learning_rate": 2.9836319343816397e-07, | |
| "loss": 0.1663, | |
| "num_tokens": 36409910.0, | |
| "reward": 1.1702253967523575, | |
| "reward_std": 0.46344269812107086, | |
| "rewards/cosine_scaled_reward": 0.08511268568690866, | |
| "rewards/format_reward": 1.0, | |
| "step": 360 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1849.8958740234375, | |
| "epoch": 0.4125714285714286, | |
| "grad_norm": 0.35474714636802673, | |
| "kl": 0.022979736328125, | |
| "learning_rate": 2.9576484845877793e-07, | |
| "loss": 0.118, | |
| "num_tokens": 36506409.0, | |
| "reward": 0.4462018497288227, | |
| "reward_std": 0.2832772321999073, | |
| "rewards/cosine_scaled_reward": -0.08939909003674984, | |
| "rewards/format_reward": 0.625, | |
| "step": 361 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1358.5833435058594, | |
| "epoch": 0.4137142857142857, | |
| "grad_norm": 0.4604112207889557, | |
| "kl": 0.023681640625, | |
| "learning_rate": 2.931788945420058e-07, | |
| "loss": 0.2728, | |
| "num_tokens": 36580003.0, | |
| "reward": 0.8181298896670341, | |
| "reward_std": 0.7494820207357407, | |
| "rewards/cosine_scaled_reward": 0.0028149131685495377, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 362 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1144.3125305175781, | |
| "epoch": 0.41485714285714287, | |
| "grad_norm": 0.44948238134384155, | |
| "kl": 0.0284423828125, | |
| "learning_rate": 2.9060545772359305e-07, | |
| "loss": 0.1588, | |
| "num_tokens": 36642832.0, | |
| "reward": 0.7569349557161331, | |
| "reward_std": 0.5557837300002575, | |
| "rewards/cosine_scaled_reward": -0.06944921240210533, | |
| "rewards/format_reward": 0.8958333283662796, | |
| "step": 363 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1319.2708435058594, | |
| "epoch": 0.416, | |
| "grad_norm": 0.40363508462905884, | |
| "kl": 0.0239715576171875, | |
| "learning_rate": 2.8804466342921987e-07, | |
| "loss": 0.3013, | |
| "num_tokens": 36714131.0, | |
| "reward": 0.6689659608528018, | |
| "reward_std": 0.6713744327425957, | |
| "rewards/cosine_scaled_reward": -0.050933680147863925, | |
| "rewards/format_reward": 0.7708333358168602, | |
| "step": 364 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1737.854248046875, | |
| "epoch": 0.41714285714285715, | |
| "grad_norm": 0.3201209604740143, | |
| "kl": 0.018829345703125, | |
| "learning_rate": 2.854966364683872e-07, | |
| "loss": -0.001, | |
| "num_tokens": 36805390.0, | |
| "reward": 0.7450773566961288, | |
| "reward_std": 0.8447261303663254, | |
| "rewards/cosine_scaled_reward": -0.0024613337591290474, | |
| "rewards/format_reward": 0.75, | |
| "step": 365 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1310.9583435058594, | |
| "epoch": 0.41828571428571426, | |
| "grad_norm": 0.336849570274353, | |
| "kl": 0.0190582275390625, | |
| "learning_rate": 2.829615010283344e-07, | |
| "loss": 0.1492, | |
| "num_tokens": 36876134.0, | |
| "reward": 0.7563202679157257, | |
| "reward_std": 0.5958736017346382, | |
| "rewards/cosine_scaled_reward": -0.06975654885172844, | |
| "rewards/format_reward": 0.8958333283662796, | |
| "step": 366 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 946.7500152587891, | |
| "epoch": 0.41942857142857143, | |
| "grad_norm": 0.5472022294998169, | |
| "kl": 0.028717041015625, | |
| "learning_rate": 2.8043938066798645e-07, | |
| "loss": 0.1855, | |
| "num_tokens": 36929078.0, | |
| "reward": 1.0344794690608978, | |
| "reward_std": 0.44533807784318924, | |
| "rewards/cosine_scaled_reward": 0.06932303309440613, | |
| "rewards/format_reward": 0.8958333283662796, | |
| "step": 367 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1162.7083740234375, | |
| "epoch": 0.4205714285714286, | |
| "grad_norm": 0.4045441448688507, | |
| "kl": 0.021575927734375, | |
| "learning_rate": 2.7793039831193133e-07, | |
| "loss": -0.0114, | |
| "num_tokens": 36993378.0, | |
| "reward": 1.0075117945671082, | |
| "reward_std": 0.6353353708982468, | |
| "rewards/cosine_scaled_reward": 0.08708921447396278, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 368 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1022.9791870117188, | |
| "epoch": 0.4217142857142857, | |
| "grad_norm": 0.4526968002319336, | |
| "kl": 0.02374267578125, | |
| "learning_rate": 2.7543467624442956e-07, | |
| "loss": 0.1353, | |
| "num_tokens": 37050395.0, | |
| "reward": 1.3955522179603577, | |
| "reward_std": 0.35689088329672813, | |
| "rewards/cosine_scaled_reward": 0.2081927489489317, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 369 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1004.8958587646484, | |
| "epoch": 0.4228571428571429, | |
| "grad_norm": 0.41456565260887146, | |
| "kl": 0.02435302734375, | |
| "learning_rate": 2.729523361034538e-07, | |
| "loss": 0.4306, | |
| "num_tokens": 37107066.0, | |
| "reward": 0.7459383457899094, | |
| "reward_std": 0.5385182648897171, | |
| "rewards/cosine_scaled_reward": -0.09578085504472256, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 370 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 927.7708892822266, | |
| "epoch": 0.424, | |
| "grad_norm": 0.39877113699913025, | |
| "kl": 0.02532958984375, | |
| "learning_rate": 2.7048349887476037e-07, | |
| "loss": 0.2257, | |
| "num_tokens": 37159825.0, | |
| "reward": 0.9463022500276566, | |
| "reward_std": 0.7569273114204407, | |
| "rewards/cosine_scaled_reward": -0.006015541031956673, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 371 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 860.7291793823242, | |
| "epoch": 0.42514285714285716, | |
| "grad_norm": 1.0322585105895996, | |
| "kl": 0.0416259765625, | |
| "learning_rate": 2.6802828488599294e-07, | |
| "loss": 0.2483, | |
| "num_tokens": 37209402.0, | |
| "reward": 1.0459811389446259, | |
| "reward_std": 0.37404677644371986, | |
| "rewards/cosine_scaled_reward": 0.033407218754291534, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 372 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1284.541732788086, | |
| "epoch": 0.42628571428571427, | |
| "grad_norm": 0.3862076997756958, | |
| "kl": 0.02374267578125, | |
| "learning_rate": 2.655868138008171e-07, | |
| "loss": 0.2574, | |
| "num_tokens": 37279418.0, | |
| "reward": 0.6485820934176445, | |
| "reward_std": 0.3174329958856106, | |
| "rewards/cosine_scaled_reward": -0.11320894956588745, | |
| "rewards/format_reward": 0.875, | |
| "step": 373 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1156.7708587646484, | |
| "epoch": 0.42742857142857144, | |
| "grad_norm": 0.41904541850090027, | |
| "kl": 0.0294189453125, | |
| "learning_rate": 2.631592046130896e-07, | |
| "loss": 0.288, | |
| "num_tokens": 37342689.0, | |
| "reward": 0.7976613417267799, | |
| "reward_std": 0.6100458800792694, | |
| "rewards/cosine_scaled_reward": -0.049086001701653004, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 374 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 827.9791870117188, | |
| "epoch": 0.42857142857142855, | |
| "grad_norm": 0.483198344707489, | |
| "kl": 0.030426025390625, | |
| "learning_rate": 2.6074557564105724e-07, | |
| "loss": 0.1944, | |
| "num_tokens": 37390370.0, | |
| "reward": 0.974768877029419, | |
| "reward_std": 0.5094475597143173, | |
| "rewards/cosine_scaled_reward": 0.008217750757467002, | |
| "rewards/format_reward": 0.9583333283662796, | |
| "step": 375 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1670.7916870117188, | |
| "epoch": 0.4297142857142857, | |
| "grad_norm": 0.3357181251049042, | |
| "kl": 0.0157012939453125, | |
| "learning_rate": 2.583460445215911e-07, | |
| "loss": 0.2978, | |
| "num_tokens": 37478224.0, | |
| "reward": 0.8598220273852348, | |
| "reward_std": 0.8302092999219894, | |
| "rewards/cosine_scaled_reward": 0.04449433600530028, | |
| "rewards/format_reward": 0.7708333283662796, | |
| "step": 376 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 884.0000152587891, | |
| "epoch": 0.4308571428571429, | |
| "grad_norm": 0.43593403697013855, | |
| "kl": 0.031005859375, | |
| "learning_rate": 2.5596072820445254e-07, | |
| "loss": 0.1621, | |
| "num_tokens": 37528678.0, | |
| "reward": 0.9919871091842651, | |
| "reward_std": 0.5908376425504684, | |
| "rewards/cosine_scaled_reward": 0.006410190369933844, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 377 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1222.2708435058594, | |
| "epoch": 0.432, | |
| "grad_norm": 0.41493770480155945, | |
| "kl": 0.025177001953125, | |
| "learning_rate": 2.5358974294659373e-07, | |
| "loss": 0.3092, | |
| "num_tokens": 37595327.0, | |
| "reward": 0.6667770892381668, | |
| "reward_std": 0.628834679722786, | |
| "rewards/cosine_scaled_reward": -0.11452813632786274, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 378 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1521.6459045410156, | |
| "epoch": 0.43314285714285716, | |
| "grad_norm": 0.48460623621940613, | |
| "kl": 0.027587890625, | |
| "learning_rate": 2.512332043064913e-07, | |
| "loss": 0.2818, | |
| "num_tokens": 37676838.0, | |
| "reward": 0.8283536843955517, | |
| "reward_std": 0.7019159197807312, | |
| "rewards/cosine_scaled_reward": 0.039176818914711475, | |
| "rewards/format_reward": 0.75, | |
| "step": 379 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1020.9375305175781, | |
| "epoch": 0.4342857142857143, | |
| "grad_norm": 0.5315077900886536, | |
| "kl": 0.0264892578125, | |
| "learning_rate": 2.488912271385139e-07, | |
| "loss": 0.3609, | |
| "num_tokens": 37734285.0, | |
| "reward": 0.8689068183302879, | |
| "reward_std": 0.6152400150895119, | |
| "rewards/cosine_scaled_reward": -0.0342966066673398, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 380 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1150.5000305175781, | |
| "epoch": 0.43542857142857144, | |
| "grad_norm": 0.4323570728302002, | |
| "kl": 0.01971435546875, | |
| "learning_rate": 2.465639255873246e-07, | |
| "loss": 0.177, | |
| "num_tokens": 37797207.0, | |
| "reward": 1.0002194195985794, | |
| "reward_std": 0.7640446051955223, | |
| "rewards/cosine_scaled_reward": 0.020943036302924156, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 381 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1450.6041870117188, | |
| "epoch": 0.43657142857142855, | |
| "grad_norm": 0.39461439847946167, | |
| "kl": 0.023468017578125, | |
| "learning_rate": 2.4425141308231765e-07, | |
| "loss": 0.2971, | |
| "num_tokens": 37874744.0, | |
| "reward": 0.7021645717322826, | |
| "reward_std": 0.7685395777225494, | |
| "rewards/cosine_scaled_reward": -0.04475104878656566, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 382 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2027.2083740234375, | |
| "epoch": 0.4377142857142857, | |
| "grad_norm": 0.34751343727111816, | |
| "kl": 0.0179443359375, | |
| "learning_rate": 2.4195380233209006e-07, | |
| "loss": 0.1176, | |
| "num_tokens": 37979940.0, | |
| "reward": 0.5363237643614411, | |
| "reward_std": 0.3470821715891361, | |
| "rewards/cosine_scaled_reward": -0.03392144478857517, | |
| "rewards/format_reward": 0.6041666567325592, | |
| "step": 383 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1126.5208587646484, | |
| "epoch": 0.43885714285714283, | |
| "grad_norm": 0.4206862151622772, | |
| "kl": 0.0215301513671875, | |
| "learning_rate": 2.3967120531894857e-07, | |
| "loss": 0.123, | |
| "num_tokens": 38042449.0, | |
| "reward": 0.7954209297895432, | |
| "reward_std": 0.4029863178730011, | |
| "rewards/cosine_scaled_reward": -0.10228953487239778, | |
| "rewards/format_reward": 1.0, | |
| "step": 384 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1238.9584045410156, | |
| "epoch": 0.44, | |
| "grad_norm": 0.3564131259918213, | |
| "kl": 0.022125244140625, | |
| "learning_rate": 2.374037332934512e-07, | |
| "loss": 0.0834, | |
| "num_tokens": 38109791.0, | |
| "reward": 1.001874104142189, | |
| "reward_std": 0.6318590641021729, | |
| "rewards/cosine_scaled_reward": 0.042603690177202225, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 385 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1390.8125610351562, | |
| "epoch": 0.44114285714285717, | |
| "grad_norm": 0.3943594992160797, | |
| "kl": 0.0227203369140625, | |
| "learning_rate": 2.3515149676898552e-07, | |
| "loss": 0.2978, | |
| "num_tokens": 38184644.0, | |
| "reward": 0.8800090253353119, | |
| "reward_std": 0.5464823693037033, | |
| "rewards/cosine_scaled_reward": 0.01292116753757, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 386 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1202.3333740234375, | |
| "epoch": 0.4422857142857143, | |
| "grad_norm": 0.4116460978984833, | |
| "kl": 0.02825927734375, | |
| "learning_rate": 2.3291460551638237e-07, | |
| "loss": 0.217, | |
| "num_tokens": 38250456.0, | |
| "reward": 0.7325910478830338, | |
| "reward_std": 0.3488932326436043, | |
| "rewards/cosine_scaled_reward": -0.10245448537170887, | |
| "rewards/format_reward": 0.9375, | |
| "step": 387 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1391.6667175292969, | |
| "epoch": 0.44342857142857145, | |
| "grad_norm": 0.4180901050567627, | |
| "kl": 0.0212860107421875, | |
| "learning_rate": 2.306931685585657e-07, | |
| "loss": 0.1904, | |
| "num_tokens": 38325338.0, | |
| "reward": 1.0158484429121017, | |
| "reward_std": 0.7112128995358944, | |
| "rewards/cosine_scaled_reward": 0.04959086421877146, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 388 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1247.3959045410156, | |
| "epoch": 0.44457142857142856, | |
| "grad_norm": 0.3733654320240021, | |
| "kl": 0.02099609375, | |
| "learning_rate": 2.2848729416523859e-07, | |
| "loss": 0.1651, | |
| "num_tokens": 38393073.0, | |
| "reward": 1.2150611281394958, | |
| "reward_std": 0.7659250199794769, | |
| "rewards/cosine_scaled_reward": 0.13878049701452255, | |
| "rewards/format_reward": 0.9375, | |
| "step": 389 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1146.5833435058594, | |
| "epoch": 0.44571428571428573, | |
| "grad_norm": 0.6047143936157227, | |
| "kl": 0.03582763671875, | |
| "learning_rate": 2.2629708984760706e-07, | |
| "loss": 0.3609, | |
| "num_tokens": 38456011.0, | |
| "reward": 0.6786292586475611, | |
| "reward_std": 0.6744736880064011, | |
| "rewards/cosine_scaled_reward": -0.06693539395928383, | |
| "rewards/format_reward": 0.8125000074505806, | |
| "step": 390 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 942.6667022705078, | |
| "epoch": 0.44685714285714284, | |
| "grad_norm": 0.46833914518356323, | |
| "kl": 0.031005859375, | |
| "learning_rate": 2.2412266235313973e-07, | |
| "loss": 0.2084, | |
| "num_tokens": 38508975.0, | |
| "reward": 0.8765498697757721, | |
| "reward_std": 0.6001102030277252, | |
| "rewards/cosine_scaled_reward": -0.040891751646995544, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 391 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1619.6041870117188, | |
| "epoch": 0.448, | |
| "grad_norm": 0.3091895878314972, | |
| "kl": 0.018768310546875, | |
| "learning_rate": 2.2196411766036487e-07, | |
| "loss": 0.0724, | |
| "num_tokens": 38594150.0, | |
| "reward": 1.0864002853631973, | |
| "reward_std": 0.7392374388873577, | |
| "rewards/cosine_scaled_reward": 0.14736680313944817, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 392 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1556.4583740234375, | |
| "epoch": 0.4491428571428571, | |
| "grad_norm": 0.35970357060432434, | |
| "kl": 0.021942138671875, | |
| "learning_rate": 2.1982156097370557e-07, | |
| "loss": 0.2975, | |
| "num_tokens": 38677536.0, | |
| "reward": 0.7836254239082336, | |
| "reward_std": 0.6550324112176895, | |
| "rewards/cosine_scaled_reward": -0.004020635038614273, | |
| "rewards/format_reward": 0.7916666567325592, | |
| "step": 393 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1020.1042022705078, | |
| "epoch": 0.4502857142857143, | |
| "grad_norm": 0.41625267267227173, | |
| "kl": 0.03192138671875, | |
| "learning_rate": 2.1769509671835223e-07, | |
| "loss": 0.2402, | |
| "num_tokens": 38735135.0, | |
| "reward": 0.8095816224813461, | |
| "reward_std": 0.5363309979438782, | |
| "rewards/cosine_scaled_reward": -0.06395919248461723, | |
| "rewards/format_reward": 0.9375, | |
| "step": 394 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1274.0625610351562, | |
| "epoch": 0.4514285714285714, | |
| "grad_norm": 0.3793922960758209, | |
| "kl": 0.026214599609375, | |
| "learning_rate": 2.1558482853517253e-07, | |
| "loss": 0.2779, | |
| "num_tokens": 38805098.0, | |
| "reward": 0.853252187371254, | |
| "reward_std": 0.5334652774035931, | |
| "rewards/cosine_scaled_reward": 0.00995943695306778, | |
| "rewards/format_reward": 0.8333333283662796, | |
| "step": 395 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 740.1875152587891, | |
| "epoch": 0.45257142857142857, | |
| "grad_norm": 0.45222580432891846, | |
| "kl": 0.034210205078125, | |
| "learning_rate": 2.134908592756607e-07, | |
| "loss": 0.2259, | |
| "num_tokens": 38848211.0, | |
| "reward": 1.0253641307353973, | |
| "reward_std": 0.33863143250346184, | |
| "rewards/cosine_scaled_reward": 0.03351536951959133, | |
| "rewards/format_reward": 0.9583333283662796, | |
| "step": 396 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 880.2500305175781, | |
| "epoch": 0.45371428571428574, | |
| "grad_norm": 0.4955395758152008, | |
| "kl": 0.03704833984375, | |
| "learning_rate": 2.1141329099692406e-07, | |
| "loss": 0.3078, | |
| "num_tokens": 38898983.0, | |
| "reward": 1.428503692150116, | |
| "reward_std": 0.5125311873853207, | |
| "rewards/cosine_scaled_reward": 0.2350851409137249, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 397 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1339.7500305175781, | |
| "epoch": 0.45485714285714285, | |
| "grad_norm": 0.39652198553085327, | |
| "kl": 0.020843505859375, | |
| "learning_rate": 2.0935222495670968e-07, | |
| "loss": 0.257, | |
| "num_tokens": 38971691.0, | |
| "reward": 1.0975730419158936, | |
| "reward_std": 0.5926219597458839, | |
| "rewards/cosine_scaled_reward": 0.13211984746158123, | |
| "rewards/format_reward": 0.8333333283662796, | |
| "step": 398 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1455.916748046875, | |
| "epoch": 0.456, | |
| "grad_norm": 0.32825639843940735, | |
| "kl": 0.023895263671875, | |
| "learning_rate": 2.0730776160846853e-07, | |
| "loss": 0.43, | |
| "num_tokens": 39049621.0, | |
| "reward": 0.7255288064479828, | |
| "reward_std": 0.5127102136611938, | |
| "rewards/cosine_scaled_reward": -0.0434856116771698, | |
| "rewards/format_reward": 0.8125, | |
| "step": 399 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 904.7291717529297, | |
| "epoch": 0.45714285714285713, | |
| "grad_norm": 0.47382229566574097, | |
| "kl": 0.0305938720703125, | |
| "learning_rate": 2.0528000059645995e-07, | |
| "loss": 0.3168, | |
| "num_tokens": 39101004.0, | |
| "reward": 0.9123225212097168, | |
| "reward_std": 0.324755534529686, | |
| "rewards/cosine_scaled_reward": -0.023005416616797447, | |
| "rewards/format_reward": 0.9583333283662796, | |
| "step": 400 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 670.7500152587891, | |
| "epoch": 0.4582857142857143, | |
| "grad_norm": 0.5006996989250183, | |
| "kl": 0.028167724609375, | |
| "learning_rate": 2.032690407508949e-07, | |
| "loss": 0.1668, | |
| "num_tokens": 39141300.0, | |
| "reward": 1.1984441727399826, | |
| "reward_std": 0.43713772110641, | |
| "rewards/cosine_scaled_reward": 0.09922206029295921, | |
| "rewards/format_reward": 1.0, | |
| "step": 401 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1485.2500610351562, | |
| "epoch": 0.4594285714285714, | |
| "grad_norm": 2.8007476329803467, | |
| "kl": 0.042938232421875, | |
| "learning_rate": 2.0127498008311922e-07, | |
| "loss": 0.254, | |
| "num_tokens": 39221130.0, | |
| "reward": 0.5768831968307495, | |
| "reward_std": 0.5035651251673698, | |
| "rewards/cosine_scaled_reward": -0.1594750825315714, | |
| "rewards/format_reward": 0.8958333283662796, | |
| "step": 402 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1249.3958587646484, | |
| "epoch": 0.4605714285714286, | |
| "grad_norm": 0.3587521016597748, | |
| "kl": 0.02703857421875, | |
| "learning_rate": 1.9929791578083655e-07, | |
| "loss": 0.1084, | |
| "num_tokens": 39288733.0, | |
| "reward": 1.0168748944997787, | |
| "reward_std": 0.7613801509141922, | |
| "rewards/cosine_scaled_reward": 0.05010411172406748, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 403 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 893.0000305175781, | |
| "epoch": 0.4617142857142857, | |
| "grad_norm": 0.40811389684677124, | |
| "kl": 0.0299072265625, | |
| "learning_rate": 1.9733794420337213e-07, | |
| "loss": 0.2457, | |
| "num_tokens": 39339589.0, | |
| "reward": 0.9994086623191833, | |
| "reward_std": 0.523450993001461, | |
| "rewards/cosine_scaled_reward": 0.030954306945204735, | |
| "rewards/format_reward": 0.9375, | |
| "step": 404 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1085.8750457763672, | |
| "epoch": 0.46285714285714286, | |
| "grad_norm": 0.3914470076560974, | |
| "kl": 0.029327392578125, | |
| "learning_rate": 1.9539516087697517e-07, | |
| "loss": 0.2453, | |
| "num_tokens": 39400087.0, | |
| "reward": 1.1680223643779755, | |
| "reward_std": 0.6629828922450542, | |
| "rewards/cosine_scaled_reward": 0.10484451148658991, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 405 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1413.1458740234375, | |
| "epoch": 0.464, | |
| "grad_norm": 0.4474855363368988, | |
| "kl": 0.0238037109375, | |
| "learning_rate": 1.934696604901642e-07, | |
| "loss": 0.3965, | |
| "num_tokens": 39476744.0, | |
| "reward": 0.8232813104987144, | |
| "reward_std": 0.5474796146154404, | |
| "rewards/cosine_scaled_reward": 0.015807300806045532, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 406 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 973.9791870117188, | |
| "epoch": 0.46514285714285714, | |
| "grad_norm": 0.5875135660171509, | |
| "kl": 0.03564453125, | |
| "learning_rate": 1.915615368891117e-07, | |
| "loss": 0.464, | |
| "num_tokens": 39531097.0, | |
| "reward": 1.0968922302126884, | |
| "reward_std": 0.5210925191640854, | |
| "rewards/cosine_scaled_reward": 0.0901127781253308, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 407 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1042.020851135254, | |
| "epoch": 0.4662857142857143, | |
| "grad_norm": 0.5569504499435425, | |
| "kl": 0.036956787109375, | |
| "learning_rate": 1.8967088307307e-07, | |
| "loss": 0.1739, | |
| "num_tokens": 39588962.0, | |
| "reward": 1.0446859672665596, | |
| "reward_std": 0.477496899664402, | |
| "rewards/cosine_scaled_reward": 0.06400964129716158, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 408 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 853.7083740234375, | |
| "epoch": 0.4674285714285714, | |
| "grad_norm": 0.43134912848472595, | |
| "kl": 0.0301513671875, | |
| "learning_rate": 1.8779779118983867e-07, | |
| "loss": 0.0768, | |
| "num_tokens": 39638154.0, | |
| "reward": 1.1461460888385773, | |
| "reward_std": 0.7252542972564697, | |
| "rewards/cosine_scaled_reward": 0.07307301368564367, | |
| "rewards/format_reward": 1.0, | |
| "step": 409 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1219.4583740234375, | |
| "epoch": 0.4685714285714286, | |
| "grad_norm": 0.42697203159332275, | |
| "kl": 0.0308074951171875, | |
| "learning_rate": 1.8594235253127372e-07, | |
| "loss": 0.2314, | |
| "num_tokens": 39704914.0, | |
| "reward": 0.8525972217321396, | |
| "reward_std": 0.7250062227249146, | |
| "rewards/cosine_scaled_reward": -0.011201405432075262, | |
| "rewards/format_reward": 0.875, | |
| "step": 410 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1561.7083587646484, | |
| "epoch": 0.4697142857142857, | |
| "grad_norm": 0.35902073979377747, | |
| "kl": 0.027740478515625, | |
| "learning_rate": 1.8410465752883758e-07, | |
| "loss": 0.3147, | |
| "num_tokens": 39788684.0, | |
| "reward": 0.42419329285621643, | |
| "reward_std": 0.4189872369170189, | |
| "rewards/cosine_scaled_reward": -0.2045700242742896, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 411 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 869.9583435058594, | |
| "epoch": 0.47085714285714286, | |
| "grad_norm": 0.5099910497665405, | |
| "kl": 0.03778076171875, | |
| "learning_rate": 1.822847957491922e-07, | |
| "loss": 0.2973, | |
| "num_tokens": 39838452.0, | |
| "reward": 0.9078437611460686, | |
| "reward_std": 0.4090404435992241, | |
| "rewards/cosine_scaled_reward": -0.025244787335395813, | |
| "rewards/format_reward": 0.9583333283662796, | |
| "step": 412 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1232.2708740234375, | |
| "epoch": 0.472, | |
| "grad_norm": 0.4471232295036316, | |
| "kl": 0.02978515625, | |
| "learning_rate": 1.804828558898332e-07, | |
| "loss": 0.3114, | |
| "num_tokens": 39905653.0, | |
| "reward": 0.5217236494645476, | |
| "reward_std": 0.31653984263539314, | |
| "rewards/cosine_scaled_reward": -0.1558048389852047, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 413 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1533.6666717529297, | |
| "epoch": 0.47314285714285714, | |
| "grad_norm": 0.4558218717575073, | |
| "kl": 0.0302886962890625, | |
| "learning_rate": 1.7869892577476722e-07, | |
| "loss": 0.2128, | |
| "num_tokens": 39988185.0, | |
| "reward": 1.2229383140802383, | |
| "reward_std": 0.6020685248076916, | |
| "rewards/cosine_scaled_reward": 0.19480247469618917, | |
| "rewards/format_reward": 0.8333333283662796, | |
| "step": 414 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1351.0000305175781, | |
| "epoch": 0.4742857142857143, | |
| "grad_norm": 0.3420197665691376, | |
| "kl": 0.025360107421875, | |
| "learning_rate": 1.7693309235023127e-07, | |
| "loss": 0.3434, | |
| "num_tokens": 40061211.0, | |
| "reward": 1.1249284744262695, | |
| "reward_std": 0.8133822381496429, | |
| "rewards/cosine_scaled_reward": 0.12496422789990902, | |
| "rewards/format_reward": 0.875, | |
| "step": 415 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1263.6458740234375, | |
| "epoch": 0.4754285714285714, | |
| "grad_norm": 0.4289003014564514, | |
| "kl": 0.031707763671875, | |
| "learning_rate": 1.7518544168045524e-07, | |
| "loss": 0.09, | |
| "num_tokens": 40130410.0, | |
| "reward": 0.9559039436280727, | |
| "reward_std": 0.6498378068208694, | |
| "rewards/cosine_scaled_reward": 0.05086860992014408, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 416 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1705.5833740234375, | |
| "epoch": 0.4765714285714286, | |
| "grad_norm": 0.4851248562335968, | |
| "kl": 0.0277099609375, | |
| "learning_rate": 1.7345605894346726e-07, | |
| "loss": 0.2165, | |
| "num_tokens": 40220522.0, | |
| "reward": 0.5403048545122147, | |
| "reward_std": 0.5704892203211784, | |
| "rewards/cosine_scaled_reward": -0.08401426486670971, | |
| "rewards/format_reward": 0.7083333358168602, | |
| "step": 417 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1089.7292175292969, | |
| "epoch": 0.4777142857142857, | |
| "grad_norm": 0.5024583339691162, | |
| "kl": 0.02630615234375, | |
| "learning_rate": 1.7174502842694212e-07, | |
| "loss": 0.35, | |
| "num_tokens": 40281043.0, | |
| "reward": 0.5926680490374565, | |
| "reward_std": 0.48843371123075485, | |
| "rewards/cosine_scaled_reward": -0.1724159736186266, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 418 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1123.7083587646484, | |
| "epoch": 0.47885714285714287, | |
| "grad_norm": 0.4442269504070282, | |
| "kl": 0.032562255859375, | |
| "learning_rate": 1.7005243352409333e-07, | |
| "loss": 0.0806, | |
| "num_tokens": 40343321.0, | |
| "reward": 0.8577143996953964, | |
| "reward_std": 0.5276221707463264, | |
| "rewards/cosine_scaled_reward": -0.05030947690829635, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 419 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1060.1042022705078, | |
| "epoch": 0.48, | |
| "grad_norm": 0.410487562417984, | |
| "kl": 0.026458740234375, | |
| "learning_rate": 1.6837835672960831e-07, | |
| "loss": 0.161, | |
| "num_tokens": 40401910.0, | |
| "reward": 1.127629041671753, | |
| "reward_std": 0.7603463605046272, | |
| "rewards/cosine_scaled_reward": 0.09506450593471527, | |
| "rewards/format_reward": 0.9375, | |
| "step": 420 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 927.5625457763672, | |
| "epoch": 0.48114285714285715, | |
| "grad_norm": 0.4083124101161957, | |
| "kl": 0.028717041015625, | |
| "learning_rate": 1.6672287963562852e-07, | |
| "loss": 0.4235, | |
| "num_tokens": 40454173.0, | |
| "reward": 1.2100372835993767, | |
| "reward_std": 0.7276662588119507, | |
| "rewards/cosine_scaled_reward": 0.13626863807439804, | |
| "rewards/format_reward": 0.9375, | |
| "step": 421 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1145.6667175292969, | |
| "epoch": 0.48228571428571426, | |
| "grad_norm": 0.36734601855278015, | |
| "kl": 0.03082275390625, | |
| "learning_rate": 1.6508608292777203e-07, | |
| "loss": 0.3146, | |
| "num_tokens": 40517349.0, | |
| "reward": 0.745939314365387, | |
| "reward_std": 0.4512132965028286, | |
| "rewards/cosine_scaled_reward": -0.10619700700044632, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 422 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1289.041732788086, | |
| "epoch": 0.48342857142857143, | |
| "grad_norm": 0.39704540371894836, | |
| "kl": 0.02734375, | |
| "learning_rate": 1.6346804638120098e-07, | |
| "loss": 0.2491, | |
| "num_tokens": 40587065.0, | |
| "reward": 0.7884100899100304, | |
| "reward_std": 0.5758587270975113, | |
| "rewards/cosine_scaled_reward": -0.03287830762565136, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 423 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1040.0625610351562, | |
| "epoch": 0.4845714285714286, | |
| "grad_norm": 0.39009180665016174, | |
| "kl": 0.0362548828125, | |
| "learning_rate": 1.6186884885673413e-07, | |
| "loss": 0.3062, | |
| "num_tokens": 40645184.0, | |
| "reward": 0.9872114062309265, | |
| "reward_std": 0.43921393156051636, | |
| "rewards/cosine_scaled_reward": 0.014439025893807411, | |
| "rewards/format_reward": 0.9583333283662796, | |
| "step": 424 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1162.2083740234375, | |
| "epoch": 0.4857142857142857, | |
| "grad_norm": 0.41968798637390137, | |
| "kl": 0.031097412109375, | |
| "learning_rate": 1.6028856829700258e-07, | |
| "loss": 0.1913, | |
| "num_tokens": 40709070.0, | |
| "reward": 1.1210691630840302, | |
| "reward_std": 0.6839531622827053, | |
| "rewards/cosine_scaled_reward": 0.09178455546498299, | |
| "rewards/format_reward": 0.9375, | |
| "step": 425 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 842.9583740234375, | |
| "epoch": 0.4868571428571429, | |
| "grad_norm": 0.4921518862247467, | |
| "kl": 0.032623291015625, | |
| "learning_rate": 1.5872728172265146e-07, | |
| "loss": 0.1351, | |
| "num_tokens": 40757704.0, | |
| "reward": 1.1493963152170181, | |
| "reward_std": 0.5684140212833881, | |
| "rewards/cosine_scaled_reward": 0.08511480502784252, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 426 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1863.0417175292969, | |
| "epoch": 0.488, | |
| "grad_norm": 0.34825828671455383, | |
| "kl": 0.016571044921875, | |
| "learning_rate": 1.5718506522858572e-07, | |
| "loss": 0.262, | |
| "num_tokens": 40854708.0, | |
| "reward": 0.6934209913015366, | |
| "reward_std": 0.6876000687479973, | |
| "rewards/cosine_scaled_reward": -0.007456184830516577, | |
| "rewards/format_reward": 0.708333320915699, | |
| "step": 427 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1125.562515258789, | |
| "epoch": 0.48914285714285716, | |
| "grad_norm": 0.431072473526001, | |
| "kl": 0.02935791015625, | |
| "learning_rate": 1.5566199398026147e-07, | |
| "loss": 0.1989, | |
| "num_tokens": 40917639.0, | |
| "reward": 0.79083002358675, | |
| "reward_std": 0.49118974804878235, | |
| "rewards/cosine_scaled_reward": -0.06291831657290459, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 428 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1233.250015258789, | |
| "epoch": 0.49028571428571427, | |
| "grad_norm": 0.5056308507919312, | |
| "kl": 0.03472900390625, | |
| "learning_rate": 1.5415814221002265e-07, | |
| "loss": 0.0979, | |
| "num_tokens": 40985193.0, | |
| "reward": 1.0755756497383118, | |
| "reward_std": 0.6792989065870643, | |
| "rewards/cosine_scaled_reward": 0.12112115137279034, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 429 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1888.4792175292969, | |
| "epoch": 0.49142857142857144, | |
| "grad_norm": 0.3414265513420105, | |
| "kl": 0.018707275390625, | |
| "learning_rate": 1.5267358321348285e-07, | |
| "loss": 0.2482, | |
| "num_tokens": 41084198.0, | |
| "reward": 0.5194306820631027, | |
| "reward_std": 0.7147988900542259, | |
| "rewards/cosine_scaled_reward": -0.11528466921299696, | |
| "rewards/format_reward": 0.75, | |
| "step": 430 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1227.1042175292969, | |
| "epoch": 0.49257142857142855, | |
| "grad_norm": 0.45565667748451233, | |
| "kl": 0.027862548828125, | |
| "learning_rate": 1.5120838934595337e-07, | |
| "loss": 0.2267, | |
| "num_tokens": 41150899.0, | |
| "reward": 1.093374602496624, | |
| "reward_std": 0.5845904052257538, | |
| "rewards/cosine_scaled_reward": 0.11960397660732269, | |
| "rewards/format_reward": 0.8541666567325592, | |
| "step": 431 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1604.812515258789, | |
| "epoch": 0.4937142857142857, | |
| "grad_norm": 0.35553795099258423, | |
| "kl": 0.026458740234375, | |
| "learning_rate": 1.4976263201891613e-07, | |
| "loss": 0.1478, | |
| "num_tokens": 41235508.0, | |
| "reward": 0.7511163279414177, | |
| "reward_std": 0.7302351742982864, | |
| "rewards/cosine_scaled_reward": -0.030691856518387794, | |
| "rewards/format_reward": 0.8125, | |
| "step": 432 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1305.6041717529297, | |
| "epoch": 0.4948571428571429, | |
| "grad_norm": 0.36688801646232605, | |
| "kl": 0.029998779296875, | |
| "learning_rate": 1.483363816965435e-07, | |
| "loss": 0.3838, | |
| "num_tokens": 41306295.0, | |
| "reward": 0.8434520326554775, | |
| "reward_std": 0.5114891212433577, | |
| "rewards/cosine_scaled_reward": 0.01547599770128727, | |
| "rewards/format_reward": 0.8125, | |
| "step": 433 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1180.0833740234375, | |
| "epoch": 0.496, | |
| "grad_norm": 0.585704505443573, | |
| "kl": 0.0245361328125, | |
| "learning_rate": 1.469297078922642e-07, | |
| "loss": 0.2038, | |
| "num_tokens": 41370631.0, | |
| "reward": 1.3207928240299225, | |
| "reward_std": 0.9274136871099472, | |
| "rewards/cosine_scaled_reward": 0.2124797385185957, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 434 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1271.5417175292969, | |
| "epoch": 0.49714285714285716, | |
| "grad_norm": 0.4473916292190552, | |
| "kl": 0.030487060546875, | |
| "learning_rate": 1.4554267916537495e-07, | |
| "loss": 0.3724, | |
| "num_tokens": 41439423.0, | |
| "reward": 0.6336492225527763, | |
| "reward_std": 0.540140762925148, | |
| "rewards/cosine_scaled_reward": -0.11025873199105263, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 435 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1551.9792175292969, | |
| "epoch": 0.4982857142857143, | |
| "grad_norm": 0.3577360510826111, | |
| "kl": 0.0240631103515625, | |
| "learning_rate": 1.4417536311769885e-07, | |
| "loss": 0.3034, | |
| "num_tokens": 41521970.0, | |
| "reward": 0.7670895345509052, | |
| "reward_std": 0.7583608031272888, | |
| "rewards/cosine_scaled_reward": -0.012288582162000239, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 436 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1145.5833587646484, | |
| "epoch": 0.49942857142857144, | |
| "grad_norm": 0.41388896107673645, | |
| "kl": 0.02911376953125, | |
| "learning_rate": 1.4282782639029128e-07, | |
| "loss": 0.2609, | |
| "num_tokens": 41584824.0, | |
| "reward": 1.317168042063713, | |
| "reward_std": 0.6604329124093056, | |
| "rewards/cosine_scaled_reward": 0.21066732332110405, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 437 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 888.7917175292969, | |
| "epoch": 0.5005714285714286, | |
| "grad_norm": 1.7249231338500977, | |
| "kl": 0.05987548828125, | |
| "learning_rate": 1.4150013466019114e-07, | |
| "loss": 0.3252, | |
| "num_tokens": 41635646.0, | |
| "reward": 1.056243896484375, | |
| "reward_std": 0.5137106534093618, | |
| "rewards/cosine_scaled_reward": 0.04895526496693492, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 438 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1017.1458740234375, | |
| "epoch": 0.5017142857142857, | |
| "grad_norm": 0.5317440629005432, | |
| "kl": 0.0291748046875, | |
| "learning_rate": 1.4019235263722034e-07, | |
| "loss": 0.2548, | |
| "num_tokens": 41691807.0, | |
| "reward": 0.9232819229364395, | |
| "reward_std": 0.62980717420578, | |
| "rewards/cosine_scaled_reward": 0.013724284246563911, | |
| "rewards/format_reward": 0.8958333283662796, | |
| "step": 439 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1725.8542175292969, | |
| "epoch": 0.5028571428571429, | |
| "grad_norm": 0.3497353196144104, | |
| "kl": 0.0239105224609375, | |
| "learning_rate": 1.3890454406082956e-07, | |
| "loss": 0.1759, | |
| "num_tokens": 41783018.0, | |
| "reward": 0.6551067333202809, | |
| "reward_std": 0.9818321466445923, | |
| "rewards/cosine_scaled_reward": -0.06827997602522373, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 440 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1217.1875457763672, | |
| "epoch": 0.504, | |
| "grad_norm": 0.35091596841812134, | |
| "kl": 0.02471923828125, | |
| "learning_rate": 1.3763677169699217e-07, | |
| "loss": 0.2462, | |
| "num_tokens": 41849957.0, | |
| "reward": 0.8108302969485521, | |
| "reward_std": 0.4449946694076061, | |
| "rewards/cosine_scaled_reward": -0.021668191999197006, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 441 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1427.3333740234375, | |
| "epoch": 0.5051428571428571, | |
| "grad_norm": 0.424146831035614, | |
| "kl": 0.02996826171875, | |
| "learning_rate": 1.3638909733514452e-07, | |
| "loss": 0.1381, | |
| "num_tokens": 41927001.0, | |
| "reward": 0.6767252758145332, | |
| "reward_std": 0.43887148424983025, | |
| "rewards/cosine_scaled_reward": -0.06788737326860428, | |
| "rewards/format_reward": 0.8125, | |
| "step": 442 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1055.1042022705078, | |
| "epoch": 0.5062857142857143, | |
| "grad_norm": 0.3556446135044098, | |
| "kl": 0.02960205078125, | |
| "learning_rate": 1.351615817851748e-07, | |
| "loss": 0.3331, | |
| "num_tokens": 41985806.0, | |
| "reward": 0.8476575687527657, | |
| "reward_std": 0.3729449659585953, | |
| "rewards/cosine_scaled_reward": -0.03450455144047737, | |
| "rewards/format_reward": 0.9166666567325592, | |
| "step": 443 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1387.104232788086, | |
| "epoch": 0.5074285714285715, | |
| "grad_norm": 0.35173964500427246, | |
| "kl": 0.0280914306640625, | |
| "learning_rate": 1.3395428487445914e-07, | |
| "loss": 0.2315, | |
| "num_tokens": 42061015.0, | |
| "reward": 1.0118909031152725, | |
| "reward_std": 0.5946892201900482, | |
| "rewards/cosine_scaled_reward": 0.05802877992391586, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 444 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1481.416732788086, | |
| "epoch": 0.5085714285714286, | |
| "grad_norm": 0.4023115038871765, | |
| "kl": 0.02587890625, | |
| "learning_rate": 1.3276726544494571e-07, | |
| "loss": 0.1811, | |
| "num_tokens": 42140733.0, | |
| "reward": 0.6685292148031294, | |
| "reward_std": 0.4582846313714981, | |
| "rewards/cosine_scaled_reward": -0.040735377464443445, | |
| "rewards/format_reward": 0.75, | |
| "step": 445 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1054.937515258789, | |
| "epoch": 0.5097142857142857, | |
| "grad_norm": 0.4524555504322052, | |
| "kl": 0.0357666015625, | |
| "learning_rate": 1.316005813502869e-07, | |
| "loss": 0.175, | |
| "num_tokens": 42199530.0, | |
| "reward": 1.0812687873840332, | |
| "reward_std": 0.7833655476570129, | |
| "rewards/cosine_scaled_reward": 0.08230104623362422, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 446 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1313.1875305175781, | |
| "epoch": 0.5108571428571429, | |
| "grad_norm": 0.44180259108543396, | |
| "kl": 0.030792236328125, | |
| "learning_rate": 1.3045428945301953e-07, | |
| "loss": 0.1406, | |
| "num_tokens": 42270315.0, | |
| "reward": 0.6780117899179459, | |
| "reward_std": 0.3482256345450878, | |
| "rewards/cosine_scaled_reward": -0.07766077481210232, | |
| "rewards/format_reward": 0.8333333358168602, | |
| "step": 447 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 976.1250152587891, | |
| "epoch": 0.512, | |
| "grad_norm": 0.3419904410839081, | |
| "kl": 0.02667236328125, | |
| "learning_rate": 1.2932844562179352e-07, | |
| "loss": 0.2378, | |
| "num_tokens": 42324861.0, | |
| "reward": 1.2964210510253906, | |
| "reward_std": 0.7425139099359512, | |
| "rewards/cosine_scaled_reward": 0.15862719155848026, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 448 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1269.7916870117188, | |
| "epoch": 0.5131428571428571, | |
| "grad_norm": 0.4110361635684967, | |
| "kl": 0.02386474609375, | |
| "learning_rate": 1.2822310472864885e-07, | |
| "loss": 0.3271, | |
| "num_tokens": 42393947.0, | |
| "reward": 0.8870633244514465, | |
| "reward_std": 0.6967166736721992, | |
| "rewards/cosine_scaled_reward": -0.004385008476674557, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 449 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 702.6250305175781, | |
| "epoch": 0.5142857142857142, | |
| "grad_norm": 0.5267137289047241, | |
| "kl": 0.03900146484375, | |
| "learning_rate": 1.2713832064634125e-07, | |
| "loss": 0.1521, | |
| "num_tokens": 42435545.0, | |
| "reward": 1.109239935874939, | |
| "reward_std": 0.573906421661377, | |
| "rewards/cosine_scaled_reward": 0.05461995178484358, | |
| "rewards/format_reward": 1.0, | |
| "step": 450 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1432.2083740234375, | |
| "epoch": 0.5154285714285715, | |
| "grad_norm": 0.40656542778015137, | |
| "kl": 0.030517578125, | |
| "learning_rate": 1.260741462457165e-07, | |
| "loss": 0.2169, | |
| "num_tokens": 42512091.0, | |
| "reward": 0.9379907790571451, | |
| "reward_std": 0.6935139521956444, | |
| "rewards/cosine_scaled_reward": 0.03149538184516132, | |
| "rewards/format_reward": 0.875, | |
| "step": 451 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1116.3542022705078, | |
| "epoch": 0.5165714285714286, | |
| "grad_norm": 0.5056872367858887, | |
| "kl": 0.027923583984375, | |
| "learning_rate": 1.2503063339313356e-07, | |
| "loss": 0.3907, | |
| "num_tokens": 42573776.0, | |
| "reward": 0.6782356053590775, | |
| "reward_std": 0.5416813492774963, | |
| "rewards/cosine_scaled_reward": -0.09838221129029989, | |
| "rewards/format_reward": 0.875, | |
| "step": 452 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1324.8334045410156, | |
| "epoch": 0.5177142857142857, | |
| "grad_norm": 0.4419814646244049, | |
| "kl": 0.031219482421875, | |
| "learning_rate": 1.2400783294793668e-07, | |
| "loss": 0.2348, | |
| "num_tokens": 42645294.0, | |
| "reward": 1.0193464905023575, | |
| "reward_std": 0.6786343604326248, | |
| "rewards/cosine_scaled_reward": 0.06175656849518418, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 453 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 933.8541717529297, | |
| "epoch": 0.5188571428571429, | |
| "grad_norm": 0.44726788997650146, | |
| "kl": 0.0369873046875, | |
| "learning_rate": 1.2300579475997657e-07, | |
| "loss": 0.36, | |
| "num_tokens": 42698207.0, | |
| "reward": 1.0875123143196106, | |
| "reward_std": 0.31689387187361717, | |
| "rewards/cosine_scaled_reward": 0.05417281948029995, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 454 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1275.4791870117188, | |
| "epoch": 0.52, | |
| "grad_norm": 0.5583041310310364, | |
| "kl": 0.02591705322265625, | |
| "learning_rate": 1.220245676671809e-07, | |
| "loss": 0.306, | |
| "num_tokens": 42767536.0, | |
| "reward": 1.0069229509681463, | |
| "reward_std": 0.4878829885274172, | |
| "rewards/cosine_scaled_reward": 0.08679477497935295, | |
| "rewards/format_reward": 0.8333333358168602, | |
| "step": 455 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1401.6458740234375, | |
| "epoch": 0.5211428571428571, | |
| "grad_norm": 0.40996938943862915, | |
| "kl": 0.02996826171875, | |
| "learning_rate": 1.2106419949317388e-07, | |
| "loss": 0.3287, | |
| "num_tokens": 42842915.0, | |
| "reward": 0.8078549057245255, | |
| "reward_std": 0.7320556342601776, | |
| "rewards/cosine_scaled_reward": 0.018510787514969707, | |
| "rewards/format_reward": 0.7708333283662796, | |
| "step": 456 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 806.083366394043, | |
| "epoch": 0.5222857142857142, | |
| "grad_norm": 0.48604169487953186, | |
| "kl": 0.03582763671875, | |
| "learning_rate": 1.2012473704494537e-07, | |
| "loss": 0.2165, | |
| "num_tokens": 42889905.0, | |
| "reward": 1.1789227575063705, | |
| "reward_std": 0.38530726544559, | |
| "rewards/cosine_scaled_reward": 0.09987803548574448, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 457 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1071.4375, | |
| "epoch": 0.5234285714285715, | |
| "grad_norm": 0.4074256122112274, | |
| "kl": 0.02691650390625, | |
| "learning_rate": 1.1920622611056974e-07, | |
| "loss": 0.2512, | |
| "num_tokens": 42949794.0, | |
| "reward": 1.0278047621250153, | |
| "reward_std": 0.7519606053829193, | |
| "rewards/cosine_scaled_reward": 0.04515235684812069, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 458 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1211.7500610351562, | |
| "epoch": 0.5245714285714286, | |
| "grad_norm": 0.4030369222164154, | |
| "kl": 0.029937744140625, | |
| "learning_rate": 1.1830871145697412e-07, | |
| "loss": 0.3212, | |
| "num_tokens": 43017036.0, | |
| "reward": 0.8210784047842026, | |
| "reward_std": 0.6628520265221596, | |
| "rewards/cosine_scaled_reward": -0.037377479020506144, | |
| "rewards/format_reward": 0.8958333283662796, | |
| "step": 459 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1466.2916870117188, | |
| "epoch": 0.5257142857142857, | |
| "grad_norm": 0.4611579477787018, | |
| "kl": 0.024932861328125, | |
| "learning_rate": 1.1743223682775649e-07, | |
| "loss": 0.2586, | |
| "num_tokens": 43095800.0, | |
| "reward": 1.166172817349434, | |
| "reward_std": 1.0088868141174316, | |
| "rewards/cosine_scaled_reward": 0.13516972260549664, | |
| "rewards/format_reward": 0.8958333283662796, | |
| "step": 460 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1250.2500305175781, | |
| "epoch": 0.5268571428571428, | |
| "grad_norm": 0.35799574851989746, | |
| "kl": 0.02569580078125, | |
| "learning_rate": 1.1657684494105386e-07, | |
| "loss": 0.2041, | |
| "num_tokens": 43163744.0, | |
| "reward": 0.8057873249053955, | |
| "reward_std": 0.647061862051487, | |
| "rewards/cosine_scaled_reward": -0.024189693154767156, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 461 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1080.1458740234375, | |
| "epoch": 0.528, | |
| "grad_norm": 0.4635910093784332, | |
| "kl": 0.03936767578125, | |
| "learning_rate": 1.1574257748745986e-07, | |
| "loss": 0.3997, | |
| "num_tokens": 43223661.0, | |
| "reward": 0.6495188176631927, | |
| "reward_std": 0.4584726169705391, | |
| "rewards/cosine_scaled_reward": -0.14399060979485512, | |
| "rewards/format_reward": 0.9375, | |
| "step": 462 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1144.8750534057617, | |
| "epoch": 0.5291428571428571, | |
| "grad_norm": 0.4683811068534851, | |
| "kl": 0.03570556640625, | |
| "learning_rate": 1.1492947512799328e-07, | |
| "loss": 0.3718, | |
| "num_tokens": 43286439.0, | |
| "reward": 1.1641941219568253, | |
| "reward_std": 0.6467169672250748, | |
| "rewards/cosine_scaled_reward": 0.15501370280981064, | |
| "rewards/format_reward": 0.8541666567325592, | |
| "step": 463 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 997.0416870117188, | |
| "epoch": 0.5302857142857142, | |
| "grad_norm": 4.38219690322876, | |
| "kl": 0.05279541015625, | |
| "learning_rate": 1.1413757749211602e-07, | |
| "loss": 0.2404, | |
| "num_tokens": 43342619.0, | |
| "reward": 0.8943421989679337, | |
| "reward_std": 0.589733824133873, | |
| "rewards/cosine_scaled_reward": -0.04241223679855466, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 464 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1403.979248046875, | |
| "epoch": 0.5314285714285715, | |
| "grad_norm": 0.488127201795578, | |
| "kl": 0.029754638671875, | |
| "learning_rate": 1.1336692317580158e-07, | |
| "loss": 0.3311, | |
| "num_tokens": 43417882.0, | |
| "reward": 0.8888783566653728, | |
| "reward_std": 0.5945712774991989, | |
| "rewards/cosine_scaled_reward": 0.03818914666771889, | |
| "rewards/format_reward": 0.8125, | |
| "step": 465 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1697.6250610351562, | |
| "epoch": 0.5325714285714286, | |
| "grad_norm": 0.26276934146881104, | |
| "kl": 0.01947784423828125, | |
| "learning_rate": 1.1261754973965422e-07, | |
| "loss": 0.2593, | |
| "num_tokens": 43507456.0, | |
| "reward": 0.6314441710710526, | |
| "reward_std": 0.7540897130966187, | |
| "rewards/cosine_scaled_reward": -0.09052791446447372, | |
| "rewards/format_reward": 0.8124999850988388, | |
| "step": 466 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1293.041732788086, | |
| "epoch": 0.5337142857142857, | |
| "grad_norm": 8.432247161865234, | |
| "kl": 0.218963623046875, | |
| "learning_rate": 1.1188949370707787e-07, | |
| "loss": 0.143, | |
| "num_tokens": 43577394.0, | |
| "reward": 0.8170301653444767, | |
| "reward_std": 0.5284570157527924, | |
| "rewards/cosine_scaled_reward": -0.018568256869912148, | |
| "rewards/format_reward": 0.8541666641831398, | |
| "step": 467 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1465.8750457763672, | |
| "epoch": 0.5348571428571428, | |
| "grad_norm": 0.49368521571159363, | |
| "kl": 0.0333099365234375, | |
| "learning_rate": 1.1118279056249653e-07, | |
| "loss": 0.1917, | |
| "num_tokens": 43655832.0, | |
| "reward": 1.2481490820646286, | |
| "reward_std": 0.7606203258037567, | |
| "rewards/cosine_scaled_reward": 0.18657452706247568, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 468 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1239.0000610351562, | |
| "epoch": 0.536, | |
| "grad_norm": 0.4890661835670471, | |
| "kl": 0.0341796875, | |
| "learning_rate": 1.1049747474962444e-07, | |
| "loss": 0.2292, | |
| "num_tokens": 43723566.0, | |
| "reward": 0.7020466178655624, | |
| "reward_std": 0.5384815186262131, | |
| "rewards/cosine_scaled_reward": -0.08647668547928333, | |
| "rewards/format_reward": 0.875, | |
| "step": 469 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1251.4791870117188, | |
| "epoch": 0.5371428571428571, | |
| "grad_norm": 0.4875927269458771, | |
| "kl": 0.029296875, | |
| "learning_rate": 1.0983357966978745e-07, | |
| "loss": 0.1285, | |
| "num_tokens": 43791641.0, | |
| "reward": 1.112037941813469, | |
| "reward_std": 0.33521461114287376, | |
| "rewards/cosine_scaled_reward": 0.1185189438983798, | |
| "rewards/format_reward": 0.875, | |
| "step": 470 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1172.4792022705078, | |
| "epoch": 0.5382857142857143, | |
| "grad_norm": 0.42823565006256104, | |
| "kl": 0.030059814453125, | |
| "learning_rate": 1.0919113768029517e-07, | |
| "loss": 0.2834, | |
| "num_tokens": 43855648.0, | |
| "reward": 0.8745324984192848, | |
| "reward_std": 0.6210919320583344, | |
| "rewards/cosine_scaled_reward": -0.00023377127945423126, | |
| "rewards/format_reward": 0.875, | |
| "step": 471 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 705.2500152587891, | |
| "epoch": 0.5394285714285715, | |
| "grad_norm": 0.46066632866859436, | |
| "kl": 0.03387451171875, | |
| "learning_rate": 1.0857018009286381e-07, | |
| "loss": 0.1864, | |
| "num_tokens": 43897018.0, | |
| "reward": 1.1916231364011765, | |
| "reward_std": 0.5989440307021141, | |
| "rewards/cosine_scaled_reward": 0.10622821375727654, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 472 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 974.3125152587891, | |
| "epoch": 0.5405714285714286, | |
| "grad_norm": 0.5403279662132263, | |
| "kl": 0.035491943359375, | |
| "learning_rate": 1.0797073717209013e-07, | |
| "loss": 0.2905, | |
| "num_tokens": 43952113.0, | |
| "reward": 0.7929527014493942, | |
| "reward_std": 0.28072798252105713, | |
| "rewards/cosine_scaled_reward": -0.09310700930655003, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 473 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 840.8542022705078, | |
| "epoch": 0.5417142857142857, | |
| "grad_norm": 0.4049588143825531, | |
| "kl": 0.030242919921875, | |
| "learning_rate": 1.0739283813397639e-07, | |
| "loss": 0.2409, | |
| "num_tokens": 44000700.0, | |
| "reward": 0.9743078052997589, | |
| "reward_std": 0.5338472779840231, | |
| "rewards/cosine_scaled_reward": -0.0024294480681419373, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 474 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1120.6667022705078, | |
| "epoch": 0.5428571428571428, | |
| "grad_norm": 0.5333501696586609, | |
| "kl": 0.03533935546875, | |
| "learning_rate": 1.068365111445064e-07, | |
| "loss": 0.2386, | |
| "num_tokens": 44062388.0, | |
| "reward": 0.8754686489701271, | |
| "reward_std": 0.7480637580156326, | |
| "rewards/cosine_scaled_reward": 0.00023431982845067978, | |
| "rewards/format_reward": 0.875, | |
| "step": 475 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1014.0417175292969, | |
| "epoch": 0.544, | |
| "grad_norm": 0.42668408155441284, | |
| "kl": 0.032989501953125, | |
| "learning_rate": 1.063017833182728e-07, | |
| "loss": 0.2809, | |
| "num_tokens": 44120026.0, | |
| "reward": 1.238448478281498, | |
| "reward_std": 0.5114860832691193, | |
| "rewards/cosine_scaled_reward": 0.14005756378173828, | |
| "rewards/format_reward": 0.9583333283662796, | |
| "step": 476 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 760.7708587646484, | |
| "epoch": 0.5451428571428572, | |
| "grad_norm": 0.4274328052997589, | |
| "kl": 0.03369140625, | |
| "learning_rate": 1.0578868071715544e-07, | |
| "loss": 0.1905, | |
| "num_tokens": 44163635.0, | |
| "reward": 1.3580638319253922, | |
| "reward_std": 0.6236323565244675, | |
| "rewards/cosine_scaled_reward": 0.18944857362657785, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 477 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1135.5417022705078, | |
| "epoch": 0.5462857142857143, | |
| "grad_norm": 0.3638131022453308, | |
| "kl": 0.028350830078125, | |
| "learning_rate": 1.0529722834905125e-07, | |
| "loss": 0.2326, | |
| "num_tokens": 44226181.0, | |
| "reward": 0.9669682309031487, | |
| "reward_std": 0.5843757539987564, | |
| "rewards/cosine_scaled_reward": 0.014734117314219475, | |
| "rewards/format_reward": 0.9375, | |
| "step": 478 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1052.4792175292969, | |
| "epoch": 0.5474285714285714, | |
| "grad_norm": 0.4436745047569275, | |
| "kl": 0.03021240234375, | |
| "learning_rate": 1.0482745016665526e-07, | |
| "loss": 0.2299, | |
| "num_tokens": 44284902.0, | |
| "reward": 1.0697922855615616, | |
| "reward_std": 0.6758029907941818, | |
| "rewards/cosine_scaled_reward": 0.08697945438325405, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 479 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 777.0833587646484, | |
| "epoch": 0.5485714285714286, | |
| "grad_norm": 0.3994079530239105, | |
| "kl": 0.03125, | |
| "learning_rate": 1.0437936906629334e-07, | |
| "loss": 0.128, | |
| "num_tokens": 44330476.0, | |
| "reward": 1.0248275697231293, | |
| "reward_std": 0.6758236438035965, | |
| "rewards/cosine_scaled_reward": 0.012413740856572986, | |
| "rewards/format_reward": 1.0, | |
| "step": 480 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1162.0000610351562, | |
| "epoch": 0.5497142857142857, | |
| "grad_norm": 0.4814600646495819, | |
| "kl": 0.03363037109375, | |
| "learning_rate": 1.0395300688680625e-07, | |
| "loss": 0.2608, | |
| "num_tokens": 44393998.0, | |
| "reward": 0.7157177105545998, | |
| "reward_std": 0.44291423074901104, | |
| "rewards/cosine_scaled_reward": -0.07964113913476467, | |
| "rewards/format_reward": 0.875, | |
| "step": 481 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 600.6458587646484, | |
| "epoch": 0.5508571428571428, | |
| "grad_norm": 0.5359194874763489, | |
| "kl": 0.041259765625, | |
| "learning_rate": 1.0354838440848501e-07, | |
| "loss": 0.1478, | |
| "num_tokens": 44430977.0, | |
| "reward": 1.1498132944107056, | |
| "reward_std": 0.3962139468640089, | |
| "rewards/cosine_scaled_reward": 0.07490663533098996, | |
| "rewards/format_reward": 1.0, | |
| "step": 482 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1325.2291717529297, | |
| "epoch": 0.552, | |
| "grad_norm": 0.39830389618873596, | |
| "kl": 0.02691650390625, | |
| "learning_rate": 1.0316552135205837e-07, | |
| "loss": 0.1685, | |
| "num_tokens": 44502010.0, | |
| "reward": 0.9287998229265213, | |
| "reward_std": 0.8306932374835014, | |
| "rewards/cosine_scaled_reward": 0.0477332123555243, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 483 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 902.9792175292969, | |
| "epoch": 0.5531428571428572, | |
| "grad_norm": 0.47691088914871216, | |
| "kl": 0.03839111328125, | |
| "learning_rate": 1.0280443637773163e-07, | |
| "loss": 0.3136, | |
| "num_tokens": 44553171.0, | |
| "reward": 1.4516296237707138, | |
| "reward_std": 0.355313777923584, | |
| "rewards/cosine_scaled_reward": 0.26748147048056126, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 484 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1307.2500610351562, | |
| "epoch": 0.5542857142857143, | |
| "grad_norm": 0.4514337182044983, | |
| "kl": 0.0279541015625, | |
| "learning_rate": 1.0246514708427701e-07, | |
| "loss": 0.3272, | |
| "num_tokens": 44623797.0, | |
| "reward": 0.9350142329931259, | |
| "reward_std": 0.3977475240826607, | |
| "rewards/cosine_scaled_reward": 0.030007120221853256, | |
| "rewards/format_reward": 0.875, | |
| "step": 485 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1277.8333892822266, | |
| "epoch": 0.5554285714285714, | |
| "grad_norm": 0.3418792486190796, | |
| "kl": 0.027435302734375, | |
| "learning_rate": 1.0214767000817596e-07, | |
| "loss": 0.347, | |
| "num_tokens": 44693005.0, | |
| "reward": 1.2393681406974792, | |
| "reward_std": 0.827592596411705, | |
| "rewards/cosine_scaled_reward": 0.19260072708129883, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 486 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1803.3542175292969, | |
| "epoch": 0.5565714285714286, | |
| "grad_norm": 0.390234112739563, | |
| "kl": 0.0197601318359375, | |
| "learning_rate": 1.0185202062281336e-07, | |
| "loss": 0.2833, | |
| "num_tokens": 44787534.0, | |
| "reward": 0.41170351952314377, | |
| "reward_std": 0.6201038360595703, | |
| "rewards/cosine_scaled_reward": -0.1587315769866109, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 487 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1535.7709197998047, | |
| "epoch": 0.5577142857142857, | |
| "grad_norm": 0.3768185079097748, | |
| "kl": 0.025360107421875, | |
| "learning_rate": 1.0157821333772304e-07, | |
| "loss": 0.2207, | |
| "num_tokens": 44868931.0, | |
| "reward": 0.5452676527202129, | |
| "reward_std": 0.5340722799301147, | |
| "rewards/cosine_scaled_reward": -0.15444950759410858, | |
| "rewards/format_reward": 0.8541666567325592, | |
| "step": 488 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 628.0208358764648, | |
| "epoch": 0.5588571428571428, | |
| "grad_norm": 0.6455451250076294, | |
| "kl": 0.05511474609375, | |
| "learning_rate": 1.013262614978859e-07, | |
| "loss": 0.1924, | |
| "num_tokens": 44906402.0, | |
| "reward": 1.6253060102462769, | |
| "reward_std": 0.5070550180971622, | |
| "rewards/cosine_scaled_reward": 0.3230696848477237, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 489 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1089.416732788086, | |
| "epoch": 0.56, | |
| "grad_norm": 0.5013791918754578, | |
| "kl": 0.03314208984375, | |
| "learning_rate": 1.0109617738307911e-07, | |
| "loss": 0.1106, | |
| "num_tokens": 44966572.0, | |
| "reward": 1.0448856204748154, | |
| "reward_std": 0.5141267701983452, | |
| "rewards/cosine_scaled_reward": 0.04327613674104214, | |
| "rewards/format_reward": 0.9583333283662796, | |
| "step": 490 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1648.9583740234375, | |
| "epoch": 0.5611428571428572, | |
| "grad_norm": 0.4632658064365387, | |
| "kl": 0.0217742919921875, | |
| "learning_rate": 1.0088797220727779e-07, | |
| "loss": 0.4465, | |
| "num_tokens": 45053672.0, | |
| "reward": 0.9366854764521122, | |
| "reward_std": 0.3563379105180502, | |
| "rewards/cosine_scaled_reward": 0.09334271214902401, | |
| "rewards/format_reward": 0.75, | |
| "step": 491 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 934.3958587646484, | |
| "epoch": 0.5622857142857143, | |
| "grad_norm": 0.5427266955375671, | |
| "kl": 0.0307159423828125, | |
| "learning_rate": 1.0070165611810855e-07, | |
| "loss": 0.4325, | |
| "num_tokens": 45106761.0, | |
| "reward": 0.9387294054031372, | |
| "reward_std": 0.27895698696374893, | |
| "rewards/cosine_scaled_reward": -0.009801974520087242, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 492 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 869.5208740234375, | |
| "epoch": 0.5634285714285714, | |
| "grad_norm": 0.4428861141204834, | |
| "kl": 0.035308837890625, | |
| "learning_rate": 1.005372381963547e-07, | |
| "loss": 0.0663, | |
| "num_tokens": 45156058.0, | |
| "reward": 0.8442247211933136, | |
| "reward_std": 0.5361873507499695, | |
| "rewards/cosine_scaled_reward": -0.06747098336927593, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 493 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1189.5416870117188, | |
| "epoch": 0.5645714285714286, | |
| "grad_norm": 0.5454748868942261, | |
| "kl": 0.03359222412109375, | |
| "learning_rate": 1.0039472645551372e-07, | |
| "loss": 0.1418, | |
| "num_tokens": 45220542.0, | |
| "reward": 0.7198501382954419, | |
| "reward_std": 0.4353892542421818, | |
| "rewards/cosine_scaled_reward": -0.05674161948263645, | |
| "rewards/format_reward": 0.8333333358168602, | |
| "step": 494 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 911.8541870117188, | |
| "epoch": 0.5657142857142857, | |
| "grad_norm": 0.5451707243919373, | |
| "kl": 0.037506103515625, | |
| "learning_rate": 1.002741278414069e-07, | |
| "loss": 0.4167, | |
| "num_tokens": 45272705.0, | |
| "reward": 1.2369663715362549, | |
| "reward_std": 0.6268434636294842, | |
| "rewards/cosine_scaled_reward": 0.14973314851522446, | |
| "rewards/format_reward": 0.9375, | |
| "step": 495 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 874.5417022705078, | |
| "epoch": 0.5668571428571428, | |
| "grad_norm": 0.4017329216003418, | |
| "kl": 0.03363037109375, | |
| "learning_rate": 1.0017544823184055e-07, | |
| "loss": 0.3653, | |
| "num_tokens": 45322435.0, | |
| "reward": 1.1579137444496155, | |
| "reward_std": 0.6667543575167656, | |
| "rewards/cosine_scaled_reward": 0.09979016706347466, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 496 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 957.6042022705078, | |
| "epoch": 0.568, | |
| "grad_norm": 0.481536865234375, | |
| "kl": 0.0282135009765625, | |
| "learning_rate": 1.0009869243631952e-07, | |
| "loss": 0.2427, | |
| "num_tokens": 45376566.0, | |
| "reward": 1.3649472296237946, | |
| "reward_std": 0.6812234669923782, | |
| "rewards/cosine_scaled_reward": 0.1928902603685856, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 497 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1804.0000610351562, | |
| "epoch": 0.5691428571428572, | |
| "grad_norm": 0.40501669049263, | |
| "kl": 0.0171966552734375, | |
| "learning_rate": 1.000438641958131e-07, | |
| "loss": 0.4378, | |
| "num_tokens": 45471132.0, | |
| "reward": 0.38411422073841095, | |
| "reward_std": 0.7113517224788666, | |
| "rewards/cosine_scaled_reward": -0.15169288171455264, | |
| "rewards/format_reward": 0.6875, | |
| "step": 498 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1580.2500305175781, | |
| "epoch": 0.5702857142857143, | |
| "grad_norm": 0.6002418398857117, | |
| "kl": 0.034637451171875, | |
| "learning_rate": 1.0001096618257236e-07, | |
| "loss": 0.2261, | |
| "num_tokens": 45554832.0, | |
| "reward": 0.4394306093454361, | |
| "reward_std": 0.6632324308156967, | |
| "rewards/cosine_scaled_reward": -0.16570135951042175, | |
| "rewards/format_reward": 0.7708333283662796, | |
| "step": 499 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1098.8542175292969, | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 0.4748859703540802, | |
| "kl": 0.027740478515625, | |
| "learning_rate": 1e-07, | |
| "loss": 0.3975, | |
| "num_tokens": 45615545.0, | |
| "reward": 0.7702142149209976, | |
| "reward_std": 0.559198834002018, | |
| "rewards/cosine_scaled_reward": -0.07322624698281288, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "step": 500, | |
| "total_flos": 0.0, | |
| "train_loss": 0.16526810049655613, | |
| "train_runtime": 49656.4521, | |
| "train_samples_per_second": 0.483, | |
| "train_steps_per_second": 0.01 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 6, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |