diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -2,7 +2,7 @@ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.5714285714285714, + "epoch": 0.2857142857142857, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, @@ -11,7511 +11,7011 @@ "log_history": [ { "clip_ratio": 0.0, - "completion_length": 2878.5416870117188, - "epoch": 0.001142857142857143, - "grad_norm": 0.2171134501695633, + "completion_length": 3023.0, + "epoch": 0.0005714285714285715, + "grad_norm": 0.2460898458957672, "kl": 0.0, "learning_rate": 2e-08, - "loss": -0.0109, - "num_tokens": 146240.0, - "reward": 0.09066538512706757, - "reward_std": 0.19429835677146912, - "rewards/cosine_scaled_reward": -0.14216730743646622, - "rewards/format_reward": 0.375, + "loss": -0.0314, + "num_tokens": 151404.0, + "reward": -0.17859874665737152, + "reward_std": 0.18563616648316383, + "rewards/cosine_scaled_reward": -0.08929938450455666, "step": 1 }, { "clip_ratio": 0.0, - "completion_length": 2564.8333435058594, - "epoch": 0.002285714285714286, - "grad_norm": 0.27501821517944336, + "completion_length": 2731.3958740234375, + "epoch": 0.001142857142857143, + "grad_norm": 0.22445940971374512, "kl": 0.0, "learning_rate": 4e-08, - "loss": 0.0684, - "num_tokens": 277074.0, - "reward": 0.5692815706133842, - "reward_std": 0.6707231402397156, - "rewards/cosine_scaled_reward": 0.013807429000735283, - "rewards/format_reward": 0.5416666772216558, + "loss": 0.0519, + "num_tokens": 288319.0, + "reward": -0.535461600869894, + "reward_std": 0.16202664375305176, + "rewards/cosine_scaled_reward": -0.2677307929843664, "step": 2 }, { "clip_ratio": 0.0, - "completion_length": 2944.854217529297, - "epoch": 0.0034285714285714284, - "grad_norm": 0.2801656723022461, - "kl": 0.0006022453308105469, + "completion_length": 2233.5416717529297, + "epoch": 0.0017142857142857142, + "grad_norm": 0.24249283969402313, + "kl": 0.00036716461181640625, "learning_rate": 6e-08, - "loss": 0.0777, - "num_tokens": 426995.0, - "reward": -0.32244681287556887, - "reward_std": 0.3240165375173092, - "rewards/cosine_scaled_reward": -0.2862233966588974, - "rewards/format_reward": 0.24999999813735485, + "loss": 0.0576, + "num_tokens": 401025.0, + "reward": 0.33875009417533875, + "reward_std": 0.6338529586791992, + "rewards/cosine_scaled_reward": 0.16937505267560482, "step": 3 }, { "clip_ratio": 0.0, - "completion_length": 2979.8125, - "epoch": 0.004571428571428572, - "grad_norm": 0.2610093951225281, - "kl": 0.000518798828125, + "completion_length": 2376.2500915527344, + "epoch": 0.002285714285714286, + "grad_norm": 0.3028571605682373, + "kl": 0.0006170272827148438, "learning_rate": 8e-08, - "loss": 0.1125, - "num_tokens": 577748.0, - "reward": -0.16339990682899952, - "reward_std": 0.43585263565182686, - "rewards/cosine_scaled_reward": -0.23794995370553806, - "rewards/format_reward": 0.3125, + "loss": 0.0773, + "num_tokens": 521001.0, + "reward": -0.4882083088159561, + "reward_std": 0.4496277957223356, + "rewards/cosine_scaled_reward": -0.24410414695739746, "step": 4 }, { "clip_ratio": 0.0, - "completion_length": 2759.1458740234375, - "epoch": 0.005714285714285714, - "grad_norm": 0.2045336216688156, - "kl": 0.00048828125, + "completion_length": 3372.3750610351562, + "epoch": 0.002857142857142857, + "grad_norm": 0.21095335483551025, + "kl": 0.0006723403930664062, "learning_rate": 1e-07, - "loss": -0.0202, - "num_tokens": 717849.0, - "reward": 0.541385006159544, - "reward_std": 0.8971522003412247, - "rewards/cosine_scaled_reward": 0.010275840759277344, - "rewards/format_reward": 0.5208333358168602, + "loss": 0.007, + "num_tokens": 689631.0, + "reward": -0.5746253430843353, + "reward_std": 0.23810617998242378, + "rewards/cosine_scaled_reward": -0.28731267154216766, "step": 5 }, { "clip_ratio": 0.0, - "completion_length": 2596.8959350585938, - "epoch": 0.006857142857142857, - "grad_norm": 0.28218525648117065, - "kl": 0.0006213188171386719, + "completion_length": 2820.375030517578, + "epoch": 0.0034285714285714284, + "grad_norm": 0.33144864439964294, + "kl": 0.0005979537963867188, "learning_rate": 1.2e-07, - "loss": 0.0694, - "num_tokens": 850060.0, - "reward": 0.18730801343917847, - "reward_std": 0.7535099536180496, - "rewards/cosine_scaled_reward": -0.1667626677080989, - "rewards/format_reward": 0.5208333358168602, + "loss": -0.0916, + "num_tokens": 831357.0, + "reward": -0.4876829609274864, + "reward_std": 0.20684241317212582, + "rewards/cosine_scaled_reward": -0.24384147115051746, "step": 6 }, { "clip_ratio": 0.0, - "completion_length": 2559.14599609375, - "epoch": 0.008, - "grad_norm": 0.27962586283683777, - "kl": 0.0004992485046386719, + "completion_length": 3243.8750610351562, + "epoch": 0.004, + "grad_norm": 0.20102664828300476, + "kl": 0.0005273818969726562, "learning_rate": 1.4e-07, - "loss": 0.0313, - "num_tokens": 980555.0, - "reward": 0.7750245705246925, - "reward_std": 0.49819300323724747, - "rewards/cosine_scaled_reward": 0.10626226477324963, - "rewards/format_reward": 0.5624999925494194, + "loss": -0.0443, + "num_tokens": 992799.0, + "reward": -0.30651520285755396, + "reward_std": 0.3921822514384985, + "rewards/cosine_scaled_reward": -0.15325760166160762, "step": 7 }, { "clip_ratio": 0.0, - "completion_length": 2791.5625610351562, - "epoch": 0.009142857142857144, - "grad_norm": 0.2271980494260788, - "kl": 0.0005779266357421875, + "completion_length": 3092.625, + "epoch": 0.004571428571428572, + "grad_norm": 0.23749692738056183, + "kl": 0.0005922317504882812, "learning_rate": 1.6e-07, - "loss": 0.1349, - "num_tokens": 1122434.0, - "reward": 0.28142981859855354, - "reward_std": 0.6572991460561752, - "rewards/cosine_scaled_reward": -0.10928510129451752, - "rewards/format_reward": 0.4999999925494194, + "loss": -0.0614, + "num_tokens": 1146921.0, + "reward": -0.8370707631111145, + "reward_std": 0.16819308325648308, + "rewards/cosine_scaled_reward": -0.41853538155555725, "step": 8 }, { "clip_ratio": 0.0, - "completion_length": 3071.0625610351562, - "epoch": 0.010285714285714285, - "grad_norm": 0.21379557251930237, - "kl": 0.0006103515625, + "completion_length": 2227.9166717529297, + "epoch": 0.005142857142857143, + "grad_norm": 0.3016290068626404, + "kl": 0.0005288124084472656, "learning_rate": 1.8e-07, - "loss": 0.0952, - "num_tokens": 1277807.0, - "reward": -0.12088486924767494, - "reward_std": 0.6939665377140045, - "rewards/cosine_scaled_reward": -0.19585910439491272, - "rewards/format_reward": 0.2708333283662796, + "loss": -0.0113, + "num_tokens": 1259777.0, + "reward": -0.15989744663238525, + "reward_std": 0.25975861586630344, + "rewards/cosine_scaled_reward": -0.07994873821735382, "step": 9 }, { "clip_ratio": 0.0, - "completion_length": 2429.5000610351562, - "epoch": 0.011428571428571429, - "grad_norm": 0.27965009212493896, - "kl": 0.0006093978881835938, + "completion_length": 3047.4376220703125, + "epoch": 0.005714285714285714, + "grad_norm": 0.21191374957561493, + "kl": 0.000568389892578125, "learning_rate": 2e-07, - "loss": 0.0617, - "num_tokens": 1402259.0, - "reward": 0.2807231955230236, - "reward_std": 0.5721964091062546, - "rewards/cosine_scaled_reward": -0.1617217343300581, - "rewards/format_reward": 0.604166679084301, + "loss": -0.0152, + "num_tokens": 1411430.0, + "reward": -0.019763831049203873, + "reward_std": 0.8465264737606049, + "rewards/cosine_scaled_reward": -0.00988190807402134, "step": 10 }, { "clip_ratio": 0.0, - "completion_length": 2379.6250610351562, - "epoch": 0.012571428571428572, - "grad_norm": 0.389694482088089, - "kl": 0.0004887580871582031, + "completion_length": 2482.4375, + "epoch": 0.006285714285714286, + "grad_norm": 0.330398291349411, + "kl": 0.0006580352783203125, "learning_rate": 2.1999999999999998e-07, - "loss": 0.1212, - "num_tokens": 1524605.0, - "reward": 0.8620961308479309, - "reward_std": 0.8407739326357841, - "rewards/cosine_scaled_reward": 0.09771470725536346, - "rewards/format_reward": 0.6666666716337204, + "loss": 0.053, + "num_tokens": 1536179.0, + "reward": -0.4834251650609076, + "reward_std": 0.5014891251921654, + "rewards/cosine_scaled_reward": -0.2417125750798732, "step": 11 }, { "clip_ratio": 0.0, - "completion_length": 2639.5625610351562, - "epoch": 0.013714285714285714, - "grad_norm": 0.20471037924289703, - "kl": 0.0005512237548828125, + "completion_length": 2487.2500610351562, + "epoch": 0.006857142857142857, + "grad_norm": 0.30102092027664185, + "kl": 0.000583648681640625, "learning_rate": 2.4e-07, - "loss": 0.0464, - "num_tokens": 1659272.0, - "reward": 0.5478887595236301, - "reward_std": 0.8274511396884918, - "rewards/cosine_scaled_reward": -0.017722302465699613, - "rewards/format_reward": 0.5833333283662796, + "loss": -0.0912, + "num_tokens": 1661063.0, + "reward": -0.14752477407455444, + "reward_std": 0.5868879407644272, + "rewards/cosine_scaled_reward": -0.07376237958669662, "step": 12 }, { "clip_ratio": 0.0, - "completion_length": 2278.9375610351562, - "epoch": 0.014857142857142857, - "grad_norm": 0.30582067370414734, - "kl": 0.0005240440368652344, + "completion_length": 2876.229217529297, + "epoch": 0.0074285714285714285, + "grad_norm": 0.25117188692092896, + "kl": 0.0005583763122558594, "learning_rate": 2.6e-07, - "loss": 0.0729, - "num_tokens": 1776629.0, - "reward": 0.45567256212234497, - "reward_std": 0.7415833473205566, - "rewards/cosine_scaled_reward": -0.07424705754965544, - "rewards/format_reward": 0.6041666716337204, + "loss": -0.0352, + "num_tokens": 1804702.0, + "reward": -0.08496717864181846, + "reward_std": 0.5994590483605862, + "rewards/cosine_scaled_reward": -0.042483578145038337, "step": 13 }, { "clip_ratio": 0.0, - "completion_length": 2861.3958740234375, - "epoch": 0.016, - "grad_norm": 0.25264662504196167, - "kl": 0.0006380081176757812, + "completion_length": 2376.166717529297, + "epoch": 0.008, + "grad_norm": 0.34051504731178284, + "kl": 0.0006012916564941406, "learning_rate": 2.8e-07, - "loss": 0.0145, - "num_tokens": 1922556.0, - "reward": -0.11359720956534147, - "reward_std": 0.47907784581184387, - "rewards/cosine_scaled_reward": -0.22346526756882668, - "rewards/format_reward": 0.3333333283662796, + "loss": 0.1844, + "num_tokens": 1924458.0, + "reward": 0.5019294954836369, + "reward_std": 0.4014289937913418, + "rewards/cosine_scaled_reward": 0.2509647514671087, "step": 14 }, { "clip_ratio": 0.0, - "completion_length": 3137.7083740234375, - "epoch": 0.017142857142857144, - "grad_norm": 0.218108668923378, - "kl": 0.0006160736083984375, + "completion_length": 2104.3541870117188, + "epoch": 0.008571428571428572, + "grad_norm": 0.27839282155036926, + "kl": 0.0004687309265136719, "learning_rate": 3e-07, - "loss": 0.0771, - "num_tokens": 2081608.0, - "reward": 0.05102095380425453, - "reward_std": 0.49165723100304604, - "rewards/cosine_scaled_reward": -0.07865619286894798, - "rewards/format_reward": 0.2083333395421505, + "loss": 0.0602, + "num_tokens": 2031107.0, + "reward": -0.2739022574387491, + "reward_std": 0.5232805069535971, + "rewards/cosine_scaled_reward": -0.13695112499408424, "step": 15 }, { "clip_ratio": 0.0, - "completion_length": 2212.479248046875, - "epoch": 0.018285714285714287, - "grad_norm": 0.3894731402397156, - "kl": 0.0005140304565429688, + "completion_length": 3022.2709350585938, + "epoch": 0.009142857142857144, + "grad_norm": 0.23329903185367584, + "kl": 0.0007066726684570312, "learning_rate": 3.2e-07, - "loss": 0.1743, - "num_tokens": 2195313.0, - "reward": 0.7095662355422974, - "reward_std": 0.6108940467238426, - "rewards/cosine_scaled_reward": 0.08394978567957878, - "rewards/format_reward": 0.5416666641831398, + "loss": -0.0235, + "num_tokens": 2182272.0, + "reward": -0.28543997276574373, + "reward_std": 0.5580427274107933, + "rewards/cosine_scaled_reward": -0.14271997893229127, "step": 16 }, { "clip_ratio": 0.0, - "completion_length": 3214.9583740234375, - "epoch": 0.019428571428571427, - "grad_norm": 0.22356842458248138, - "kl": 0.0006380081176757812, + "completion_length": 2995.8959350585938, + "epoch": 0.009714285714285713, + "grad_norm": 0.20035724341869354, + "kl": 0.00064849853515625, "learning_rate": 3.4000000000000003e-07, - "loss": -0.0182, - "num_tokens": 2357947.0, - "reward": -0.35805825144052505, - "reward_std": 0.27831941843032837, - "rewards/cosine_scaled_reward": -0.29361244291067123, - "rewards/format_reward": 0.22916666232049465, + "loss": 0.0584, + "num_tokens": 2331607.0, + "reward": -0.49751752614974976, + "reward_std": 0.40262408554553986, + "rewards/cosine_scaled_reward": -0.24875876307487488, "step": 17 }, { "clip_ratio": 0.0, - "completion_length": 2965.0208740234375, - "epoch": 0.02057142857142857, - "grad_norm": 0.2580890357494354, - "kl": 0.0005941390991210938, + "completion_length": 3225.3125, + "epoch": 0.010285714285714285, + "grad_norm": 0.20705807209014893, + "kl": 0.0006122589111328125, "learning_rate": 3.6e-07, - "loss": 0.1942, - "num_tokens": 2508662.0, - "reward": -0.20507963374257088, - "reward_std": 0.4416894242167473, - "rewards/cosine_scaled_reward": -0.23795648105442524, - "rewards/format_reward": 0.2708333395421505, + "loss": 0.0237, + "num_tokens": 2492782.0, + "reward": -0.5619450844824314, + "reward_std": 0.47893428802490234, + "rewards/cosine_scaled_reward": -0.2809725347906351, "step": 18 }, { "clip_ratio": 0.0, - "completion_length": 3202.1875610351562, - "epoch": 0.021714285714285714, - "grad_norm": 0.19615532457828522, - "kl": 0.0005750656127929688, + "completion_length": 2767.8541717529297, + "epoch": 0.010857142857142857, + "grad_norm": 0.25546160340309143, + "kl": 0.0007276535034179688, "learning_rate": 3.7999999999999996e-07, - "loss": 0.0641, - "num_tokens": 2670995.0, - "reward": 0.053066179156303406, - "reward_std": 0.7212408185005188, - "rewards/cosine_scaled_reward": -0.11930023087188601, - "rewards/format_reward": 0.2916666679084301, + "loss": -0.0882, + "num_tokens": 2631327.0, + "reward": -0.18378404527902603, + "reward_std": 0.5495752617716789, + "rewards/cosine_scaled_reward": -0.09189202263951302, "step": 19 }, { "clip_ratio": 0.0, - "completion_length": 2715.6666870117188, - "epoch": 0.022857142857142857, - "grad_norm": 0.24433307349681854, - "kl": 0.0006351470947265625, + "completion_length": 1481.000015258789, + "epoch": 0.011428571428571429, + "grad_norm": 0.3158569931983948, + "kl": 0.0004367828369140625, "learning_rate": 4e-07, - "loss": 0.0429, - "num_tokens": 2808961.0, - "reward": 0.023024218156933784, - "reward_std": 0.6361246034502983, - "rewards/cosine_scaled_reward": -0.1968212267383933, - "rewards/format_reward": 0.4166666679084301, + "loss": 0.0439, + "num_tokens": 2708367.0, + "reward": -0.025459617376327515, + "reward_std": 0.57894092425704, + "rewards/cosine_scaled_reward": -0.01272980123758316, "step": 20 }, { "clip_ratio": 0.0, - "completion_length": 2765.8958740234375, - "epoch": 0.024, - "grad_norm": 0.24205811321735382, - "kl": 0.0005364418029785156, + "completion_length": 1864.2500457763672, + "epoch": 0.012, + "grad_norm": 0.4832952618598938, + "kl": 0.00046253204345703125, "learning_rate": 4.1999999999999995e-07, - "loss": 0.066, - "num_tokens": 2950058.0, - "reward": 0.03793077915906906, - "reward_std": 0.43773240596055984, - "rewards/cosine_scaled_reward": -0.14770127274096012, - "rewards/format_reward": 0.33333333395421505, + "loss": 0.0893, + "num_tokens": 2803731.0, + "reward": -0.20306236669421196, + "reward_std": 0.7009828165173531, + "rewards/cosine_scaled_reward": -0.10153118334710598, "step": 21 }, { "clip_ratio": 0.0, - "completion_length": 3380.0833740234375, - "epoch": 0.025142857142857144, - "grad_norm": 0.19774502515792847, - "kl": 0.0005664825439453125, + "completion_length": 3143.604248046875, + "epoch": 0.012571428571428572, + "grad_norm": 0.20443888008594513, + "kl": 0.0005559921264648438, "learning_rate": 4.3999999999999997e-07, - "loss": 0.0911, - "num_tokens": 3120072.0, - "reward": 0.15241558849811554, - "reward_std": 0.8311697989702225, - "rewards/cosine_scaled_reward": -0.02795886993408203, - "rewards/format_reward": 0.20833333767950535, + "loss": 0.0298, + "num_tokens": 2960960.0, + "reward": 0.664834626019001, + "reward_std": 1.0612835884094238, + "rewards/cosine_scaled_reward": 0.3324173092842102, "step": 22 }, { "clip_ratio": 0.0, - "completion_length": 3330.9375610351562, - "epoch": 0.026285714285714287, - "grad_norm": 0.218012735247612, - "kl": 0.0006666183471679688, + "completion_length": 2191.9791870117188, + "epoch": 0.013142857142857144, + "grad_norm": 0.23772495985031128, + "kl": 0.00048279762268066406, "learning_rate": 4.6e-07, - "loss": 0.0349, - "num_tokens": 3287889.0, - "reward": 0.24341265857219696, - "reward_std": 0.7591935321688652, - "rewards/cosine_scaled_reward": -0.09704366815276444, - "rewards/format_reward": 0.4375, + "loss": 0.0419, + "num_tokens": 3071671.0, + "reward": -0.08595703169703484, + "reward_std": 0.7537456881254911, + "rewards/cosine_scaled_reward": -0.042978519573807716, "step": 23 }, { "clip_ratio": 0.0, - "completion_length": 2302.3959045410156, - "epoch": 0.027428571428571427, - "grad_norm": 0.27331000566482544, - "kl": 0.00044727325439453125, + "completion_length": 2681.1458740234375, + "epoch": 0.013714285714285714, + "grad_norm": 0.2397710382938385, + "kl": 0.0005970001220703125, "learning_rate": 4.8e-07, - "loss": 0.0858, - "num_tokens": 3406318.0, - "reward": 0.6318932324647903, - "reward_std": 0.7281809970736504, - "rewards/cosine_scaled_reward": 0.02427995391190052, - "rewards/format_reward": 0.5833333395421505, + "loss": 0.0064, + "num_tokens": 3206774.0, + "reward": 0.038678646087646484, + "reward_std": 0.3931765630841255, + "rewards/cosine_scaled_reward": 0.01933930814266205, "step": 24 }, { "clip_ratio": 0.0, - "completion_length": 2765.9375610351562, - "epoch": 0.02857142857142857, - "grad_norm": 0.28522786498069763, - "kl": 0.000560760498046875, + "completion_length": 1551.250015258789, + "epoch": 0.014285714285714285, + "grad_norm": 0.3846415579319, + "kl": 0.0004100799560546875, "learning_rate": 5e-07, - "loss": 0.1322, - "num_tokens": 3546859.0, - "reward": 0.09269634401425719, - "reward_std": 0.6214342266321182, - "rewards/cosine_scaled_reward": -0.12031849287450314, - "rewards/format_reward": 0.3333333395421505, + "loss": 0.0557, + "num_tokens": 3287126.0, + "reward": -0.13053925335407257, + "reward_std": 0.4432575963437557, + "rewards/cosine_scaled_reward": -0.06526962295174599, "step": 25 }, { "clip_ratio": 0.0, - "completion_length": 3093.6876220703125, - "epoch": 0.029714285714285714, - "grad_norm": 0.21539199352264404, - "kl": 0.00049591064453125, + "completion_length": 2773.6250610351562, + "epoch": 0.014857142857142857, + "grad_norm": 0.23315444588661194, + "kl": 0.0005998611450195312, "learning_rate": 5.2e-07, - "loss": 0.0489, - "num_tokens": 3704476.0, - "reward": 0.1484052948653698, - "reward_std": 0.8102314993739128, - "rewards/cosine_scaled_reward": -0.07163068139925599, - "rewards/format_reward": 0.29166666604578495, + "loss": 0.0573, + "num_tokens": 3426272.0, + "reward": -0.14446274191141129, + "reward_std": 0.7020265012979507, + "rewards/cosine_scaled_reward": -0.07223137095570564, "step": 26 }, { "clip_ratio": 0.0, - "completion_length": 2976.229248046875, - "epoch": 0.030857142857142857, - "grad_norm": 0.25912725925445557, - "kl": 0.0005688667297363281, + "completion_length": 3283.5, + "epoch": 0.015428571428571429, + "grad_norm": 0.21181795001029968, + "kl": 0.0006422996520996094, "learning_rate": 5.4e-07, - "loss": -0.0511, - "num_tokens": 3856233.0, - "reward": -0.04554035887122154, - "reward_std": 0.3555384576320648, - "rewards/cosine_scaled_reward": -0.15818685293197632, - "rewards/format_reward": 0.27083333395421505, + "loss": -0.0769, + "num_tokens": 3590060.0, + "reward": -0.299600200727582, + "reward_std": 0.4262968748807907, + "rewards/cosine_scaled_reward": -0.14980009896680713, "step": 27 }, { "clip_ratio": 0.0, - "completion_length": 3116.479248046875, - "epoch": 0.032, - "grad_norm": 0.2779461145401001, - "kl": 0.0006647109985351562, + "completion_length": 2844.0833740234375, + "epoch": 0.016, + "grad_norm": 0.21920780837535858, + "kl": 0.000614166259765625, "learning_rate": 5.6e-07, - "loss": 0.0937, - "num_tokens": 4014002.0, - "reward": -0.06345795840024948, - "reward_std": 0.6914098784327507, - "rewards/cosine_scaled_reward": -0.17756231129169464, - "rewards/format_reward": 0.2916666679084301, + "loss": 0.0034, + "num_tokens": 3733524.0, + "reward": -0.4807719439268112, + "reward_std": 0.42509571835398674, + "rewards/cosine_scaled_reward": -0.24038597010076046, "step": 28 }, { "clip_ratio": 0.0, - "completion_length": 3091.8125, - "epoch": 0.03314285714285714, - "grad_norm": 0.20903803408145905, - "kl": 0.0004973411560058594, + "completion_length": 2722.250030517578, + "epoch": 0.01657142857142857, + "grad_norm": 0.31188592314720154, + "kl": 0.0006227493286132812, "learning_rate": 5.8e-07, - "loss": 0.0822, - "num_tokens": 4170833.0, - "reward": -0.17993240803480148, - "reward_std": 0.6661744937300682, - "rewards/cosine_scaled_reward": -0.2357995305210352, - "rewards/format_reward": 0.29166667349636555, + "loss": 0.0756, + "num_tokens": 3871068.0, + "reward": 0.08422036468982697, + "reward_std": 0.6395114436745644, + "rewards/cosine_scaled_reward": 0.04211018607020378, "step": 29 }, { "clip_ratio": 0.0, - "completion_length": 3212.3125610351562, - "epoch": 0.03428571428571429, - "grad_norm": 0.19359447062015533, - "kl": 0.0006041526794433594, + "completion_length": 3481.2709350585938, + "epoch": 0.017142857142857144, + "grad_norm": 0.20324808359146118, + "kl": 0.000644683837890625, "learning_rate": 6e-07, - "loss": 0.125, - "num_tokens": 4333220.0, - "reward": 0.13911130279302597, - "reward_std": 0.826167568564415, - "rewards/cosine_scaled_reward": -0.09711101395078003, - "rewards/format_reward": 0.3333333320915699, + "loss": 0.0018, + "num_tokens": 4044145.0, + "reward": -0.18417476117610931, + "reward_std": 0.6154340840876102, + "rewards/cosine_scaled_reward": -0.0920873824506998, "step": 30 }, { "clip_ratio": 0.0, - "completion_length": 2465.5208740234375, - "epoch": 0.03542857142857143, - "grad_norm": 0.3267292380332947, - "kl": 0.0005950927734375, + "completion_length": 2657.166793823242, + "epoch": 0.017714285714285714, + "grad_norm": 0.3732287287712097, + "kl": 0.0005855560302734375, "learning_rate": 6.2e-07, - "loss": 0.0618, - "num_tokens": 4459791.0, - "reward": 0.28785821609199047, - "reward_std": 0.5182768851518631, - "rewards/cosine_scaled_reward": -0.08523756638169289, - "rewards/format_reward": 0.45833333395421505, + "loss": 0.1126, + "num_tokens": 4176981.0, + "reward": -0.34596723690629005, + "reward_std": 0.6994314044713974, + "rewards/cosine_scaled_reward": -0.17298361286520958, "step": 31 }, { "clip_ratio": 0.0, - "completion_length": 3584.0, - "epoch": 0.036571428571428574, - "grad_norm": 0.18462678790092468, - "kl": 0.0006098747253417969, + "completion_length": 1848.4583587646484, + "epoch": 0.018285714285714287, + "grad_norm": 0.34063395857810974, + "kl": 0.0004658699035644531, "learning_rate": 6.4e-07, - "loss": 0.0, - "num_tokens": 4640043.0, - "reward": -0.45041289925575256, - "reward_std": 0.18651413917541504, - "rewards/cosine_scaled_reward": -0.22520644962787628, - "rewards/format_reward": 0.0, + "loss": 0.0435, + "num_tokens": 4271395.0, + "reward": 0.1516597867012024, + "reward_std": 0.7864086776971817, + "rewards/cosine_scaled_reward": 0.0758299008011818, "step": 32 }, { "clip_ratio": 0.0, - "completion_length": 3056.166748046875, - "epoch": 0.037714285714285714, - "grad_norm": 0.22060233354568481, - "kl": 0.0005612373352050781, + "completion_length": 3584.0, + "epoch": 0.018857142857142857, + "grad_norm": 0.22958050668239594, + "kl": 0.000720977783203125, "learning_rate": 6.6e-07, - "loss": 0.1453, - "num_tokens": 4794857.0, - "reward": 0.029838480055332184, - "reward_std": 0.7281420417129993, - "rewards/cosine_scaled_reward": -0.14133075810968876, - "rewards/format_reward": 0.31249999441206455, + "loss": 0.0, + "num_tokens": 4450063.0, + "reward": -0.6370590478181839, + "reward_std": 0.20538340508937836, + "rewards/cosine_scaled_reward": -0.31852949783205986, "step": 33 }, { "clip_ratio": 0.0, - "completion_length": 3175.4791870117188, - "epoch": 0.038857142857142854, - "grad_norm": 0.24089233577251434, - "kl": 0.0005521774291992188, + "completion_length": 2935.3959350585938, + "epoch": 0.019428571428571427, + "grad_norm": 0.2298842966556549, + "kl": 0.0005502700805664062, "learning_rate": 6.800000000000001e-07, - "loss": 0.0075, - "num_tokens": 4955170.0, - "reward": -0.21627317368984222, - "reward_std": 0.49830519035458565, - "rewards/cosine_scaled_reward": -0.1914699161425233, - "rewards/format_reward": 0.1666666679084301, + "loss": 0.0374, + "num_tokens": 4596926.0, + "reward": -0.41714829951524734, + "reward_std": 0.39900972694158554, + "rewards/cosine_scaled_reward": -0.20857414416968822, "step": 34 }, { "clip_ratio": 0.0, - "completion_length": 2963.02099609375, - "epoch": 0.04, - "grad_norm": 0.22746875882148743, - "kl": 0.0005502700805664062, + "completion_length": 3036.9166870117188, + "epoch": 0.02, + "grad_norm": 0.23063404858112335, + "kl": 0.0006365776062011719, "learning_rate": 7e-07, - "loss": 0.0903, - "num_tokens": 5105531.0, - "reward": 0.07940403372049332, - "reward_std": 0.8007240146398544, - "rewards/cosine_scaled_reward": -0.16863130778074265, - "rewards/format_reward": 0.4166666679084301, + "loss": 0.0008, + "num_tokens": 4749586.0, + "reward": -0.7142433375120163, + "reward_std": 0.3739009462296963, + "rewards/cosine_scaled_reward": -0.35712166875600815, "step": 35 }, { "clip_ratio": 0.0, - "completion_length": 2587.0000610351562, - "epoch": 0.04114285714285714, - "grad_norm": 0.24526968598365784, - "kl": 0.0005397796630859375, + "completion_length": 2749.8958740234375, + "epoch": 0.02057142857142857, + "grad_norm": 0.27252835035324097, + "kl": 0.000637054443359375, "learning_rate": 7.2e-07, - "loss": 0.0504, - "num_tokens": 5237645.0, - "reward": 0.5731585621833801, - "reward_std": 0.4250886049121618, - "rewards/cosine_scaled_reward": 0.05741261690855026, - "rewards/format_reward": 0.4583333283662796, + "loss": -0.0058, + "num_tokens": 4887449.0, + "reward": -0.10459958261344582, + "reward_std": 0.6155130080878735, + "rewards/cosine_scaled_reward": -0.052299798757303506, "step": 36 }, { "clip_ratio": 0.0, - "completion_length": 2604.7500610351562, - "epoch": 0.04228571428571429, - "grad_norm": 0.3840998709201813, - "kl": 0.00054931640625, + "completion_length": 2935.562530517578, + "epoch": 0.021142857142857144, + "grad_norm": 0.2475793957710266, + "kl": 0.000568389892578125, "learning_rate": 7.4e-07, - "loss": 0.1006, - "num_tokens": 5370077.0, - "reward": 0.44232267513871193, - "reward_std": 0.6607204154133797, - "rewards/cosine_scaled_reward": -0.049671996384859085, - "rewards/format_reward": 0.541666654869914, + "loss": -0.088, + "num_tokens": 5034704.0, + "reward": -0.29862387478351593, + "reward_std": 0.39744907803833485, + "rewards/cosine_scaled_reward": -0.14931193552911282, "step": 37 }, { "clip_ratio": 0.0, - "completion_length": 2919.1250610351562, - "epoch": 0.04342857142857143, - "grad_norm": 0.2132943570613861, - "kl": 0.000682830810546875, + "completion_length": 3025.7083740234375, + "epoch": 0.021714285714285714, + "grad_norm": 0.22514301538467407, + "kl": 0.0006685256958007812, "learning_rate": 7.599999999999999e-07, - "loss": 0.1065, - "num_tokens": 5517587.0, - "reward": -0.07279382459819317, - "reward_std": 0.5536654070019722, - "rewards/cosine_scaled_reward": -0.2551469076424837, - "rewards/format_reward": 0.4375000074505806, + "loss": 0.0347, + "num_tokens": 5186814.0, + "reward": -0.2246699258685112, + "reward_std": 0.6279377490282059, + "rewards/cosine_scaled_reward": -0.11233496479690075, "step": 38 }, { "clip_ratio": 0.0, - "completion_length": 2926.6459350585938, - "epoch": 0.044571428571428574, - "grad_norm": 0.23606236279010773, - "kl": 0.0006608963012695312, + "completion_length": 1896.0208435058594, + "epoch": 0.022285714285714287, + "grad_norm": 0.31844407320022583, + "kl": 0.00075531005859375, "learning_rate": 7.799999999999999e-07, - "loss": 0.113, - "num_tokens": 5665770.0, - "reward": 0.4562120959162712, - "reward_std": 0.876225158572197, - "rewards/cosine_scaled_reward": -0.011477291118353605, - "rewards/format_reward": 0.4791666641831398, + "loss": -0.0971, + "num_tokens": 5282791.0, + "reward": -0.26946142315864563, + "reward_std": 0.6844599097967148, + "rewards/cosine_scaled_reward": -0.13473070412874222, "step": 39 }, { "clip_ratio": 0.0, - "completion_length": 3004.5625915527344, - "epoch": 0.045714285714285714, - "grad_norm": 0.2187039852142334, - "kl": 0.0005340576171875, + "completion_length": 3461.6458740234375, + "epoch": 0.022857142857142857, + "grad_norm": 0.1988898515701294, + "kl": 0.0005855560302734375, "learning_rate": 8e-07, - "loss": 0.068, - "num_tokens": 5818485.0, - "reward": 0.15439531083393376, - "reward_std": 0.6592165231704712, - "rewards/cosine_scaled_reward": -0.11030234955251217, - "rewards/format_reward": 0.3750000037252903, + "loss": 0.0348, + "num_tokens": 5455178.0, + "reward": -0.3771579749882221, + "reward_std": 0.43913378193974495, + "rewards/cosine_scaled_reward": -0.18857897631824017, "step": 40 }, { "clip_ratio": 0.0, - "completion_length": 3318.166748046875, - "epoch": 0.046857142857142854, - "grad_norm": 0.20403259992599487, - "kl": 0.0006113052368164062, + "completion_length": 2864.0625, + "epoch": 0.023428571428571427, + "grad_norm": 0.2541484534740448, + "kl": 0.0006170272827148438, "learning_rate": 8.199999999999999e-07, - "loss": 0.0189, - "num_tokens": 5985731.0, - "reward": 0.24213121831417084, - "reward_std": 0.8019094243645668, - "rewards/cosine_scaled_reward": -0.03518439130857587, - "rewards/format_reward": 0.3124999925494194, + "loss": 0.0332, + "num_tokens": 5599517.0, + "reward": -0.15848201513290405, + "reward_std": 0.28519516810774803, + "rewards/cosine_scaled_reward": -0.07924101501703262, "step": 41 }, { "clip_ratio": 0.0, - "completion_length": 2081.0834350585938, - "epoch": 0.048, - "grad_norm": 0.3065926134586334, - "kl": 0.0006427764892578125, + "completion_length": 2103.375045776367, + "epoch": 0.024, + "grad_norm": 0.4904050827026367, + "kl": 0.0005826950073242188, "learning_rate": 8.399999999999999e-07, - "loss": 0.0694, - "num_tokens": 6092841.0, - "reward": 0.7219225168228149, - "reward_std": 0.7827914208173752, - "rewards/cosine_scaled_reward": 0.006794577464461327, - "rewards/format_reward": 0.7083333358168602, + "loss": 0.1226, + "num_tokens": 5706251.0, + "reward": 0.05192290246486664, + "reward_std": 0.5277432054281235, + "rewards/cosine_scaled_reward": 0.02596145309507847, "step": 42 }, { "clip_ratio": 0.0, - "completion_length": 3077.604248046875, - "epoch": 0.04914285714285714, - "grad_norm": 0.25333571434020996, - "kl": 0.0007266998291015625, + "completion_length": 3375.9583740234375, + "epoch": 0.02457142857142857, + "grad_norm": 0.20466555655002594, + "kl": 0.0007343292236328125, "learning_rate": 8.599999999999999e-07, - "loss": 0.0159, - "num_tokens": 6248312.0, - "reward": 0.6068893522024155, - "reward_std": 1.1225253641605377, - "rewards/cosine_scaled_reward": 0.04302799212746322, - "rewards/format_reward": 0.5208333283662796, + "loss": 0.0472, + "num_tokens": 5873601.0, + "reward": -0.1930120848119259, + "reward_std": 0.7160178981721401, + "rewards/cosine_scaled_reward": -0.0965060293674469, "step": 43 }, { "clip_ratio": 0.0, - "completion_length": 2814.9583740234375, - "epoch": 0.05028571428571429, - "grad_norm": 0.2492004930973053, - "kl": 0.0008058547973632812, + "completion_length": 3035.7708740234375, + "epoch": 0.025142857142857144, + "grad_norm": 0.19474832713603973, + "kl": 0.0006237030029296875, "learning_rate": 8.799999999999999e-07, - "loss": 0.0075, - "num_tokens": 6391476.0, - "reward": 0.25283733755350113, - "reward_std": 0.5932779908180237, - "rewards/cosine_scaled_reward": -0.08191467449069023, - "rewards/format_reward": 0.41666666232049465, + "loss": -0.0035, + "num_tokens": 6025522.0, + "reward": -0.16648699529469013, + "reward_std": 0.6652341857552528, + "rewards/cosine_scaled_reward": -0.08324349066242576, "step": 44 }, { "clip_ratio": 0.0, - "completion_length": 2648.3959350585938, - "epoch": 0.05142857142857143, - "grad_norm": 0.2790423333644867, - "kl": 0.0006084442138671875, + "completion_length": 3335.8750610351562, + "epoch": 0.025714285714285714, + "grad_norm": 0.21470214426517487, + "kl": 0.0007848739624023438, "learning_rate": 9e-07, - "loss": -0.0224, - "num_tokens": 6526459.0, - "reward": 0.6963641820475459, - "reward_std": 0.8004543036222458, - "rewards/cosine_scaled_reward": 0.05651540495455265, - "rewards/format_reward": 0.5833333283662796, + "loss": 0.0771, + "num_tokens": 6192064.0, + "reward": -0.6564953848719597, + "reward_std": 0.22902014665305614, + "rewards/cosine_scaled_reward": -0.32824768498539925, "step": 45 }, { "clip_ratio": 0.0, - "completion_length": 2959.2916870117188, - "epoch": 0.052571428571428575, - "grad_norm": 0.27206096053123474, - "kl": 0.000598907470703125, + "completion_length": 2241.8541870117188, + "epoch": 0.026285714285714287, + "grad_norm": 0.29658329486846924, + "kl": 0.00080108642578125, "learning_rate": 9.2e-07, - "loss": 0.1131, - "num_tokens": 6676827.0, - "reward": 0.2336385459639132, - "reward_std": 0.44995977729558945, - "rewards/cosine_scaled_reward": -0.029014069586992264, - "rewards/format_reward": 0.2916666716337204, + "loss": -0.0078, + "num_tokens": 6305085.0, + "reward": 0.3887103348970413, + "reward_std": 1.0468786805868149, + "rewards/cosine_scaled_reward": 0.19435517117381096, "step": 46 }, { "clip_ratio": 0.0, - "completion_length": 2453.0625915527344, - "epoch": 0.053714285714285714, - "grad_norm": 0.28164342045783997, - "kl": 0.00048160552978515625, + "completion_length": 2734.7083740234375, + "epoch": 0.026857142857142857, + "grad_norm": 0.28740641474723816, + "kl": 0.0004706382751464844, "learning_rate": 9.399999999999999e-07, - "loss": 0.0803, - "num_tokens": 6802206.0, - "reward": 0.4458533003926277, - "reward_std": 0.5642239525914192, - "rewards/cosine_scaled_reward": -0.06874001771211624, - "rewards/format_reward": 0.5833333283662796, + "loss": 0.049, + "num_tokens": 6441355.0, + "reward": 0.15308012068271637, + "reward_std": 0.4208949161693454, + "rewards/cosine_scaled_reward": 0.07654005661606789, "step": 47 }, { "clip_ratio": 0.0, - "completion_length": 2640.7916870117188, - "epoch": 0.054857142857142854, - "grad_norm": 0.3081265687942505, - "kl": 0.0006432533264160156, + "completion_length": 1500.8958740234375, + "epoch": 0.027428571428571427, + "grad_norm": 0.39340028166770935, + "kl": 0.0004849433898925781, "learning_rate": 9.6e-07, - "loss": 0.0203, - "num_tokens": 6936848.0, - "reward": 0.4199897248763591, - "reward_std": 0.5818230472505093, - "rewards/cosine_scaled_reward": -0.05042180512100458, - "rewards/format_reward": 0.5208333414047956, + "loss": 0.0855, + "num_tokens": 6520190.0, + "reward": 0.44709211960434914, + "reward_std": 0.7460008524358273, + "rewards/cosine_scaled_reward": 0.22354605607688427, "step": 48 }, { "clip_ratio": 0.0, - "completion_length": 2285.7916870117188, - "epoch": 0.056, - "grad_norm": 0.3449012339115143, - "kl": 0.0005960464477539062, + "completion_length": 2304.1458435058594, + "epoch": 0.028, + "grad_norm": 0.35876935720443726, + "kl": 0.0008916854858398438, "learning_rate": 9.8e-07, - "loss": -0.0279, - "num_tokens": 7054330.0, - "reward": 0.6692525297403336, - "reward_std": 0.6822149157524109, - "rewards/cosine_scaled_reward": 0.032542891800403595, - "rewards/format_reward": 0.6041666716337204, + "loss": 0.0348, + "num_tokens": 6636501.0, + "reward": -0.05522707849740982, + "reward_std": 0.5208401791751385, + "rewards/cosine_scaled_reward": -0.02761353738605976, "step": 49 }, { "clip_ratio": 0.0, - "completion_length": 2134.8334045410156, - "epoch": 0.05714285714285714, - "grad_norm": 0.387600839138031, - "kl": 0.0008153915405273438, + "completion_length": 3012.8541870117188, + "epoch": 0.02857142857142857, + "grad_norm": 0.2135663479566574, + "kl": 0.0007343292236328125, "learning_rate": 1e-06, - "loss": 0.1332, - "num_tokens": 7163978.0, - "reward": 0.8850179463624954, - "reward_std": 0.8217868953943253, - "rewards/cosine_scaled_reward": 0.1300089694559574, - "rewards/format_reward": 0.6249999925494194, + "loss": 0.0216, + "num_tokens": 6786926.0, + "reward": -0.08169351518154144, + "reward_std": 0.6610444337129593, + "rewards/cosine_scaled_reward": -0.040846746414899826, "step": 50 }, { "clip_ratio": 0.0, - "completion_length": 3059.0834045410156, - "epoch": 0.05828571428571429, - "grad_norm": 0.23220700025558472, - "kl": 0.0007076263427734375, + "completion_length": 2833.375, + "epoch": 0.029142857142857144, + "grad_norm": 0.24645818769931793, + "kl": 0.0005292892456054688, "learning_rate": 9.999890338174275e-07, - "loss": 0.1301, - "num_tokens": 7319304.0, - "reward": -0.1438809223473072, - "reward_std": 0.772916778922081, - "rewards/cosine_scaled_reward": -0.2386071290820837, - "rewards/format_reward": 0.33333333022892475, + "loss": 0.0353, + "num_tokens": 6929624.0, + "reward": 0.0020843185484409332, + "reward_std": 0.12999659916386008, + "rewards/cosine_scaled_reward": 0.001042170450091362, "step": 51 }, { "clip_ratio": 0.0, - "completion_length": 3285.854248046875, - "epoch": 0.05942857142857143, - "grad_norm": 0.19926899671554565, - "kl": 0.0007266998291015625, + "completion_length": 3556.0416870117188, + "epoch": 0.029714285714285714, + "grad_norm": 0.18516220152378082, + "kl": 0.00058746337890625, "learning_rate": 9.999561358041868e-07, - "loss": 0.0235, - "num_tokens": 7485797.0, - "reward": -0.1713619939982891, - "reward_std": 0.6841993480920792, - "rewards/cosine_scaled_reward": -0.20026432862505317, - "rewards/format_reward": 0.22916666232049465, + "loss": 0.0009, + "num_tokens": 7107826.0, + "reward": -0.3568333759903908, + "reward_std": 0.5824087001383305, + "rewards/cosine_scaled_reward": -0.1784166805446148, "step": 52 }, { "clip_ratio": 0.0, - "completion_length": 2906.2500610351562, - "epoch": 0.060571428571428575, - "grad_norm": 0.293755441904068, - "kl": 0.0007658004760742188, + "completion_length": 2816.1666717529297, + "epoch": 0.030285714285714287, + "grad_norm": 0.3401743173599243, + "kl": 0.0006856918334960938, "learning_rate": 9.999013075636804e-07, - "loss": 0.0641, - "num_tokens": 7633685.0, - "reward": -0.10038524121046066, - "reward_std": 0.548088788986206, - "rewards/cosine_scaled_reward": -0.19602595455944538, - "rewards/format_reward": 0.2916666604578495, + "loss": 0.06, + "num_tokens": 7250142.0, + "reward": -0.00018092244863510132, + "reward_std": 0.16538633033633232, + "rewards/cosine_scaled_reward": -9.047612547874451e-05, "step": 53 }, { "clip_ratio": 0.0, - "completion_length": 2585.937530517578, - "epoch": 0.061714285714285715, - "grad_norm": 0.29792076349258423, - "kl": 0.0007724761962890625, + "completion_length": 3037.3958435058594, + "epoch": 0.030857142857142857, + "grad_norm": 0.39248475432395935, + "kl": 0.000926971435546875, "learning_rate": 9.998245517681593e-07, - "loss": 0.065, - "num_tokens": 7765328.0, - "reward": -0.13627923466265202, - "reward_std": 0.2906072996556759, - "rewards/cosine_scaled_reward": -0.25563961640000343, - "rewards/format_reward": 0.3749999925494194, + "loss": -0.0691, + "num_tokens": 7402561.0, + "reward": -0.5911240540444851, + "reward_std": 0.18204397335648537, + "rewards/cosine_scaled_reward": -0.29556202609091997, "step": 54 }, { "clip_ratio": 0.0, - "completion_length": 3107.7916870117188, - "epoch": 0.06285714285714286, - "grad_norm": 0.22162412106990814, - "kl": 0.0007085800170898438, + "completion_length": 3278.8958740234375, + "epoch": 0.03142857142857143, + "grad_norm": 0.22106441855430603, + "kl": 0.000995635986328125, "learning_rate": 9.997258721585931e-07, - "loss": 0.0752, - "num_tokens": 7922746.0, - "reward": -0.2543004397302866, - "reward_std": 0.6761848628520966, - "rewards/cosine_scaled_reward": -0.2625668868422508, - "rewards/format_reward": 0.27083333022892475, + "loss": 0.0338, + "num_tokens": 7566500.0, + "reward": -0.07510977238416672, + "reward_std": 0.608110748231411, + "rewards/cosine_scaled_reward": -0.03755488805472851, "step": 55 }, { "clip_ratio": 0.0, - "completion_length": 2845.8958740234375, - "epoch": 0.064, - "grad_norm": 0.2844100296497345, - "kl": 0.0008029937744140625, + "completion_length": 2919.937530517578, + "epoch": 0.032, + "grad_norm": 0.27260297536849976, + "kl": 0.0011653900146484375, "learning_rate": 9.996052735444862e-07, - "loss": 0.0857, - "num_tokens": 8068109.0, - "reward": 0.17015837877988815, - "reward_std": 0.6355826109647751, - "rewards/cosine_scaled_reward": -0.060754150777938776, - "rewards/format_reward": 0.2916666679084301, + "loss": 0.0375, + "num_tokens": 7712429.0, + "reward": -0.1868463009595871, + "reward_std": 0.5689870864152908, + "rewards/cosine_scaled_reward": -0.09342315793037415, "step": 56 }, { "clip_ratio": 0.0, - "completion_length": 2969.6250610351562, - "epoch": 0.06514285714285714, - "grad_norm": 0.3783540725708008, - "kl": 0.0008459091186523438, + "completion_length": 2649.8751220703125, + "epoch": 0.03257142857142857, + "grad_norm": 0.21306385099887848, + "kl": 0.0008172988891601562, "learning_rate": 9.994627618036452e-07, - "loss": 0.0707, - "num_tokens": 8218943.0, - "reward": 0.123802050948143, - "reward_std": 0.569492757320404, - "rewards/cosine_scaled_reward": -0.12559896823950112, - "rewards/format_reward": 0.375, + "loss": -0.0171, + "num_tokens": 7845899.0, + "reward": -0.43829748034477234, + "reward_std": 0.6765277907252312, + "rewards/cosine_scaled_reward": -0.21914873644709587, "step": 57 }, { "clip_ratio": 0.0, - "completion_length": 2966.0209350585938, - "epoch": 0.06628571428571428, - "grad_norm": 0.24569377303123474, - "kl": 0.0008134841918945312, + "completion_length": 3418.3958740234375, + "epoch": 0.03314285714285714, + "grad_norm": 0.19094346463680267, + "kl": 0.00072479248046875, "learning_rate": 9.992983438818915e-07, - "loss": 0.0928, - "num_tokens": 8369862.0, - "reward": -0.38002127036452293, - "reward_std": 0.40066082403063774, - "rewards/cosine_scaled_reward": -0.3358439467847347, - "rewards/format_reward": 0.2916666679084301, + "loss": 0.0116, + "num_tokens": 8016522.0, + "reward": -0.2600528746843338, + "reward_std": 0.41667389310896397, + "rewards/cosine_scaled_reward": -0.1300264373421669, "step": 58 }, { "clip_ratio": 0.0, - "completion_length": 2924.1251220703125, - "epoch": 0.06742857142857143, - "grad_norm": 0.22264137864112854, - "kl": 0.00084686279296875, + "completion_length": 2712.9583740234375, + "epoch": 0.03371428571428572, + "grad_norm": 0.27561813592910767, + "kl": 0.0011749267578125, "learning_rate": 9.991120277927223e-07, - "loss": -0.0067, - "num_tokens": 8518362.0, - "reward": 0.3800372362602502, - "reward_std": 0.7081842869520187, - "rewards/cosine_scaled_reward": 0.0025186067214235663, - "rewards/format_reward": 0.3750000037252903, + "loss": -0.0132, + "num_tokens": 8152660.0, + "reward": -0.2609961926937103, + "reward_std": 0.5313794314861298, + "rewards/cosine_scaled_reward": -0.13049809262156487, "step": 59 }, { "clip_ratio": 0.0, - "completion_length": 2843.4375610351562, - "epoch": 0.06857142857142857, - "grad_norm": 0.26855766773223877, - "kl": 0.0009984970092773438, + "completion_length": 3343.8125610351562, + "epoch": 0.03428571428571429, + "grad_norm": 0.20157206058502197, + "kl": 0.0007047653198242188, "learning_rate": 9.989038226169207e-07, - "loss": 0.0308, - "num_tokens": 8663217.0, - "reward": 0.23737115785479546, - "reward_std": 0.5052844993770123, - "rewards/cosine_scaled_reward": -0.04798109957482666, - "rewards/format_reward": 0.3333333358168602, + "loss": 0.0128, + "num_tokens": 8319607.0, + "reward": 0.07680468261241913, + "reward_std": 0.3094941098242998, + "rewards/cosine_scaled_reward": 0.038402341306209564, "step": 60 }, { "clip_ratio": 0.0, - "completion_length": 2237.854217529297, - "epoch": 0.06971428571428571, - "grad_norm": 0.26261472702026367, - "kl": 0.0006732940673828125, + "completion_length": 2177.208396911621, + "epoch": 0.03485714285714286, + "grad_norm": 0.36359933018684387, + "kl": 0.0006518363952636719, "learning_rate": 9.98673738502114e-07, - "loss": 0.0594, - "num_tokens": 8778356.0, - "reward": 0.7166529446840286, - "reward_std": 0.5268924571573734, - "rewards/cosine_scaled_reward": 0.06665980257093906, - "rewards/format_reward": 0.5833333283662796, + "loss": 0.14, + "num_tokens": 8430461.0, + "reward": 0.17365121096372604, + "reward_std": 0.5538155660033226, + "rewards/cosine_scaled_reward": 0.08682558685541153, "step": 61 }, { "clip_ratio": 0.0, - "completion_length": 3172.4376220703125, - "epoch": 0.07085714285714285, - "grad_norm": 0.19514356553554535, - "kl": 0.0007085800170898438, + "completion_length": 2966.7291870117188, + "epoch": 0.03542857142857143, + "grad_norm": 0.3397689163684845, + "kl": 0.0008535385131835938, "learning_rate": 9.98421786662277e-07, - "loss": 0.0541, - "num_tokens": 8938649.0, - "reward": 0.18832573667168617, - "reward_std": 0.6950011849403381, - "rewards/cosine_scaled_reward": -0.06208712235093117, - "rewards/format_reward": 0.3125, + "loss": -0.0699, + "num_tokens": 8578936.0, + "reward": -0.7255322933197021, + "reward_std": 0.2827052026987076, + "rewards/cosine_scaled_reward": -0.3627661466598511, "step": 62 }, { "clip_ratio": 0.0, - "completion_length": 1910.0833587646484, - "epoch": 0.072, - "grad_norm": 0.3044058680534363, - "kl": 0.001178741455078125, + "completion_length": 3450.8125610351562, + "epoch": 0.036, + "grad_norm": 0.18789339065551758, + "kl": 0.0008687973022460938, "learning_rate": 9.981479793771866e-07, - "loss": 0.0997, - "num_tokens": 9037839.0, - "reward": 0.692467100918293, - "reward_std": 0.7802318185567856, - "rewards/cosine_scaled_reward": 0.012900200905278325, - "rewards/format_reward": 0.6666666567325592, + "loss": -0.0483, + "num_tokens": 8750479.0, + "reward": -0.2192160151898861, + "reward_std": 0.5799107477068901, + "rewards/cosine_scaled_reward": -0.10960800759494305, "step": 63 }, { "clip_ratio": 0.0, - "completion_length": 3039.916748046875, - "epoch": 0.07314285714285715, - "grad_norm": 0.23920981585979462, - "kl": 0.0007276535034179688, + "completion_length": 3443.3541870117188, + "epoch": 0.036571428571428574, + "grad_norm": 0.17766065895557404, + "kl": 0.0008435249328613281, "learning_rate": 9.97852329991824e-07, - "loss": 0.0027, - "num_tokens": 9191927.0, - "reward": 0.4938540682196617, - "reward_std": 0.7745917662978172, - "rewards/cosine_scaled_reward": 0.05942701664753258, - "rewards/format_reward": 0.3750000074505806, + "loss": -0.0151, + "num_tokens": 8922264.0, + "reward": -0.48261551931500435, + "reward_std": 0.19259289279580116, + "rewards/cosine_scaled_reward": -0.24130774475634098, "step": 64 }, { "clip_ratio": 0.0, - "completion_length": 2283.8334350585938, - "epoch": 0.07428571428571429, - "grad_norm": 0.23445375263690948, - "kl": 0.0009222030639648438, + "completion_length": 3403.0625610351562, + "epoch": 0.037142857142857144, + "grad_norm": 0.1934821903705597, + "kl": 0.00086212158203125, "learning_rate": 9.975348529157229e-07, - "loss": 0.1023, - "num_tokens": 9309057.0, - "reward": 0.34917649751878344, - "reward_std": 0.7536975219845772, - "rewards/cosine_scaled_reward": -0.11707842443138361, - "rewards/format_reward": 0.5833333432674408, + "loss": 0.0185, + "num_tokens": 9092067.0, + "reward": -0.46268967539072037, + "reward_std": 0.28929166309535503, + "rewards/cosine_scaled_reward": -0.23134482093155384, "step": 65 }, { "clip_ratio": 0.0, - "completion_length": 3020.8333740234375, - "epoch": 0.07542857142857143, - "grad_norm": 0.23693493008613586, - "kl": 0.0007867813110351562, + "completion_length": 2666.395965576172, + "epoch": 0.037714285714285714, + "grad_norm": 0.24617139995098114, + "kl": 0.00070953369140625, "learning_rate": 9.971955636222684e-07, - "loss": 0.0562, - "num_tokens": 9461935.0, - "reward": -0.07892957702279091, - "reward_std": 0.5825114250183105, - "rewards/cosine_scaled_reward": -0.2165481224656105, - "rewards/format_reward": 0.3541666641831398, + "loss": -0.0574, + "num_tokens": 9225802.0, + "reward": 0.30002279952168465, + "reward_std": 0.6946442574262619, + "rewards/cosine_scaled_reward": 0.1500114006921649, "step": 66 }, { "clip_ratio": 0.0, - "completion_length": 2902.0208435058594, - "epoch": 0.07657142857142857, - "grad_norm": 0.23581241071224213, - "kl": 0.00127410888671875, + "completion_length": 2073.750030517578, + "epoch": 0.038285714285714284, + "grad_norm": 0.4215092062950134, + "kl": 0.0011243820190429688, "learning_rate": 9.968344786479415e-07, - "loss": 0.1158, - "num_tokens": 9609506.0, - "reward": 0.5483582876622677, - "reward_std": 0.5557361207902431, - "rewards/cosine_scaled_reward": 0.055429140105843544, - "rewards/format_reward": 0.4375000111758709, + "loss": 0.0529, + "num_tokens": 9330898.0, + "reward": -0.09548089653253555, + "reward_std": 0.8580914586782455, + "rewards/cosine_scaled_reward": -0.04774044919759035, "step": 67 }, { "clip_ratio": 0.0, - "completion_length": 2495.6458740234375, - "epoch": 0.07771428571428571, - "grad_norm": 0.3460405170917511, - "kl": 0.0016241073608398438, + "completion_length": 3528.1666870117188, + "epoch": 0.038857142857142854, + "grad_norm": 0.20039838552474976, + "kl": 0.0005893707275390625, "learning_rate": 9.964516155915151e-07, - "loss": -0.0339, - "num_tokens": 9737079.0, - "reward": -0.16728203371167183, - "reward_std": 0.2556677311658859, - "rewards/cosine_scaled_reward": -0.27114101499319077, - "rewards/format_reward": 0.375, + "loss": -0.0095, + "num_tokens": 9506442.0, + "reward": -0.42071669083088636, + "reward_std": 0.42319002375006676, + "rewards/cosine_scaled_reward": -0.21035834541544318, "step": 68 }, { "clip_ratio": 0.0, - "completion_length": 2326.791748046875, - "epoch": 0.07885714285714286, - "grad_norm": 0.2886251211166382, - "kl": 0.0007181167602539062, + "completion_length": 2716.7291870117188, + "epoch": 0.03942857142857143, + "grad_norm": 0.2517124116420746, + "kl": 0.0007944107055664062, "learning_rate": 9.960469931131936e-07, - "loss": 0.0408, - "num_tokens": 9856457.0, - "reward": 0.2764681279659271, - "reward_std": 0.4927753880620003, - "rewards/cosine_scaled_reward": -0.1221826063701883, - "rewards/format_reward": 0.5208333283662796, + "loss": -0.1279, + "num_tokens": 9643061.0, + "reward": -0.3758072182536125, + "reward_std": 0.45701417699456215, + "rewards/cosine_scaled_reward": -0.18790359422564507, "step": 69 }, { "clip_ratio": 0.0, - "completion_length": 2659.9584045410156, - "epoch": 0.08, - "grad_norm": 0.18427890539169312, - "kl": 0.0009355545043945312, + "completion_length": 2648.0000228881836, + "epoch": 0.04, + "grad_norm": 0.29658740758895874, + "kl": 0.0008521080017089844, "learning_rate": 9.956206309337066e-07, - "loss": 0.0742, - "num_tokens": 9992241.0, - "reward": 0.3243530666222796, - "reward_std": 0.5141221769154072, - "rewards/cosine_scaled_reward": -0.11907346919178963, - "rewards/format_reward": 0.5624999981373549, + "loss": 0.0124, + "num_tokens": 9776189.0, + "reward": -0.16740068793296814, + "reward_std": 0.41449040174484253, + "rewards/cosine_scaled_reward": -0.08370032906532288, "step": 70 }, { "clip_ratio": 0.0, - "completion_length": 2051.4375610351562, - "epoch": 0.08114285714285714, - "grad_norm": 0.37901344895362854, - "kl": 0.001514434814453125, + "completion_length": 1873.812515258789, + "epoch": 0.04057142857142857, + "grad_norm": 0.3751354515552521, + "kl": 0.0007829666137695312, "learning_rate": 9.951725498333448e-07, - "loss": 0.1337, - "num_tokens": 10098330.0, - "reward": 0.2932474911212921, - "reward_std": 0.562146857380867, - "rewards/cosine_scaled_reward": -0.16587623208761215, - "rewards/format_reward": 0.6249999925494194, + "loss": 0.0863, + "num_tokens": 9871712.0, + "reward": 0.3852356970310211, + "reward_std": 0.5030505172908306, + "rewards/cosine_scaled_reward": 0.19261783733963966, "step": 71 }, { "clip_ratio": 0.0, - "completion_length": 2446.187530517578, - "epoch": 0.08228571428571428, - "grad_norm": 0.2612290680408478, - "kl": 0.0010986328125, + "completion_length": 2817.8334197998047, + "epoch": 0.04114285714285714, + "grad_norm": 0.27796244621276855, + "kl": 0.000804901123046875, "learning_rate": 9.947027716509488e-07, - "loss": 0.063, - "num_tokens": 10223667.0, - "reward": 0.843310259282589, - "reward_std": 0.9821145087480545, - "rewards/cosine_scaled_reward": 0.10915513057261705, - "rewards/format_reward": 0.6249999925494194, + "loss": 0.0581, + "num_tokens": 10013232.0, + "reward": -0.10409137606620789, + "reward_std": 0.20197268202900887, + "rewards/cosine_scaled_reward": -0.05204569548368454, "step": 72 }, { "clip_ratio": 0.0, - "completion_length": 1752.6875762939453, - "epoch": 0.08342857142857144, - "grad_norm": 0.32139480113983154, - "kl": 0.001506805419921875, + "completion_length": 2128.0208740234375, + "epoch": 0.04171428571428572, + "grad_norm": 0.33463340997695923, + "kl": 0.0006952285766601562, "learning_rate": 9.942113192828444e-07, - "loss": 0.0949, - "num_tokens": 10314594.0, - "reward": 0.7357300966978073, - "reward_std": 0.713388629257679, - "rewards/cosine_scaled_reward": 0.00328170508146286, - "rewards/format_reward": 0.7291666716337204, + "loss": 0.0541, + "num_tokens": 10120693.0, + "reward": -0.22259100899100304, + "reward_std": 0.4416811428964138, + "rewards/cosine_scaled_reward": -0.11129548959434032, "step": 73 }, { "clip_ratio": 0.0, - "completion_length": 2221.375030517578, - "epoch": 0.08457142857142858, - "grad_norm": 0.26274731755256653, - "kl": 0.00138092041015625, + "completion_length": 2830.6666717529297, + "epoch": 0.04228571428571429, + "grad_norm": 0.28111714124679565, + "kl": 0.0005383491516113281, "learning_rate": 9.93698216681727e-07, - "loss": 0.142, - "num_tokens": 10429110.0, - "reward": 0.44233171858650167, - "reward_std": 0.6651621311903, - "rewards/cosine_scaled_reward": -0.08091748412698507, - "rewards/format_reward": 0.6041666716337204, + "loss": 0.0078, + "num_tokens": 10262025.0, + "reward": -0.14538022875785828, + "reward_std": 0.5454810187220573, + "rewards/cosine_scaled_reward": -0.07269011810421944, "step": 74 }, { "clip_ratio": 0.0, - "completion_length": 2533.6459350585938, - "epoch": 0.08571428571428572, - "grad_norm": 0.3130059540271759, - "kl": 0.0022735595703125, + "completion_length": 1554.2708435058594, + "epoch": 0.04285714285714286, + "grad_norm": 0.4327227771282196, + "kl": 0.0011434555053710938, "learning_rate": 9.931634888554935e-07, - "loss": 0.1731, - "num_tokens": 10558639.0, - "reward": 0.012795105576515198, - "reward_std": 0.6212242320179939, - "rewards/cosine_scaled_reward": -0.21235244907438755, - "rewards/format_reward": 0.4375000037252903, + "loss": 0.0645, + "num_tokens": 10341778.0, + "reward": -0.28616778552532196, + "reward_std": 0.6919333338737488, + "rewards/cosine_scaled_reward": -0.1430838778614998, "step": 75 }, { "clip_ratio": 0.0, - "completion_length": 2811.0001220703125, - "epoch": 0.08685714285714285, - "grad_norm": 0.2659956216812134, - "kl": 0.0014276504516601562, + "completion_length": 3283.2083740234375, + "epoch": 0.04342857142857143, + "grad_norm": 0.2106700837612152, + "kl": 0.0007944107055664062, "learning_rate": 9.926071618660237e-07, - "loss": 0.1683, - "num_tokens": 10701949.0, - "reward": 0.43999071419239044, - "reward_std": 0.5641827136278152, - "rewards/cosine_scaled_reward": 0.011662017554044724, - "rewards/format_reward": 0.4166666641831398, + "loss": 0.0367, + "num_tokens": 10504976.0, + "reward": -0.337122593075037, + "reward_std": 0.5403655767440796, + "rewards/cosine_scaled_reward": -0.1685612890869379, "step": 76 }, { "clip_ratio": 0.0, - "completion_length": 3142.1458740234375, - "epoch": 0.088, - "grad_norm": 0.2155051827430725, - "kl": 0.0018138885498046875, + "completion_length": 2804.854248046875, + "epoch": 0.044, + "grad_norm": 0.290350079536438, + "kl": 0.0010528564453125, "learning_rate": 9.9202926282791e-07, - "loss": 0.0556, - "num_tokens": 10860824.0, - "reward": 0.007742300629615784, - "reward_std": 0.5252507999539375, - "rewards/cosine_scaled_reward": -0.1107122041285038, - "rewards/format_reward": 0.2291666641831398, + "loss": -0.1465, + "num_tokens": 10645033.0, + "reward": -0.3672878537327051, + "reward_std": 0.4394787736237049, + "rewards/cosine_scaled_reward": -0.18364392640069127, "step": 77 }, { "clip_ratio": 0.0, - "completion_length": 2719.3333740234375, - "epoch": 0.08914285714285715, - "grad_norm": 0.27928948402404785, - "kl": 0.0011129379272460938, + "completion_length": 2761.9375610351562, + "epoch": 0.044571428571428574, + "grad_norm": 0.23817922174930573, + "kl": 0.0006437301635742188, "learning_rate": 9.91429819907136e-07, - "loss": 0.0509, - "num_tokens": 10998852.0, - "reward": 0.4107997752726078, - "reward_std": 0.7809228599071503, - "rewards/cosine_scaled_reward": -0.002933473326265812, - "rewards/format_reward": 0.41666666232049465, + "loss": 0.0617, + "num_tokens": 10783558.0, + "reward": 0.039801888167858124, + "reward_std": 0.8721425756812096, + "rewards/cosine_scaled_reward": 0.019900942221283913, "step": 78 }, { "clip_ratio": 0.0, - "completion_length": 2861.6875610351562, - "epoch": 0.09028571428571429, - "grad_norm": 0.24252408742904663, - "kl": 0.0010776519775390625, + "completion_length": 2478.0000610351562, + "epoch": 0.045142857142857144, + "grad_norm": 0.2438346892595291, + "kl": 0.0007276535034179688, "learning_rate": 9.908088623197048e-07, - "loss": 0.167, - "num_tokens": 11144973.0, - "reward": 0.06531679630279541, - "reward_std": 0.5613357946276665, - "rewards/cosine_scaled_reward": -0.17567493673413992, - "rewards/format_reward": 0.4166666641831398, + "loss": -0.0649, + "num_tokens": 10908622.0, + "reward": -0.009920487180352211, + "reward_std": 0.6310172341763973, + "rewards/cosine_scaled_reward": -0.00496023939922452, "step": 79 }, { "clip_ratio": 0.0, - "completion_length": 3362.5416870117188, - "epoch": 0.09142857142857143, - "grad_norm": 0.1729951947927475, - "kl": 0.0009174346923828125, + "completion_length": 3292.5208740234375, + "epoch": 0.045714285714285714, + "grad_norm": 0.19675898551940918, + "kl": 0.0007238388061523438, "learning_rate": 9.901664203302124e-07, - "loss": 0.0921, - "num_tokens": 11314865.0, - "reward": -0.06127368565648794, - "reward_std": 0.6489161625504494, - "rewards/cosine_scaled_reward": -0.1348035205155611, - "rewards/format_reward": 0.2083333358168602, + "loss": 0.028, + "num_tokens": 11073503.0, + "reward": -0.26976824924349785, + "reward_std": 0.44561611115932465, + "rewards/cosine_scaled_reward": -0.13488411717116833, "step": 80 }, { "clip_ratio": 0.0, - "completion_length": 2769.0625610351562, - "epoch": 0.09257142857142857, - "grad_norm": 0.20924705266952515, - "kl": 0.0013446807861328125, + "completion_length": 2928.7500610351562, + "epoch": 0.046285714285714284, + "grad_norm": 0.23351161181926727, + "kl": 0.0010251998901367188, "learning_rate": 9.895025252503755e-07, - "loss": 0.1263, - "num_tokens": 11455040.0, - "reward": 0.23385965824127197, - "reward_std": 0.7129553332924843, - "rewards/cosine_scaled_reward": -0.08098683506250381, - "rewards/format_reward": 0.3958333283662796, + "loss": -0.0533, + "num_tokens": 11219663.0, + "reward": 0.14206518977880478, + "reward_std": 0.47988787665963173, + "rewards/cosine_scaled_reward": 0.07103258091956377, "step": 81 }, { "clip_ratio": 0.0, - "completion_length": 2727.9584350585938, - "epoch": 0.09371428571428571, - "grad_norm": 0.26771607995033264, - "kl": 0.0012149810791015625, + "completion_length": 3216.3958740234375, + "epoch": 0.046857142857142854, + "grad_norm": 0.22799547016620636, + "kl": 0.0006895065307617188, "learning_rate": 9.888172094375033e-07, - "loss": 0.2245, - "num_tokens": 11593944.0, - "reward": 0.02113605911290506, - "reward_std": 0.5952873006463051, - "rewards/cosine_scaled_reward": -0.21859865076839924, - "rewards/format_reward": 0.4583333358168602, + "loss": 0.0032, + "num_tokens": 11380386.0, + "reward": -0.16227489709854126, + "reward_std": 0.33015402406454086, + "rewards/cosine_scaled_reward": -0.08113745599985123, "step": 82 }, { "clip_ratio": 0.0, - "completion_length": 3496.9375610351562, - "epoch": 0.09485714285714286, - "grad_norm": 0.20277726650238037, - "kl": 0.0009927749633789062, + "completion_length": 993.4583435058594, + "epoch": 0.04742857142857143, + "grad_norm": 0.551730751991272, + "kl": 0.0010633468627929688, "learning_rate": 9.881105062929221e-07, - "loss": 0.0322, - "num_tokens": 11770755.0, - "reward": -0.40173853002488613, - "reward_std": 0.4993053339421749, - "rewards/cosine_scaled_reward": -0.28420259058475494, - "rewards/format_reward": 0.16666666604578495, + "loss": 0.1338, + "num_tokens": 11432752.0, + "reward": 0.6598471999168396, + "reward_std": 0.16890107188373804, + "rewards/cosine_scaled_reward": 0.3299236036837101, "step": 83 }, { "clip_ratio": 0.0, - "completion_length": 2602.8959350585938, - "epoch": 0.096, - "grad_norm": 0.24264362454414368, - "kl": 0.0011615753173828125, + "completion_length": 1853.7083740234375, + "epoch": 0.048, + "grad_norm": 0.29992803931236267, + "kl": 0.0008602142333984375, "learning_rate": 9.873824502603459e-07, - "loss": 0.1134, - "num_tokens": 11903482.0, - "reward": 0.8407080993056297, - "reward_std": 0.7610376700758934, - "rewards/cosine_scaled_reward": 0.11827070452272892, - "rewards/format_reward": 0.6041666567325592, + "loss": 0.0295, + "num_tokens": 11527454.0, + "reward": -0.3242928695399314, + "reward_std": 0.7351822834461927, + "rewards/cosine_scaled_reward": -0.16214643290732056, "step": 84 }, { "clip_ratio": 0.0, - "completion_length": 3080.4375915527344, - "epoch": 0.09714285714285714, - "grad_norm": 0.1991293877363205, - "kl": 0.001140594482421875, + "completion_length": 2737.2083740234375, + "epoch": 0.04857142857142857, + "grad_norm": 0.31860730051994324, + "kl": 0.0012292861938476562, "learning_rate": 9.866330768241983e-07, - "loss": 0.008, - "num_tokens": 12058927.0, - "reward": 0.20353421010077, - "reward_std": 0.8020459171384573, - "rewards/cosine_scaled_reward": -0.06489956192672253, - "rewards/format_reward": 0.3333333395421505, + "loss": 0.0168, + "num_tokens": 11664108.0, + "reward": -0.3406180441379547, + "reward_std": 0.4295322969555855, + "rewards/cosine_scaled_reward": -0.17030901461839676, "step": 85 }, { "clip_ratio": 0.0, - "completion_length": 2708.541748046875, - "epoch": 0.09828571428571428, - "grad_norm": 0.24375155568122864, - "kl": 0.001434326171875, + "completion_length": 2586.541732788086, + "epoch": 0.04914285714285714, + "grad_norm": 0.3602101802825928, + "kl": 0.000606536865234375, "learning_rate": 9.85862422507884e-07, - "loss": 0.0966, - "num_tokens": 12197073.0, - "reward": 0.2201586167793721, - "reward_std": 0.3715602122247219, - "rewards/cosine_scaled_reward": -0.08783736452460289, - "rewards/format_reward": 0.3958333283662796, + "loss": -0.1276, + "num_tokens": 11794454.0, + "reward": 0.20340422540903091, + "reward_std": 0.9888063967227936, + "rewards/cosine_scaled_reward": 0.10170210711658001, "step": 86 }, { "clip_ratio": 0.0, - "completion_length": 2833.2500610351562, - "epoch": 0.09942857142857142, - "grad_norm": 0.2276199460029602, - "kl": 0.001941680908203125, + "completion_length": 2964.75, + "epoch": 0.04971428571428571, + "grad_norm": 0.3094576895236969, + "kl": 0.00102996826171875, "learning_rate": 9.850705248720068e-07, - "loss": 0.0738, - "num_tokens": 12340527.0, - "reward": -0.10197540372610092, - "reward_std": 0.6884967442601919, - "rewards/cosine_scaled_reward": -0.2384876972064376, - "rewards/format_reward": 0.3749999925494194, + "loss": -0.0531, + "num_tokens": 11943002.0, + "reward": 0.01601184532046318, + "reward_std": 0.4094504490494728, + "rewards/cosine_scaled_reward": 0.008005908690392971, "step": 87 }, { "clip_ratio": 0.0, - "completion_length": 2349.5833740234375, - "epoch": 0.10057142857142858, - "grad_norm": 0.2580890357494354, - "kl": 0.0018215179443359375, + "completion_length": 2426.1666870117188, + "epoch": 0.05028571428571429, + "grad_norm": 0.2704874277114868, + "kl": 0.001644134521484375, "learning_rate": 9.8425742251254e-07, - "loss": 0.0471, - "num_tokens": 12461275.0, - "reward": 0.8556020110845566, - "reward_std": 0.8413996547460556, - "rewards/cosine_scaled_reward": 0.13613433949649334, - "rewards/format_reward": 0.5833333358168602, + "loss": 0.0654, + "num_tokens": 12065278.0, + "reward": -0.40290449309395626, + "reward_std": 0.6259909272193909, + "rewards/cosine_scaled_reward": -0.20145224656153005, "step": 88 }, { "clip_ratio": 0.0, - "completion_length": 3218.3125610351562, - "epoch": 0.10171428571428572, - "grad_norm": 0.19695505499839783, - "kl": 0.00116729736328125, + "completion_length": 2142.1041870117188, + "epoch": 0.05085714285714286, + "grad_norm": 0.3306039273738861, + "kl": 0.00197601318359375, "learning_rate": 9.83423155058946e-07, - "loss": 0.0728, - "num_tokens": 12623878.0, - "reward": 0.16969604790210724, - "reward_std": 0.7643514573574066, - "rewards/cosine_scaled_reward": -0.050568655133247375, - "rewards/format_reward": 0.27083333395421505, + "loss": -0.1126, + "num_tokens": 12174291.0, + "reward": -0.23806674778461456, + "reward_std": 0.6727369725704193, + "rewards/cosine_scaled_reward": -0.11903337389230728, "step": 89 }, { "clip_ratio": 0.0, - "completion_length": 2965.916748046875, - "epoch": 0.10285714285714286, - "grad_norm": 0.23769812285900116, - "kl": 0.0013370513916015625, + "completion_length": 2580.7709350585938, + "epoch": 0.05142857142857143, + "grad_norm": 0.23774664103984833, + "kl": 0.0007190704345703125, "learning_rate": 9.825677631722435e-07, - "loss": 0.1051, - "num_tokens": 12774504.0, - "reward": -0.07332007400691509, - "reward_std": 0.7234293296933174, - "rewards/cosine_scaled_reward": -0.19291004166007042, - "rewards/format_reward": 0.3124999962747097, + "loss": 0.077, + "num_tokens": 12303664.0, + "reward": 0.3880194779485464, + "reward_std": 1.0538864731788635, + "rewards/cosine_scaled_reward": 0.1940097352489829, "step": 90 }, { "clip_ratio": 0.0, - "completion_length": 3094.291748046875, - "epoch": 0.104, - "grad_norm": 0.23373101651668549, - "kl": 0.0011157989501953125, + "completion_length": 2841.312530517578, + "epoch": 0.052, + "grad_norm": 0.24611423909664154, + "kl": 0.000865936279296875, "learning_rate": 9.816912885430258e-07, - "loss": 0.0653, - "num_tokens": 12931448.0, - "reward": -0.2179771214723587, - "reward_std": 0.5946699306368828, - "rewards/cosine_scaled_reward": -0.2652385588735342, - "rewards/format_reward": 0.31249999813735485, + "loss": 0.0599, + "num_tokens": 12445867.0, + "reward": -0.19608542323112488, + "reward_std": 0.42760632932186127, + "rewards/cosine_scaled_reward": -0.09804270416498184, "step": 91 }, { "clip_ratio": 0.0, - "completion_length": 2507.500030517578, - "epoch": 0.10514285714285715, - "grad_norm": 0.26020577549934387, - "kl": 0.001468658447265625, + "completion_length": 2743.6041717529297, + "epoch": 0.052571428571428575, + "grad_norm": 0.23887218534946442, + "kl": 0.0006628036499023438, "learning_rate": 9.807937738894303e-07, - "loss": 0.1937, - "num_tokens": 13060088.0, - "reward": 0.3611624091863632, - "reward_std": 0.5020711049437523, - "rewards/cosine_scaled_reward": -0.07983547076582909, - "rewards/format_reward": 0.5208333283662796, + "loss": 0.0646, + "num_tokens": 12584352.0, + "reward": 0.15040923655033112, + "reward_std": 0.25195283722132444, + "rewards/cosine_scaled_reward": 0.07520462200045586, "step": 92 }, { "clip_ratio": 0.0, - "completion_length": 2233.666717529297, - "epoch": 0.10628571428571429, - "grad_norm": 0.3289438486099243, - "kl": 0.001186370849609375, + "completion_length": 2130.0833435058594, + "epoch": 0.053142857142857144, + "grad_norm": 0.42298954725265503, + "kl": 0.0020599365234375, "learning_rate": 9.798752629550546e-07, - "loss": 0.1828, - "num_tokens": 13175656.0, - "reward": 0.38644066639244556, - "reward_std": 0.6418244391679764, - "rewards/cosine_scaled_reward": -0.09844633843749762, - "rewards/format_reward": 0.583333320915699, + "loss": 0.0779, + "num_tokens": 12692224.0, + "reward": -0.06379163265228271, + "reward_std": 0.3432878144085407, + "rewards/cosine_scaled_reward": -0.03189583122730255, "step": 93 }, { "clip_ratio": 0.0, - "completion_length": 3342.6666870117188, - "epoch": 0.10742857142857143, - "grad_norm": 0.22657889127731323, - "kl": 0.0014677047729492188, + "completion_length": 2588.2709350585938, + "epoch": 0.053714285714285714, + "grad_norm": 0.21005931496620178, + "kl": 0.0005979537963867188, "learning_rate": 9.78935800506826e-07, - "loss": 0.0397, - "num_tokens": 13344348.0, - "reward": -0.197869211435318, - "reward_std": 0.6262447759509087, - "rewards/cosine_scaled_reward": -0.1926846019923687, - "rewards/format_reward": 0.1875, + "loss": 0.0561, + "num_tokens": 12822065.0, + "reward": -0.2485465258359909, + "reward_std": 0.6237562894821167, + "rewards/cosine_scaled_reward": -0.12427325546741486, "step": 94 }, { "clip_ratio": 0.0, - "completion_length": 3180.7084350585938, - "epoch": 0.10857142857142857, - "grad_norm": 0.18802064657211304, - "kl": 0.0009918212890625, + "completion_length": 1483.5417175292969, + "epoch": 0.054285714285714284, + "grad_norm": 0.34288352727890015, + "kl": 0.0010595321655273438, "learning_rate": 9.779754323328192e-07, - "loss": -0.0051, - "num_tokens": 13506052.0, - "reward": -0.047770393546670675, - "reward_std": 0.6212666258215904, - "rewards/cosine_scaled_reward": -0.16971853002905846, - "rewards/format_reward": 0.29166666232049465, + "loss": 0.0743, + "num_tokens": 12899239.0, + "reward": 0.16759111359715462, + "reward_std": 0.9945086091756821, + "rewards/cosine_scaled_reward": 0.08379554376006126, "step": 95 }, { "clip_ratio": 0.0, - "completion_length": 2359.6459350585938, - "epoch": 0.10971428571428571, - "grad_norm": 0.3318856656551361, - "kl": 0.0027332305908203125, + "completion_length": 2968.5000610351562, + "epoch": 0.054857142857142854, + "grad_norm": 0.20499330759048462, + "kl": 0.0007104873657226562, "learning_rate": 9.769942052400235e-07, - "loss": 0.0385, - "num_tokens": 13626977.0, - "reward": 0.3386218100786209, - "reward_std": 0.5042477250099182, - "rewards/cosine_scaled_reward": -0.10152245499193668, - "rewards/format_reward": 0.5416666641831398, + "loss": 0.0123, + "num_tokens": 13047499.0, + "reward": -0.08611700683832169, + "reward_std": 0.7486424595117569, + "rewards/cosine_scaled_reward": -0.04305850435048342, "step": 96 }, { "clip_ratio": 0.0, - "completion_length": 3064.625030517578, - "epoch": 0.11085714285714286, - "grad_norm": 0.2832983434200287, - "kl": 0.0015583038330078125, + "completion_length": 2057.5625228881836, + "epoch": 0.05542857142857143, + "grad_norm": 0.3967874348163605, + "kl": 0.00153350830078125, "learning_rate": 9.759921670520634e-07, - "loss": 0.046, - "num_tokens": 13782221.0, - "reward": 0.006381474435329437, - "reward_std": 0.5968942120671272, - "rewards/cosine_scaled_reward": -0.12180926650762558, - "rewards/format_reward": 0.2499999962747097, + "loss": 0.0677, + "num_tokens": 13152466.0, + "reward": -0.5269366651773453, + "reward_std": 0.3624592386186123, + "rewards/cosine_scaled_reward": -0.26346831768751144, "step": 97 }, { "clip_ratio": 0.0, - "completion_length": 2421.041748046875, - "epoch": 0.112, - "grad_norm": 0.24057374894618988, - "kl": 0.001895904541015625, + "completion_length": 1952.1041717529297, + "epoch": 0.056, + "grad_norm": 0.3613995909690857, + "kl": 0.0018596649169921875, "learning_rate": 9.749693666068663e-07, - "loss": 0.0459, - "num_tokens": 13906513.0, - "reward": 0.23158405721187592, - "reward_std": 0.46807075664401054, - "rewards/cosine_scaled_reward": -0.12379130208864808, - "rewards/format_reward": 0.47916667349636555, + "loss": 0.0947, + "num_tokens": 13251459.0, + "reward": 0.9773776829242706, + "reward_std": 0.8529446795582771, + "rewards/cosine_scaled_reward": 0.4886888340115547, "step": 98 }, { "clip_ratio": 0.0, - "completion_length": 2506.291748046875, - "epoch": 0.11314285714285714, - "grad_norm": 0.2858019769191742, - "kl": 0.0012254714965820312, + "completion_length": 2277.0416717529297, + "epoch": 0.05657142857142857, + "grad_norm": 0.3505290150642395, + "kl": 0.0008687973022460938, "learning_rate": 9.739258537542835e-07, - "loss": 0.0971, - "num_tokens": 14034993.0, - "reward": 0.28322335705161095, - "reward_std": 0.4190603345632553, - "rewards/cosine_scaled_reward": -0.11880499497056007, - "rewards/format_reward": 0.5208333320915699, + "loss": 0.0603, + "num_tokens": 13366469.0, + "reward": 0.27346290089190006, + "reward_std": 0.47108355164527893, + "rewards/cosine_scaled_reward": 0.13673143601045012, "step": 99 }, { "clip_ratio": 0.0, - "completion_length": 2948.5625610351562, - "epoch": 0.11428571428571428, - "grad_norm": 0.2643907070159912, - "kl": 0.00148773193359375, + "completion_length": 1580.0625305175781, + "epoch": 0.05714285714285714, + "grad_norm": 0.36085861921310425, + "kl": 0.0011138916015625, "learning_rate": 9.728616793536587e-07, - "loss": -0.0572, - "num_tokens": 14184588.0, - "reward": 0.147983580827713, - "reward_std": 0.6056301072239876, - "rewards/cosine_scaled_reward": -0.09267487563192844, - "rewards/format_reward": 0.3333333358168602, + "loss": 0.0027, + "num_tokens": 13446920.0, + "reward": 0.4378054551780224, + "reward_std": 0.6341788824647665, + "rewards/cosine_scaled_reward": 0.2189027275890112, "step": 100 }, { "clip_ratio": 0.0, - "completion_length": 2414.666732788086, - "epoch": 0.11542857142857142, - "grad_norm": 0.2964082658290863, - "kl": 0.001434326171875, + "completion_length": 2714.8541717529297, + "epoch": 0.05771428571428571, + "grad_norm": 0.2826038599014282, + "kl": 0.0008282661437988281, "learning_rate": 9.717768952713511e-07, - "loss": 0.0541, - "num_tokens": 14308580.0, - "reward": 0.07409980893135071, - "reward_std": 0.4281482808291912, - "rewards/cosine_scaled_reward": -0.18170008901506662, - "rewards/format_reward": 0.43750000558793545, + "loss": 0.0606, + "num_tokens": 13583809.0, + "reward": -0.5446555614471436, + "reward_std": 0.35200726985931396, + "rewards/cosine_scaled_reward": -0.2723277732729912, "step": 101 }, { "clip_ratio": 0.0, - "completion_length": 2445.604217529297, - "epoch": 0.11657142857142858, - "grad_norm": 0.25845882296562195, - "kl": 0.0014829635620117188, + "completion_length": 2409.666717529297, + "epoch": 0.05828571428571429, + "grad_norm": 0.30025118589401245, + "kl": 0.0009298324584960938, "learning_rate": 9.706715543782064e-07, - "loss": 0.1582, - "num_tokens": 14433931.0, - "reward": 0.17498280853033066, - "reward_std": 0.7513352856040001, - "rewards/cosine_scaled_reward": -0.18334193527698517, - "rewards/format_reward": 0.5416666567325592, + "loss": -0.028, + "num_tokens": 13705845.0, + "reward": -0.1350867822766304, + "reward_std": 0.3358248174190521, + "rewards/cosine_scaled_reward": -0.0675433836877346, "step": 102 }, { "clip_ratio": 0.0, - "completion_length": 2614.7291870117188, - "epoch": 0.11771428571428572, - "grad_norm": 0.20373135805130005, - "kl": 0.0013742446899414062, + "completion_length": 2799.2708587646484, + "epoch": 0.05885714285714286, + "grad_norm": 0.2541813552379608, + "kl": 0.0008907318115234375, "learning_rate": 9.695457105469804e-07, - "loss": 0.0359, - "num_tokens": 14567022.0, - "reward": 0.029875081032514572, - "reward_std": 0.6425531953573227, - "rewards/cosine_scaled_reward": -0.19339579716324806, - "rewards/format_reward": 0.41666666977107525, + "loss": 0.0025, + "num_tokens": 13846618.0, + "reward": -0.6046699732542038, + "reward_std": 0.3693853598088026, + "rewards/cosine_scaled_reward": -0.3023349717259407, "step": 103 }, { "clip_ratio": 0.0, - "completion_length": 2409.0001220703125, - "epoch": 0.11885714285714286, - "grad_norm": 0.23686547577381134, - "kl": 0.0011281967163085938, + "completion_length": 2963.1458435058594, + "epoch": 0.05942857142857143, + "grad_norm": 0.2182713747024536, + "kl": 0.000762939453125, "learning_rate": 9.683994186497132e-07, - "loss": 0.106, - "num_tokens": 14690484.0, - "reward": 0.8152372092008591, - "reward_std": 0.6015111114829779, - "rewards/cosine_scaled_reward": 0.13678526505827904, - "rewards/format_reward": 0.5416666641831398, + "loss": 0.0303, + "num_tokens": 13995953.0, + "reward": -0.20170933986082673, + "reward_std": 0.5568380877375603, + "rewards/cosine_scaled_reward": -0.10085467004682869, "step": 104 }, { "clip_ratio": 0.0, - "completion_length": 2311.3125610351562, - "epoch": 0.12, - "grad_norm": 0.2990974187850952, - "kl": 0.00154876708984375, + "completion_length": 2755.1458740234375, + "epoch": 0.06, + "grad_norm": 0.26797640323638916, + "kl": 0.0011749267578125, "learning_rate": 9.672327345550543e-07, - "loss": 0.2331, - "num_tokens": 14809947.0, - "reward": 0.3171987719833851, - "reward_std": 0.437034510076046, - "rewards/cosine_scaled_reward": -0.11223394051194191, - "rewards/format_reward": 0.5416666641831398, + "loss": -0.0094, + "num_tokens": 14134524.0, + "reward": -0.6127043217420578, + "reward_std": 0.19923977181315422, + "rewards/cosine_scaled_reward": -0.3063521459698677, "step": 105 }, { "clip_ratio": 0.0, - "completion_length": 2736.7500610351562, - "epoch": 0.12114285714285715, - "grad_norm": 0.26455891132354736, - "kl": 0.002010345458984375, + "completion_length": 2781.0416717529297, + "epoch": 0.060571428571428575, + "grad_norm": 0.4417632222175598, + "kl": 0.0021905899047851562, "learning_rate": 9.66045715125541e-07, - "loss": 0.1562, - "num_tokens": 14948811.0, - "reward": 0.34779771137982607, - "reward_std": 0.6852256283164024, - "rewards/cosine_scaled_reward": -0.024017807096242905, - "rewards/format_reward": 0.3958333358168602, + "loss": 0.0892, + "num_tokens": 14274434.0, + "reward": -0.042052820324897766, + "reward_std": 0.555817510932684, + "rewards/cosine_scaled_reward": -0.021026406437158585, "step": 106 }, { "clip_ratio": 0.0, - "completion_length": 2184.2291870117188, - "epoch": 0.12228571428571429, - "grad_norm": 0.33748024702072144, - "kl": 0.0018157958984375, + "completion_length": 2453.125030517578, + "epoch": 0.061142857142857145, + "grad_norm": 0.2828550338745117, + "kl": 0.000919342041015625, "learning_rate": 9.648384182148252e-07, - "loss": -0.0665, - "num_tokens": 15061676.0, - "reward": 0.24546424997970462, - "reward_std": 0.5336676575243473, - "rewards/cosine_scaled_reward": -0.16893455758690834, - "rewards/format_reward": 0.5833333358168602, + "loss": -0.0085, + "num_tokens": 14397428.0, + "reward": -0.37871552258729935, + "reward_std": 0.4371586740016937, + "rewards/cosine_scaled_reward": -0.18935775943100452, "step": 107 }, { "clip_ratio": 0.0, - "completion_length": 2026.3126068115234, - "epoch": 0.12342857142857143, - "grad_norm": 0.34937456250190735, - "kl": 0.0019092559814453125, + "completion_length": 2142.5625, + "epoch": 0.061714285714285715, + "grad_norm": 0.30579039454460144, + "kl": 0.0007009506225585938, "learning_rate": 9.636109026648554e-07, - "loss": 0.1239, - "num_tokens": 15166709.0, - "reward": 0.8228390365839005, - "reward_std": 0.5794718265533447, - "rewards/cosine_scaled_reward": 0.10933613404631615, - "rewards/format_reward": 0.6041666623204947, + "loss": 0.0216, + "num_tokens": 14506031.0, + "reward": -0.2899063751101494, + "reward_std": 0.30772218108177185, + "rewards/cosine_scaled_reward": -0.1449531875550747, "step": 108 }, { "clip_ratio": 0.0, - "completion_length": 2952.7916870117188, - "epoch": 0.12457142857142857, - "grad_norm": 0.3034088611602783, - "kl": 0.0014963150024414062, + "completion_length": 3271.8750610351562, + "epoch": 0.062285714285714285, + "grad_norm": 0.212229385972023, + "kl": 0.0011157989501953125, "learning_rate": 9.623632283030077e-07, - "loss": -0.0444, - "num_tokens": 15316837.0, - "reward": -0.007000848650932312, - "reward_std": 0.5003913566470146, - "rewards/cosine_scaled_reward": -0.15975042153149843, - "rewards/format_reward": 0.31249999813735485, + "loss": -0.035, + "num_tokens": 14669513.0, + "reward": -0.5292222313582897, + "reward_std": 0.40466723032295704, + "rewards/cosine_scaled_reward": -0.26461111195385456, "step": 109 }, { "clip_ratio": 0.0, - "completion_length": 2271.3125610351562, - "epoch": 0.12571428571428572, - "grad_norm": 0.2639235854148865, - "kl": 0.002208709716796875, + "completion_length": 2759.3125, + "epoch": 0.06285714285714286, + "grad_norm": 0.25697484612464905, + "kl": 0.0011186599731445312, "learning_rate": 9.610954559391704e-07, - "loss": 0.1034, - "num_tokens": 15433678.0, - "reward": 1.0950873866677284, - "reward_std": 0.6892717778682709, - "rewards/cosine_scaled_reward": 0.17254368215799332, - "rewards/format_reward": 0.7499999850988388, + "loss": 0.0172, + "num_tokens": 14807984.0, + "reward": -0.4681231379508972, + "reward_std": 0.29983806796371937, + "rewards/cosine_scaled_reward": -0.23406155407428741, "step": 110 }, { "clip_ratio": 0.0, - "completion_length": 1646.6458740234375, - "epoch": 0.12685714285714286, - "grad_norm": 0.3545893728733063, - "kl": 0.002468109130859375, + "completion_length": 2785.4166717529297, + "epoch": 0.06342857142857143, + "grad_norm": 0.47920992970466614, + "kl": 0.009280204772949219, "learning_rate": 9.598076473627796e-07, - "loss": 0.1439, - "num_tokens": 15519989.0, - "reward": 0.6878980733454227, - "reward_std": 0.6134084016084671, - "rewards/cosine_scaled_reward": -0.02063431334681809, - "rewards/format_reward": 0.7291666641831398, + "loss": 0.0485, + "num_tokens": 14949280.0, + "reward": 0.15108218044042587, + "reward_std": 0.20054534077644348, + "rewards/cosine_scaled_reward": 0.07554109394550323, "step": 111 }, { "clip_ratio": 0.0, - "completion_length": 2729.0209045410156, - "epoch": 0.128, - "grad_norm": 0.22189046442508698, - "kl": 0.001827239990234375, + "completion_length": 2506.416748046875, + "epoch": 0.064, + "grad_norm": 0.26250529289245605, + "kl": 0.0015287399291992188, "learning_rate": 9.58499865339809e-07, - "loss": 0.067, - "num_tokens": 15658938.0, - "reward": 0.4110586680471897, - "reward_std": 0.5899315737187862, - "rewards/cosine_scaled_reward": -0.02363734319806099, - "rewards/format_reward": 0.4583333283662796, + "loss": -0.1711, + "num_tokens": 15075480.0, + "reward": -0.23084469139575958, + "reward_std": 0.5984194576740265, + "rewards/cosine_scaled_reward": -0.11542234569787979, "step": 112 }, { "clip_ratio": 0.0, - "completion_length": 1823.4375915527344, - "epoch": 0.12914285714285714, - "grad_norm": 0.2697267532348633, - "kl": 0.0023956298828125, + "completion_length": 2602.5208740234375, + "epoch": 0.06457142857142857, + "grad_norm": 0.424373596906662, + "kl": 0.0057659149169921875, "learning_rate": 9.571721736097088e-07, - "loss": 0.0572, - "num_tokens": 15754773.0, - "reward": 0.7300843407865614, - "reward_std": 0.41797127947211266, - "rewards/cosine_scaled_reward": 0.02129216119647026, - "rewards/format_reward": 0.6875, + "loss": 0.0718, + "num_tokens": 15206593.0, + "reward": -0.253767779096961, + "reward_std": 0.4538792669773102, + "rewards/cosine_scaled_reward": -0.12688388722017407, "step": 113 }, { "clip_ratio": 0.0, - "completion_length": 1685.7500610351562, - "epoch": 0.13028571428571428, - "grad_norm": 0.3274898827075958, - "kl": 0.0018157958984375, + "completion_length": 3077.0000610351562, + "epoch": 0.06514285714285714, + "grad_norm": 0.2078198343515396, + "kl": 0.0006704330444335938, "learning_rate": 9.55824636882301e-07, - "loss": 0.1355, - "num_tokens": 15843075.0, - "reward": 0.9840904623270035, - "reward_std": 0.6420945823192596, - "rewards/cosine_scaled_reward": 0.13787856203271076, - "rewards/format_reward": 0.708333320915699, + "loss": -0.0122, + "num_tokens": 15360649.0, + "reward": -0.15475063771009445, + "reward_std": 0.5910014770925045, + "rewards/cosine_scaled_reward": -0.07737531885504723, "step": 114 }, { "clip_ratio": 0.0, - "completion_length": 2409.3750610351562, - "epoch": 0.13142857142857142, - "grad_norm": 0.21988999843597412, - "kl": 0.0018787384033203125, + "completion_length": 2643.6875610351562, + "epoch": 0.06571428571428571, + "grad_norm": 0.28771933913230896, + "kl": 0.0011758804321289062, "learning_rate": 9.54457320834625e-07, - "loss": 0.1401, - "num_tokens": 15967587.0, - "reward": 0.5942223705351353, - "reward_std": 0.9110212624073029, - "rewards/cosine_scaled_reward": -0.03622216871008277, - "rewards/format_reward": 0.6666666567325592, + "loss": 0.0676, + "num_tokens": 15493570.0, + "reward": -0.19310491532087326, + "reward_std": 0.5526712536811829, + "rewards/cosine_scaled_reward": -0.09655245952308178, "step": 115 }, { "clip_ratio": 0.0, - "completion_length": 2412.041748046875, - "epoch": 0.13257142857142856, - "grad_norm": 0.2862938940525055, - "kl": 0.0075397491455078125, + "completion_length": 3344.7291870117188, + "epoch": 0.06628571428571428, + "grad_norm": 0.2425944209098816, + "kl": 0.0008716583251953125, "learning_rate": 9.530702921077358e-07, - "loss": 0.068, - "num_tokens": 16090931.0, - "reward": 0.35588742792606354, - "reward_std": 0.5233044624328613, - "rewards/cosine_scaled_reward": -0.07205628603696823, - "rewards/format_reward": 0.5000000074505806, + "loss": 0.0343, + "num_tokens": 15661161.0, + "reward": -0.7032309919595718, + "reward_std": 0.3479248844087124, + "rewards/cosine_scaled_reward": -0.351615484803915, "step": 116 }, { "clip_ratio": 0.0, - "completion_length": 3117.5208740234375, - "epoch": 0.1337142857142857, - "grad_norm": 0.17605924606323242, - "kl": 0.0014972686767578125, + "completion_length": 3084.4583740234375, + "epoch": 0.06685714285714285, + "grad_norm": 0.20971350371837616, + "kl": 0.0007829666137695312, "learning_rate": 9.516636183034564e-07, - "loss": 0.127, - "num_tokens": 16249248.0, - "reward": -0.08803762402385473, - "reward_std": 0.6678096652030945, - "rewards/cosine_scaled_reward": -0.18985214456915855, - "rewards/format_reward": 0.2916666679084301, + "loss": -0.0389, + "num_tokens": 15815755.0, + "reward": 0.0003270097076892853, + "reward_std": 0.43772435188293457, + "rewards/cosine_scaled_reward": 0.00016349367797374725, "step": 117 }, { "clip_ratio": 0.0, - "completion_length": 1823.6042175292969, - "epoch": 0.13485714285714287, - "grad_norm": 0.3110818564891815, - "kl": 0.0024127960205078125, + "completion_length": 2729.312530517578, + "epoch": 0.06742857142857143, + "grad_norm": 0.2613621950149536, + "kl": 0.001468658447265625, "learning_rate": 9.502373679810839e-07, - "loss": 0.0435, - "num_tokens": 16344899.0, - "reward": 0.8677586033008993, - "reward_std": 0.6636749655008316, - "rewards/cosine_scaled_reward": 0.058879293501377106, - "rewards/format_reward": 0.75, + "loss": 0.0463, + "num_tokens": 15952474.0, + "reward": -0.06591695547103882, + "reward_std": 0.39777015522122383, + "rewards/cosine_scaled_reward": -0.032958466559648514, "step": 118 }, { "clip_ratio": 0.0, - "completion_length": 1695.0000457763672, - "epoch": 0.136, - "grad_norm": 0.4480116665363312, - "kl": 0.002857208251953125, + "completion_length": 2081.7916717529297, + "epoch": 0.068, + "grad_norm": 0.4527161717414856, + "kl": 0.0013675689697265625, "learning_rate": 9.487916106540465e-07, - "loss": 0.2014, - "num_tokens": 16434299.0, - "reward": 0.3477291911840439, - "reward_std": 0.4658740572631359, - "rewards/cosine_scaled_reward": -0.16988542396575212, - "rewards/format_reward": 0.6874999925494194, + "loss": 0.0989, + "num_tokens": 16059348.0, + "reward": 0.35896405577659607, + "reward_std": 0.3922106046229601, + "rewards/cosine_scaled_reward": 0.17948202788829803, "step": 119 }, { "clip_ratio": 0.0, - "completion_length": 2670.5625610351562, - "epoch": 0.13714285714285715, - "grad_norm": 0.3484485149383545, - "kl": 0.0051021575927734375, + "completion_length": 2978.125, + "epoch": 0.06857142857142857, + "grad_norm": 0.2305285632610321, + "kl": 0.0010137557983398438, "learning_rate": 9.473264167865171e-07, - "loss": 0.1218, - "num_tokens": 16570394.0, - "reward": 0.19900877276086248, - "reward_std": 0.7815569303929806, - "rewards/cosine_scaled_reward": -0.17132895928807557, - "rewards/format_reward": 0.5416666641831398, + "loss": -0.0725, + "num_tokens": 16208058.0, + "reward": -0.6749820820987225, + "reward_std": 0.33936042711138725, + "rewards/cosine_scaled_reward": -0.33749102614820004, "step": 120 }, { "clip_ratio": 0.0, - "completion_length": 2795.354278564453, - "epoch": 0.1382857142857143, - "grad_norm": 0.26137691736221313, - "kl": 0.0018291473388671875, + "completion_length": 2772.125, + "epoch": 0.06914285714285714, + "grad_norm": 0.2650047242641449, + "kl": 0.0009822845458984375, "learning_rate": 9.458418577899774e-07, - "loss": 0.1001, - "num_tokens": 16712407.0, - "reward": 0.32655623741447926, - "reward_std": 0.5866650827229023, - "rewards/cosine_scaled_reward": -0.055471885949373245, - "rewards/format_reward": 0.4375000074505806, + "loss": -0.0085, + "num_tokens": 16347408.0, + "reward": -0.032290175557136536, + "reward_std": 0.4409598559141159, + "rewards/cosine_scaled_reward": -0.016145076602697372, "step": 121 }, { "clip_ratio": 0.0, - "completion_length": 2151.520965576172, - "epoch": 0.13942857142857143, - "grad_norm": 0.23685228824615479, - "kl": 0.0031890869140625, + "completion_length": 2094.479217529297, + "epoch": 0.06971428571428571, + "grad_norm": 0.24298027157783508, + "kl": 0.0012693405151367188, "learning_rate": 9.443380060197385e-07, - "loss": 0.0582, - "num_tokens": 16823474.0, - "reward": 0.03241742588579655, - "reward_std": 0.5829479023814201, - "rewards/cosine_scaled_reward": -0.28587461821734905, - "rewards/format_reward": 0.6041666716337204, + "loss": -0.0886, + "num_tokens": 16453067.0, + "reward": 0.5003323024138808, + "reward_std": 0.5531654357910156, + "rewards/cosine_scaled_reward": 0.2501661437563598, "step": 122 }, { "clip_ratio": 0.0, - "completion_length": 2567.7084350585938, - "epoch": 0.14057142857142857, - "grad_norm": 0.2046615034341812, - "kl": 0.0016498565673828125, + "completion_length": 1995.3958587646484, + "epoch": 0.07028571428571428, + "grad_norm": 0.2879078686237335, + "kl": 0.001132965087890625, "learning_rate": 9.428149347714143e-07, - "loss": 0.1672, - "num_tokens": 16954332.0, - "reward": 0.25633008778095245, - "reward_std": 0.868492528796196, - "rewards/cosine_scaled_reward": -0.13225161656737328, - "rewards/format_reward": 0.5208333358168602, + "loss": 0.069, + "num_tokens": 16554006.0, + "reward": -0.1732923611998558, + "reward_std": 0.6025716587901115, + "rewards/cosine_scaled_reward": -0.08664617873728275, "step": 123 }, { "clip_ratio": 0.0, - "completion_length": 2511.541748046875, - "epoch": 0.1417142857142857, - "grad_norm": 0.3218223750591278, - "kl": 0.0021820068359375, + "completion_length": 3321.0625, + "epoch": 0.07085714285714285, + "grad_norm": 0.18874195218086243, + "kl": 0.0006341934204101562, "learning_rate": 9.412727182773486e-07, - "loss": 0.0675, - "num_tokens": 17083004.0, - "reward": 0.7707539834082127, - "reward_std": 0.6489643305540085, - "rewards/cosine_scaled_reward": 0.11454363912343979, - "rewards/format_reward": 0.5416666753590107, + "loss": 0.0448, + "num_tokens": 16720257.0, + "reward": 0.0892313290387392, + "reward_std": 0.36883802339434624, + "rewards/cosine_scaled_reward": 0.0446156719699502, "step": 124 }, { "clip_ratio": 0.0, - "completion_length": 2831.3959350585938, - "epoch": 0.14285714285714285, - "grad_norm": 0.20118391513824463, - "kl": 0.0013217926025390625, + "completion_length": 1953.6458740234375, + "epoch": 0.07142857142857142, + "grad_norm": 0.34396979212760925, + "kl": 0.0025730133056640625, "learning_rate": 9.397114317029974e-07, - "loss": 0.1636, - "num_tokens": 17226939.0, - "reward": 0.01697085052728653, - "reward_std": 0.6512814313173294, - "rewards/cosine_scaled_reward": -0.18943122308701277, - "rewards/format_reward": 0.3958333283662796, + "loss": -0.0302, + "num_tokens": 16819864.0, + "reward": 0.04853908717632294, + "reward_std": 0.6620666459202766, + "rewards/cosine_scaled_reward": 0.024269558489322662, "step": 125 }, { "clip_ratio": 0.0, - "completion_length": 1912.6875457763672, - "epoch": 0.144, - "grad_norm": 0.3025752902030945, - "kl": 0.00395965576171875, + "completion_length": 1393.2708435058594, + "epoch": 0.072, + "grad_norm": 0.5013756155967712, + "kl": 0.011034011840820312, "learning_rate": 9.381311511432658e-07, - "loss": 0.0732, - "num_tokens": 17326566.0, - "reward": 0.29770641401410103, - "reward_std": 0.6499512940645218, - "rewards/cosine_scaled_reward": -0.1948967999778688, - "rewards/format_reward": 0.6875, + "loss": 0.1767, + "num_tokens": 16891889.0, + "reward": -0.013652913272380829, + "reward_std": 0.3805545046925545, + "rewards/cosine_scaled_reward": -0.006826456636190414, "step": 126 }, { "clip_ratio": 0.0, - "completion_length": 2401.1875610351562, - "epoch": 0.14514285714285713, - "grad_norm": 0.2488543838262558, - "kl": 0.001705169677734375, + "completion_length": 2377.8958435058594, + "epoch": 0.07257142857142856, + "grad_norm": 0.22264918684959412, + "kl": 0.0008573532104492188, "learning_rate": 9.36531953618799e-07, - "loss": 0.1578, - "num_tokens": 17449599.0, - "reward": 0.6098210737109184, - "reward_std": 0.6302774548530579, - "rewards/cosine_scaled_reward": 0.02366053406149149, - "rewards/format_reward": 0.5625000074505806, + "loss": 0.0667, + "num_tokens": 17012724.0, + "reward": 0.2236809842288494, + "reward_std": 0.4961891621351242, + "rewards/cosine_scaled_reward": 0.11184047814458609, "step": 127 }, { "clip_ratio": 0.0, - "completion_length": 2305.854248046875, - "epoch": 0.1462857142857143, - "grad_norm": 0.24932162463665009, - "kl": 0.00275421142578125, + "completion_length": 2564.7500610351562, + "epoch": 0.07314285714285715, + "grad_norm": 0.306539386510849, + "kl": 0.0009932518005371094, "learning_rate": 9.34913917072228e-07, - "loss": 0.1021, - "num_tokens": 17568806.0, - "reward": 0.21157516352832317, - "reward_std": 0.688478484749794, - "rewards/cosine_scaled_reward": -0.16504575312137604, - "rewards/format_reward": 0.5416666716337204, + "loss": -0.0337, + "num_tokens": 17141448.0, + "reward": -0.004740983247756958, + "reward_std": 0.3173178732395172, + "rewards/cosine_scaled_reward": -0.002370491623878479, "step": 128 }, { "clip_ratio": 0.0, - "completion_length": 1629.6250915527344, - "epoch": 0.14742857142857144, - "grad_norm": 0.31823137402534485, - "kl": 0.0034008026123046875, + "completion_length": 2082.166717529297, + "epoch": 0.07371428571428572, + "grad_norm": 0.3070489764213562, + "kl": 0.0011272430419921875, "learning_rate": 9.332771203643714e-07, - "loss": 0.0721, - "num_tokens": 17654918.0, - "reward": 0.7311810962855816, - "reward_std": 0.48786818608641624, - "rewards/cosine_scaled_reward": -0.009409455582499504, - "rewards/format_reward": 0.75, + "loss": -0.0082, + "num_tokens": 17247344.0, + "reward": -0.18208786100149155, + "reward_std": 0.5415353253483772, + "rewards/cosine_scaled_reward": -0.09104393050074577, "step": 129 }, { "clip_ratio": 0.0, - "completion_length": 2351.6250610351562, - "epoch": 0.14857142857142858, - "grad_norm": 0.22688433527946472, - "kl": 0.002315521240234375, + "completion_length": 1620.7291717529297, + "epoch": 0.07428571428571429, + "grad_norm": 0.3711329698562622, + "kl": 0.0028362274169921875, "learning_rate": 9.316216432703916e-07, - "loss": 0.1654, - "num_tokens": 17775866.0, - "reward": 0.1337121445685625, - "reward_std": 0.489741962403059, - "rewards/cosine_scaled_reward": -0.19356059283018112, - "rewards/format_reward": 0.5208333358168602, + "loss": 0.008, + "num_tokens": 17330167.0, + "reward": -0.04559193179011345, + "reward_std": 0.6725184172391891, + "rewards/cosine_scaled_reward": -0.022795964032411575, "step": 130 }, { "clip_ratio": 0.0, - "completion_length": 2512.2083740234375, - "epoch": 0.14971428571428572, - "grad_norm": 0.2089494913816452, - "kl": 0.002399444580078125, + "completion_length": 2907.8333587646484, + "epoch": 0.07485714285714286, + "grad_norm": 0.22048690915107727, + "kl": 0.0009622573852539062, "learning_rate": 9.299475664759068e-07, - "loss": 0.0292, - "num_tokens": 17904126.0, - "reward": 0.10406213253736496, - "reward_std": 0.556065134704113, - "rewards/cosine_scaled_reward": -0.22921893745660782, - "rewards/format_reward": 0.5624999962747097, + "loss": 0.0646, + "num_tokens": 17475575.0, + "reward": -0.6151376739144325, + "reward_std": 0.3845426104962826, + "rewards/cosine_scaled_reward": -0.30756882205605507, "step": 131 }, { "clip_ratio": 0.0, - "completion_length": 2390.916748046875, - "epoch": 0.15085714285714286, - "grad_norm": 0.25721561908721924, - "kl": 0.002170562744140625, + "completion_length": 2430.7083740234375, + "epoch": 0.07542857142857143, + "grad_norm": 0.22665299475193024, + "kl": 0.0009012222290039062, "learning_rate": 9.282549715730579e-07, - "loss": 0.1248, - "num_tokens": 18027062.0, - "reward": 0.03476526029407978, - "reward_std": 0.5311327800154686, - "rewards/cosine_scaled_reward": -0.22220071218907833, - "rewards/format_reward": 0.4791666641831398, + "loss": 0.0695, + "num_tokens": 17598141.0, + "reward": -0.2976334486156702, + "reward_std": 0.5366611061617732, + "rewards/cosine_scaled_reward": -0.14881671639159322, "step": 132 }, { "clip_ratio": 0.0, - "completion_length": 2429.3959350585938, - "epoch": 0.152, - "grad_norm": 0.24131183326244354, - "kl": 0.002513885498046875, + "completion_length": 2548.6458435058594, + "epoch": 0.076, + "grad_norm": 0.3216173052787781, + "kl": 0.0010004043579101562, "learning_rate": 9.265439410565328e-07, - "loss": 0.021, - "num_tokens": 18151617.0, - "reward": 0.32898006960749626, - "reward_std": 0.6633763536810875, - "rewards/cosine_scaled_reward": -0.15842663776129484, - "rewards/format_reward": 0.6458333432674408, + "loss": 0.0916, + "num_tokens": 17726752.0, + "reward": 0.17475611716508865, + "reward_std": 0.47678545862436295, + "rewards/cosine_scaled_reward": 0.08737805113196373, "step": 133 }, { "clip_ratio": 0.0, - "completion_length": 1656.7500457763672, - "epoch": 0.15314285714285714, - "grad_norm": 0.769141674041748, - "kl": 0.020517349243164062, + "completion_length": 3280.7500610351562, + "epoch": 0.07657142857142857, + "grad_norm": 0.17866647243499756, + "kl": 0.0007352828979492188, "learning_rate": 9.248145583195447e-07, - "loss": -0.092, - "num_tokens": 18238839.0, - "reward": 0.7126629631966352, - "reward_std": 0.7828906625509262, - "rewards/cosine_scaled_reward": -0.06033520896744449, - "rewards/format_reward": 0.8333333283662796, + "loss": -0.003, + "num_tokens": 17890468.0, + "reward": 0.4249248839914799, + "reward_std": 0.7319479957222939, + "rewards/cosine_scaled_reward": 0.21246242709457874, "step": 134 }, { "clip_ratio": 0.0, - "completion_length": 2022.3334045410156, - "epoch": 0.15428571428571428, - "grad_norm": 0.2641633152961731, - "kl": 0.002735137939453125, + "completion_length": 2719.062530517578, + "epoch": 0.07714285714285714, + "grad_norm": 0.34077975153923035, + "kl": 0.0010766983032226562, "learning_rate": 9.230669076497687e-07, - "loss": -0.0217, - "num_tokens": 18343621.0, - "reward": 0.44979627430438995, - "reward_std": 0.4202271206304431, - "rewards/cosine_scaled_reward": -0.0876018637791276, - "rewards/format_reward": 0.625, + "loss": 0.0249, + "num_tokens": 18027487.0, + "reward": -0.6892034411430359, + "reward_std": 0.43735230527818203, + "rewards/cosine_scaled_reward": -0.34460172057151794, "step": 135 }, { "clip_ratio": 0.0, - "completion_length": 1820.8958740234375, - "epoch": 0.15542857142857142, - "grad_norm": 0.40350502729415894, - "kl": 0.0032672882080078125, + "completion_length": 1582.0833587646484, + "epoch": 0.07771428571428571, + "grad_norm": 0.5860525965690613, + "kl": 0.0029449462890625, "learning_rate": 9.213010742252327e-07, - "loss": 0.1507, - "num_tokens": 18439412.0, - "reward": 0.5865317583084106, - "reward_std": 0.6874261423945427, - "rewards/cosine_scaled_reward": -0.07131745107471943, - "rewards/format_reward": 0.7291666641831398, + "loss": -0.1361, + "num_tokens": 18108455.0, + "reward": -0.3233466073870659, + "reward_std": 0.4398806467652321, + "rewards/cosine_scaled_reward": -0.1616733018308878, "step": 136 }, { "clip_ratio": 0.0, - "completion_length": 2143.875, - "epoch": 0.15657142857142858, - "grad_norm": 0.31186163425445557, - "kl": 0.003925323486328125, + "completion_length": 2537.8333435058594, + "epoch": 0.07828571428571429, + "grad_norm": 0.21415218710899353, + "kl": 0.000827789306640625, "learning_rate": 9.195171441101668e-07, - "loss": 0.0978, - "num_tokens": 18550394.0, - "reward": 0.24395663291215897, - "reward_std": 0.3794162981212139, - "rewards/cosine_scaled_reward": -0.1592716935556382, - "rewards/format_reward": 0.5625000055879354, + "loss": -0.0168, + "num_tokens": 18236475.0, + "reward": -0.059672433882951736, + "reward_std": 0.7180322706699371, + "rewards/cosine_scaled_reward": -0.02983621321618557, "step": 137 }, { "clip_ratio": 0.0, - "completion_length": 1822.8750457763672, - "epoch": 0.15771428571428572, - "grad_norm": 0.2998411953449249, - "kl": 0.0027942657470703125, + "completion_length": 2120.3750076293945, + "epoch": 0.07885714285714286, + "grad_norm": 0.29080891609191895, + "kl": 0.0012989044189453125, "learning_rate": 9.177152042508077e-07, - "loss": 0.0397, - "num_tokens": 18645914.0, - "reward": 0.8181012012064457, - "reward_std": 0.8359499275684357, - "rewards/cosine_scaled_reward": 0.04446728294715285, - "rewards/format_reward": 0.7291666716337204, + "loss": 0.0494, + "num_tokens": 18343401.0, + "reward": -0.08782655745744705, + "reward_std": 0.12043035682290792, + "rewards/cosine_scaled_reward": -0.04391326941549778, "step": 138 }, { "clip_ratio": 0.0, - "completion_length": 2293.8125610351562, - "epoch": 0.15885714285714286, - "grad_norm": 0.23238134384155273, - "kl": 0.0021343231201171875, + "completion_length": 2405.2291870117188, + "epoch": 0.07942857142857143, + "grad_norm": 0.22457921504974365, + "kl": 0.0009593963623046875, "learning_rate": 9.158953424711624e-07, - "loss": -0.0407, - "num_tokens": 18764201.0, - "reward": 0.44185843877494335, - "reward_std": 0.595511220395565, - "rewards/cosine_scaled_reward": -0.10198746342211962, - "rewards/format_reward": 0.645833333954215, + "loss": 0.0361, + "num_tokens": 18464648.0, + "reward": -0.41086670011281967, + "reward_std": 0.4968971386551857, + "rewards/cosine_scaled_reward": -0.20543334260582924, "step": 139 }, { "clip_ratio": 0.0, - "completion_length": 1963.7500305175781, - "epoch": 0.16, - "grad_norm": 0.32312485575675964, - "kl": 0.002685546875, + "completion_length": 2780.2291717529297, + "epoch": 0.08, + "grad_norm": 0.2105841189622879, + "kl": 0.0010890960693359375, "learning_rate": 9.140576474687263e-07, - "loss": -0.005, - "num_tokens": 18866069.0, - "reward": 0.7035277560353279, - "reward_std": 0.563681848347187, - "rewards/cosine_scaled_reward": -0.033652789890766144, - "rewards/format_reward": 0.7708333432674408, + "loss": 0.0146, + "num_tokens": 18604483.0, + "reward": 0.13771556317806244, + "reward_std": 0.30897051841020584, + "rewards/cosine_scaled_reward": 0.06885778903961182, "step": 140 }, { "clip_ratio": 0.0, - "completion_length": 1937.9792098999023, - "epoch": 0.16114285714285714, - "grad_norm": 0.336083322763443, - "kl": 0.002849578857421875, + "completion_length": 2002.2708740234375, + "epoch": 0.08057142857142857, + "grad_norm": 0.3650009334087372, + "kl": 0.002017974853515625, "learning_rate": 9.122022088101613e-07, - "loss": 0.0896, - "num_tokens": 18967072.0, - "reward": 0.6717538591474295, - "reward_std": 0.4805175382643938, - "rewards/cosine_scaled_reward": -0.007873063907027245, - "rewards/format_reward": 0.6874999962747097, + "loss": 0.0459, + "num_tokens": 18705872.0, + "reward": -0.379656158387661, + "reward_std": 0.5402833297848701, + "rewards/cosine_scaled_reward": -0.1898280642926693, "step": 141 }, { "clip_ratio": 0.0, - "completion_length": 1959.3125610351562, - "epoch": 0.16228571428571428, - "grad_norm": 0.3192788362503052, - "kl": 0.0032787322998046875, + "completion_length": 2488.7708740234375, + "epoch": 0.08114285714285714, + "grad_norm": 0.2864922285079956, + "kl": 0.001262664794921875, "learning_rate": 9.103291169269299e-07, - "loss": 0.0537, - "num_tokens": 19069219.0, - "reward": 0.5299720168113708, - "reward_std": 0.5246853530406952, - "rewards/cosine_scaled_reward": -0.026680664159357548, - "rewards/format_reward": 0.5833333283662796, + "loss": -0.0564, + "num_tokens": 18831261.0, + "reward": 0.011970575898885727, + "reward_std": 0.7028900012373924, + "rewards/cosine_scaled_reward": 0.0059852879494428635, "step": 142 }, { "clip_ratio": 0.0, - "completion_length": 2556.0000610351562, - "epoch": 0.16342857142857142, - "grad_norm": 0.23155175149440765, - "kl": 0.0023345947265625, + "completion_length": 2584.7084350585938, + "epoch": 0.08171428571428571, + "grad_norm": 0.3126732409000397, + "kl": 0.0011653900146484375, "learning_rate": 9.084384631108882e-07, - "loss": 0.0581, - "num_tokens": 19200019.0, - "reward": 0.5917239114642143, - "reward_std": 0.7362638562917709, - "rewards/cosine_scaled_reward": 0.014611944556236267, - "rewards/format_reward": 0.5625, + "loss": 0.0683, + "num_tokens": 18961471.0, + "reward": 0.24577251449227333, + "reward_std": 0.6674134684726596, + "rewards/cosine_scaled_reward": 0.12288625724613667, "step": 143 }, { "clip_ratio": 0.0, - "completion_length": 2280.416748046875, - "epoch": 0.16457142857142856, - "grad_norm": 0.27343228459358215, - "kl": 0.00260162353515625, + "completion_length": 2151.1041870117188, + "epoch": 0.08228571428571428, + "grad_norm": 0.37290921807289124, + "kl": 0.0016574859619140625, "learning_rate": 9.065303395098358e-07, - "loss": 0.1897, - "num_tokens": 19317651.0, - "reward": 0.27971339225769043, - "reward_std": 0.8346386849880219, - "rewards/cosine_scaled_reward": -0.17264331132173538, - "rewards/format_reward": 0.6250000111758709, + "loss": 0.0363, + "num_tokens": 19070388.0, + "reward": -0.1530948244035244, + "reward_std": 0.6015914604067802, + "rewards/cosine_scaled_reward": -0.076547397300601, "step": 144 }, { "clip_ratio": 0.0, - "completion_length": 1757.3125610351562, - "epoch": 0.1657142857142857, - "grad_norm": 0.31246596574783325, - "kl": 0.0029392242431640625, + "completion_length": 1968.2084197998047, + "epoch": 0.08285714285714285, + "grad_norm": 0.35119813680648804, + "kl": 0.002681732177734375, "learning_rate": 9.046048391230247e-07, - "loss": 0.1442, - "num_tokens": 19409574.0, - "reward": 0.28544900193810463, - "reward_std": 0.6451750323176384, - "rewards/cosine_scaled_reward": -0.23227551455056528, - "rewards/format_reward": 0.75, + "loss": -0.0567, + "num_tokens": 19169698.0, + "reward": -0.20278839766979218, + "reward_std": 0.47369804978370667, + "rewards/cosine_scaled_reward": -0.10139419510960579, "step": 145 }, { "clip_ratio": 0.0, - "completion_length": 1878.5000305175781, - "epoch": 0.16685714285714287, - "grad_norm": 0.27821478247642517, - "kl": 0.0026645660400390625, + "completion_length": 1638.3125457763672, + "epoch": 0.08342857142857144, + "grad_norm": 0.3501497507095337, + "kl": 0.0024051666259765625, "learning_rate": 9.026620557966279e-07, - "loss": -0.0058, - "num_tokens": 19507560.0, - "reward": 0.08767887763679028, - "reward_std": 0.2916584052145481, - "rewards/cosine_scaled_reward": -0.3311605527997017, - "rewards/format_reward": 0.75, + "loss": 0.009, + "num_tokens": 19253065.0, + "reward": 0.25620073080062866, + "reward_std": 0.8665501922369003, + "rewards/cosine_scaled_reward": 0.12810037285089493, "step": 146 }, { "clip_ratio": 0.0, - "completion_length": 1733.6250305175781, - "epoch": 0.168, - "grad_norm": 0.3348139822483063, - "kl": 0.002803802490234375, + "completion_length": 2041.5417175292969, + "epoch": 0.084, + "grad_norm": 0.29272228479385376, + "kl": 0.0031538009643554688, "learning_rate": 9.007020842191634e-07, - "loss": 0.0576, - "num_tokens": 19598754.0, - "reward": 0.6129366103559732, - "reward_std": 0.6426418013870716, - "rewards/cosine_scaled_reward": -0.07894838228821754, - "rewards/format_reward": 0.7708333283662796, + "loss": -0.0467, + "num_tokens": 19356735.0, + "reward": 0.015910595655441284, + "reward_std": 0.7093758508563042, + "rewards/cosine_scaled_reward": 0.007955307140946388, "step": 147 }, { "clip_ratio": 0.0, - "completion_length": 1734.5625457763672, - "epoch": 0.16914285714285715, - "grad_norm": 0.348442405462265, - "kl": 0.0042572021484375, + "completion_length": 1694.2500305175781, + "epoch": 0.08457142857142858, + "grad_norm": 0.36988532543182373, + "kl": 0.002410888671875, "learning_rate": 8.987250199168808e-07, - "loss": 0.0653, - "num_tokens": 19690329.0, - "reward": 0.09828764945268631, - "reward_std": 0.49646729975938797, - "rewards/cosine_scaled_reward": -0.32585618272423744, - "rewards/format_reward": 0.7500000111758709, + "loss": 0.0963, + "num_tokens": 19444131.0, + "reward": -0.25489600747823715, + "reward_std": 0.4693850204348564, + "rewards/cosine_scaled_reward": -0.12744800373911858, "step": 148 }, { "clip_ratio": 0.0, - "completion_length": 1818.1250915527344, - "epoch": 0.1702857142857143, - "grad_norm": 0.3159027695655823, - "kl": 0.003021240234375, + "completion_length": 1455.7917022705078, + "epoch": 0.08514285714285715, + "grad_norm": 0.3559054434299469, + "kl": 0.00274658203125, "learning_rate": 8.967309592491052e-07, - "loss": 0.1256, - "num_tokens": 19785447.0, - "reward": 0.9565939288586378, - "reward_std": 0.8120209276676178, - "rewards/cosine_scaled_reward": 0.10329693369567394, - "rewards/format_reward": 0.7499999925494194, + "loss": 0.0878, + "num_tokens": 19519601.0, + "reward": -0.10238776355981827, + "reward_std": 0.636756157502532, + "rewards/cosine_scaled_reward": -0.05119386687874794, "step": 149 }, { "clip_ratio": 0.0, - "completion_length": 1871.4584045410156, - "epoch": 0.17142857142857143, - "grad_norm": 0.30927422642707825, - "kl": 0.003932952880859375, + "completion_length": 2705.2500228881836, + "epoch": 0.08571428571428572, + "grad_norm": 0.3573072850704193, + "kl": 0.0050811767578125, "learning_rate": 8.9471999940354e-07, - "loss": 0.1203, - "num_tokens": 19883539.0, - "reward": 0.08374191913753748, - "reward_std": 0.42122264206409454, - "rewards/cosine_scaled_reward": -0.30187904462218285, - "rewards/format_reward": 0.6874999962747097, + "loss": 0.0188, + "num_tokens": 19655657.0, + "reward": -0.20403114520013332, + "reward_std": 0.5940973423421383, + "rewards/cosine_scaled_reward": -0.10201557632535696, "step": 150 }, { "clip_ratio": 0.0, - "completion_length": 1639.1459045410156, - "epoch": 0.17257142857142857, - "grad_norm": 0.31134167313575745, - "kl": 0.0028209686279296875, + "completion_length": 2180.812545776367, + "epoch": 0.08628571428571429, + "grad_norm": 0.2828698456287384, + "kl": 0.0015001296997070312, "learning_rate": 8.926922383915315e-07, - "loss": 0.0552, - "num_tokens": 19970774.0, - "reward": 0.5363836996257305, - "reward_std": 0.3387358784675598, - "rewards/cosine_scaled_reward": -0.11722482740879059, - "rewards/format_reward": 0.7708333283662796, + "loss": -0.0244, + "num_tokens": 19766528.0, + "reward": 0.11930293589830399, + "reward_std": 0.6097396910190582, + "rewards/cosine_scaled_reward": 0.05965147539973259, "step": 151 }, { "clip_ratio": 0.0, - "completion_length": 2605.3333740234375, - "epoch": 0.1737142857142857, - "grad_norm": 0.24991856515407562, - "kl": 0.0028591156005859375, + "completion_length": 2904.4166870117188, + "epoch": 0.08685714285714285, + "grad_norm": 0.26096129417419434, + "kl": 0.00130462646484375, "learning_rate": 8.906477750432903e-07, - "loss": 0.1381, - "num_tokens": 20104146.0, - "reward": 0.15796156786382198, - "reward_std": 0.6485603600740433, - "rewards/cosine_scaled_reward": -0.17101922258734703, - "rewards/format_reward": 0.5, + "loss": 0.1732, + "num_tokens": 19912480.0, + "reward": -0.3498671278357506, + "reward_std": 0.394152645021677, + "rewards/cosine_scaled_reward": -0.1749335676431656, "step": 152 }, { "clip_ratio": 0.0, - "completion_length": 2192.8333740234375, - "epoch": 0.17485714285714285, - "grad_norm": 0.2899620831012726, - "kl": 0.0022563934326171875, + "completion_length": 2408.5208740234375, + "epoch": 0.08742857142857142, + "grad_norm": 0.35171744227409363, + "kl": 0.0016956329345703125, "learning_rate": 8.88586709003076e-07, - "loss": -0.0387, - "num_tokens": 20217076.0, - "reward": 0.8120561987161636, - "reward_std": 0.8174077644944191, - "rewards/cosine_scaled_reward": 0.08311141841113567, - "rewards/format_reward": 0.6458333358168602, + "loss": 0.1131, + "num_tokens": 20034233.0, + "reward": 0.07599013298749924, + "reward_std": 0.41721872985363007, + "rewards/cosine_scaled_reward": 0.037995072081685066, "step": 153 }, { "clip_ratio": 0.0, - "completion_length": 2512.0625610351562, - "epoch": 0.176, - "grad_norm": 0.20951369404792786, - "kl": 0.001766204833984375, + "completion_length": 2731.3750610351562, + "epoch": 0.088, + "grad_norm": 0.26374199986457825, + "kl": 0.001861572265625, "learning_rate": 8.865091407243394e-07, - "loss": 0.1924, - "num_tokens": 20345389.0, - "reward": 0.715275889262557, - "reward_std": 0.9692183881998062, - "rewards/cosine_scaled_reward": 0.06597128417342901, - "rewards/format_reward": 0.5833333358168602, + "loss": -0.0558, + "num_tokens": 20171267.0, + "reward": -0.2839723117649555, + "reward_std": 0.6639672666788101, + "rewards/cosine_scaled_reward": -0.14198614470660686, "step": 154 }, { "clip_ratio": 0.0, - "completion_length": 2936.9376220703125, - "epoch": 0.17714285714285713, - "grad_norm": 0.22864274680614471, - "kl": 0.00200653076171875, + "completion_length": 2960.8958435058594, + "epoch": 0.08857142857142856, + "grad_norm": 0.27562668919563293, + "kl": 0.0019378662109375, "learning_rate": 8.844151714648274e-07, - "loss": 0.2167, - "num_tokens": 20494816.0, - "reward": -0.09320222213864326, - "reward_std": 0.5999226495623589, - "rewards/cosine_scaled_reward": -0.2132677833433263, - "rewards/format_reward": 0.3333333358168602, + "loss": -0.0435, + "num_tokens": 20318922.0, + "reward": -0.46872561052441597, + "reward_std": 0.42786915227770805, + "rewards/cosine_scaled_reward": -0.2343627940863371, "step": 155 }, { "clip_ratio": 0.0, - "completion_length": 2167.229217529297, - "epoch": 0.1782857142857143, - "grad_norm": 0.27711907029151917, - "kl": 0.003047943115234375, + "completion_length": 1580.7917404174805, + "epoch": 0.08914285714285715, + "grad_norm": 0.499097615480423, + "kl": 0.005889892578125, "learning_rate": 8.823049032816478e-07, - "loss": 0.126, - "num_tokens": 20606895.0, - "reward": 0.036196669563651085, - "reward_std": 0.3412875160574913, - "rewards/cosine_scaled_reward": -0.2944016717374325, - "rewards/format_reward": 0.625, + "loss": -0.0042, + "num_tokens": 20400236.0, + "reward": 0.6059545688331127, + "reward_std": 0.690888412296772, + "rewards/cosine_scaled_reward": 0.3029772713780403, "step": 156 }, { "clip_ratio": 0.0, - "completion_length": 2172.3333435058594, - "epoch": 0.17942857142857144, - "grad_norm": 0.29314175248146057, - "kl": 0.004207611083984375, + "completion_length": 2041.2708435058594, + "epoch": 0.08971428571428572, + "grad_norm": 0.4625336229801178, + "kl": 0.005961418151855469, "learning_rate": 8.801784390262943e-07, - "loss": 0.021, - "num_tokens": 20719723.0, - "reward": 0.9620806649327278, - "reward_std": 0.7838257402181625, - "rewards/cosine_scaled_reward": 0.1893736298661679, - "rewards/format_reward": 0.5833333283662796, + "loss": 0.1281, + "num_tokens": 20504961.0, + "reward": 0.08923859149217606, + "reward_std": 0.16652610152959824, + "rewards/cosine_scaled_reward": 0.04461930692195892, "step": 157 }, { "clip_ratio": 0.0, - "completion_length": 2577.8959350585938, - "epoch": 0.18057142857142858, - "grad_norm": 0.24172109365463257, - "kl": 0.003192901611328125, + "completion_length": 2572.312530517578, + "epoch": 0.09028571428571429, + "grad_norm": 0.25713205337524414, + "kl": 0.0018138885498046875, "learning_rate": 8.780358823396352e-07, - "loss": 0.0886, - "num_tokens": 20851364.0, - "reward": 0.09658388825482689, - "reward_std": 0.5904048159718513, - "rewards/cosine_scaled_reward": -0.23295804858207703, - "rewards/format_reward": 0.5625, + "loss": 0.0261, + "num_tokens": 20635176.0, + "reward": -0.24785784073174, + "reward_std": 0.6590173244476318, + "rewards/cosine_scaled_reward": -0.1239289166405797, "step": 158 }, { "clip_ratio": 0.0, - "completion_length": 1744.1459350585938, - "epoch": 0.18171428571428572, - "grad_norm": 0.28953787684440613, - "kl": 0.0030364990234375, + "completion_length": 3366.6875, + "epoch": 0.09085714285714286, + "grad_norm": 0.21494215726852417, + "kl": 0.00112152099609375, "learning_rate": 8.758773376468604e-07, - "loss": 0.1287, - "num_tokens": 20942583.0, - "reward": 0.7739498913288116, - "reward_std": 0.6133845373988152, - "rewards/cosine_scaled_reward": -0.008858396206051111, - "rewards/format_reward": 0.7916666716337204, + "loss": -0.0031, + "num_tokens": 20803401.0, + "reward": -0.47726341150701046, + "reward_std": 0.34933631122112274, + "rewards/cosine_scaled_reward": -0.23863169085234404, "step": 159 }, { "clip_ratio": 0.0, - "completion_length": 1813.291748046875, - "epoch": 0.18285714285714286, - "grad_norm": 0.28211480379104614, - "kl": 0.00348663330078125, + "completion_length": 3498.0208740234375, + "epoch": 0.09142857142857143, + "grad_norm": 0.19158728420734406, + "kl": 0.0010175704956054688, "learning_rate": 8.737029101523929e-07, - "loss": 0.1865, - "num_tokens": 21038333.0, - "reward": 0.3583908216096461, - "reward_std": 0.7735366523265839, - "rewards/cosine_scaled_reward": -0.1645545873325318, - "rewards/format_reward": 0.6875, + "loss": -0.0137, + "num_tokens": 20977630.0, + "reward": -0.23914687521755695, + "reward_std": 0.37878062576055527, + "rewards/cosine_scaled_reward": -0.11957343760877848, "step": 160 }, { "clip_ratio": 0.0, - "completion_length": 2659.5209350585938, - "epoch": 0.184, - "grad_norm": 0.2976882755756378, - "kl": 0.004169464111328125, + "completion_length": 2862.8959350585938, + "epoch": 0.092, + "grad_norm": 0.2647199034690857, + "kl": 0.002674102783203125, "learning_rate": 8.715127058347614e-07, - "loss": 0.1072, - "num_tokens": 21174012.0, - "reward": 0.19437413476407528, - "reward_std": 0.6651452034711838, - "rewards/cosine_scaled_reward": -0.16322960006073117, - "rewards/format_reward": 0.520833320915699, + "loss": 0.0488, + "num_tokens": 21120425.0, + "reward": -0.28584553534165025, + "reward_std": 0.7033149749040604, + "rewards/cosine_scaled_reward": -0.14292275649495423, "step": 161 }, { "clip_ratio": 0.0, - "completion_length": 1947.6458740234375, - "epoch": 0.18514285714285714, - "grad_norm": 0.2935653626918793, - "kl": 0.003204345703125, + "completion_length": 2344.0208740234375, + "epoch": 0.09257142857142857, + "grad_norm": 0.27368515729904175, + "kl": 0.0023193359375, "learning_rate": 8.693068314414344e-07, - "loss": 0.0734, - "num_tokens": 21275227.0, - "reward": 0.25041304528713226, - "reward_std": 0.5214099213480949, - "rewards/cosine_scaled_reward": -0.22896013781428337, - "rewards/format_reward": 0.7083333283662796, + "loss": 0.0967, + "num_tokens": 21238050.0, + "reward": 0.3530673161149025, + "reward_std": 0.8069795817136765, + "rewards/cosine_scaled_reward": 0.1765336561948061, "step": 162 }, { "clip_ratio": 0.0, - "completion_length": 1663.5833740234375, - "epoch": 0.18628571428571428, - "grad_norm": 0.4594109058380127, - "kl": 0.0039997100830078125, + "completion_length": 3091.416717529297, + "epoch": 0.09314285714285714, + "grad_norm": 0.2311529964208603, + "kl": 0.0021657943725585938, "learning_rate": 8.670853944836176e-07, - "loss": 0.0258, - "num_tokens": 21363017.0, - "reward": 0.5311151891946793, - "reward_std": 0.5461377911269665, - "rewards/cosine_scaled_reward": -0.07819239422678947, - "rewards/format_reward": 0.6875, + "loss": 0.0074, + "num_tokens": 21392414.0, + "reward": -0.3498072102665901, + "reward_std": 0.36920344084501266, + "rewards/cosine_scaled_reward": -0.17490360513329506, "step": 163 }, { "clip_ratio": 0.0, - "completion_length": 2301.812530517578, - "epoch": 0.18742857142857142, - "grad_norm": 0.33991318941116333, - "kl": 0.003398895263671875, + "completion_length": 3042.8126220703125, + "epoch": 0.09371428571428571, + "grad_norm": 0.22658418118953705, + "kl": 0.00127410888671875, "learning_rate": 8.648485032310144e-07, - "loss": 0.1005, - "num_tokens": 21480998.0, - "reward": 0.026660297065973282, - "reward_std": 0.3485817238688469, - "rewards/cosine_scaled_reward": -0.20541985146701336, - "rewards/format_reward": 0.4375, + "loss": 0.0962, + "num_tokens": 21544385.0, + "reward": 0.0001346580684185028, + "reward_std": 0.6785394810140133, + "rewards/cosine_scaled_reward": 6.732158362865448e-05, "step": 164 }, { "clip_ratio": 0.0, - "completion_length": 2174.479248046875, - "epoch": 0.18857142857142858, - "grad_norm": 0.3102648854255676, - "kl": 0.0030345916748046875, + "completion_length": 3391.354248046875, + "epoch": 0.09428571428571429, + "grad_norm": 0.2044445425271988, + "kl": 0.0012722015380859375, "learning_rate": 8.625962667065487e-07, - "loss": 0.1386, - "num_tokens": 21593905.0, - "reward": 0.4817277453839779, - "reward_std": 0.9409472942352295, - "rewards/cosine_scaled_reward": -0.07163612451404333, - "rewards/format_reward": 0.6250000074505806, + "loss": 0.0174, + "num_tokens": 21714214.0, + "reward": -0.43216387182474136, + "reward_std": 0.4510202333331108, + "rewards/cosine_scaled_reward": -0.2160819210112095, "step": 165 }, { "clip_ratio": 0.0, - "completion_length": 1578.7708740234375, - "epoch": 0.18971428571428572, - "grad_norm": 0.31517553329467773, - "kl": 0.0026702880859375, + "completion_length": 3543.7708740234375, + "epoch": 0.09485714285714286, + "grad_norm": 0.20412862300872803, + "kl": 0.0009775161743164062, "learning_rate": 8.603287946810513e-07, - "loss": 0.1302, - "num_tokens": 21677198.0, - "reward": 0.5695466273464262, - "reward_std": 0.2626556381583214, - "rewards/cosine_scaled_reward": -0.0902266874909401, - "rewards/format_reward": 0.75, + "loss": 0.0178, + "num_tokens": 21891155.0, + "reward": -0.5908009447157383, + "reward_std": 0.4042652491480112, + "rewards/cosine_scaled_reward": -0.2954004658386111, "step": 166 }, { "clip_ratio": 0.0, - "completion_length": 2458.0000610351562, - "epoch": 0.19085714285714286, - "grad_norm": 0.2574119567871094, - "kl": 0.0025386810302734375, + "completion_length": 1522.666732788086, + "epoch": 0.09542857142857143, + "grad_norm": 0.3152174651622772, + "kl": 0.00511932373046875, "learning_rate": 8.580461976679099e-07, - "loss": 0.1922, - "num_tokens": 21803210.0, - "reward": 0.16224310919642448, - "reward_std": 0.544936329126358, - "rewards/cosine_scaled_reward": -0.1584617868065834, - "rewards/format_reward": 0.47916666232049465, + "loss": -0.0191, + "num_tokens": 21969823.0, + "reward": 0.7545666880905628, + "reward_std": 0.8821780234575272, + "rewards/cosine_scaled_reward": 0.3772833216935396, "step": 167 }, { "clip_ratio": 0.0, - "completion_length": 1711.7291870117188, - "epoch": 0.192, - "grad_norm": 0.31856468319892883, - "kl": 0.00446319580078125, + "completion_length": 2335.8333587646484, + "epoch": 0.096, + "grad_norm": 0.33240070939064026, + "kl": 0.003902435302734375, "learning_rate": 8.557485869176825e-07, - "loss": -0.0714, - "num_tokens": 21892741.0, - "reward": 0.4072803081944585, - "reward_std": 0.6576224267482758, - "rewards/cosine_scaled_reward": -0.21302651334553957, - "rewards/format_reward": 0.8333333283662796, + "loss": 0.0588, + "num_tokens": 22087907.0, + "reward": -0.26911586057394743, + "reward_std": 0.5829970799386501, + "rewards/cosine_scaled_reward": -0.13455793377943337, "step": 168 }, { "clip_ratio": 0.0, - "completion_length": 2047.8750915527344, - "epoch": 0.19314285714285714, - "grad_norm": 0.3831358850002289, - "kl": 0.005199432373046875, + "completion_length": 1914.791748046875, + "epoch": 0.09657142857142857, + "grad_norm": 0.35050421953201294, + "kl": 0.005504608154296875, "learning_rate": 8.534360744126753e-07, - "loss": -0.0838, - "num_tokens": 21999667.0, - "reward": 1.2296884339302778, - "reward_std": 0.8351282328367233, - "rewards/cosine_scaled_reward": 0.22942754812538624, - "rewards/format_reward": 0.7708333358168602, + "loss": 0.1012, + "num_tokens": 22185385.0, + "reward": -0.21728778630495071, + "reward_std": 0.8187869340181351, + "rewards/cosine_scaled_reward": -0.10864389315247536, "step": 169 }, { "clip_ratio": 0.0, - "completion_length": 1698.7917175292969, - "epoch": 0.19428571428571428, - "grad_norm": 0.25520145893096924, - "kl": 0.00498199462890625, + "completion_length": 3486.1250610351562, + "epoch": 0.09714285714285714, + "grad_norm": 0.21712802350521088, + "kl": 0.0020456314086914062, "learning_rate": 8.511087728614862e-07, - "loss": 0.0237, - "num_tokens": 22089615.0, - "reward": 0.33604998141527176, - "reward_std": 0.4241115599870682, - "rewards/cosine_scaled_reward": -0.20697502605617046, - "rewards/format_reward": 0.75, + "loss": -0.0069, + "num_tokens": 22358287.0, + "reward": -0.2807197757065296, + "reward_std": 0.188842561095953, + "rewards/cosine_scaled_reward": -0.1403598841279745, "step": 170 }, { "clip_ratio": 0.0, - "completion_length": 1190.8541870117188, - "epoch": 0.19542857142857142, - "grad_norm": 0.35177505016326904, - "kl": 0.0034351348876953125, + "completion_length": 2643.2083435058594, + "epoch": 0.09771428571428571, + "grad_norm": 0.25833019614219666, + "kl": 0.0024633407592773438, "learning_rate": 8.487667956935087e-07, - "loss": 0.076, - "num_tokens": 22154102.0, - "reward": 0.4167235270142555, - "reward_std": 0.44075047969818115, - "rewards/cosine_scaled_reward": -0.23955491092056036, - "rewards/format_reward": 0.8958333283662796, + "loss": -0.1294, + "num_tokens": 22491605.0, + "reward": 0.01894190162420273, + "reward_std": 0.37402310594916344, + "rewards/cosine_scaled_reward": 0.009470956400036812, "step": 171 }, { "clip_ratio": 0.0, - "completion_length": 1618.5625305175781, - "epoch": 0.19657142857142856, - "grad_norm": 0.4714674949645996, - "kl": 0.01062774658203125, + "completion_length": 2838.8958740234375, + "epoch": 0.09828571428571428, + "grad_norm": 0.2890698313713074, + "kl": 0.0052337646484375, "learning_rate": 8.464102570534061e-07, - "loss": 0.1873, - "num_tokens": 22239383.0, - "reward": 0.2359453495591879, - "reward_std": 0.39740175753831863, - "rewards/cosine_scaled_reward": -0.24661065079271793, - "rewards/format_reward": 0.7291666641831398, + "loss": 0.0445, + "num_tokens": 22633668.0, + "reward": -0.39469024166464806, + "reward_std": 0.5306678973138332, + "rewards/cosine_scaled_reward": -0.19734511990100145, "step": 172 }, { "clip_ratio": 0.0, - "completion_length": 1911.3750457763672, - "epoch": 0.1977142857142857, - "grad_norm": 0.3989448845386505, - "kl": 0.003726959228515625, + "completion_length": 1792.7916870117188, + "epoch": 0.09885714285714285, + "grad_norm": 0.34095853567123413, + "kl": 0.0036678314208984375, "learning_rate": 8.440392717955475e-07, - "loss": 0.1805, - "num_tokens": 22339241.0, - "reward": 0.6558301709592342, - "reward_std": 0.5854435563087463, - "rewards/cosine_scaled_reward": -0.026251595467329025, - "rewards/format_reward": 0.7083333283662796, + "loss": 0.2055, + "num_tokens": 22724762.0, + "reward": -0.05847650859504938, + "reward_std": 0.7935373112559319, + "rewards/cosine_scaled_reward": -0.029238261049613357, "step": 173 }, { "clip_ratio": 0.0, - "completion_length": 1451.1250610351562, - "epoch": 0.19885714285714284, - "grad_norm": 0.2805970311164856, - "kl": 0.005706787109375, + "completion_length": 3167.6458740234375, + "epoch": 0.09942857142857142, + "grad_norm": 0.2256677895784378, + "kl": 0.00197601318359375, "learning_rate": 8.416539554784089e-07, - "loss": 0.2465, - "num_tokens": 22417001.0, - "reward": 0.3804206885397434, - "reward_std": 0.45497578382492065, - "rewards/cosine_scaled_reward": -0.2681229915469885, - "rewards/format_reward": 0.9166666716337204, + "loss": 0.0598, + "num_tokens": 22882653.0, + "reward": -0.525588646531105, + "reward_std": 0.44627620652318, + "rewards/cosine_scaled_reward": -0.2627943083643913, "step": 174 }, { "clip_ratio": 0.0, - "completion_length": 1849.6875457763672, - "epoch": 0.2, - "grad_norm": 0.2999878227710724, - "kl": 0.004802703857421875, + "completion_length": 1800.0000457763672, + "epoch": 0.1, + "grad_norm": 0.38881924748420715, + "kl": 0.008832931518554688, "learning_rate": 8.392544243589427e-07, - "loss": 0.109, - "num_tokens": 22513610.0, - "reward": 0.6369728110730648, - "reward_std": 0.5664872080087662, - "rewards/cosine_scaled_reward": -0.05651361867785454, - "rewards/format_reward": 0.75, + "loss": 0.0274, + "num_tokens": 22975461.0, + "reward": 0.41337180882692337, + "reward_std": 0.621935173869133, + "rewards/cosine_scaled_reward": 0.20668590441346169, "step": 175 }, { "clip_ratio": 0.0, - "completion_length": 1288.7916717529297, - "epoch": 0.20114285714285715, - "grad_norm": 0.40278714895248413, - "kl": 0.0055389404296875, + "completion_length": 2512.2084045410156, + "epoch": 0.10057142857142858, + "grad_norm": 0.24792905151844025, + "kl": 0.00196075439453125, "learning_rate": 8.368407953869103e-07, - "loss": 0.2272, - "num_tokens": 22583158.0, - "reward": 0.6769410446286201, - "reward_std": 0.5952249988913536, - "rewards/cosine_scaled_reward": -0.09902950003743172, - "rewards/format_reward": 0.875, + "loss": -0.0595, + "num_tokens": 23101543.0, + "reward": 0.18181858723983169, + "reward_std": 0.6874982379376888, + "rewards/cosine_scaled_reward": 0.09090929350350052, "step": 176 }, { "clip_ratio": 0.0, - "completion_length": 1662.1250305175781, - "epoch": 0.2022857142857143, - "grad_norm": 0.28330227732658386, - "kl": 0.00322723388671875, + "completion_length": 2633.8333740234375, + "epoch": 0.10114285714285715, + "grad_norm": 0.3130466043949127, + "kl": 0.001682281494140625, "learning_rate": 8.344131861991828e-07, - "loss": 0.1889, - "num_tokens": 22670926.0, - "reward": 0.352513425052166, - "reward_std": 0.5872365534305573, - "rewards/cosine_scaled_reward": -0.2299932837486267, - "rewards/format_reward": 0.8125, + "loss": 0.0074, + "num_tokens": 23234291.0, + "reward": 0.03136664628982544, + "reward_std": 0.5966363772749901, + "rewards/cosine_scaled_reward": 0.015683308243751526, "step": 177 }, { "clip_ratio": 0.0, - "completion_length": 1484.5208892822266, - "epoch": 0.20342857142857143, - "grad_norm": 0.39071711897850037, - "kl": 0.005340576171875, + "completion_length": 3336.229248046875, + "epoch": 0.10171428571428572, + "grad_norm": 0.17930154502391815, + "kl": 0.0008959770202636719, "learning_rate": 8.319717151140072e-07, - "loss": 0.2975, - "num_tokens": 22749749.0, - "reward": 1.0806448608636856, - "reward_std": 0.36274878680706024, - "rewards/cosine_scaled_reward": 0.12365575134754181, - "rewards/format_reward": 0.8333333432674408, + "loss": 0.0608, + "num_tokens": 23400322.0, + "reward": -0.12859635055065155, + "reward_std": 0.7636773735284805, + "rewards/cosine_scaled_reward": -0.06429817155003548, "step": 178 }, { "clip_ratio": 0.0, - "completion_length": 1618.3125610351562, - "epoch": 0.20457142857142857, - "grad_norm": 0.29320603609085083, - "kl": 0.003803253173828125, + "completion_length": 2887.4583435058594, + "epoch": 0.10228571428571429, + "grad_norm": 0.24900512397289276, + "kl": 0.0021228790283203125, "learning_rate": 8.295165011252396e-07, - "loss": 0.2516, - "num_tokens": 22835786.0, - "reward": 0.607761038467288, - "reward_std": 0.7367214486002922, - "rewards/cosine_scaled_reward": -0.11278617009520531, - "rewards/format_reward": 0.8333333283662796, + "loss": -0.0046, + "num_tokens": 23545100.0, + "reward": -0.3140909820795059, + "reward_std": 0.46572665125131607, + "rewards/cosine_scaled_reward": -0.15704548731446266, "step": 179 }, { "clip_ratio": 0.0, - "completion_length": 1773.6250610351562, - "epoch": 0.2057142857142857, - "grad_norm": 0.2515128254890442, - "kl": 0.003505706787109375, + "completion_length": 2308.4583435058594, + "epoch": 0.10285714285714286, + "grad_norm": 0.29636770486831665, + "kl": 0.0054950714111328125, "learning_rate": 8.270476638965461e-07, - "loss": 0.0654, - "num_tokens": 22928852.0, - "reward": 0.419520135037601, - "reward_std": 0.6586542651057243, - "rewards/cosine_scaled_reward": -0.1964899403974414, - "rewards/format_reward": 0.8125, + "loss": -0.0668, + "num_tokens": 23662218.0, + "reward": -0.2494659647345543, + "reward_std": 0.7115907035768032, + "rewards/cosine_scaled_reward": -0.124732980504632, "step": 180 }, { "clip_ratio": 0.0, - "completion_length": 1947.354248046875, - "epoch": 0.20685714285714285, - "grad_norm": 0.25565311312675476, - "kl": 0.0033130645751953125, + "completion_length": 2766.729232788086, + "epoch": 0.10342857142857143, + "grad_norm": 0.23259904980659485, + "kl": 0.005695343017578125, "learning_rate": 8.245653237555705e-07, - "loss": 0.0849, - "num_tokens": 23030425.0, - "reward": 0.3168973168358207, - "reward_std": 0.6736587360501289, - "rewards/cosine_scaled_reward": -0.20613467320799828, - "rewards/format_reward": 0.7291666567325592, + "loss": -0.0073, + "num_tokens": 23801273.0, + "reward": -0.15345774590969086, + "reward_std": 0.5345512442290783, + "rewards/cosine_scaled_reward": -0.07672888785600662, "step": 181 }, { "clip_ratio": 0.0, - "completion_length": 1263.9375457763672, - "epoch": 0.208, - "grad_norm": 0.33973386883735657, - "kl": 0.00634002685546875, + "completion_length": 3253.6458740234375, + "epoch": 0.104, + "grad_norm": 0.23866048455238342, + "kl": 0.0014324188232421875, "learning_rate": 8.220696016880687e-07, - "loss": 0.0838, - "num_tokens": 23098498.0, - "reward": 0.6266386806964874, - "reward_std": 0.1899520792067051, - "rewards/cosine_scaled_reward": -0.12418065406382084, - "rewards/format_reward": 0.875, + "loss": 0.0049, + "num_tokens": 23964000.0, + "reward": -0.5079273246228695, + "reward_std": 0.6336031965911388, + "rewards/cosine_scaled_reward": -0.2539636502042413, "step": 182 }, { "clip_ratio": 0.0, - "completion_length": 2050.9584350585938, - "epoch": 0.20914285714285713, - "grad_norm": 0.382978230714798, - "kl": 0.004489898681640625, + "completion_length": 2357.2291870117188, + "epoch": 0.10457142857142857, + "grad_norm": 0.31123295426368713, + "kl": 0.00495147705078125, "learning_rate": 8.195606193320136e-07, - "loss": 0.2785, - "num_tokens": 23204696.0, - "reward": 0.052440449595451355, - "reward_std": 0.3785889223217964, - "rewards/cosine_scaled_reward": -0.3175297752022743, - "rewards/format_reward": 0.6875000149011612, + "loss": 0.1906, + "num_tokens": 24084575.0, + "reward": -0.5395753756165504, + "reward_std": 0.2657657843083143, + "rewards/cosine_scaled_reward": -0.2697876766324043, "step": 183 }, { "clip_ratio": 0.0, - "completion_length": 1581.916748046875, - "epoch": 0.2102857142857143, - "grad_norm": 0.29861006140708923, - "kl": 0.0050201416015625, + "completion_length": 1866.9792175292969, + "epoch": 0.10514285714285715, + "grad_norm": 0.3134949803352356, + "kl": 0.00649261474609375, "learning_rate": 8.170384989716657e-07, - "loss": 0.0973, - "num_tokens": 23288308.0, - "reward": 1.0160801857709885, - "reward_std": 0.9725418835878372, - "rewards/cosine_scaled_reward": 0.07054009102284908, - "rewards/format_reward": 0.875, + "loss": -0.0113, + "num_tokens": 24179290.0, + "reward": -0.09402483701705933, + "reward_std": 0.6055268943309784, + "rewards/cosine_scaled_reward": -0.047012414783239365, "step": 184 }, { "clip_ratio": 0.0, - "completion_length": 1973.0625610351562, - "epoch": 0.21142857142857144, - "grad_norm": 0.2891089916229248, - "kl": 0.005382537841796875, + "completion_length": 2454.770835876465, + "epoch": 0.10571428571428572, + "grad_norm": 0.44388100504875183, + "kl": 0.001873016357421875, "learning_rate": 8.145033635316128e-07, - "loss": 0.1131, - "num_tokens": 23390791.0, - "reward": 0.47157616540789604, - "reward_std": 0.8375265002250671, - "rewards/cosine_scaled_reward": -0.12879525747848675, - "rewards/format_reward": 0.7291666641831398, + "loss": 0.0984, + "num_tokens": 24301895.0, + "reward": -0.41640862822532654, + "reward_std": 0.4775813100859523, + "rewards/cosine_scaled_reward": -0.20820431411266327, "step": 185 }, { "clip_ratio": 0.0, - "completion_length": 1952.8959197998047, - "epoch": 0.21257142857142858, - "grad_norm": 0.2916695475578308, - "kl": 0.004917144775390625, + "completion_length": 2527.3750915527344, + "epoch": 0.10628571428571429, + "grad_norm": 0.22966258227825165, + "kl": 0.00251007080078125, "learning_rate": 8.119553365707802e-07, - "loss": 0.122, - "num_tokens": 23492504.0, - "reward": 0.6901115328073502, - "reward_std": 0.6558038257062435, - "rewards/cosine_scaled_reward": -0.019527582451701164, - "rewards/format_reward": 0.7291666716337204, + "loss": 0.0335, + "num_tokens": 24431105.0, + "reward": 0.3885076344013214, + "reward_std": 0.8602791130542755, + "rewards/cosine_scaled_reward": 0.1942538060247898, "step": 186 }, { "clip_ratio": 0.0, - "completion_length": 1829.1041870117188, - "epoch": 0.21371428571428572, - "grad_norm": 0.26562783122062683, - "kl": 0.005084991455078125, + "completion_length": 3435.3958740234375, + "epoch": 0.10685714285714286, + "grad_norm": 0.22783932089805603, + "kl": 0.001224517822265625, "learning_rate": 8.093945422764069e-07, - "loss": 0.1011, - "num_tokens": 23589427.0, - "reward": 1.0519462451338768, - "reward_std": 0.8125685751438141, - "rewards/cosine_scaled_reward": 0.09888977278023958, - "rewards/format_reward": 0.8541666716337204, + "loss": -0.0104, + "num_tokens": 24602028.0, + "reward": -0.35544631630182266, + "reward_std": 0.3423473574221134, + "rewards/cosine_scaled_reward": -0.17772315442562103, "step": 187 }, { "clip_ratio": 0.0, - "completion_length": 1543.3541870117188, - "epoch": 0.21485714285714286, - "grad_norm": 0.28853264451026917, - "kl": 0.0056972503662109375, + "completion_length": 2700.3125610351562, + "epoch": 0.10742857142857143, + "grad_norm": 0.2991270422935486, + "kl": 0.003947257995605469, "learning_rate": 8.068211054579943e-07, - "loss": 0.1731, - "num_tokens": 23671032.0, - "reward": 0.8557066395878792, - "reward_std": 0.9914733618497849, - "rewards/cosine_scaled_reward": 0.032019972801208496, - "rewards/format_reward": 0.7916666716337204, + "loss": 0.1736, + "num_tokens": 24738075.0, + "reward": 0.23966709151864052, + "reward_std": 0.7574465498328209, + "rewards/cosine_scaled_reward": 0.11983353085815907, "step": 188 }, { "clip_ratio": 0.0, - "completion_length": 1163.1042022705078, - "epoch": 0.216, - "grad_norm": 0.3592197000980377, - "kl": 0.006622314453125, + "completion_length": 3063.5001220703125, + "epoch": 0.108, + "grad_norm": 0.21379563212394714, + "kl": 0.0010519027709960938, "learning_rate": 8.04235151541222e-07, - "loss": 0.1053, - "num_tokens": 23734703.0, - "reward": 0.6029985174536705, - "reward_std": 0.4228066951036453, - "rewards/cosine_scaled_reward": -0.16725075244903564, - "rewards/format_reward": 0.9375, + "loss": 0.0489, + "num_tokens": 24892311.0, + "reward": -0.047808293253183365, + "reward_std": 0.9623388051986694, + "rewards/cosine_scaled_reward": -0.023904146626591682, "step": 189 }, { "clip_ratio": 0.0, - "completion_length": 1680.9584197998047, - "epoch": 0.21714285714285714, - "grad_norm": 0.2913980782032013, - "kl": 0.00589752197265625, + "completion_length": 3313.4583740234375, + "epoch": 0.10857142857142857, + "grad_norm": 0.20801162719726562, + "kl": 0.001056671142578125, "learning_rate": 8.01636806561836e-07, - "loss": 0.1223, - "num_tokens": 23823057.0, - "reward": 0.45007142052054405, - "reward_std": 0.5233389809727669, - "rewards/cosine_scaled_reward": -0.191630975343287, - "rewards/format_reward": 0.8333333283662796, + "loss": 0.1006, + "num_tokens": 25058197.0, + "reward": -0.4445067085325718, + "reward_std": 0.3523157760500908, + "rewards/cosine_scaled_reward": -0.2222533505409956, "step": 190 }, { "clip_ratio": 0.0, - "completion_length": 1796.8333740234375, - "epoch": 0.21828571428571428, - "grad_norm": 0.29226669669151306, - "kl": 0.00363922119140625, + "completion_length": 2248.000045776367, + "epoch": 0.10914285714285714, + "grad_norm": 0.45326536893844604, + "kl": 0.003253936767578125, "learning_rate": 7.990261971595048e-07, - "loss": 0.1848, - "num_tokens": 23917189.0, - "reward": 0.3450898602604866, - "reward_std": 0.7155111283063889, - "rewards/cosine_scaled_reward": -0.202455073595047, - "rewards/format_reward": 0.7499999962747097, + "loss": 0.0083, + "num_tokens": 25171705.0, + "reward": -0.3526854105293751, + "reward_std": 0.45292405039072037, + "rewards/cosine_scaled_reward": -0.17634270247071981, "step": 191 }, { "clip_ratio": 0.0, - "completion_length": 2041.4583740234375, - "epoch": 0.21942857142857142, - "grad_norm": 0.38081520795822144, - "kl": 0.006256103515625, + "completion_length": 2677.645896911621, + "epoch": 0.10971428571428571, + "grad_norm": 0.26036742329597473, + "kl": 0.008434295654296875, "learning_rate": 7.964034505716476e-07, - "loss": 0.0669, - "num_tokens": 24022817.0, - "reward": 0.24144138023257256, - "reward_std": 0.6051021218299866, - "rewards/cosine_scaled_reward": -0.17094597034156322, - "rewards/format_reward": 0.5833333432674408, + "loss": 0.0529, + "num_tokens": 25305920.0, + "reward": 0.11686116084456444, + "reward_std": 0.6998385563492775, + "rewards/cosine_scaled_reward": 0.0584305664524436, "step": 192 }, { "clip_ratio": 0.0, - "completion_length": 1747.8125305175781, - "epoch": 0.22057142857142858, - "grad_norm": 0.41225576400756836, - "kl": 0.00653076171875, + "completion_length": 3415.604248046875, + "epoch": 0.11028571428571429, + "grad_norm": 0.17603135108947754, + "kl": 0.0010662078857421875, "learning_rate": 7.93768694627233e-07, - "loss": 0.0997, - "num_tokens": 24114524.0, - "reward": 0.5901122093200684, - "reward_std": 0.49861879646778107, - "rewards/cosine_scaled_reward": -0.07994389347732067, - "rewards/format_reward": 0.7500000074505806, + "loss": 0.0194, + "num_tokens": 25476889.0, + "reward": -0.3952821143902838, + "reward_std": 0.296985674649477, + "rewards/cosine_scaled_reward": -0.1976410405477509, "step": 193 }, { "clip_ratio": 0.0, - "completion_length": 1069.3958587646484, - "epoch": 0.22171428571428572, - "grad_norm": 0.36986562609672546, - "kl": 0.0089263916015625, + "completion_length": 2293.4167098999023, + "epoch": 0.11085714285714286, + "grad_norm": 0.3645835220813751, + "kl": 0.006656646728515625, "learning_rate": 7.911220577405484e-07, - "loss": 0.1094, - "num_tokens": 24173427.0, - "reward": 1.4875798523426056, - "reward_std": 0.6242154352366924, - "rewards/cosine_scaled_reward": 0.28545656986534595, - "rewards/format_reward": 0.9166666716337204, + "loss": 0.0665, + "num_tokens": 25592205.0, + "reward": 0.070842613466084, + "reward_std": 0.631361898034811, + "rewards/cosine_scaled_reward": 0.03542129183188081, "step": 194 }, { "clip_ratio": 0.0, - "completion_length": 1081.3750305175781, - "epoch": 0.22285714285714286, - "grad_norm": 0.3711891770362854, - "kl": 0.00726318359375, + "completion_length": 2965.6458740234375, + "epoch": 0.11142857142857143, + "grad_norm": 0.24365754425525665, + "kl": 0.0016307830810546875, "learning_rate": 7.884636689049422e-07, - "loss": 0.2086, - "num_tokens": 24233181.0, - "reward": 1.04611661657691, - "reward_std": 0.5401172246783972, - "rewards/cosine_scaled_reward": 0.05430831015110016, - "rewards/format_reward": 0.9375, + "loss": 0.0612, + "num_tokens": 25740496.0, + "reward": -0.3104175217449665, + "reward_std": 0.5912666730582714, + "rewards/cosine_scaled_reward": -0.15520876459777355, "step": 195 }, { "clip_ratio": 0.0, - "completion_length": 1984.6459045410156, - "epoch": 0.224, - "grad_norm": 0.3469684422016144, - "kl": 0.0052490234375, + "completion_length": 2565.3750228881836, + "epoch": 0.112, + "grad_norm": 0.2600879669189453, + "kl": 0.00702667236328125, "learning_rate": 7.857936576865356e-07, - "loss": 0.1803, - "num_tokens": 24336388.0, - "reward": 0.38754457980394363, - "reward_std": 0.5967799983918667, - "rewards/cosine_scaled_reward": -0.13956104964017868, - "rewards/format_reward": 0.666666679084301, + "loss": -0.0485, + "num_tokens": 25869826.0, + "reward": -0.014622047543525696, + "reward_std": 0.34931765496730804, + "rewards/cosine_scaled_reward": -0.007311023771762848, "step": 196 }, { "clip_ratio": 0.0, - "completion_length": 1723.8334045410156, - "epoch": 0.22514285714285714, - "grad_norm": 0.3208644986152649, - "kl": 0.005466461181640625, + "completion_length": 2068.354202270508, + "epoch": 0.11257142857142857, + "grad_norm": 0.48206502199172974, + "kl": 0.0059070587158203125, "learning_rate": 7.831121542179086e-07, - "loss": 0.1018, - "num_tokens": 24427034.0, - "reward": 0.13043999671936035, - "reward_std": 0.36407894641160965, - "rewards/cosine_scaled_reward": -0.2785299941897392, - "rewards/format_reward": 0.6875000055879354, + "loss": 0.1763, + "num_tokens": 25974819.0, + "reward": -0.04546727240085602, + "reward_std": 0.39029328897595406, + "rewards/cosine_scaled_reward": -0.022733643651008606, "step": 197 }, { "clip_ratio": 0.0, - "completion_length": 2133.6875610351562, - "epoch": 0.22628571428571428, - "grad_norm": 0.26503321528434753, - "kl": 0.00514984130859375, + "completion_length": 1956.2708740234375, + "epoch": 0.11314285714285714, + "grad_norm": 0.46816733479499817, + "kl": 0.0056858062744140625, "learning_rate": 7.804192891917571e-07, - "loss": 0.0816, - "num_tokens": 24537677.0, - "reward": 0.20322639122605324, - "reward_std": 0.41373175010085106, - "rewards/cosine_scaled_reward": -0.22130347788333893, - "rewards/format_reward": 0.6458333432674408, + "loss": 0.0688, + "num_tokens": 26075332.0, + "reward": 0.2702416032552719, + "reward_std": 0.3293491117656231, + "rewards/cosine_scaled_reward": 0.13512080535292625, "step": 198 }, { "clip_ratio": 0.0, - "completion_length": 1842.7500915527344, - "epoch": 0.22742857142857142, - "grad_norm": 0.3147067427635193, - "kl": 0.00490570068359375, + "completion_length": 3000.666748046875, + "epoch": 0.11371428571428571, + "grad_norm": 0.2764574885368347, + "kl": 0.0014142990112304688, "learning_rate": 7.777151938545235e-07, - "loss": 0.116, - "num_tokens": 24634271.0, - "reward": 0.8186136335134506, - "reward_std": 0.7792095839977264, - "rewards/cosine_scaled_reward": 0.013473461382091045, - "rewards/format_reward": 0.7916666716337204, + "loss": -0.0134, + "num_tokens": 26224608.0, + "reward": 0.04078002646565437, + "reward_std": 0.5825254544615746, + "rewards/cosine_scaled_reward": 0.020389998331665993, "step": 199 }, { "clip_ratio": 0.0, - "completion_length": 1465.3750305175781, - "epoch": 0.22857142857142856, - "grad_norm": 0.4409931004047394, - "kl": 0.00621795654296875, + "completion_length": 2605.0625915527344, + "epoch": 0.11428571428571428, + "grad_norm": 0.386100172996521, + "kl": 0.003597259521484375, "learning_rate": 7.75e-07, - "loss": 0.2345, - "num_tokens": 24712667.0, - "reward": 0.9040451645851135, - "reward_std": 0.6532056555151939, - "rewards/cosine_scaled_reward": 0.06660587899386883, - "rewards/format_reward": 0.7708333358168602, + "loss": 0.0811, + "num_tokens": 26356503.0, + "reward": -0.17602870613336563, + "reward_std": 0.5460017845034599, + "rewards/cosine_scaled_reward": -0.08801434934139252, "step": 200 }, { "clip_ratio": 0.0, - "completion_length": 1399.9791870117188, - "epoch": 0.2297142857142857, - "grad_norm": 4.2830705642700195, - "kl": 0.08713531494140625, + "completion_length": 2886.8958740234375, + "epoch": 0.11485714285714285, + "grad_norm": 0.25738444924354553, + "kl": 0.0018377304077148438, "learning_rate": 7.72273839962904e-07, - "loss": 0.1038, - "num_tokens": 24788044.0, - "reward": 0.5020628832280636, - "reward_std": 0.5367235317826271, - "rewards/cosine_scaled_reward": -0.1552185484324582, - "rewards/format_reward": 0.8125, + "loss": -0.0744, + "num_tokens": 26501878.0, + "reward": -0.44855231791734695, + "reward_std": 0.4905274584889412, + "rewards/cosine_scaled_reward": -0.22427614964544773, "step": 201 }, { "clip_ratio": 0.0, - "completion_length": 1097.625015258789, - "epoch": 0.23085714285714284, - "grad_norm": 0.4227047562599182, - "kl": 0.0122528076171875, + "completion_length": 2285.625045776367, + "epoch": 0.11542857142857142, + "grad_norm": 0.32238876819610596, + "kl": 0.0074291229248046875, "learning_rate": 7.695368466124296e-07, - "loss": 0.1053, - "num_tokens": 24848728.0, - "reward": 0.8473174124956131, - "reward_std": 0.7071373090147972, - "rewards/cosine_scaled_reward": -0.02425796026363969, - "rewards/format_reward": 0.8958333283662796, + "loss": 0.0146, + "num_tokens": 26616928.0, + "reward": 0.08226745203137398, + "reward_std": 0.9037965089082718, + "rewards/cosine_scaled_reward": 0.041133725084364414, "step": 202 }, { "clip_ratio": 0.0, - "completion_length": 1515.2292175292969, - "epoch": 0.232, - "grad_norm": 0.41127878427505493, - "kl": 0.01018524169921875, + "completion_length": 3112.8333740234375, + "epoch": 0.116, + "grad_norm": 0.1928633451461792, + "kl": 0.00113677978515625, "learning_rate": 7.667891533457718e-07, - "loss": 0.2108, - "num_tokens": 24929409.0, - "reward": 0.7710594609379768, - "reward_std": 0.7223998121917248, - "rewards/cosine_scaled_reward": 0.010529719293117523, - "rewards/format_reward": 0.7499999925494194, + "loss": -0.026, + "num_tokens": 26772284.0, + "reward": 0.10418719984591007, + "reward_std": 0.6046584714204073, + "rewards/cosine_scaled_reward": 0.052093599922955036, "step": 203 }, { "clip_ratio": 0.0, - "completion_length": 1023.0417175292969, - "epoch": 0.23314285714285715, - "grad_norm": 0.44631218910217285, - "kl": 0.009815216064453125, + "completion_length": 2544.8958740234375, + "epoch": 0.11657142857142858, + "grad_norm": 0.28456971049308777, + "kl": 0.0077342987060546875, "learning_rate": 7.640308940816239e-07, - "loss": 0.225, - "num_tokens": 24986189.0, - "reward": 1.2192229256033897, - "reward_std": 0.7845951393246651, - "rewards/cosine_scaled_reward": 0.16169476276263595, - "rewards/format_reward": 0.8958333283662796, + "loss": 0.0073, + "num_tokens": 26900391.0, + "reward": -0.42413394153118134, + "reward_std": 0.3923846688121557, + "rewards/cosine_scaled_reward": -0.21206695958971977, "step": 204 }, { "clip_ratio": 0.0, - "completion_length": 1646.1875305175781, - "epoch": 0.2342857142857143, - "grad_norm": 0.2805160880088806, - "kl": 0.00618743896484375, + "completion_length": 2439.0208587646484, + "epoch": 0.11714285714285715, + "grad_norm": 0.2515351176261902, + "kl": 0.003063201904296875, "learning_rate": 7.612622032536507e-07, - "loss": 0.1993, - "num_tokens": 25073390.0, - "reward": 0.5499276574701071, - "reward_std": 0.8229625821113586, - "rewards/cosine_scaled_reward": -0.14170285500586033, - "rewards/format_reward": 0.8333333432674408, + "loss": 0.0386, + "num_tokens": 27022972.0, + "reward": -0.41302137821912766, + "reward_std": 0.5369090847671032, + "rewards/cosine_scaled_reward": -0.20651067793369293, "step": 205 }, { "clip_ratio": 0.0, - "completion_length": 1825.0209045410156, - "epoch": 0.23542857142857143, - "grad_norm": 0.2909686267375946, - "kl": 0.00579071044921875, + "completion_length": 2899.2500915527344, + "epoch": 0.11771428571428572, + "grad_norm": 0.24134834110736847, + "kl": 0.0021295547485351562, "learning_rate": 7.584832158039378e-07, - "loss": 0.04, - "num_tokens": 25169739.0, - "reward": 0.30743252485990524, - "reward_std": 0.7692845687270164, - "rewards/cosine_scaled_reward": -0.2212837437982671, - "rewards/format_reward": 0.75, + "loss": 0.0364, + "num_tokens": 27167764.0, + "reward": -0.4214736092835665, + "reward_std": 0.4597649797797203, + "rewards/cosine_scaled_reward": -0.21073680510744452, "step": 206 }, { "clip_ratio": 0.0, - "completion_length": 1912.0000915527344, - "epoch": 0.23657142857142857, - "grad_norm": 0.27535000443458557, - "kl": 0.004474639892578125, + "completion_length": 2665.9583587646484, + "epoch": 0.11828571428571429, + "grad_norm": 0.27409258484840393, + "kl": 0.005873680114746094, "learning_rate": 7.556940671764124e-07, - "loss": 0.0423, - "num_tokens": 25269513.0, - "reward": 0.8390000090003014, - "reward_std": 0.8653023391962051, - "rewards/cosine_scaled_reward": 0.02366666356101632, - "rewards/format_reward": 0.7916666567325592, + "loss": 0.0453, + "num_tokens": 27301262.0, + "reward": -0.13723902963101864, + "reward_std": 0.5910945013165474, + "rewards/cosine_scaled_reward": -0.06861951481550932, "step": 207 }, { "clip_ratio": 0.0, - "completion_length": 1695.3333892822266, - "epoch": 0.2377142857142857, - "grad_norm": 0.31623345613479614, - "kl": 0.00894927978515625, + "completion_length": 2065.4583740234375, + "epoch": 0.11885714285714286, + "grad_norm": 0.3946564793586731, + "kl": 0.018975257873535156, "learning_rate": 7.528948933102438e-07, - "loss": 0.0569, - "num_tokens": 25358791.0, - "reward": 0.2730532819405198, - "reward_std": 0.2860656566917896, - "rewards/cosine_scaled_reward": -0.23847334645688534, - "rewards/format_reward": 0.7499999925494194, + "loss": 0.0449, + "num_tokens": 27406500.0, + "reward": 0.5526407100260258, + "reward_std": 0.3575515812262893, + "rewards/cosine_scaled_reward": 0.2763203550130129, "step": 208 }, { "clip_ratio": 0.0, - "completion_length": 900.3750305175781, - "epoch": 0.23885714285714285, - "grad_norm": 0.42924708127975464, - "kl": 0.00959014892578125, + "completion_length": 2564.06254196167, + "epoch": 0.11942857142857143, + "grad_norm": 0.36452898383140564, + "kl": 0.015535354614257812, "learning_rate": 7.500858306332172e-07, - "loss": 0.132, - "num_tokens": 25409989.0, - "reward": 0.6640197485685349, - "reward_std": 0.662564605474472, - "rewards/cosine_scaled_reward": -0.11590681597590446, - "rewards/format_reward": 0.8958333432674408, + "loss": 0.0593, + "num_tokens": 27536463.0, + "reward": 0.1716044805943966, + "reward_std": 0.2996017700061202, + "rewards/cosine_scaled_reward": 0.08580224774777889, "step": 209 }, { "clip_ratio": 0.0, - "completion_length": 1257.7708740234375, - "epoch": 0.24, - "grad_norm": 0.6309426426887512, - "kl": 0.0152740478515625, + "completion_length": 2995.9376220703125, + "epoch": 0.12, + "grad_norm": 0.22107850015163422, + "kl": 0.001888275146484375, "learning_rate": 7.472670160550848e-07, - "loss": 0.2302, - "num_tokens": 25478378.0, - "reward": 0.9711966030299664, - "reward_std": 0.5971670504659414, - "rewards/cosine_scaled_reward": 0.0585149209946394, - "rewards/format_reward": 0.8541666567325592, + "loss": 0.0598, + "num_tokens": 27686388.0, + "reward": -0.5479435250163078, + "reward_std": 0.3200632855296135, + "rewards/cosine_scaled_reward": -0.2739717550575733, "step": 210 }, { "clip_ratio": 0.0, - "completion_length": 2222.7084350585938, - "epoch": 0.24114285714285713, - "grad_norm": 0.26984068751335144, - "kl": 0.0057525634765625, + "completion_length": 2689.4791717529297, + "epoch": 0.12057142857142857, + "grad_norm": 0.2601456940174103, + "kl": 0.005894660949707031, "learning_rate": 7.444385869608921e-07, - "loss": 0.1591, - "num_tokens": 25593546.0, - "reward": 0.15845571644604206, - "reward_std": 0.6340156942605972, - "rewards/cosine_scaled_reward": -0.2124387975782156, - "rewards/format_reward": 0.5833333283662796, + "loss": 0.0078, + "num_tokens": 27821411.0, + "reward": -0.06815922260284424, + "reward_std": 0.5627651736140251, + "rewards/cosine_scaled_reward": -0.03407961130142212, "step": 211 }, { "clip_ratio": 0.0, - "completion_length": 1195.187515258789, - "epoch": 0.2422857142857143, - "grad_norm": 0.37575727701187134, - "kl": 0.0077056884765625, + "completion_length": 3288.8959350585938, + "epoch": 0.12114285714285715, + "grad_norm": 0.25300368666648865, + "kl": 0.0014171600341796875, "learning_rate": 7.416006812042827e-07, - "loss": 0.1568, - "num_tokens": 25658679.0, - "reward": 0.7064365767873824, - "reward_std": 0.46855687722563744, - "rewards/cosine_scaled_reward": -0.07386504206806421, - "rewards/format_reward": 0.8541666716337204, + "loss": 0.0609, + "num_tokens": 27984318.0, + "reward": 0.17400704324245453, + "reward_std": 0.720283254981041, + "rewards/cosine_scaled_reward": 0.08700351975858212, "step": 212 }, { "clip_ratio": 0.0, - "completion_length": 2643.291748046875, - "epoch": 0.24342857142857144, - "grad_norm": 0.26660388708114624, - "kl": 0.0048313140869140625, + "completion_length": 2185.479217529297, + "epoch": 0.12171428571428572, + "grad_norm": 0.28523921966552734, + "kl": 0.0068721771240234375, "learning_rate": 7.387534371007797e-07, - "loss": 0.064, - "num_tokens": 25793729.0, - "reward": -0.02674592100083828, - "reward_std": 0.3547811508178711, - "rewards/cosine_scaled_reward": -0.2112896330654621, - "rewards/format_reward": 0.3958333320915699, + "loss": 0.004, + "num_tokens": 28094729.0, + "reward": 0.2066820189356804, + "reward_std": 0.7146667540073395, + "rewards/cosine_scaled_reward": 0.10334100387990475, "step": 213 }, { "clip_ratio": 0.0, - "completion_length": 1974.1459350585938, - "epoch": 0.24457142857142858, - "grad_norm": 0.275295615196228, - "kl": 0.004917144775390625, + "completion_length": 2242.687545776367, + "epoch": 0.12228571428571429, + "grad_norm": 0.4000585079193115, + "kl": 0.008604049682617188, "learning_rate": 7.358969934210438e-07, - "loss": 0.077, - "num_tokens": 25896684.0, - "reward": 0.5545921013690531, - "reward_std": 0.7209105789661407, - "rewards/cosine_scaled_reward": -0.08728731423616409, - "rewards/format_reward": 0.7291666567325592, + "loss": 0.1593, + "num_tokens": 28208882.0, + "reward": -0.39366581034846604, + "reward_std": 0.4671928398311138, + "rewards/cosine_scaled_reward": -0.19683289778186008, "step": 214 }, { "clip_ratio": 0.0, - "completion_length": 1460.9792175292969, - "epoch": 0.24571428571428572, - "grad_norm": 0.3583175539970398, - "kl": 0.00732421875, + "completion_length": 2578.375030517578, + "epoch": 0.12285714285714286, + "grad_norm": 0.26247313618659973, + "kl": 0.005038261413574219, "learning_rate": 7.330314893841101e-07, - "loss": 0.0764, - "num_tokens": 25974851.0, - "reward": 0.6321092396974564, - "reward_std": 0.7664843499660492, - "rewards/cosine_scaled_reward": -0.12144538667052984, - "rewards/format_reward": 0.8750000149011612, + "loss": 0.1202, + "num_tokens": 28338476.0, + "reward": -0.4542321562767029, + "reward_std": 0.4457564279437065, + "rewards/cosine_scaled_reward": -0.2271160762757063, "step": 215 }, { "clip_ratio": 0.0, - "completion_length": 1925.8958435058594, - "epoch": 0.24685714285714286, - "grad_norm": 0.27199554443359375, - "kl": 0.00620269775390625, + "completion_length": 836.9583511352539, + "epoch": 0.12342857142857143, + "grad_norm": 0.6995976567268372, + "kl": 0.0274505615234375, "learning_rate": 7.301570646506027e-07, - "loss": 0.1422, - "num_tokens": 26075400.0, - "reward": 0.2661210894584656, - "reward_std": 0.5832830742001534, - "rewards/cosine_scaled_reward": -0.22110612504184246, - "rewards/format_reward": 0.708333320915699, + "loss": 0.3925, + "num_tokens": 28384326.0, + "reward": 1.4526022970676422, + "reward_std": 0.5801738128066063, + "rewards/cosine_scaled_reward": 0.7263011261820793, "step": 216 }, { "clip_ratio": 0.0, - "completion_length": 1680.729248046875, - "epoch": 0.248, - "grad_norm": 0.36763909459114075, - "kl": 0.00833892822265625, + "completion_length": 3564.6458740234375, + "epoch": 0.124, + "grad_norm": 0.18853364884853363, + "kl": 0.0008840560913085938, "learning_rate": 7.27273859315928e-07, - "loss": 0.0663, - "num_tokens": 26163863.0, - "reward": 0.5651724338531494, - "reward_std": 0.47575872763991356, - "rewards/cosine_scaled_reward": -0.10283046402037144, - "rewards/format_reward": 0.7708333283662796, + "loss": 0.0079, + "num_tokens": 28562401.0, + "reward": -0.1634646449238062, + "reward_std": 0.4835543856024742, + "rewards/cosine_scaled_reward": -0.0817323224619031, "step": 217 }, { "clip_ratio": 0.0, - "completion_length": 1642.354248046875, - "epoch": 0.24914285714285714, - "grad_norm": 0.3406772017478943, - "kl": 0.006591796875, + "completion_length": 2188.062545776367, + "epoch": 0.12457142857142857, + "grad_norm": 0.3206729292869568, + "kl": 0.007763862609863281, "learning_rate": 7.243820139034464e-07, - "loss": 0.0695, - "num_tokens": 26251186.0, - "reward": 0.955289987847209, - "reward_std": 0.7435199301689863, - "rewards/cosine_scaled_reward": 0.08181164413690567, - "rewards/format_reward": 0.7916666716337204, + "loss": 0.0321, + "num_tokens": 28673212.0, + "reward": -0.21583660691976547, + "reward_std": 0.4847491458058357, + "rewards/cosine_scaled_reward": -0.10791829600930214, "step": 218 }, { "clip_ratio": 0.0, - "completion_length": 980.8542175292969, - "epoch": 0.2502857142857143, - "grad_norm": 0.378128319978714, - "kl": 0.00731658935546875, + "completion_length": 1354.5625762939453, + "epoch": 0.12514285714285714, + "grad_norm": 0.7568894624710083, + "kl": 0.0127105712890625, "learning_rate": 7.214816693576234e-07, - "loss": 0.2817, - "num_tokens": 26305803.0, - "reward": 0.768341101706028, - "reward_std": 0.45224541425704956, - "rewards/cosine_scaled_reward": -0.0949961468577385, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.2123, + "num_tokens": 28743739.0, + "reward": 0.5299456119537354, + "reward_std": 0.6226745247840881, + "rewards/cosine_scaled_reward": 0.2649728059768677, "step": 219 }, { "clip_ratio": 0.0, - "completion_length": 1017.9166717529297, - "epoch": 0.25142857142857145, - "grad_norm": 0.3980640172958374, - "kl": 0.0093231201171875, + "completion_length": 2359.729217529297, + "epoch": 0.12571428571428572, + "grad_norm": 0.2767828404903412, + "kl": 0.0012798309326171875, "learning_rate": 7.185729670371604e-07, - "loss": 0.2144, - "num_tokens": 26362757.0, - "reward": 0.5230312906205654, - "reward_std": 0.6670053526759148, - "rewards/cosine_scaled_reward": -0.1968176942318678, - "rewards/format_reward": 0.9166666567325592, + "loss": -0.0269, + "num_tokens": 28863102.0, + "reward": 0.4944605454802513, + "reward_std": 0.5324547663331032, + "rewards/cosine_scaled_reward": 0.24723027274012566, "step": 220 }, { "clip_ratio": 0.0, - "completion_length": 1961.1459350585938, - "epoch": 0.25257142857142856, - "grad_norm": 0.3665303885936737, - "kl": 0.00670623779296875, + "completion_length": 1487.9791717529297, + "epoch": 0.12628571428571428, + "grad_norm": 0.5411548614501953, + "kl": 0.02813243865966797, "learning_rate": 7.156560487081051e-07, - "loss": 0.2889, - "num_tokens": 26465760.0, - "reward": 0.3678629584610462, - "reward_std": 0.5251259803771973, - "rewards/cosine_scaled_reward": -0.11815185844898224, - "rewards/format_reward": 0.6041666641831398, + "loss": 0.1265, + "num_tokens": 28939709.0, + "reward": 0.3169099148362875, + "reward_std": 0.6918673776090145, + "rewards/cosine_scaled_reward": 0.15845494996756315, "step": 221 }, { "clip_ratio": 0.0, - "completion_length": 2460.3334350585938, - "epoch": 0.2537142857142857, - "grad_norm": 0.24931471049785614, - "kl": 0.00376129150390625, + "completion_length": 1893.2708740234375, + "epoch": 0.12685714285714286, + "grad_norm": 0.4862631857395172, + "kl": 0.016809463500976562, "learning_rate": 7.127310565369415e-07, - "loss": 0.257, - "num_tokens": 26592070.0, - "reward": -0.09669354045763612, - "reward_std": 0.49049024283885956, - "rewards/cosine_scaled_reward": -0.30876342952251434, - "rewards/format_reward": 0.5208333283662796, + "loss": 0.1021, + "num_tokens": 29035914.0, + "reward": -0.06954247504472733, + "reward_std": 0.5530165806412697, + "rewards/cosine_scaled_reward": -0.03477124497294426, "step": 222 }, { "clip_ratio": 0.0, - "completion_length": 1533.3333892822266, - "epoch": 0.25485714285714284, - "grad_norm": 0.36577993631362915, - "kl": 0.0063323974609375, + "completion_length": 2748.5625, + "epoch": 0.12742857142857142, + "grad_norm": 0.30448853969573975, + "kl": 0.005116462707519531, "learning_rate": 7.097981330836616e-07, - "loss": 0.1167, - "num_tokens": 26673644.0, - "reward": 0.7940644323825836, - "reward_std": 0.6795159354805946, - "rewards/cosine_scaled_reward": -0.05088444147258997, - "rewards/format_reward": 0.8958333432674408, + "loss": 0.0692, + "num_tokens": 29173857.0, + "reward": -0.2174304649233818, + "reward_std": 0.4413216896355152, + "rewards/cosine_scaled_reward": -0.1087152324616909, "step": 223 }, { "clip_ratio": 0.0, - "completion_length": 2049.979248046875, - "epoch": 0.256, - "grad_norm": 0.28553715348243713, - "kl": 0.00672149658203125, + "completion_length": 3276.4166870117188, + "epoch": 0.128, + "grad_norm": 0.18361811339855194, + "kl": 0.0010099411010742188, "learning_rate": 7.068574212948169e-07, - "loss": 0.0225, - "num_tokens": 26779861.0, - "reward": 0.3970666974782944, - "reward_std": 0.25915637239813805, - "rewards/cosine_scaled_reward": -0.10354999452829361, - "rewards/format_reward": 0.6041666641831398, + "loss": 0.0613, + "num_tokens": 29336993.0, + "reward": 0.06714647263288498, + "reward_std": 0.6574617028236389, + "rewards/cosine_scaled_reward": 0.033573225140571594, "step": 224 }, { "clip_ratio": 0.0, - "completion_length": 1318.2708740234375, - "epoch": 0.2571428571428571, - "grad_norm": 0.35073599219322205, - "kl": 0.00748443603515625, + "completion_length": 2702.625030517578, + "epoch": 0.12857142857142856, + "grad_norm": 0.2830963730812073, + "kl": 0.002536773681640625, "learning_rate": 7.039090644965509e-07, - "loss": 0.1601, - "num_tokens": 26851604.0, - "reward": 1.1711989641189575, - "reward_std": 0.7675672024488449, - "rewards/cosine_scaled_reward": 0.16893283650279045, - "rewards/format_reward": 0.8333333432674408, + "loss": -0.0118, + "num_tokens": 29473259.0, + "reward": -0.10639284551143646, + "reward_std": 0.5328052043914795, + "rewards/cosine_scaled_reward": -0.05319641903042793, "step": 225 }, { "clip_ratio": 0.0, - "completion_length": 1755.7083435058594, - "epoch": 0.2582857142857143, - "grad_norm": 0.448542058467865, - "kl": 0.009063720703125, + "completion_length": 1861.312515258789, + "epoch": 0.12914285714285714, + "grad_norm": 0.35156944394111633, + "kl": 0.015664100646972656, "learning_rate": 7.009532063876148e-07, - "loss": 0.1154, - "num_tokens": 26943744.0, - "reward": 0.5059840455651283, - "reward_std": 0.397355318069458, - "rewards/cosine_scaled_reward": -0.10117465630173683, - "rewards/format_reward": 0.708333333954215, + "loss": 0.0346, + "num_tokens": 29568650.0, + "reward": 0.3703688979148865, + "reward_std": 0.33065274357795715, + "rewards/cosine_scaled_reward": 0.18518445640802383, "step": 226 }, { "clip_ratio": 0.0, - "completion_length": 1148.0416870117188, - "epoch": 0.25942857142857145, - "grad_norm": 0.40192481875419617, - "kl": 0.01007080078125, + "completion_length": 1838.8125305175781, + "epoch": 0.12971428571428573, + "grad_norm": 0.3623930513858795, + "kl": 0.016778945922851562, "learning_rate": 6.979899910323624e-07, - "loss": 0.1993, - "num_tokens": 27006674.0, - "reward": 0.7949583828449249, - "reward_std": 0.5842397212982178, - "rewards/cosine_scaled_reward": -0.07127081975340843, - "rewards/format_reward": 0.9375, + "loss": 0.1119, + "num_tokens": 29662589.0, + "reward": 0.3070980906486511, + "reward_std": 0.5127528607845306, + "rewards/cosine_scaled_reward": 0.15354903042316437, "step": 227 }, { "clip_ratio": 0.0, - "completion_length": 1674.8125305175781, - "epoch": 0.26057142857142856, - "grad_norm": 0.3176382780075073, - "kl": 0.00914764404296875, + "completion_length": 1082.6041717529297, + "epoch": 0.13028571428571428, + "grad_norm": 0.5364865064620972, + "kl": 0.018810272216796875, "learning_rate": 6.950195628537299e-07, - "loss": 0.171, - "num_tokens": 27095561.0, - "reward": 0.23410499235615134, - "reward_std": 0.3607608489692211, - "rewards/cosine_scaled_reward": -0.24753085523843765, - "rewards/format_reward": 0.7291666716337204, + "loss": 0.0448, + "num_tokens": 29719618.0, + "reward": 0.6576689593493938, + "reward_std": 0.6490252837538719, + "rewards/cosine_scaled_reward": 0.3288344731554389, "step": 228 }, { "clip_ratio": 0.0, - "completion_length": 1639.3333740234375, - "epoch": 0.26171428571428573, - "grad_norm": 0.35383015871047974, - "kl": 0.01419830322265625, + "completion_length": 2231.6458740234375, + "epoch": 0.13085714285714287, + "grad_norm": 0.30130934715270996, + "kl": 0.009735107421875, "learning_rate": 6.920420666261961e-07, - "loss": 0.1877, - "num_tokens": 27181737.0, - "reward": 0.4472636952996254, - "reward_std": 0.5543514788150787, - "rewards/cosine_scaled_reward": -0.13053483422845602, - "rewards/format_reward": 0.7083333358168602, + "loss": -0.0144, + "num_tokens": 29833121.0, + "reward": -0.20264260238036513, + "reward_std": 0.4636539947241545, + "rewards/cosine_scaled_reward": -0.10132130864076316, "step": 229 }, { "clip_ratio": 0.0, - "completion_length": 1613.6042022705078, - "epoch": 0.26285714285714284, - "grad_norm": 0.3959270119667053, - "kl": 0.007358551025390625, + "completion_length": 2730.1041870117188, + "epoch": 0.13142857142857142, + "grad_norm": 0.36225757002830505, + "kl": 0.0050067901611328125, "learning_rate": 6.890576474687263e-07, - "loss": 0.1748, - "num_tokens": 27267662.0, - "reward": 0.7049806490540504, - "reward_std": 0.637917771935463, - "rewards/cosine_scaled_reward": -0.04334301874041557, - "rewards/format_reward": 0.7916666716337204, + "loss": -0.1013, + "num_tokens": 29971474.0, + "reward": 0.15790478512644768, + "reward_std": 0.900160625576973, + "rewards/cosine_scaled_reward": 0.07895238231867552, "step": 230 }, { "clip_ratio": 0.0, - "completion_length": 2350.979217529297, - "epoch": 0.264, - "grad_norm": 0.29473647475242615, - "kl": 0.006885528564453125, + "completion_length": 2743.7083740234375, + "epoch": 0.132, + "grad_norm": 0.21701067686080933, + "kl": 0.0045013427734375, "learning_rate": 6.860664508377001e-07, - "loss": 0.1554, - "num_tokens": 27388519.0, - "reward": 0.6861059963703156, - "reward_std": 0.5922371260821819, - "rewards/cosine_scaled_reward": 0.009719666093587875, - "rewards/format_reward": 0.6666666641831398, + "loss": 0.0773, + "num_tokens": 30108548.0, + "reward": -0.1372587690129876, + "reward_std": 0.5728438459336758, + "rewards/cosine_scaled_reward": -0.06862937705591321, "step": 231 }, { "clip_ratio": 0.0, - "completion_length": 1402.4167022705078, - "epoch": 0.2651428571428571, - "grad_norm": 0.354436457157135, - "kl": 0.01007080078125, + "completion_length": 2708.0833740234375, + "epoch": 0.13257142857142856, + "grad_norm": 0.48722752928733826, + "kl": 0.011949539184570312, "learning_rate": 6.83068622519821e-07, - "loss": 0.1694, - "num_tokens": 27464409.0, - "reward": 0.5453539118170738, - "reward_std": 0.7636468224227428, - "rewards/cosine_scaled_reward": -0.13357303908560425, - "rewards/format_reward": 0.8125, + "loss": -0.0551, + "num_tokens": 30244260.0, + "reward": 0.018355626612901688, + "reward_std": 0.46119677275419235, + "rewards/cosine_scaled_reward": 0.009177813306450844, "step": 232 }, { "clip_ratio": 0.0, - "completion_length": 1941.7291870117188, - "epoch": 0.2662857142857143, - "grad_norm": 0.36628639698028564, - "kl": 0.0104217529296875, + "completion_length": 3555.2708740234375, + "epoch": 0.13314285714285715, + "grad_norm": 0.1710912138223648, + "kl": 0.0009517669677734375, "learning_rate": 6.800643086250121e-07, - "loss": 0.132, - "num_tokens": 27565634.0, - "reward": 0.8026084899902344, - "reward_std": 0.9171203821897507, - "rewards/cosine_scaled_reward": 0.015887574292719364, - "rewards/format_reward": 0.7708333283662796, + "loss": 0.0049, + "num_tokens": 30421249.0, + "reward": -0.30245864391326904, + "reward_std": 0.2900640666484833, + "rewards/cosine_scaled_reward": -0.15122931078076363, "step": 233 }, { "clip_ratio": 0.0, - "completion_length": 1735.4583740234375, - "epoch": 0.2674285714285714, - "grad_norm": 0.2988748848438263, - "kl": 0.00730133056640625, + "completion_length": 3123.9375610351562, + "epoch": 0.1337142857142857, + "grad_norm": 0.19327852129936218, + "kl": 0.00162506103515625, "learning_rate": 6.770536555792944e-07, - "loss": 0.2224, - "num_tokens": 27657318.0, - "reward": 0.6515710987150669, - "reward_std": 0.6439172849059105, - "rewards/cosine_scaled_reward": -0.07004780881106853, - "rewards/format_reward": 0.7916666567325592, + "loss": 0.0671, + "num_tokens": 30578182.0, + "reward": -0.25677131395787, + "reward_std": 0.6208681277930737, + "rewards/cosine_scaled_reward": -0.128385656978935, "step": 234 }, { "clip_ratio": 0.0, - "completion_length": 1488.208366394043, - "epoch": 0.26857142857142857, - "grad_norm": 0.3993307948112488, - "kl": 0.01326751708984375, + "completion_length": 1931.9792175292969, + "epoch": 0.13428571428571429, + "grad_norm": 0.46432214975357056, + "kl": 0.024862289428710938, "learning_rate": 6.740368101176495e-07, - "loss": 0.1033, - "num_tokens": 27735910.0, - "reward": 0.8183648958802223, - "reward_std": 0.5113412290811539, - "rewards/cosine_scaled_reward": 0.03418244048953056, - "rewards/format_reward": 0.75, + "loss": 0.0701, + "num_tokens": 30677037.0, + "reward": 0.25881527364254, + "reward_std": 0.3069061152637005, + "rewards/cosine_scaled_reward": 0.12940763682127, "step": 235 }, { "clip_ratio": 0.0, - "completion_length": 1141.8333587646484, - "epoch": 0.26971428571428574, - "grad_norm": 0.3268232047557831, - "kl": 0.00972747802734375, + "completion_length": 1024.5208740234375, + "epoch": 0.13485714285714287, + "grad_norm": 0.551213264465332, + "kl": 0.0158843994140625, "learning_rate": 6.710139192768694e-07, - "loss": 0.209, - "num_tokens": 27798722.0, - "reward": 0.45677755028009415, - "reward_std": 0.5210263505578041, - "rewards/cosine_scaled_reward": -0.22994457185268402, - "rewards/format_reward": 0.9166666716337204, + "loss": 0.2681, + "num_tokens": 30732298.0, + "reward": 0.17941563576459885, + "reward_std": 0.547722615301609, + "rewards/cosine_scaled_reward": 0.08970782160758972, "step": 236 }, { "clip_ratio": 0.0, - "completion_length": 1664.7500305175781, - "epoch": 0.27085714285714285, - "grad_norm": 0.33243250846862793, - "kl": 0.0086517333984375, + "completion_length": 1741.9791870117188, + "epoch": 0.13542857142857143, + "grad_norm": 0.5294864773750305, + "kl": 0.012974739074707031, "learning_rate": 6.679851303883891e-07, - "loss": 0.078, - "num_tokens": 27887030.0, - "reward": 0.7787259165197611, - "reward_std": 0.5983692929148674, - "rewards/cosine_scaled_reward": -0.01688704453408718, - "rewards/format_reward": 0.8125, + "loss": 0.35, + "num_tokens": 30821337.0, + "reward": -0.2729988917708397, + "reward_std": 0.39208219945430756, + "rewards/cosine_scaled_reward": -0.13649944216012955, "step": 237 }, { "clip_ratio": 0.0, - "completion_length": 1015.4167175292969, - "epoch": 0.272, - "grad_norm": 0.4665866792201996, - "kl": 0.0143890380859375, + "completion_length": 2661.2708740234375, + "epoch": 0.136, + "grad_norm": 0.26540040969848633, + "kl": 0.008977890014648438, "learning_rate": 6.649505910711058e-07, - "loss": 0.1004, - "num_tokens": 27944128.0, - "reward": 0.5497091813012958, - "reward_std": 0.5992827862501144, - "rewards/cosine_scaled_reward": -0.18347875276231207, - "rewards/format_reward": 0.9166666716337204, + "loss": 0.1287, + "num_tokens": 30955702.0, + "reward": -0.24963749200105667, + "reward_std": 0.5366819277405739, + "rewards/cosine_scaled_reward": -0.12481874227523804, "step": 238 }, { "clip_ratio": 0.0, - "completion_length": 1973.1458740234375, - "epoch": 0.27314285714285713, - "grad_norm": 0.25983497500419617, - "kl": 0.007293701171875, + "completion_length": 2663.8958740234375, + "epoch": 0.13657142857142857, + "grad_norm": 0.5480060577392578, + "kl": 0.012514114379882812, "learning_rate": 6.619104492241847e-07, - "loss": 0.1291, - "num_tokens": 28047701.0, - "reward": 0.27186793461441994, - "reward_std": 0.6742007434368134, - "rewards/cosine_scaled_reward": -0.21823271550238132, - "rewards/format_reward": 0.708333320915699, + "loss": 0.1785, + "num_tokens": 31089365.0, + "reward": -0.02775234915316105, + "reward_std": 0.7676347196102142, + "rewards/cosine_scaled_reward": -0.013876182027161121, "step": 239 }, { "clip_ratio": 0.0, - "completion_length": 1144.4166870117188, - "epoch": 0.2742857142857143, - "grad_norm": 0.3183096945285797, - "kl": 0.0117340087890625, + "completion_length": 2176.229248046875, + "epoch": 0.13714285714285715, + "grad_norm": 0.28263622522354126, + "kl": 0.003826141357421875, "learning_rate": 6.588648530198504e-07, - "loss": 0.1353, - "num_tokens": 28110835.0, - "reward": 1.299223653972149, - "reward_std": 0.5905436016619205, - "rewards/cosine_scaled_reward": 0.17044511064887047, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.0034, + "num_tokens": 31199812.0, + "reward": -0.4636247009038925, + "reward_std": 0.3770550861954689, + "rewards/cosine_scaled_reward": -0.23181235045194626, "step": 240 }, { "clip_ratio": 0.0, - "completion_length": 1381.416732788086, - "epoch": 0.2754285714285714, - "grad_norm": 0.4551909565925598, - "kl": 0.014190673828125, + "completion_length": 1963.645839691162, + "epoch": 0.1377142857142857, + "grad_norm": 0.5095245838165283, + "kl": 0.02035999298095703, "learning_rate": 6.558139508961654e-07, - "loss": 0.0289, - "num_tokens": 28185639.0, - "reward": 0.633112620562315, - "reward_std": 0.5124102905392647, - "rewards/cosine_scaled_reward": -0.08969368692487478, - "rewards/format_reward": 0.8125, + "loss": 0.0311, + "num_tokens": 31299575.0, + "reward": 0.524737037718296, + "reward_std": 0.8398763090372086, + "rewards/cosine_scaled_reward": 0.2623685207217932, "step": 241 }, { "clip_ratio": 0.0, - "completion_length": 1745.875015258789, - "epoch": 0.2765714285714286, - "grad_norm": 0.46845531463623047, - "kl": 0.00977325439453125, + "completion_length": 3584.0, + "epoch": 0.1382857142857143, + "grad_norm": 0.21239568293094635, + "kl": 0.0011444091796875, "learning_rate": 6.527578915497951e-07, - "loss": 0.354, - "num_tokens": 28277157.0, - "reward": 0.6752857603132725, - "reward_std": 0.7641724795103073, - "rewards/cosine_scaled_reward": -0.047773787286132574, - "rewards/format_reward": 0.7708333432674408, + "loss": 0.0, + "num_tokens": 31477739.0, + "reward": -0.3269985783845186, + "reward_std": 0.2089610155671835, + "rewards/cosine_scaled_reward": -0.1634992891922593, "step": 242 }, { "clip_ratio": 0.0, - "completion_length": 1423.1875610351562, - "epoch": 0.2777142857142857, - "grad_norm": 0.3185717463493347, - "kl": 0.0092926025390625, + "completion_length": 2510.937530517578, + "epoch": 0.13885714285714285, + "grad_norm": 0.32022956013679504, + "kl": 0.00478363037109375, "learning_rate": 6.496968239287603e-07, - "loss": 0.2263, - "num_tokens": 28352916.0, - "reward": 0.8118701353669167, - "reward_std": 0.5183029696345329, - "rewards/cosine_scaled_reward": -0.01073160395026207, - "rewards/format_reward": 0.8333333432674408, + "loss": 0.0972, + "num_tokens": 31603616.0, + "reward": -0.44686760660260916, + "reward_std": 0.3707558251917362, + "rewards/cosine_scaled_reward": -0.22343379561789334, "step": 243 }, { "clip_ratio": 0.0, - "completion_length": 1071.0417175292969, - "epoch": 0.27885714285714286, - "grad_norm": 4.240310192108154, - "kl": 0.1209869384765625, + "completion_length": 2716.604248046875, + "epoch": 0.13942857142857143, + "grad_norm": 0.3231089115142822, + "kl": 0.0021686553955078125, "learning_rate": 6.466308972251785e-07, - "loss": 0.2218, - "num_tokens": 28412534.0, - "reward": 1.1517661362886429, - "reward_std": 0.9977184236049652, - "rewards/cosine_scaled_reward": 0.12796638230793178, - "rewards/format_reward": 0.8958333432674408, + "loss": -0.0145, + "num_tokens": 31740217.0, + "reward": -0.5142710842192173, + "reward_std": 0.6101813912391663, + "rewards/cosine_scaled_reward": -0.25713553559035063, "step": 244 }, { "clip_ratio": 0.0, - "completion_length": 1106.6042022705078, - "epoch": 0.28, - "grad_norm": 0.5300337076187134, - "kl": 0.0133056640625, + "completion_length": 2832.625030517578, + "epoch": 0.14, + "grad_norm": 0.26809558272361755, + "kl": 0.0017642974853515625, "learning_rate": 6.435602608679916e-07, - "loss": 0.1105, - "num_tokens": 28472587.0, - "reward": 0.7246537022292614, - "reward_std": 0.5131946355104446, - "rewards/cosine_scaled_reward": -0.08558984659612179, - "rewards/format_reward": 0.8958333432674408, + "loss": 0.0903, + "num_tokens": 31881463.0, + "reward": -0.1661408469080925, + "reward_std": 0.7787005566060543, + "rewards/cosine_scaled_reward": -0.08307042345404625, "step": 245 }, { "clip_ratio": 0.0, - "completion_length": 1154.0833740234375, - "epoch": 0.28114285714285714, - "grad_norm": 0.45345908403396606, - "kl": 0.0103302001953125, + "completion_length": 2692.8541717529297, + "epoch": 0.14057142857142857, + "grad_norm": 0.2669066786766052, + "kl": 0.009328842163085938, "learning_rate": 6.404850645156841e-07, - "loss": 0.183, - "num_tokens": 28536143.0, - "reward": 0.912123791873455, - "reward_std": 0.47968992590904236, - "rewards/cosine_scaled_reward": -0.023104790598154068, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.0709, + "num_tokens": 32016624.0, + "reward": -0.19150156527757645, + "reward_std": 0.509680725634098, + "rewards/cosine_scaled_reward": -0.09575077798217535, "step": 246 }, { "clip_ratio": 0.0, - "completion_length": 768.0625228881836, - "epoch": 0.2822857142857143, - "grad_norm": 0.527489423751831, - "kl": 0.018798828125, + "completion_length": 2888.9375, + "epoch": 0.14114285714285715, + "grad_norm": 0.217579185962677, + "kl": 0.0030298233032226562, "learning_rate": 6.374054580489873e-07, - "loss": 0.0279, - "num_tokens": 28580654.0, - "reward": 1.2265305668115616, - "reward_std": 0.6787078976631165, - "rewards/cosine_scaled_reward": 0.1340985968708992, - "rewards/format_reward": 0.9583333432674408, + "loss": -0.1061, + "num_tokens": 32162217.0, + "reward": 0.31709786131978035, + "reward_std": 0.7779325805604458, + "rewards/cosine_scaled_reward": 0.15854893065989017, "step": 247 }, { "clip_ratio": 0.0, - "completion_length": 1808.3333358764648, - "epoch": 0.2834285714285714, - "grad_norm": 0.4756445288658142, - "kl": 0.011627197265625, + "completion_length": 1323.1041870117188, + "epoch": 0.1417142857142857, + "grad_norm": 0.6014187335968018, + "kl": 0.03565692901611328, "learning_rate": 6.343215915635761e-07, - "loss": 0.1571, - "num_tokens": 28675230.0, - "reward": 0.4981997571885586, - "reward_std": 0.6461528465151787, - "rewards/cosine_scaled_reward": -0.08423344511538744, - "rewards/format_reward": 0.6666666828095913, + "loss": 0.0756, + "num_tokens": 32231006.0, + "reward": 0.6246988326311111, + "reward_std": 0.35837205685675144, + "rewards/cosine_scaled_reward": 0.3123493976891041, "step": 248 }, { "clip_ratio": 0.0, - "completion_length": 1773.5000762939453, - "epoch": 0.2845714285714286, - "grad_norm": 0.31437787413597107, - "kl": 0.007415771484375, + "completion_length": 2613.6458740234375, + "epoch": 0.1422857142857143, + "grad_norm": 0.29411664605140686, + "kl": 0.0029964447021484375, "learning_rate": 6.31233615362752e-07, - "loss": 0.1652, - "num_tokens": 28769256.0, - "reward": 0.6388058252632618, - "reward_std": 0.7867574747651815, - "rewards/cosine_scaled_reward": -0.03476376552134752, - "rewards/format_reward": 0.708333333954215, + "loss": 0.2094, + "num_tokens": 32362737.0, + "reward": -0.4900950863957405, + "reward_std": 0.5070604905486107, + "rewards/cosine_scaled_reward": -0.24504754319787025, "step": 249 }, { "clip_ratio": 0.0, - "completion_length": 931.2083587646484, - "epoch": 0.2857142857142857, - "grad_norm": 0.41349363327026367, - "kl": 0.0152130126953125, + "completion_length": 2731.5208740234375, + "epoch": 0.14285714285714285, + "grad_norm": 0.35469648241996765, + "kl": 0.017116546630859375, "learning_rate": 6.281416799501187e-07, - "loss": 0.0753, - "num_tokens": 28821742.0, - "reward": 0.7699981704354286, - "reward_std": 0.718483492732048, - "rewards/cosine_scaled_reward": -0.09416760504245758, - "rewards/format_reward": 0.9583333283662796, + "loss": -0.0102, + "num_tokens": 32499598.0, + "reward": -0.21433956921100616, + "reward_std": 0.520557913929224, + "rewards/cosine_scaled_reward": -0.10716977342963219, "step": 250 }, { "clip_ratio": 0.0, - "completion_length": 2199.6876220703125, - "epoch": 0.28685714285714287, - "grad_norm": 0.2921055555343628, - "kl": 0.0113372802734375, + "completion_length": 1934.125015258789, + "epoch": 0.14342857142857143, + "grad_norm": 0.3160950839519501, + "kl": 0.009817123413085938, "learning_rate": 6.25045936022246e-07, - "loss": 0.0523, - "num_tokens": 28935739.0, - "reward": 0.542440053075552, - "reward_std": 0.9616198837757111, - "rewards/cosine_scaled_reward": -0.041279987432062626, - "rewards/format_reward": 0.6250000149011612, + "loss": 0.127, + "num_tokens": 32598400.0, + "reward": -0.07275501638650894, + "reward_std": 0.4598498921841383, + "rewards/cosine_scaled_reward": -0.03637750819325447, "step": 251 }, { "clip_ratio": 0.0, - "completion_length": 1409.8542022705078, - "epoch": 0.288, - "grad_norm": 0.34936532378196716, - "kl": 0.011505126953125, + "completion_length": 2064.5834045410156, + "epoch": 0.144, + "grad_norm": 0.3414526581764221, + "kl": 0.00769805908203125, "learning_rate": 6.219465344613258e-07, - "loss": 0.123, - "num_tokens": 29011548.0, - "reward": 0.8992012739181519, - "reward_std": 0.5171293690800667, - "rewards/cosine_scaled_reward": -0.00873270258307457, - "rewards/format_reward": 0.9166666567325592, + "loss": 0.0573, + "num_tokens": 32703140.0, + "reward": -0.3573624864220619, + "reward_std": 0.4409428536891937, + "rewards/cosine_scaled_reward": -0.17868124693632126, "step": 252 }, { "clip_ratio": 0.0, - "completion_length": 1621.0417022705078, - "epoch": 0.28914285714285715, - "grad_norm": 0.3845634460449219, - "kl": 0.0136260986328125, + "completion_length": 2815.083335876465, + "epoch": 0.14457142857142857, + "grad_norm": 0.3612518012523651, + "kl": 0.007879257202148438, "learning_rate": 6.188436263278172e-07, - "loss": 0.2043, - "num_tokens": 29097182.0, - "reward": 0.47796724177896976, - "reward_std": 0.4307953789830208, - "rewards/cosine_scaled_reward": -0.11518305912613869, - "rewards/format_reward": 0.7083333432674408, + "loss": -0.0483, + "num_tokens": 32843784.0, + "reward": -0.22203051671385765, + "reward_std": 0.31091106310486794, + "rewards/cosine_scaled_reward": -0.11101526208221912, "step": 253 }, { "clip_ratio": 0.0, - "completion_length": 1132.7917175292969, - "epoch": 0.29028571428571426, - "grad_norm": 0.3748434782028198, - "kl": 0.0124969482421875, + "completion_length": 2616.479248046875, + "epoch": 0.14514285714285713, + "grad_norm": 0.3855963945388794, + "kl": 0.011937141418457031, "learning_rate": 6.157373628530852e-07, - "loss": 0.1782, - "num_tokens": 29158930.0, - "reward": 1.4391566216945648, - "reward_std": 0.6073151230812073, - "rewards/cosine_scaled_reward": 0.2924949713051319, - "rewards/format_reward": 0.8541666716337204, + "loss": 0.1246, + "num_tokens": 32975375.0, + "reward": 0.18154123798012733, + "reward_std": 0.5934804044663906, + "rewards/cosine_scaled_reward": 0.09077062457799911, "step": 254 }, { "clip_ratio": 0.0, - "completion_length": 1166.7500610351562, - "epoch": 0.2914285714285714, - "grad_norm": 0.3888883888721466, - "kl": 0.013397216796875, + "completion_length": 2412.333335876465, + "epoch": 0.1457142857142857, + "grad_norm": 0.34006571769714355, + "kl": 0.01013946533203125, "learning_rate": 6.126278954320294e-07, - "loss": 0.1089, - "num_tokens": 29223376.0, - "reward": 0.8761316128075123, - "reward_std": 0.4091811142861843, - "rewards/cosine_scaled_reward": 0.010982461273670197, - "rewards/format_reward": 0.8541666716337204, + "loss": -0.0014, + "num_tokens": 33097635.0, + "reward": 0.23491641879081726, + "reward_std": 0.5346547961235046, + "rewards/cosine_scaled_reward": 0.11745821312069893, "step": 255 }, { "clip_ratio": 0.0, - "completion_length": 863.0833587646484, - "epoch": 0.2925714285714286, - "grad_norm": 0.37044182419776917, - "kl": 0.0126190185546875, + "completion_length": 3411.3541870117188, + "epoch": 0.1462857142857143, + "grad_norm": 0.19116432964801788, + "kl": 0.0012416839599609375, "learning_rate": 6.095153756157051e-07, - "loss": 0.0904, - "num_tokens": 29273426.0, - "reward": 0.9074563533067703, - "reward_std": 0.49417800083756447, - "rewards/cosine_scaled_reward": -0.03585517778992653, - "rewards/format_reward": 0.9791666716337204, + "loss": 0.0547, + "num_tokens": 33267932.0, + "reward": -0.4405994936823845, + "reward_std": 0.43207642808556557, + "rewards/cosine_scaled_reward": -0.22029974684119225, "step": 256 }, { "clip_ratio": 0.0, - "completion_length": 1247.6875305175781, - "epoch": 0.2937142857142857, - "grad_norm": 0.2904569208621979, - "kl": 0.011871337890625, + "completion_length": 2541.4583435058594, + "epoch": 0.14685714285714285, + "grad_norm": 0.3120291233062744, + "kl": 0.0090484619140625, "learning_rate": 6.06399955103937e-07, - "loss": 0.2652, - "num_tokens": 29341499.0, - "reward": 0.43778856843709946, - "reward_std": 0.5745215713977814, - "rewards/cosine_scaled_reward": -0.22902238368988037, - "rewards/format_reward": 0.8958333283662796, + "loss": 0.0088, + "num_tokens": 33395922.0, + "reward": -0.40020735282450914, + "reward_std": 0.6490463241934776, + "rewards/cosine_scaled_reward": -0.20010367268696427, "step": 257 }, { "clip_ratio": 0.0, - "completion_length": 1685.5833435058594, - "epoch": 0.2948571428571429, - "grad_norm": 0.366291880607605, - "kl": 0.013092041015625, + "completion_length": 1818.3958587646484, + "epoch": 0.14742857142857144, + "grad_norm": 0.6874107122421265, + "kl": 0.04048919677734375, "learning_rate": 6.032817857379256e-07, - "loss": 0.3003, - "num_tokens": 29430639.0, - "reward": 0.5727702639997005, - "reward_std": 0.6911112070083618, - "rewards/cosine_scaled_reward": -0.06778154894709587, - "rewards/format_reward": 0.7083333432674408, + "loss": 0.1263, + "num_tokens": 33488953.0, + "reward": 0.3148918077349663, + "reward_std": 0.6025057537481189, + "rewards/cosine_scaled_reward": 0.15744590386748314, "step": 258 }, { "clip_ratio": 0.0, - "completion_length": 1004.8750457763672, - "epoch": 0.296, - "grad_norm": 0.4196613132953644, - "kl": 0.012115478515625, + "completion_length": 2497.2916870117188, + "epoch": 0.148, + "grad_norm": 0.3090062141418457, + "kl": 0.0051898956298828125, "learning_rate": 6.001610194928464e-07, - "loss": 0.1882, - "num_tokens": 29486475.0, - "reward": 0.6432907655835152, - "reward_std": 0.41963067930191755, - "rewards/cosine_scaled_reward": -0.12627129815518856, - "rewards/format_reward": 0.8958333432674408, + "loss": -0.0475, + "num_tokens": 33614655.0, + "reward": -0.1998734101653099, + "reward_std": 0.5551594458520412, + "rewards/cosine_scaled_reward": -0.09993669763207436, "step": 259 }, { "clip_ratio": 0.0, - "completion_length": 1489.8333740234375, - "epoch": 0.29714285714285715, - "grad_norm": 0.34775781631469727, - "kl": 0.0118255615234375, + "completion_length": 3108.2291870117188, + "epoch": 0.14857142857142858, + "grad_norm": 0.20045077800750732, + "kl": 0.00176239013671875, "learning_rate": 5.97037808470444e-07, - "loss": 0.1441, - "num_tokens": 29566183.0, - "reward": 0.4704531617462635, - "reward_std": 0.6600965559482574, - "rewards/cosine_scaled_reward": -0.191856749355793, - "rewards/format_reward": 0.8541666716337204, + "loss": -0.095, + "num_tokens": 33770126.0, + "reward": -0.24349116533994675, + "reward_std": 0.4448055624961853, + "rewards/cosine_scaled_reward": -0.12174558266997337, "step": 260 }, { "clip_ratio": 0.0, - "completion_length": 1136.9792022705078, - "epoch": 0.29828571428571427, - "grad_norm": 0.34792360663414, - "kl": 0.01123046875, + "completion_length": 3407.041748046875, + "epoch": 0.14914285714285713, + "grad_norm": 0.20637670159339905, + "kl": 0.0013484954833984375, "learning_rate": 5.939123048916173e-07, - "loss": 0.2938, - "num_tokens": 29628600.0, - "reward": 0.5166485756635666, - "reward_std": 0.40996433794498444, - "rewards/cosine_scaled_reward": -0.20000904146581888, - "rewards/format_reward": 0.9166666716337204, + "loss": 0.0355, + "num_tokens": 33939832.0, + "reward": -0.44314368814229965, + "reward_std": 0.4703112803399563, + "rewards/cosine_scaled_reward": -0.22157184407114983, "step": 261 }, { "clip_ratio": 0.0, - "completion_length": 1239.2917175292969, - "epoch": 0.29942857142857143, - "grad_norm": 0.3884727358818054, - "kl": 0.017578125, + "completion_length": 2691.125045776367, + "epoch": 0.14971428571428572, + "grad_norm": 0.2881263196468353, + "kl": 0.00722503662109375, "learning_rate": 5.907846610890011e-07, - "loss": 0.0502, - "num_tokens": 29696036.0, - "reward": 0.5863418951630592, - "reward_std": 0.3798971250653267, - "rewards/cosine_scaled_reward": -0.14432908222079277, - "rewards/format_reward": 0.875, + "loss": 0.0162, + "num_tokens": 34074154.0, + "reward": 0.07448863238096237, + "reward_std": 0.6951295547187328, + "rewards/cosine_scaled_reward": 0.03724431432783604, "step": 262 }, { "clip_ratio": 0.0, - "completion_length": 709.4375152587891, - "epoch": 0.30057142857142854, - "grad_norm": 0.4851728677749634, - "kl": 0.0172271728515625, + "completion_length": 2771.6041717529297, + "epoch": 0.15028571428571427, + "grad_norm": 0.35452720522880554, + "kl": 0.0109100341796875, "learning_rate": 5.87655029499542e-07, - "loss": 0.173, - "num_tokens": 29738195.0, - "reward": 0.806241512298584, - "reward_std": 0.5233523100614548, - "rewards/cosine_scaled_reward": -0.08646258153021336, - "rewards/format_reward": 0.9791666716337204, + "loss": -0.012, + "num_tokens": 34213515.0, + "reward": -0.10311572067439556, + "reward_std": 0.3219672627747059, + "rewards/cosine_scaled_reward": -0.05155786033719778, "step": 263 }, { "clip_ratio": 0.0, - "completion_length": 1195.7916870117188, - "epoch": 0.3017142857142857, - "grad_norm": 0.3379260301589966, - "kl": 0.00963592529296875, + "completion_length": 3548.666748046875, + "epoch": 0.15085714285714286, + "grad_norm": 0.20419248938560486, + "kl": 0.0011997222900390625, "learning_rate": 5.845235626570683e-07, - "loss": 0.2216, - "num_tokens": 29803975.0, - "reward": 0.7059077769517899, - "reward_std": 0.5995725318789482, - "rewards/cosine_scaled_reward": -0.1262128073722124, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.0187, + "num_tokens": 34389839.0, + "reward": -0.3663049042224884, + "reward_std": 0.6324282512068748, + "rewards/cosine_scaled_reward": -0.1831524483859539, "step": 264 }, { "clip_ratio": 0.0, - "completion_length": 1640.9583740234375, - "epoch": 0.3028571428571429, - "grad_norm": 0.3161636292934418, - "kl": 0.01184844970703125, + "completion_length": 3260.1875, + "epoch": 0.15142857142857144, + "grad_norm": 0.23959365487098694, + "kl": 0.0013904571533203125, "learning_rate": 5.813904131848564e-07, - "loss": 0.2132, - "num_tokens": 29890727.0, - "reward": 0.43143167346715927, - "reward_std": 0.5859523415565491, - "rewards/cosine_scaled_reward": -0.12803417071700096, - "rewards/format_reward": 0.6875, + "loss": -0.0252, + "num_tokens": 34552136.0, + "reward": -0.2764420807361603, + "reward_std": 0.49818455800414085, + "rewards/cosine_scaled_reward": -0.13822103291749954, "step": 265 }, { "clip_ratio": 0.0, - "completion_length": 1334.6666870117188, - "epoch": 0.304, - "grad_norm": 3.1876046657562256, - "kl": 0.0460357666015625, + "completion_length": 2710.437530517578, + "epoch": 0.152, + "grad_norm": 0.32057079672813416, + "kl": 0.0076694488525390625, "learning_rate": 5.78255733788191e-07, - "loss": 0.2019, - "num_tokens": 29962909.0, - "reward": 0.8172680884599686, - "reward_std": 0.3814220707863569, - "rewards/cosine_scaled_reward": -0.008032636949792504, - "rewards/format_reward": 0.8333333283662796, + "loss": -0.0256, + "num_tokens": 34688285.0, + "reward": -0.3459478300064802, + "reward_std": 0.25663014128804207, + "rewards/cosine_scaled_reward": -0.17297391314059496, "step": 266 }, { "clip_ratio": 0.0, - "completion_length": 1512.7500305175781, - "epoch": 0.30514285714285716, - "grad_norm": 0.33352527022361755, - "kl": 0.013641357421875, + "completion_length": 1547.208396911621, + "epoch": 0.15257142857142858, + "grad_norm": 0.4645328223705292, + "kl": 0.012868881225585938, "learning_rate": 5.751196772469237e-07, - "loss": 0.1679, - "num_tokens": 30043741.0, - "reward": 0.7953452169895172, - "reward_std": 0.5917237177491188, - "rewards/cosine_scaled_reward": 0.0018392521888017654, - "rewards/format_reward": 0.7916666865348816, + "loss": 0.113, + "num_tokens": 34767999.0, + "reward": -0.11789099872112274, + "reward_std": 0.5525611527264118, + "rewards/cosine_scaled_reward": -0.05894550122320652, "step": 267 }, { "clip_ratio": 0.0, - "completion_length": 1813.7709045410156, - "epoch": 0.3062857142857143, - "grad_norm": 0.3294363021850586, - "kl": 0.0094451904296875, + "completion_length": 2182.375, + "epoch": 0.15314285714285714, + "grad_norm": 0.3756212294101715, + "kl": 0.010714530944824219, "learning_rate": 5.71982396408026e-07, - "loss": 0.2892, - "num_tokens": 30138998.0, - "reward": 0.4046551361680031, - "reward_std": 0.4766751229763031, - "rewards/cosine_scaled_reward": -0.16225576400756836, - "rewards/format_reward": 0.7291666641831398, + "loss": 0.1363, + "num_tokens": 34878669.0, + "reward": -0.077213354408741, + "reward_std": 0.57977394759655, + "rewards/cosine_scaled_reward": -0.0386066734790802, "step": 268 }, { "clip_ratio": 0.0, - "completion_length": 905.3750152587891, - "epoch": 0.30742857142857144, - "grad_norm": 0.43302249908447266, - "kl": 0.01812744140625, + "completion_length": 3128.6251220703125, + "epoch": 0.15371428571428572, + "grad_norm": 0.1624217927455902, + "kl": 0.0017490386962890625, "learning_rate": 5.688440441781398e-07, - "loss": 0.3599, - "num_tokens": 30190574.0, - "reward": 0.7200521975755692, - "reward_std": 0.552587129175663, - "rewards/cosine_scaled_reward": -0.10872390307486057, - "rewards/format_reward": 0.9375000149011612, + "loss": -0.0542, + "num_tokens": 35034243.0, + "reward": -0.375546395778656, + "reward_std": 0.2080207783728838, + "rewards/cosine_scaled_reward": -0.18777319695800543, "step": 269 }, { "clip_ratio": 0.0, - "completion_length": 885.4583435058594, - "epoch": 0.30857142857142855, - "grad_norm": 0.470708429813385, - "kl": 0.0196990966796875, + "completion_length": 2074.270839691162, + "epoch": 0.15428571428571428, + "grad_norm": 0.48845088481903076, + "kl": 0.02855682373046875, "learning_rate": 5.657047735161255e-07, - "loss": 0.0815, - "num_tokens": 30240552.0, - "reward": 1.292167842388153, - "reward_std": 0.6577414199709892, - "rewards/cosine_scaled_reward": 0.15650059608742595, - "rewards/format_reward": 0.9791666716337204, + "loss": -0.1012, + "num_tokens": 35139796.0, + "reward": 0.26241855323314667, + "reward_std": 0.4325704537332058, + "rewards/cosine_scaled_reward": 0.13120926916599274, "step": 270 }, { "clip_ratio": 0.0, - "completion_length": 1603.4792175292969, - "epoch": 0.3097142857142857, - "grad_norm": 0.3854743242263794, - "kl": 0.014801025390625, + "completion_length": 2039.5416870117188, + "epoch": 0.15485714285714286, + "grad_norm": 0.27519840002059937, + "kl": 0.004604339599609375, "learning_rate": 5.625647374256061e-07, - "loss": 0.087, - "num_tokens": 30325325.0, - "reward": 0.9050909653306007, - "reward_std": 0.64243184030056, - "rewards/cosine_scaled_reward": 0.04629545658826828, - "rewards/format_reward": 0.8125, + "loss": -0.1227, + "num_tokens": 35244654.0, + "reward": -0.13348775170743465, + "reward_std": 0.5455809384584427, + "rewards/cosine_scaled_reward": -0.06674387538805604, "step": 271 }, { "clip_ratio": 0.0, - "completion_length": 1576.1458740234375, - "epoch": 0.31085714285714283, - "grad_norm": 0.391513854265213, - "kl": 0.01590728759765625, + "completion_length": 2399.0625915527344, + "epoch": 0.15542857142857142, + "grad_norm": 0.3173102140426636, + "kl": 0.0048809051513671875, "learning_rate": 5.594240889475106e-07, - "loss": 0.1901, - "num_tokens": 30409218.0, - "reward": 0.4028028752654791, - "reward_std": 0.659964844584465, - "rewards/cosine_scaled_reward": -0.20484856329858303, - "rewards/format_reward": 0.8125, + "loss": 0.1177, + "num_tokens": 35365593.0, + "reward": -0.08320139348506927, + "reward_std": 0.7119211666285992, + "rewards/cosine_scaled_reward": -0.04160069301724434, "step": 272 }, { "clip_ratio": 0.0, - "completion_length": 1670.6875610351562, - "epoch": 0.312, - "grad_norm": 0.6100649833679199, - "kl": 0.013885498046875, + "completion_length": 3164.75, + "epoch": 0.156, + "grad_norm": 0.22468267381191254, + "kl": 0.001804351806640625, "learning_rate": 5.562829811526154e-07, - "loss": 0.2061, - "num_tokens": 30497661.0, - "reward": 0.19328145054169, - "reward_std": 0.6281792521476746, - "rewards/cosine_scaled_reward": -0.2679426074028015, - "rewards/format_reward": 0.7291666567325592, + "loss": 0.0294, + "num_tokens": 35523117.0, + "reward": -0.3206188827753067, + "reward_std": 0.5574362277984619, + "rewards/cosine_scaled_reward": -0.16030943393707275, "step": 273 }, { "clip_ratio": 0.0, - "completion_length": 1347.3333435058594, - "epoch": 0.31314285714285717, - "grad_norm": 0.38863444328308105, - "kl": 0.0164031982421875, + "completion_length": 2586.1666870117188, + "epoch": 0.15657142857142858, + "grad_norm": 0.3104928731918335, + "kl": 0.0055084228515625, "learning_rate": 5.531415671340826e-07, - "loss": 0.0935, - "num_tokens": 30570181.0, - "reward": 0.7795020919293165, - "reward_std": 0.7149695008993149, - "rewards/cosine_scaled_reward": -0.026915639638900757, - "rewards/format_reward": 0.8333333358168602, + "loss": 0.0949, + "num_tokens": 35653757.0, + "reward": -0.13973749428987503, + "reward_std": 0.3990135118365288, + "rewards/cosine_scaled_reward": -0.06986876204609871, "step": 274 }, { "clip_ratio": 0.0, - "completion_length": 2051.1875610351562, - "epoch": 0.3142857142857143, - "grad_norm": 0.4029337763786316, - "kl": 0.015407562255859375, + "completion_length": 2072.541717529297, + "epoch": 0.15714285714285714, + "grad_norm": 0.41220641136169434, + "kl": 0.013563156127929688, "learning_rate": 5.5e-07, - "loss": 0.2663, - "num_tokens": 30677122.0, - "reward": 0.3835557587444782, - "reward_std": 0.8576947599649429, - "rewards/cosine_scaled_reward": -0.14155545644462109, - "rewards/format_reward": 0.6666666641831398, + "loss": 0.0363, + "num_tokens": 35759299.0, + "reward": -0.4124254733324051, + "reward_std": 0.5271046534180641, + "rewards/cosine_scaled_reward": -0.20621273759752512, "step": 275 }, { "clip_ratio": 0.0, - "completion_length": 747.5625305175781, - "epoch": 0.31542857142857145, - "grad_norm": 0.8817548751831055, - "kl": 0.039520263671875, + "completion_length": 1188.1250457763672, + "epoch": 0.15771428571428572, + "grad_norm": 0.4833068251609802, + "kl": 0.0210723876953125, "learning_rate": 5.468584328659172e-07, - "loss": 0.203, - "num_tokens": 30720433.0, - "reward": 1.04302117228508, - "reward_std": 0.6926102489233017, - "rewards/cosine_scaled_reward": 0.031927239149808884, - "rewards/format_reward": 0.9791666716337204, + "loss": -0.0615, + "num_tokens": 35822281.0, + "reward": 0.5711349472403526, + "reward_std": 0.7510395795106888, + "rewards/cosine_scaled_reward": 0.28556746058166027, "step": 276 }, { "clip_ratio": 0.0, - "completion_length": 1000.7083740234375, - "epoch": 0.31657142857142856, - "grad_norm": 0.44100356101989746, - "kl": 0.020416259765625, + "completion_length": 2676.9375915527344, + "epoch": 0.15828571428571428, + "grad_norm": 0.28584277629852295, + "kl": 0.0041713714599609375, "learning_rate": 5.437170188473847e-07, - "loss": 0.4464, - "num_tokens": 30776447.0, - "reward": 0.8042115196585655, - "reward_std": 0.6335554867982864, - "rewards/cosine_scaled_reward": -0.03539424831978977, - "rewards/format_reward": 0.875, + "loss": 0.0217, + "num_tokens": 35956750.0, + "reward": 0.05125083029270172, + "reward_std": 0.6981733292341232, + "rewards/cosine_scaled_reward": 0.025625411421060562, "step": 277 }, { "clip_ratio": 0.0, - "completion_length": 1254.1250610351562, - "epoch": 0.3177142857142857, - "grad_norm": 0.4421320855617523, - "kl": 0.02032470703125, + "completion_length": 2614.9375, + "epoch": 0.15885714285714286, + "grad_norm": 0.29526305198669434, + "kl": 0.002593994140625, "learning_rate": 5.405759110524894e-07, - "loss": 0.0181, - "num_tokens": 30844427.0, - "reward": 0.9023200869560242, - "reward_std": 0.6986501328647137, - "rewards/cosine_scaled_reward": 0.024076687172055244, - "rewards/format_reward": 0.8541666716337204, + "loss": 0.0156, + "num_tokens": 36088627.0, + "reward": -0.3907754272222519, + "reward_std": 0.5090431272983551, + "rewards/cosine_scaled_reward": -0.19538771361112595, "step": 278 }, { "clip_ratio": 0.0, - "completion_length": 1248.6458435058594, - "epoch": 0.31885714285714284, - "grad_norm": 0.43398356437683105, - "kl": 0.023956298828125, + "completion_length": 2388.791748046875, + "epoch": 0.15942857142857142, + "grad_norm": 0.32363101840019226, + "kl": 0.0035810470581054688, "learning_rate": 5.37435262574394e-07, - "loss": 0.1313, - "num_tokens": 30912714.0, - "reward": 0.7318704128265381, - "reward_std": 0.6384659558534622, - "rewards/cosine_scaled_reward": -0.08198146149516106, - "rewards/format_reward": 0.8958333283662796, + "loss": 0.1454, + "num_tokens": 36209013.0, + "reward": 0.351345656439662, + "reward_std": 0.7279459312558174, + "rewards/cosine_scaled_reward": 0.1756728133186698, "step": 279 }, { "clip_ratio": 0.0, - "completion_length": 1238.8333740234375, - "epoch": 0.32, - "grad_norm": 0.34043627977371216, - "kl": 0.0171966552734375, + "completion_length": 1682.5, + "epoch": 0.16, + "grad_norm": 0.4550730884075165, + "kl": 0.0157623291015625, "learning_rate": 5.342952264838747e-07, - "loss": 0.1864, - "num_tokens": 30979336.0, - "reward": 0.9366099536418915, - "reward_std": 0.5328906774520874, - "rewards/cosine_scaled_reward": -0.0004450604319572449, - "rewards/format_reward": 0.9375000149011612, + "loss": -0.0517, + "num_tokens": 36295233.0, + "reward": 0.032998040318489075, + "reward_std": 0.4106169492006302, + "rewards/cosine_scaled_reward": 0.016499027609825134, "step": 280 }, { "clip_ratio": 0.0, - "completion_length": 1022.8125457763672, - "epoch": 0.3211428571428571, - "grad_norm": 0.3274538218975067, - "kl": 0.021148681640625, + "completion_length": 2025.9166793823242, + "epoch": 0.16057142857142856, + "grad_norm": 0.4892213046550751, + "kl": 0.01910400390625, "learning_rate": 5.311559558218603e-07, - "loss": 0.2272, - "num_tokens": 31037209.0, - "reward": 1.0622368827462196, - "reward_std": 0.5683299601078033, - "rewards/cosine_scaled_reward": 0.07278510555624962, - "rewards/format_reward": 0.9166666716337204, + "loss": 0.2053, + "num_tokens": 36398009.0, + "reward": 0.2391605954617262, + "reward_std": 0.40038658678531647, + "rewards/cosine_scaled_reward": 0.1195802828297019, "step": 281 }, { "clip_ratio": 0.0, - "completion_length": 856.2708435058594, - "epoch": 0.3222857142857143, - "grad_norm": 0.3970337212085724, - "kl": 0.0233154296875, + "completion_length": 2755.166717529297, + "epoch": 0.16114285714285714, + "grad_norm": 0.3091196119785309, + "kl": 0.00545501708984375, "learning_rate": 5.28017603591974e-07, - "loss": 0.3198, - "num_tokens": 31086464.0, - "reward": 1.0004925429821014, - "reward_std": 0.6593362241983414, - "rewards/cosine_scaled_reward": 0.01066293753683567, - "rewards/format_reward": 0.9791666716337204, + "loss": -0.0154, + "num_tokens": 36536653.0, + "reward": -0.05713912099599838, + "reward_std": 0.6415699534118176, + "rewards/cosine_scaled_reward": -0.028569556772708893, "step": 282 }, { "clip_ratio": 0.0, - "completion_length": 1168.979232788086, - "epoch": 0.32342857142857145, - "grad_norm": 0.7218369841575623, - "kl": 0.037445068359375, + "completion_length": 2594.9583587646484, + "epoch": 0.16171428571428573, + "grad_norm": 0.3001720607280731, + "kl": 0.009159088134765625, "learning_rate": 5.248803227530763e-07, - "loss": 0.2112, - "num_tokens": 31150315.0, - "reward": 0.5269430354237556, - "reward_std": 0.39331691712141037, - "rewards/cosine_scaled_reward": -0.14277848601341248, - "rewards/format_reward": 0.8124999850988388, + "loss": 0.1765, + "num_tokens": 36666971.0, + "reward": -0.5184054747223854, + "reward_std": 0.3386564552783966, + "rewards/cosine_scaled_reward": -0.2592027336359024, "step": 283 }, { "clip_ratio": 0.0, - "completion_length": 1338.3750305175781, - "epoch": 0.32457142857142857, - "grad_norm": 0.3623667359352112, - "kl": 0.01910400390625, + "completion_length": 2470.2291717529297, + "epoch": 0.16228571428571428, + "grad_norm": 0.43797507882118225, + "kl": 0.0067348480224609375, "learning_rate": 5.21744266211809e-07, - "loss": 0.2846, - "num_tokens": 31222393.0, - "reward": 0.6486207991838455, - "reward_std": 0.5024735182523727, - "rewards/cosine_scaled_reward": -0.1131896236911416, - "rewards/format_reward": 0.875, + "loss": 0.0402, + "num_tokens": 36791950.0, + "reward": 0.5365435220301151, + "reward_std": 0.8577289432287216, + "rewards/cosine_scaled_reward": 0.26827176474034786, "step": 284 }, { "clip_ratio": 0.0, - "completion_length": 1468.0833740234375, - "epoch": 0.32571428571428573, - "grad_norm": 0.31146690249443054, - "kl": 0.0211334228515625, + "completion_length": 3468.2709350585938, + "epoch": 0.16285714285714287, + "grad_norm": 0.20558501780033112, + "kl": 0.0013113021850585938, "learning_rate": 5.186095868151436e-07, - "loss": 0.3163, - "num_tokens": 31300721.0, - "reward": 0.5346148237586021, - "reward_std": 0.5083964094519615, - "rewards/cosine_scaled_reward": -0.1597759248688817, - "rewards/format_reward": 0.8541666567325592, + "loss": 0.059, + "num_tokens": 36964079.0, + "reward": -0.32812320441007614, + "reward_std": 0.5332767590880394, + "rewards/cosine_scaled_reward": -0.16406160034239292, "step": 285 }, { "clip_ratio": 0.0, - "completion_length": 920.1250305175781, - "epoch": 0.32685714285714285, - "grad_norm": 0.3465576767921448, - "kl": 0.02032470703125, + "completion_length": 2288.3125610351562, + "epoch": 0.16342857142857142, + "grad_norm": 0.24818028509616852, + "kl": 0.002147674560546875, "learning_rate": 5.154764373429315e-07, - "loss": 0.1612, - "num_tokens": 31352723.0, - "reward": 1.0635754466056824, - "reward_std": 0.8085414916276932, - "rewards/cosine_scaled_reward": 0.04220437444746494, - "rewards/format_reward": 0.9791666716337204, + "loss": -0.0275, + "num_tokens": 37080458.0, + "reward": 0.6154246423393488, + "reward_std": 0.8250385522842407, + "rewards/cosine_scaled_reward": 0.30771232303231955, "step": 286 }, { "clip_ratio": 0.0, - "completion_length": 915.2500152587891, - "epoch": 0.328, - "grad_norm": 0.4792618751525879, - "kl": 0.031646728515625, + "completion_length": 2368.520896911621, + "epoch": 0.164, + "grad_norm": 0.2976325750350952, + "kl": 0.007648468017578125, "learning_rate": 5.123449705004581e-07, - "loss": 0.3252, - "num_tokens": 31404635.0, - "reward": 0.9221690893173218, - "reward_std": 0.3884202316403389, - "rewards/cosine_scaled_reward": 0.002751174382865429, - "rewards/format_reward": 0.9166666716337204, + "loss": 0.0946, + "num_tokens": 37200447.0, + "reward": 0.4257568195462227, + "reward_std": 0.9012775421142578, + "rewards/cosine_scaled_reward": 0.21287840977311134, "step": 287 }, { "clip_ratio": 0.0, - "completion_length": 839.7291946411133, - "epoch": 0.3291428571428571, - "grad_norm": 1.0952292680740356, - "kl": 0.05023193359375, + "completion_length": 2584.395896911621, + "epoch": 0.16457142857142856, + "grad_norm": 0.26449525356292725, + "kl": 0.0072422027587890625, "learning_rate": 5.09215338910999e-07, - "loss": 0.1517, - "num_tokens": 31452514.0, - "reward": 0.9844385534524918, - "reward_std": 0.5249290615320206, - "rewards/cosine_scaled_reward": 0.002635940443724394, - "rewards/format_reward": 0.9791666716337204, + "loss": 0.0002, + "num_tokens": 37330510.0, + "reward": -0.43290044367313385, + "reward_std": 0.4573391415178776, + "rewards/cosine_scaled_reward": -0.21645020693540573, "step": 288 }, { "clip_ratio": 0.0, - "completion_length": 1021.3125305175781, - "epoch": 0.3302857142857143, - "grad_norm": 0.4906879961490631, - "kl": 0.023193359375, + "completion_length": 2852.7083740234375, + "epoch": 0.16514285714285715, + "grad_norm": 0.23969526588916779, + "kl": 0.0036468505859375, "learning_rate": 5.060876951083828e-07, - "loss": 0.2844, - "num_tokens": 31509289.0, - "reward": 0.7608919851481915, - "reward_std": 0.28182800114154816, - "rewards/cosine_scaled_reward": -0.08830402046442032, - "rewards/format_reward": 0.9375, + "loss": 0.1021, + "num_tokens": 37473044.0, + "reward": -0.14536024630069733, + "reward_std": 0.7508940920233727, + "rewards/cosine_scaled_reward": -0.07268010824918747, "step": 289 }, { "clip_ratio": 0.0, - "completion_length": 1111.2500305175781, - "epoch": 0.3314285714285714, - "grad_norm": 0.490951806306839, - "kl": 0.023193359375, + "completion_length": 2355.1250915527344, + "epoch": 0.1657142857142857, + "grad_norm": 0.2979152202606201, + "kl": 0.0029964447021484375, "learning_rate": 5.02962191529556e-07, - "loss": 0.0739, - "num_tokens": 31570423.0, - "reward": 1.42685866355896, - "reward_std": 0.8060109168291092, - "rewards/cosine_scaled_reward": 0.2655126517638564, - "rewards/format_reward": 0.8958333432674408, + "loss": 0.1276, + "num_tokens": 37591598.0, + "reward": -0.27352139353752136, + "reward_std": 0.5863078981637955, + "rewards/cosine_scaled_reward": -0.13676068745553493, "step": 290 }, { "clip_ratio": 0.0, - "completion_length": 605.0625228881836, - "epoch": 0.3325714285714286, - "grad_norm": 0.6173872351646423, - "kl": 0.031097412109375, + "completion_length": 3103.0833740234375, + "epoch": 0.1662857142857143, + "grad_norm": 0.24183134734630585, + "kl": 0.0017833709716796875, "learning_rate": 4.998389805071536e-07, - "loss": 0.2537, - "num_tokens": 31607386.0, - "reward": 0.8903323113918304, - "reward_std": 0.38973837345838547, - "rewards/cosine_scaled_reward": -0.05483385222032666, - "rewards/format_reward": 1.0, + "loss": 0.0017, + "num_tokens": 37746126.0, + "reward": -0.4938964769244194, + "reward_std": 0.20535914227366447, + "rewards/cosine_scaled_reward": -0.24694822914898396, "step": 291 }, { "clip_ratio": 0.0, - "completion_length": 1337.9167175292969, - "epoch": 0.33371428571428574, - "grad_norm": 0.41129082441329956, - "kl": 0.025390625, + "completion_length": 2742.666748046875, + "epoch": 0.16685714285714287, + "grad_norm": 0.3063388764858246, + "kl": 0.00244903564453125, "learning_rate": 4.967182142620745e-07, - "loss": 0.177, - "num_tokens": 31679484.0, - "reward": 0.4600318595767021, - "reward_std": 0.3473619818687439, - "rewards/cosine_scaled_reward": -0.19706740230321884, - "rewards/format_reward": 0.8541666716337204, + "loss": 0.0973, + "num_tokens": 37883798.0, + "reward": 0.00018896162509918213, + "reward_std": 0.8789166212081909, + "rewards/cosine_scaled_reward": 9.44770872592926e-05, "step": 292 }, { "clip_ratio": 0.0, - "completion_length": 1204.5833587646484, - "epoch": 0.33485714285714285, - "grad_norm": 0.5244786739349365, - "kl": 0.02227783203125, + "completion_length": 1510.7708587646484, + "epoch": 0.16742857142857143, + "grad_norm": 0.48416784405708313, + "kl": 0.020442962646484375, "learning_rate": 4.93600044896063e-07, - "loss": 0.2838, - "num_tokens": 31744744.0, - "reward": 0.4529779404401779, - "reward_std": 0.5236243791878223, - "rewards/cosine_scaled_reward": -0.20059436932206154, - "rewards/format_reward": 0.8541666716337204, + "loss": -0.0533, + "num_tokens": 37961991.0, + "reward": 0.5087592005729675, + "reward_std": 0.6853836588561535, + "rewards/cosine_scaled_reward": 0.2543795704841614, "step": 293 }, { "clip_ratio": 0.0, - "completion_length": 1033.0416870117188, - "epoch": 0.336, - "grad_norm": 0.7969145774841309, - "kl": 0.0338134765625, + "completion_length": 3275.375, + "epoch": 0.168, + "grad_norm": 0.2511613368988037, + "kl": 0.0014057159423828125, "learning_rate": 4.904846243842949e-07, - "loss": 0.1185, - "num_tokens": 31802316.0, - "reward": 1.1573065668344498, - "reward_std": 0.7508738189935684, - "rewards/cosine_scaled_reward": 0.10990326898172498, - "rewards/format_reward": 0.9375, + "loss": 0.0496, + "num_tokens": 38125461.0, + "reward": -0.48528067022562027, + "reward_std": 0.555925726890564, + "rewards/cosine_scaled_reward": -0.24264032766222954, "step": 294 }, { "clip_ratio": 0.0, - "completion_length": 1818.1875762939453, - "epoch": 0.33714285714285713, - "grad_norm": 0.3968656361103058, - "kl": 0.019561767578125, + "completion_length": 3450.3334350585938, + "epoch": 0.16857142857142857, + "grad_norm": 0.2048538625240326, + "kl": 0.001056671142578125, "learning_rate": 4.873721045679706e-07, - "loss": 0.0126, - "num_tokens": 31897539.0, - "reward": 0.5907609835267067, - "reward_std": 0.6056300327181816, - "rewards/cosine_scaled_reward": -0.0275361780077219, - "rewards/format_reward": 0.6458333283662796, + "loss": 0.0367, + "num_tokens": 38297365.0, + "reward": -0.2943028609151952, + "reward_std": 0.4288143813610077, + "rewards/cosine_scaled_reward": -0.1471514304575976, "step": 295 }, { "clip_ratio": 0.0, - "completion_length": 863.5416793823242, - "epoch": 0.3382857142857143, - "grad_norm": 0.47012004256248474, - "kl": 0.0283203125, + "completion_length": 3004.9166870117188, + "epoch": 0.16914285714285715, + "grad_norm": 0.2455970197916031, + "kl": 0.001949310302734375, "learning_rate": 4.842626371469149e-07, - "loss": 0.0947, - "num_tokens": 31946135.0, - "reward": 1.1826601028442383, - "reward_std": 0.742069885134697, - "rewards/cosine_scaled_reward": 0.11216334369964898, - "rewards/format_reward": 0.9583333283662796, + "loss": 0.0845, + "num_tokens": 38447913.0, + "reward": -0.4560352563858032, + "reward_std": 0.5781249962747097, + "rewards/cosine_scaled_reward": -0.22801762074232101, "step": 296 }, { "clip_ratio": 0.0, - "completion_length": 1349.1250305175781, - "epoch": 0.3394285714285714, - "grad_norm": 0.41933736205101013, - "kl": 0.022247314453125, + "completion_length": 2529.437545776367, + "epoch": 0.1697142857142857, + "grad_norm": 0.29431456327438354, + "kl": 0.008335113525390625, "learning_rate": 4.811563736721829e-07, - "loss": 0.4068, - "num_tokens": 32018543.0, - "reward": 0.6420099958777428, - "reward_std": 0.4163365215063095, - "rewards/cosine_scaled_reward": -0.1060783602297306, - "rewards/format_reward": 0.8541666716337204, + "loss": 0.0079, + "num_tokens": 38575026.0, + "reward": 0.16207153722643852, + "reward_std": 0.46869122236967087, + "rewards/cosine_scaled_reward": 0.08103577420115471, "step": 297 }, { "clip_ratio": 0.0, - "completion_length": 1557.1458740234375, - "epoch": 0.3405714285714286, - "grad_norm": 0.3282775282859802, - "kl": 0.01898193359375, + "completion_length": 1855.666732788086, + "epoch": 0.1702857142857143, + "grad_norm": 0.5596351623535156, + "kl": 0.015949249267578125, "learning_rate": 4.780534655386743e-07, - "loss": 0.1917, - "num_tokens": 32101620.0, - "reward": 0.8522088825702667, - "reward_std": 0.7961080744862556, - "rewards/cosine_scaled_reward": -0.011395589681342244, - "rewards/format_reward": 0.8749999850988388, + "loss": -0.1605, + "num_tokens": 38670062.0, + "reward": -0.20284654013812542, + "reward_std": 0.5799939371645451, + "rewards/cosine_scaled_reward": -0.10142326634377241, "step": 298 }, { "clip_ratio": 0.0, - "completion_length": 1234.8333587646484, - "epoch": 0.3417142857142857, - "grad_norm": 0.4244450330734253, - "kl": 0.02215576171875, + "completion_length": 2124.8333740234375, + "epoch": 0.17085714285714285, + "grad_norm": 0.49901139736175537, + "kl": 0.0066089630126953125, "learning_rate": 4.749540639777539e-07, - "loss": 0.4339, - "num_tokens": 32168944.0, - "reward": 0.5344287008047104, - "reward_std": 0.5752788633108139, - "rewards/cosine_scaled_reward": -0.18070233054459095, - "rewards/format_reward": 0.8958333432674408, + "loss": 0.2655, + "num_tokens": 38778390.0, + "reward": -0.23837678879499435, + "reward_std": 0.5732098147273064, + "rewards/cosine_scaled_reward": -0.11918839067220688, "step": 299 }, { "clip_ratio": 0.0, - "completion_length": 1201.8125305175781, - "epoch": 0.34285714285714286, - "grad_norm": 0.412088543176651, - "kl": 0.021697998046875, + "completion_length": 2721.2709045410156, + "epoch": 0.17142857142857143, + "grad_norm": 0.27538150548934937, + "kl": 0.001575469970703125, "learning_rate": 4.7185832004988133e-07, - "loss": 0.1095, - "num_tokens": 32235523.0, - "reward": 1.3541258573532104, - "reward_std": 0.2665238669142127, - "rewards/cosine_scaled_reward": 0.24997958540916443, - "rewards/format_reward": 0.8541666716337204, + "loss": 0.0042, + "num_tokens": 38915167.0, + "reward": -0.44970532320439816, + "reward_std": 0.338998232036829, + "rewards/cosine_scaled_reward": -0.22485265973955393, "step": 300 }, { "clip_ratio": 0.0, - "completion_length": 1400.1667175292969, - "epoch": 0.344, - "grad_norm": 0.35744866728782654, - "kl": 0.0169830322265625, + "completion_length": 2737.937545776367, + "epoch": 0.172, + "grad_norm": 0.36887478828430176, + "kl": 0.017746925354003906, "learning_rate": 4.68766384637248e-07, - "loss": 0.1998, - "num_tokens": 32310705.0, - "reward": 0.8010849542915821, - "reward_std": 0.6691881567239761, - "rewards/cosine_scaled_reward": -0.005707542411983013, - "rewards/format_reward": 0.8125000074505806, + "loss": 0.0646, + "num_tokens": 39053920.0, + "reward": -0.25950850173830986, + "reward_std": 0.16469359770417213, + "rewards/cosine_scaled_reward": -0.12975424528121948, "step": 301 }, { "clip_ratio": 0.0, - "completion_length": 1336.312515258789, - "epoch": 0.34514285714285714, - "grad_norm": 0.8253682255744934, - "kl": 0.028411865234375, + "completion_length": 2430.8958740234375, + "epoch": 0.17257142857142857, + "grad_norm": 0.29716700315475464, + "kl": 0.012701034545898438, "learning_rate": 4.656784084364238e-07, - "loss": 0.2364, - "num_tokens": 32382504.0, - "reward": 0.816130556166172, - "reward_std": 0.3864128515124321, - "rewards/cosine_scaled_reward": -0.01901806378737092, - "rewards/format_reward": 0.8541666716337204, + "loss": 0.0492, + "num_tokens": 39176351.0, + "reward": 0.08118153735995293, + "reward_std": 0.26172393187880516, + "rewards/cosine_scaled_reward": 0.04059076961129904, "step": 302 }, { "clip_ratio": 0.0, - "completion_length": 1049.1458740234375, - "epoch": 0.3462857142857143, - "grad_norm": 0.49328041076660156, - "kl": 0.031890869140625, + "completion_length": 2375.4166870117188, + "epoch": 0.17314285714285715, + "grad_norm": 0.29471853375434875, + "kl": 0.00960540771484375, "learning_rate": 4.6259454195101267e-07, - "loss": 0.0389, - "num_tokens": 32440591.0, - "reward": 1.1552183032035828, - "reward_std": 0.6544827744364738, - "rewards/cosine_scaled_reward": 0.10885915206745267, - "rewards/format_reward": 0.9375, + "loss": 0.0308, + "num_tokens": 39296539.0, + "reward": -0.1111888438463211, + "reward_std": 0.5966824367642403, + "rewards/cosine_scaled_reward": -0.05559442937374115, "step": 303 }, { "clip_ratio": 0.0, - "completion_length": 922.5417022705078, - "epoch": 0.3474285714285714, - "grad_norm": 0.36159005761146545, - "kl": 0.0200042724609375, + "completion_length": 3561.3958740234375, + "epoch": 0.1737142857142857, + "grad_norm": 0.20573833584785461, + "kl": 0.0010528564453125, "learning_rate": 4.59514935484316e-07, - "loss": 0.2868, - "num_tokens": 32492841.0, - "reward": 1.4484843313694, - "reward_std": 0.9271421581506729, - "rewards/cosine_scaled_reward": 0.23465881869196892, - "rewards/format_reward": 0.9791666716337204, + "loss": 0.0075, + "num_tokens": 39473918.0, + "reward": -0.32946310192346573, + "reward_std": 0.5383296981453896, + "rewards/cosine_scaled_reward": -0.16473154537379742, "step": 304 }, { "clip_ratio": 0.0, - "completion_length": 1809.0000305175781, - "epoch": 0.3485714285714286, - "grad_norm": 0.3415130078792572, - "kl": 0.01605224609375, + "completion_length": 2468.7084045410156, + "epoch": 0.1742857142857143, + "grad_norm": 0.2480940818786621, + "kl": 0.0030117034912109375, "learning_rate": 4.5643973913200837e-07, - "loss": 0.2458, - "num_tokens": 32587287.0, - "reward": 0.5845005512237549, - "reward_std": 0.6893949508666992, - "rewards/cosine_scaled_reward": -0.08274972066283226, - "rewards/format_reward": 0.75, + "loss": 0.0284, + "num_tokens": 39598176.0, + "reward": 0.03428598493337631, + "reward_std": 0.5898709297180176, + "rewards/cosine_scaled_reward": 0.01714298501610756, "step": 305 }, { "clip_ratio": 0.0, - "completion_length": 1085.083396911621, - "epoch": 0.3497142857142857, - "grad_norm": 0.44499897956848145, - "kl": 0.023040771484375, + "completion_length": 1914.5209045410156, + "epoch": 0.17485714285714285, + "grad_norm": 0.4037153124809265, + "kl": 0.009237289428710938, "learning_rate": 4.5336910277482155e-07, - "loss": 0.1673, - "num_tokens": 32646685.0, - "reward": 1.1248981356620789, - "reward_std": 0.6042323708534241, - "rewards/cosine_scaled_reward": 0.10411571276199538, - "rewards/format_reward": 0.9166666716337204, + "loss": 0.1171, + "num_tokens": 39695629.0, + "reward": 0.6262711547315121, + "reward_std": 0.7676540613174438, + "rewards/cosine_scaled_reward": 0.31313556246459484, "step": 306 }, { "clip_ratio": 0.0, - "completion_length": 1154.5000305175781, - "epoch": 0.35085714285714287, - "grad_norm": 0.44520094990730286, - "kl": 0.020751953125, + "completion_length": 3471.2084350585938, + "epoch": 0.17542857142857143, + "grad_norm": 0.18108604848384857, + "kl": 0.0008935928344726562, "learning_rate": 4.503031760712397e-07, - "loss": 0.2247, - "num_tokens": 32710717.0, - "reward": 1.0021317303180695, - "reward_std": 0.6453195139765739, - "rewards/cosine_scaled_reward": 0.05314922146499157, - "rewards/format_reward": 0.8958333283662796, + "loss": 0.0372, + "num_tokens": 39868547.0, + "reward": -0.5682974830269814, + "reward_std": 0.362131267786026, + "rewards/cosine_scaled_reward": -0.28414873220026493, "step": 307 }, { "clip_ratio": 0.0, - "completion_length": 1123.9375305175781, - "epoch": 0.352, - "grad_norm": 0.6290622353553772, - "kl": 0.0402679443359375, + "completion_length": 1946.3333435058594, + "epoch": 0.176, + "grad_norm": 0.427402138710022, + "kl": 0.012170791625976562, "learning_rate": 4.4724210845020494e-07, - "loss": 0.2587, - "num_tokens": 32772196.0, - "reward": 0.933015450835228, - "reward_std": 0.6842742636799812, - "rewards/cosine_scaled_reward": 0.029007713310420513, - "rewards/format_reward": 0.875, + "loss": 0.0688, + "num_tokens": 39967107.0, + "reward": 0.6227611526846886, + "reward_std": 0.776022270321846, + "rewards/cosine_scaled_reward": 0.3113805763423443, "step": 308 }, { "clip_ratio": 0.0, - "completion_length": 1053.7708435058594, - "epoch": 0.35314285714285715, - "grad_norm": 0.42671680450439453, - "kl": 0.019866943359375, + "completion_length": 3468.5833740234375, + "epoch": 0.17657142857142857, + "grad_norm": 0.21143701672554016, + "kl": 0.0014905929565429688, "learning_rate": 4.441860491038345e-07, - "loss": 0.1434, - "num_tokens": 32830427.0, - "reward": 0.898878924548626, - "reward_std": 0.8900427669286728, - "rewards/cosine_scaled_reward": 0.011939452961087227, - "rewards/format_reward": 0.875, + "loss": 0.0234, + "num_tokens": 40140067.0, + "reward": -0.3016166687011719, + "reward_std": 0.5722797363996506, + "rewards/cosine_scaled_reward": -0.15080832783132792, "step": 309 }, { "clip_ratio": 0.0, - "completion_length": 1383.520866394043, - "epoch": 0.35428571428571426, - "grad_norm": 0.40926042199134827, - "kl": 0.020233154296875, + "completion_length": 2890.5834350585938, + "epoch": 0.17714285714285713, + "grad_norm": 0.22804640233516693, + "kl": 0.0019989013671875, "learning_rate": 4.4113514698014953e-07, - "loss": 0.0119, - "num_tokens": 32904654.0, - "reward": 0.7741856873035431, - "reward_std": 0.43527317326515913, - "rewards/cosine_scaled_reward": 0.01209283946081996, - "rewards/format_reward": 0.75, + "loss": -0.0995, + "num_tokens": 40285223.0, + "reward": 0.00922529399394989, + "reward_std": 0.8645202741026878, + "rewards/cosine_scaled_reward": 0.004612648859620094, "step": 310 }, { "clip_ratio": 0.0, - "completion_length": 1021.2917327880859, - "epoch": 0.3554285714285714, - "grad_norm": 0.6428707838058472, - "kl": 0.036163330078125, + "completion_length": 2540.4792098999023, + "epoch": 0.1777142857142857, + "grad_norm": 0.4666746258735657, + "kl": 0.009735107421875, "learning_rate": 4.3808955077581546e-07, - "loss": 0.2168, - "num_tokens": 32961134.0, - "reward": 1.0443171262741089, - "reward_std": 0.5101570673286915, - "rewards/cosine_scaled_reward": 0.0742418859153986, - "rewards/format_reward": 0.8958333283662796, + "loss": 0.0154, + "num_tokens": 40413298.0, + "reward": -0.29096972569823265, + "reward_std": 0.3306393250823021, + "rewards/cosine_scaled_reward": -0.14548486191779375, "step": 311 }, { "clip_ratio": 0.0, - "completion_length": 1167.8333892822266, - "epoch": 0.3565714285714286, - "grad_norm": 0.3679564297199249, - "kl": 0.0187530517578125, + "completion_length": 2994.437530517578, + "epoch": 0.1782857142857143, + "grad_norm": 0.35905346274375916, + "kl": 0.0028743743896484375, "learning_rate": 4.350494089288943e-07, - "loss": 0.177, - "num_tokens": 33024690.0, - "reward": 0.8535246178507805, - "reward_std": 0.7677148133516312, - "rewards/cosine_scaled_reward": -0.021154375048354268, - "rewards/format_reward": 0.8958333283662796, + "loss": -0.0335, + "num_tokens": 40562971.0, + "reward": -0.363456416875124, + "reward_std": 0.3647861182689667, + "rewards/cosine_scaled_reward": -0.18172819539904594, "step": 312 }, { "clip_ratio": 0.0, - "completion_length": 1486.0833587646484, - "epoch": 0.3577142857142857, - "grad_norm": 0.5498233437538147, - "kl": 0.0181732177734375, + "completion_length": 2380.0833587646484, + "epoch": 0.17885714285714285, + "grad_norm": 0.4951138198375702, + "kl": 0.0026788711547851562, "learning_rate": 4.3201486961161093e-07, - "loss": 0.2557, - "num_tokens": 33104086.0, - "reward": 0.9443192332983017, - "reward_std": 0.4855539873242378, - "rewards/cosine_scaled_reward": 0.06590958125889301, - "rewards/format_reward": 0.8125, + "loss": -0.0585, + "num_tokens": 40683791.0, + "reward": -0.10693264147266746, + "reward_std": 0.5804431773722172, + "rewards/cosine_scaled_reward": -0.05346631887368858, "step": 313 }, { "clip_ratio": 0.0, - "completion_length": 951.8958740234375, - "epoch": 0.3588571428571429, - "grad_norm": 0.4268878698348999, - "kl": 0.020538330078125, + "completion_length": 2596.604248046875, + "epoch": 0.17942857142857144, + "grad_norm": 0.3246864974498749, + "kl": 0.0043659210205078125, "learning_rate": 4.2898608072313045e-07, - "loss": 0.1463, - "num_tokens": 33157841.0, - "reward": 0.9409515410661697, - "reward_std": 0.5662648305296898, - "rewards/cosine_scaled_reward": 0.012142402119934559, - "rewards/format_reward": 0.9166666716337204, + "loss": 0.2706, + "num_tokens": 40814932.0, + "reward": 0.4242757335305214, + "reward_std": 1.1075529158115387, + "rewards/cosine_scaled_reward": 0.21213785372674465, "step": 314 }, { "clip_ratio": 0.0, - "completion_length": 922.3125610351562, - "epoch": 0.36, - "grad_norm": 0.5434188842773438, - "kl": 0.02447509765625, + "completion_length": 2499.791778564453, + "epoch": 0.18, + "grad_norm": 0.27629679441452026, + "kl": 0.0028667449951171875, "learning_rate": 4.2596318988235037e-07, - "loss": 0.17, - "num_tokens": 33210182.0, - "reward": 0.8172921747900546, - "reward_std": 0.4631008133292198, - "rewards/cosine_scaled_reward": -0.04968724772334099, - "rewards/format_reward": 0.9166666716337204, + "loss": 0.0475, + "num_tokens": 40940766.0, + "reward": -0.3353143408894539, + "reward_std": 0.637674517929554, + "rewards/cosine_scaled_reward": -0.16765715926885605, "step": 315 }, { "clip_ratio": 0.0, - "completion_length": 1805.6458435058594, - "epoch": 0.36114285714285715, - "grad_norm": 0.3704501688480377, - "kl": 0.01605224609375, + "completion_length": 3229.0416870117188, + "epoch": 0.18057142857142858, + "grad_norm": 0.23208899796009064, + "kl": 0.001750946044921875, "learning_rate": 4.2294634442070553e-07, - "loss": 0.2939, - "num_tokens": 33305451.0, - "reward": 0.3478560894727707, - "reward_std": 0.5816800445318222, - "rewards/cosine_scaled_reward": -0.19065529108047485, - "rewards/format_reward": 0.7291666567325592, + "loss": -0.0575, + "num_tokens": 41101688.0, + "reward": -0.3000589460134506, + "reward_std": 0.4578623231500387, + "rewards/cosine_scaled_reward": -0.15002946369349957, "step": 316 }, { "clip_ratio": 0.0, - "completion_length": 1177.6666870117188, - "epoch": 0.36228571428571427, - "grad_norm": 0.35206273198127747, - "kl": 0.018798828125, + "completion_length": 2577.125030517578, + "epoch": 0.18114285714285713, + "grad_norm": 0.232590913772583, + "kl": 0.0039806365966796875, "learning_rate": 4.1993569137498776e-07, - "loss": 0.1731, - "num_tokens": 33370553.0, - "reward": 1.243408765643835, - "reward_std": 0.6171375811100006, - "rewards/cosine_scaled_reward": 0.1633710116147995, - "rewards/format_reward": 0.9166666716337204, + "loss": -0.0006, + "num_tokens": 41231234.0, + "reward": 0.20818227902054787, + "reward_std": 0.40563616156578064, + "rewards/cosine_scaled_reward": 0.10409114044159651, "step": 317 }, { "clip_ratio": 0.0, - "completion_length": 1134.6875381469727, - "epoch": 0.36342857142857143, - "grad_norm": 0.5754601359367371, - "kl": 0.0250244140625, + "completion_length": 2359.3750610351562, + "epoch": 0.18171428571428572, + "grad_norm": 0.26052311062812805, + "kl": 0.003261566162109375, "learning_rate": 4.1693137748017915e-07, - "loss": 0.198, - "num_tokens": 33432614.0, - "reward": 0.9843797795474529, - "reward_std": 0.5465483516454697, - "rewards/cosine_scaled_reward": 0.06510654278099537, - "rewards/format_reward": 0.8541666716337204, + "loss": 0.0403, + "num_tokens": 41349608.0, + "reward": 0.13113708421587944, + "reward_std": 0.5818060413002968, + "rewards/cosine_scaled_reward": 0.06556854210793972, "step": 318 }, { "clip_ratio": 0.0, - "completion_length": 1850.2083740234375, - "epoch": 0.36457142857142855, - "grad_norm": 0.3118242621421814, - "kl": 0.01446533203125, + "completion_length": 2511.3541870117188, + "epoch": 0.18228571428571427, + "grad_norm": 0.31813371181488037, + "kl": 0.0041961669921875, "learning_rate": 4.1393354916230005e-07, - "loss": 0.1195, - "num_tokens": 33529392.0, - "reward": 0.3020896166563034, - "reward_std": 0.48611459136009216, - "rewards/cosine_scaled_reward": -0.21353853773325682, - "rewards/format_reward": 0.7291666716337204, + "loss": 0.0482, + "num_tokens": 41476213.0, + "reward": 0.3437543660402298, + "reward_std": 0.48744695633649826, + "rewards/cosine_scaled_reward": 0.1718771643936634, "step": 319 }, { "clip_ratio": 0.0, - "completion_length": 1210.4791870117188, - "epoch": 0.3657142857142857, - "grad_norm": 0.35057225823402405, - "kl": 0.0148773193359375, + "completion_length": 2694.854232788086, + "epoch": 0.18285714285714286, + "grad_norm": 0.23167455196380615, + "kl": 0.00571441650390625, "learning_rate": 4.1094235253127374e-07, - "loss": 0.1352, - "num_tokens": 33595565.0, - "reward": 0.7223537564277649, - "reward_std": 0.7105579674243927, - "rewards/cosine_scaled_reward": -0.07632312597706914, - "rewards/format_reward": 0.875, + "loss": 0.0731, + "num_tokens": 41612898.0, + "reward": 0.04213899374008179, + "reward_std": 0.5815633870661259, + "rewards/cosine_scaled_reward": 0.021069496870040894, "step": 320 }, { "clip_ratio": 0.0, - "completion_length": 1327.3334045410156, - "epoch": 0.3668571428571429, - "grad_norm": 0.4181998074054718, - "kl": 0.022186279296875, + "completion_length": 3584.0, + "epoch": 0.18342857142857144, + "grad_norm": 0.19942978024482727, + "kl": 0.00109100341796875, "learning_rate": 4.079579333738039e-07, - "loss": 0.1917, - "num_tokens": 33667215.0, - "reward": 1.0562772899866104, - "reward_std": 0.5822824612259865, - "rewards/cosine_scaled_reward": 0.038555288687348366, - "rewards/format_reward": 0.9791666716337204, + "loss": 0.0, + "num_tokens": 41791110.0, + "reward": -0.4013543911278248, + "reward_std": 0.21282335743308067, + "rewards/cosine_scaled_reward": -0.2006771918386221, "step": 321 }, { "clip_ratio": 0.0, - "completion_length": 1351.7083587646484, - "epoch": 0.368, - "grad_norm": 0.3627634644508362, - "kl": 0.0172576904296875, + "completion_length": 2271.437545776367, + "epoch": 0.184, + "grad_norm": 0.33824867010116577, + "kl": 0.013680458068847656, "learning_rate": 4.0498043714627006e-07, - "loss": 0.2456, - "num_tokens": 33740323.0, - "reward": 0.9927564784884453, - "reward_std": 0.689672015607357, - "rewards/cosine_scaled_reward": 0.10054487735033035, - "rewards/format_reward": 0.7916666716337204, + "loss": 0.0733, + "num_tokens": 41905971.0, + "reward": -0.16819965280592442, + "reward_std": 0.7291267365217209, + "rewards/cosine_scaled_reward": -0.08409981848672032, "step": 322 }, { "clip_ratio": 0.0, - "completion_length": 1161.5833435058594, - "epoch": 0.36914285714285716, - "grad_norm": 0.4092625677585602, - "kl": 0.021759033203125, + "completion_length": 2732.0416717529297, + "epoch": 0.18457142857142858, + "grad_norm": 0.3594345450401306, + "kl": 0.011373519897460938, "learning_rate": 4.020100089676376e-07, - "loss": 0.2509, - "num_tokens": 33804041.0, - "reward": 1.174657016992569, - "reward_std": 0.748102068901062, - "rewards/cosine_scaled_reward": 0.12899513231241144, - "rewards/format_reward": 0.9166666716337204, + "loss": 0.0273, + "num_tokens": 42042989.0, + "reward": -0.20197953283786774, + "reward_std": 0.6335406377911568, + "rewards/cosine_scaled_reward": -0.10098976641893387, "step": 323 }, { "clip_ratio": 0.0, - "completion_length": 1409.6042175292969, - "epoch": 0.3702857142857143, - "grad_norm": 0.3942246735095978, - "kl": 0.016510009765625, + "completion_length": 1872.7292022705078, + "epoch": 0.18514285714285714, + "grad_norm": 0.28887686133384705, + "kl": 0.0067596435546875, "learning_rate": 3.9904679361238526e-07, - "loss": 0.2506, - "num_tokens": 33879412.0, - "reward": 0.6940434277057648, - "reward_std": 0.6117554008960724, - "rewards/cosine_scaled_reward": -0.12172829359769821, - "rewards/format_reward": 0.9375000149011612, + "loss": -0.096, + "num_tokens": 42138424.0, + "reward": -0.19923657178878784, + "reward_std": 0.6053799614310265, + "rewards/cosine_scaled_reward": -0.09961828961968422, "step": 324 }, { "clip_ratio": 0.0, - "completion_length": 1158.0208740234375, - "epoch": 0.37142857142857144, - "grad_norm": 0.41832220554351807, - "kl": 0.021728515625, + "completion_length": 2770.062545776367, + "epoch": 0.18571428571428572, + "grad_norm": 0.3336716890335083, + "kl": 0.003292083740234375, "learning_rate": 3.9609093550344907e-07, - "loss": 0.2276, - "num_tokens": 33942629.0, - "reward": 0.961421325802803, - "reward_std": 0.5547241270542145, - "rewards/cosine_scaled_reward": 0.011960638221353292, - "rewards/format_reward": 0.9375, + "loss": 0.0489, + "num_tokens": 42277711.0, + "reward": -0.5184944495558739, + "reward_std": 0.2977650426328182, + "rewards/cosine_scaled_reward": -0.25924722105264664, "step": 325 }, { "clip_ratio": 0.0, - "completion_length": 2292.2708740234375, - "epoch": 0.37257142857142855, - "grad_norm": 0.28062522411346436, - "kl": 0.0122222900390625, + "completion_length": 1340.2916717529297, + "epoch": 0.18628571428571428, + "grad_norm": 0.6301169991493225, + "kl": 0.020992279052734375, "learning_rate": 3.931425787051832e-07, - "loss": 0.2198, - "num_tokens": 34061160.0, - "reward": 0.520195122808218, - "reward_std": 0.6474234536290169, - "rewards/cosine_scaled_reward": -0.0732357781380415, - "rewards/format_reward": 0.6666666641831398, + "loss": 0.0975, + "num_tokens": 42347565.0, + "reward": -0.12958931922912598, + "reward_std": 0.37651485204696655, + "rewards/cosine_scaled_reward": -0.06479465775191784, "step": 326 }, { "clip_ratio": 0.0, - "completion_length": 1594.6042022705078, - "epoch": 0.3737142857142857, - "grad_norm": 0.3291863203048706, - "kl": 0.018585205078125, + "completion_length": 2393.8333587646484, + "epoch": 0.18685714285714286, + "grad_norm": 0.34227386116981506, + "kl": 0.010608673095703125, "learning_rate": 3.902018669163384e-07, - "loss": 0.2308, - "num_tokens": 34145765.0, - "reward": 1.1332450732588768, - "reward_std": 0.7591084539890289, - "rewards/cosine_scaled_reward": 0.1603725180029869, - "rewards/format_reward": 0.8125, + "loss": 0.2647, + "num_tokens": 42467905.0, + "reward": -0.35234155505895615, + "reward_std": 0.40670277923345566, + "rewards/cosine_scaled_reward": -0.17617077007889748, "step": 327 }, { "clip_ratio": 0.0, - "completion_length": 1420.0833892822266, - "epoch": 0.37485714285714283, - "grad_norm": 0.3587169647216797, - "kl": 0.0157318115234375, + "completion_length": 2259.1250228881836, + "epoch": 0.18742857142857142, + "grad_norm": 0.4104394018650055, + "kl": 0.013408660888671875, "learning_rate": 3.872689434630585e-07, - "loss": 0.1698, - "num_tokens": 34221987.0, - "reward": 0.8290468156337738, - "reward_std": 0.7251327261328697, - "rewards/cosine_scaled_reward": -0.012559936614707112, - "rewards/format_reward": 0.8541666716337204, + "loss": 0.0312, + "num_tokens": 42581863.0, + "reward": -0.2085840255022049, + "reward_std": 0.539259634912014, + "rewards/cosine_scaled_reward": -0.10429202765226364, "step": 328 }, { "clip_ratio": 0.0, - "completion_length": 1996.2916870117188, - "epoch": 0.376, - "grad_norm": 0.2455548644065857, - "kl": 0.012237548828125, + "completion_length": 3043.4375610351562, + "epoch": 0.188, + "grad_norm": 0.2447911649942398, + "kl": 0.0016994476318359375, "learning_rate": 3.843439512918949e-07, - "loss": 0.1534, - "num_tokens": 34326347.0, - "reward": 0.6604045107960701, - "reward_std": 0.5404551178216934, - "rewards/cosine_scaled_reward": -0.05521441251039505, - "rewards/format_reward": 0.7708333358168602, + "loss": 0.0559, + "num_tokens": 42734560.0, + "reward": 0.12149301916360855, + "reward_std": 1.0036925375461578, + "rewards/cosine_scaled_reward": 0.0607465049251914, "step": 329 }, { "clip_ratio": 0.0, - "completion_length": 1714.5417785644531, - "epoch": 0.37714285714285717, - "grad_norm": 0.3259330689907074, - "kl": 0.016571044921875, + "completion_length": 1997.8541717529297, + "epoch": 0.18857142857142858, + "grad_norm": 0.4679599106311798, + "kl": 0.019840240478515625, "learning_rate": 3.8142703296283953e-07, - "loss": 0.259, - "num_tokens": 34416559.0, - "reward": 0.49699069559574127, - "reward_std": 0.7413402050733566, - "rewards/cosine_scaled_reward": -0.13692133501172066, - "rewards/format_reward": 0.7708333283662796, + "loss": 0.0583, + "num_tokens": 42836877.0, + "reward": 0.17294897139072418, + "reward_std": 0.8219190053641796, + "rewards/cosine_scaled_reward": 0.0864744782447815, "step": 330 }, { "clip_ratio": 0.0, - "completion_length": 1363.2500610351562, - "epoch": 0.3782857142857143, - "grad_norm": 0.3899250626564026, - "kl": 0.018096923828125, + "completion_length": 1952.9167289733887, + "epoch": 0.18914285714285714, + "grad_norm": 0.45652568340301514, + "kl": 0.020021438598632812, "learning_rate": 3.785183306423767e-07, - "loss": 0.2887, - "num_tokens": 34489507.0, - "reward": 0.6278744414448738, - "reward_std": 0.6622165739536285, - "rewards/cosine_scaled_reward": -0.12356280605308712, - "rewards/format_reward": 0.875, + "loss": 0.0989, + "num_tokens": 42936137.0, + "reward": 0.13750150427222252, + "reward_std": 0.17961042560636997, + "rewards/cosine_scaled_reward": 0.06875075213611126, "step": 331 }, { "clip_ratio": 0.0, - "completion_length": 1064.4792175292969, - "epoch": 0.37942857142857145, - "grad_norm": 0.4999028742313385, - "kl": 0.02362060546875, + "completion_length": 2125.4166717529297, + "epoch": 0.18971428571428572, + "grad_norm": 0.3350309431552887, + "kl": 0.014866828918457031, "learning_rate": 3.7561798609655373e-07, - "loss": 0.3095, - "num_tokens": 34548552.0, - "reward": 1.3007973432540894, - "reward_std": 0.6751734167337418, - "rewards/cosine_scaled_reward": 0.1816486194729805, - "rewards/format_reward": 0.9375, + "loss": -0.0255, + "num_tokens": 43043629.0, + "reward": -0.17669325042515993, + "reward_std": 0.4804052673280239, + "rewards/cosine_scaled_reward": -0.08834662148728967, "step": 332 }, { "clip_ratio": 0.0, - "completion_length": 1111.0208740234375, - "epoch": 0.38057142857142856, - "grad_norm": 0.4572555422782898, - "kl": 0.0247955322265625, + "completion_length": 2099.1250228881836, + "epoch": 0.19028571428571428, + "grad_norm": 0.5369983315467834, + "kl": 0.010837554931640625, "learning_rate": 3.72726140684072e-07, - "loss": 0.4488, - "num_tokens": 34610029.0, - "reward": 0.5547422207891941, - "reward_std": 0.4824206456542015, - "rewards/cosine_scaled_reward": -0.16012889053672552, - "rewards/format_reward": 0.875, + "loss": 0.2866, + "num_tokens": 43150087.0, + "reward": -0.18136774376034737, + "reward_std": 0.4798622354865074, + "rewards/cosine_scaled_reward": -0.09068387281149626, "step": 333 }, { "clip_ratio": 0.0, - "completion_length": 1466.7709045410156, - "epoch": 0.38171428571428573, - "grad_norm": 0.30377352237701416, - "kl": 0.014862060546875, + "completion_length": 3412.8125610351562, + "epoch": 0.19085714285714286, + "grad_norm": 0.1947650909423828, + "kl": 0.0010356903076171875, "learning_rate": 3.6984293534939737e-07, - "loss": 0.418, - "num_tokens": 34689020.0, - "reward": 0.5577372368425131, - "reward_std": 0.8148107454180717, - "rewards/cosine_scaled_reward": -0.12738139368593693, - "rewards/format_reward": 0.8125, + "loss": -0.0183, + "num_tokens": 43320226.0, + "reward": -0.28891574777662754, + "reward_std": 0.44802433252334595, + "rewards/cosine_scaled_reward": -0.14445787388831377, "step": 334 }, { "clip_ratio": 0.0, - "completion_length": 1797.125015258789, - "epoch": 0.38285714285714284, - "grad_norm": 0.350172758102417, - "kl": 0.0150299072265625, + "completion_length": 1843.6666870117188, + "epoch": 0.19142857142857142, + "grad_norm": 0.4157560467720032, + "kl": 0.0060596466064453125, "learning_rate": 3.6696851061588994e-07, - "loss": 0.1338, - "num_tokens": 34783118.0, - "reward": 0.7335403561592102, - "reward_std": 0.6356885507702827, - "rewards/cosine_scaled_reward": 0.012603512033820152, - "rewards/format_reward": 0.7083333358168602, + "loss": 0.1137, + "num_tokens": 43414110.0, + "reward": -0.3695136718451977, + "reward_std": 0.4990896135568619, + "rewards/cosine_scaled_reward": -0.18475682754069567, "step": 335 }, { "clip_ratio": 0.0, - "completion_length": 1083.3541870117188, - "epoch": 0.384, - "grad_norm": 0.5177885293960571, - "kl": 0.0233154296875, + "completion_length": 2307.500045776367, + "epoch": 0.192, + "grad_norm": 0.29402440786361694, + "kl": 0.0069637298583984375, "learning_rate": 3.641030065789562e-07, - "loss": 0.4695, - "num_tokens": 34842745.0, - "reward": 0.5901815667748451, - "reward_std": 0.6160614341497421, - "rewards/cosine_scaled_reward": -0.14240923523902893, - "rewards/format_reward": 0.875, + "loss": 0.0892, + "num_tokens": 43530186.0, + "reward": -0.007013067603111267, + "reward_std": 0.6947371922433376, + "rewards/cosine_scaled_reward": -0.0035065338015556335, "step": 336 }, { "clip_ratio": 0.0, - "completion_length": 698.9583511352539, - "epoch": 0.3851428571428571, - "grad_norm": 0.905795693397522, - "kl": 0.05902099609375, + "completion_length": 2586.0208740234375, + "epoch": 0.19257142857142856, + "grad_norm": 0.48218098282814026, + "kl": 0.01905059814453125, "learning_rate": 3.612465628992203e-07, - "loss": 0.4006, - "num_tokens": 34884581.0, - "reward": 1.1506546884775162, - "reward_std": 0.6925991177558899, - "rewards/cosine_scaled_reward": 0.0961606577038765, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.0188, + "num_tokens": 43660219.0, + "reward": 0.34220655262470245, + "reward_std": 0.47233958914875984, + "rewards/cosine_scaled_reward": 0.17110328003764153, "step": 337 }, { "clip_ratio": 0.0, - "completion_length": 1615.7916717529297, - "epoch": 0.3862857142857143, - "grad_norm": 0.4690731167793274, - "kl": 0.0206756591796875, + "completion_length": 2399.0209350585938, + "epoch": 0.19314285714285714, + "grad_norm": 0.3615526258945465, + "kl": 0.007982254028320312, "learning_rate": 3.5839931879571725e-07, - "loss": 0.1015, - "num_tokens": 34970017.0, - "reward": 0.3961530327796936, - "reward_std": 0.42486437410116196, - "rewards/cosine_scaled_reward": -0.1456734873354435, - "rewards/format_reward": 0.6874999962747097, + "loss": 0.1552, + "num_tokens": 43782692.0, + "reward": 0.5692258452763781, + "reward_std": 1.0823566317558289, + "rewards/cosine_scaled_reward": 0.28461292263818905, "step": 338 }, { "clip_ratio": 0.0, - "completion_length": 1820.6250610351562, - "epoch": 0.38742857142857146, - "grad_norm": 0.26318851113319397, - "kl": 0.01601409912109375, + "completion_length": 3281.166748046875, + "epoch": 0.19371428571428573, + "grad_norm": 0.20542646944522858, + "kl": 0.0014972686767578125, "learning_rate": 3.555614130391079e-07, - "loss": 0.2107, - "num_tokens": 35066017.0, - "reward": 0.5171072706580162, - "reward_std": 0.734325036406517, - "rewards/cosine_scaled_reward": -0.14769636327400804, - "rewards/format_reward": 0.8124999925494194, + "loss": 0.0219, + "num_tokens": 43947064.0, + "reward": -0.37549396604299545, + "reward_std": 0.42432290129363537, + "rewards/cosine_scaled_reward": -0.18774697184562683, "step": 339 }, { "clip_ratio": 0.0, - "completion_length": 1169.3125457763672, - "epoch": 0.38857142857142857, - "grad_norm": 0.8620088696479797, - "kl": 0.051483154296875, + "completion_length": 1948.2083358764648, + "epoch": 0.19428571428571428, + "grad_norm": 0.5580980777740479, + "kl": 0.015346527099609375, "learning_rate": 3.5273298394491515e-07, - "loss": 0.0998, - "num_tokens": 35130214.0, - "reward": 0.9262717366218567, - "reward_std": 0.5215450003743172, - "rewards/cosine_scaled_reward": 0.015219194581732154, - "rewards/format_reward": 0.8958333432674408, + "loss": 0.1066, + "num_tokens": 44046482.0, + "reward": -0.09587634727358818, + "reward_std": 0.37748729810118675, + "rewards/cosine_scaled_reward": -0.04793817549943924, "step": 340 }, { "clip_ratio": 0.0, - "completion_length": 1397.3542175292969, - "epoch": 0.38971428571428574, - "grad_norm": 0.47853705286979675, - "kl": 0.0279083251953125, + "completion_length": 1392.2917022705078, + "epoch": 0.19485714285714287, + "grad_norm": 0.4597311019897461, + "kl": 0.021306991577148438, "learning_rate": 3.4991416936678276e-07, - "loss": 0.2305, - "num_tokens": 35205861.0, - "reward": 1.2229997366666794, - "reward_std": 0.9069466888904572, - "rewards/cosine_scaled_reward": 0.17399985902011395, - "rewards/format_reward": 0.875, + "loss": 0.2272, + "num_tokens": 44118388.0, + "reward": 0.12345625646412373, + "reward_std": 0.5400365628302097, + "rewards/cosine_scaled_reward": 0.06172813195735216, "step": 341 }, { "clip_ratio": 0.0, - "completion_length": 1234.1666870117188, - "epoch": 0.39085714285714285, - "grad_norm": 0.4025494456291199, - "kl": 0.02392578125, + "completion_length": 2098.729232788086, + "epoch": 0.19542857142857142, + "grad_norm": 0.5311233401298523, + "kl": 0.016597747802734375, "learning_rate": 3.471051066897562e-07, - "loss": 0.2341, - "num_tokens": 35272907.0, - "reward": 0.8216968104243279, - "reward_std": 0.5812697075307369, - "rewards/cosine_scaled_reward": 0.004598394094500691, - "rewards/format_reward": 0.8125, + "loss": 0.1392, + "num_tokens": 44224671.0, + "reward": 0.09169729612767696, + "reward_std": 0.7804772108793259, + "rewards/cosine_scaled_reward": 0.04584864107891917, "step": 342 }, { "clip_ratio": 0.0, - "completion_length": 1443.8333740234375, - "epoch": 0.392, - "grad_norm": 0.41579753160476685, - "kl": 0.0198211669921875, + "completion_length": 1821.8333587646484, + "epoch": 0.196, + "grad_norm": 0.37737464904785156, + "kl": 0.00925445556640625, "learning_rate": 3.4430593282358777e-07, - "loss": 0.2448, - "num_tokens": 35350047.0, - "reward": 1.148866169154644, - "reward_std": 0.8502667844295502, - "rewards/cosine_scaled_reward": 0.1369330883026123, - "rewards/format_reward": 0.8750000149011612, + "loss": 0.1748, + "num_tokens": 44317867.0, + "reward": -0.17036119103431702, + "reward_std": 0.8656125217676163, + "rewards/cosine_scaled_reward": -0.08518059551715851, "step": 343 }, { "clip_ratio": 0.0, - "completion_length": 1166.4792022705078, - "epoch": 0.3931428571428571, - "grad_norm": 0.44617852568626404, - "kl": 0.027435302734375, + "completion_length": 2364.9167098999023, + "epoch": 0.19657142857142856, + "grad_norm": 0.420017808675766, + "kl": 0.022002220153808594, "learning_rate": 3.4151678419606233e-07, - "loss": 0.1595, - "num_tokens": 35414708.0, - "reward": 0.8810491487383842, - "reward_std": 0.3746897503733635, - "rewards/cosine_scaled_reward": -0.03864210657775402, - "rewards/format_reward": 0.9583333283662796, + "loss": 0.0984, + "num_tokens": 44436783.0, + "reward": -0.07778553664684296, + "reward_std": 0.6604952029883862, + "rewards/cosine_scaled_reward": -0.03889276832342148, "step": 344 }, { "clip_ratio": 0.0, - "completion_length": 949.5625152587891, - "epoch": 0.3942857142857143, - "grad_norm": 0.3746589124202728, - "kl": 0.019561767578125, + "completion_length": 1328.0, + "epoch": 0.19714285714285715, + "grad_norm": 0.6512780785560608, + "kl": 0.020231246948242188, "learning_rate": 3.387377967463493e-07, - "loss": 0.4207, - "num_tokens": 35467829.0, - "reward": 0.8917692303657532, - "reward_std": 0.3601553849875927, - "rewards/cosine_scaled_reward": -0.022865407168865204, - "rewards/format_reward": 0.9375000149011612, + "loss": 0.0694, + "num_tokens": 44506143.0, + "reward": 0.46496348083019257, + "reward_std": 0.42599966563284397, + "rewards/cosine_scaled_reward": 0.23248173296451569, "step": 345 }, { "clip_ratio": 0.0, - "completion_length": 1269.5208587646484, - "epoch": 0.3954285714285714, - "grad_norm": 0.4141824543476105, - "kl": 0.02264404296875, + "completion_length": 2967.9791870117188, + "epoch": 0.1977142857142857, + "grad_norm": 0.27706509828567505, + "kl": 0.00400543212890625, "learning_rate": 3.359691059183761e-07, - "loss": 0.0, - "num_tokens": 35536926.0, - "reward": 0.8968387022614479, - "reward_std": 0.4571293629705906, - "rewards/cosine_scaled_reward": 0.010919326916337013, - "rewards/format_reward": 0.875, + "loss": 0.2109, + "num_tokens": 44655182.0, + "reward": -0.2751462832093239, + "reward_std": 0.48510361462831497, + "rewards/cosine_scaled_reward": -0.13757313415408134, "step": 346 }, { "clip_ratio": 0.0, - "completion_length": 1213.3333435058594, - "epoch": 0.3965714285714286, - "grad_norm": 0.7501462697982788, - "kl": 0.0380859375, + "completion_length": 2848.125, + "epoch": 0.1982857142857143, + "grad_norm": 0.22808820009231567, + "kl": 0.00789642333984375, "learning_rate": 3.3321084665422803e-07, - "loss": 0.23, - "num_tokens": 35603176.0, - "reward": 1.100012943148613, - "reward_std": 0.9596873223781586, - "rewards/cosine_scaled_reward": 0.13333981484174728, - "rewards/format_reward": 0.8333333432674408, + "loss": -0.0171, + "num_tokens": 44798228.0, + "reward": -0.2193898782134056, + "reward_std": 0.39861927926540375, + "rewards/cosine_scaled_reward": -0.10969493724405766, "step": 347 }, { "clip_ratio": 0.0, - "completion_length": 1136.8750305175781, - "epoch": 0.3977142857142857, - "grad_norm": 0.40254196524620056, - "kl": 0.0218658447265625, + "completion_length": 2773.1666870117188, + "epoch": 0.19885714285714284, + "grad_norm": 0.2631858289241791, + "kl": 0.0020580291748046875, "learning_rate": 3.3046315338757026e-07, - "loss": -0.0113, - "num_tokens": 35665438.0, - "reward": 0.8266020230948925, - "reward_std": 0.5243057832121849, - "rewards/cosine_scaled_reward": -0.03461567126214504, - "rewards/format_reward": 0.8958333283662796, + "loss": -0.0332, + "num_tokens": 44937184.0, + "reward": -0.23417676240205765, + "reward_std": 0.4104016348719597, + "rewards/cosine_scaled_reward": -0.11708838120102882, "step": 348 }, { "clip_ratio": 0.0, - "completion_length": 974.5833892822266, - "epoch": 0.39885714285714285, - "grad_norm": 0.4754847586154938, - "kl": 0.0227203369140625, + "completion_length": 3129.2084350585938, + "epoch": 0.19942857142857143, + "grad_norm": 0.2336844503879547, + "kl": 0.001857757568359375, "learning_rate": 3.2772616003709616e-07, - "loss": 0.175, - "num_tokens": 35720006.0, - "reward": 1.4643159806728363, - "reward_std": 0.525859147310257, - "rewards/cosine_scaled_reward": 0.25299129262566566, - "rewards/format_reward": 0.9583333283662796, + "loss": 0.0764, + "num_tokens": 45093458.0, + "reward": -0.11799243092536926, + "reward_std": 0.512191891670227, + "rewards/cosine_scaled_reward": -0.05899622291326523, "step": 349 }, { "clip_ratio": 0.0, - "completion_length": 811.2291793823242, - "epoch": 0.4, - "grad_norm": 0.4111607074737549, - "kl": 0.02227783203125, + "completion_length": 1597.0625457763672, + "epoch": 0.2, + "grad_norm": 0.3072298467159271, + "kl": 0.0224151611328125, "learning_rate": 3.250000000000001e-07, - "loss": 0.3028, - "num_tokens": 35766733.0, - "reward": 1.3436209559440613, - "reward_std": 0.5788788720965385, - "rewards/cosine_scaled_reward": 0.20306045236065984, - "rewards/format_reward": 0.9375, + "loss": 0.1081, + "num_tokens": 45175661.0, + "reward": 0.5722237005829811, + "reward_std": 0.33796822652220726, + "rewards/cosine_scaled_reward": 0.2861118447035551, "step": 350 }, { "clip_ratio": 0.0, - "completion_length": 1149.437515258789, - "epoch": 0.40114285714285713, - "grad_norm": 0.5062916874885559, - "kl": 0.0225830078125, + "completion_length": 2634.166717529297, + "epoch": 0.20057142857142857, + "grad_norm": 0.2797272205352783, + "kl": 0.003326416015625, "learning_rate": 3.222848061454764e-07, - "loss": 0.199, - "num_tokens": 35830264.0, - "reward": 0.8019091859459877, - "reward_std": 0.5678411647677422, - "rewards/cosine_scaled_reward": -0.03654539864510298, - "rewards/format_reward": 0.875, + "loss": 0.2635, + "num_tokens": 45307765.0, + "reward": -0.06509780511260033, + "reward_std": 0.7609989158809185, + "rewards/cosine_scaled_reward": -0.03254890255630016, "step": 351 }, { "clip_ratio": 0.0, - "completion_length": 1280.0625305175781, - "epoch": 0.4022857142857143, - "grad_norm": 0.3271620571613312, - "kl": 0.0175933837890625, + "completion_length": 1977.9375, + "epoch": 0.20114285714285715, + "grad_norm": 0.4983066916465759, + "kl": 0.020462989807128906, "learning_rate": 3.195807108082429e-07, - "loss": 0.2915, - "num_tokens": 35899897.0, - "reward": 0.9320071637630463, - "reward_std": 0.5362000018358231, - "rewards/cosine_scaled_reward": -0.00274643674492836, - "rewards/format_reward": 0.9375000149011612, + "loss": 0.0084, + "num_tokens": 45408382.0, + "reward": 0.057065196335315704, + "reward_std": 0.47916316613554955, + "rewards/cosine_scaled_reward": 0.028532586991786957, "step": 352 }, { "clip_ratio": 0.0, - "completion_length": 1254.0208435058594, - "epoch": 0.4034285714285714, - "grad_norm": 0.3472294211387634, - "kl": 0.02008056640625, + "completion_length": 2337.979217529297, + "epoch": 0.2017142857142857, + "grad_norm": 0.30659788846969604, + "kl": 0.005828857421875, "learning_rate": 3.168878457820915e-07, - "loss": 0.2191, - "num_tokens": 35968424.0, - "reward": 1.047704242169857, - "reward_std": 0.4904594272375107, - "rewards/cosine_scaled_reward": 0.06551877036690712, - "rewards/format_reward": 0.9166666716337204, + "loss": 0.1821, + "num_tokens": 45527085.0, + "reward": -0.3150169067084789, + "reward_std": 0.7280392572283745, + "rewards/cosine_scaled_reward": -0.15750844962894917, "step": 353 }, { "clip_ratio": 0.0, - "completion_length": 1009.8542175292969, - "epoch": 0.4045714285714286, - "grad_norm": 0.49516573548316956, - "kl": 0.0303955078125, + "completion_length": 2067.145835876465, + "epoch": 0.2022857142857143, + "grad_norm": 0.3004125952720642, + "kl": 0.017545700073242188, "learning_rate": 3.142063423134644e-07, - "loss": 0.2002, - "num_tokens": 36024817.0, - "reward": 0.5630597248673439, - "reward_std": 0.22670871764421463, - "rewards/cosine_scaled_reward": -0.20805347710847855, - "rewards/format_reward": 0.9791666716337204, + "loss": 0.0771, + "num_tokens": 45631768.0, + "reward": -0.013154599815607071, + "reward_std": 0.4110375605523586, + "rewards/cosine_scaled_reward": -0.006577307358384132, "step": 354 }, { "clip_ratio": 0.0, - "completion_length": 1270.7291946411133, - "epoch": 0.4057142857142857, - "grad_norm": 0.7044930458068848, - "kl": 0.0262908935546875, + "completion_length": 2189.7917098999023, + "epoch": 0.20285714285714285, + "grad_norm": 0.4463716447353363, + "kl": 0.0159759521484375, "learning_rate": 3.115363310950578e-07, - "loss": 0.206, - "num_tokens": 36092706.0, - "reward": 0.9475711584091187, - "reward_std": 0.36783919855952263, - "rewards/cosine_scaled_reward": 0.06753556802868843, - "rewards/format_reward": 0.8124999925494194, + "loss": 0.0418, + "num_tokens": 45742506.0, + "reward": 0.08110996335744858, + "reward_std": 0.33503045327961445, + "rewards/cosine_scaled_reward": 0.040554989129304886, "step": 355 }, { "clip_ratio": 0.0, - "completion_length": 1286.166732788086, - "epoch": 0.40685714285714286, - "grad_norm": 0.3811585009098053, - "kl": 0.019927978515625, + "completion_length": 2110.8542098999023, + "epoch": 0.20342857142857143, + "grad_norm": 0.37933966517448425, + "kl": 0.02321624755859375, "learning_rate": 3.0887794225945143e-07, - "loss": 0.3389, - "num_tokens": 36163136.0, - "reward": 0.8006436377763748, - "reward_std": 0.4206415191292763, - "rewards/cosine_scaled_reward": -0.026761506218463182, - "rewards/format_reward": 0.8541666716337204, + "loss": -0.0443, + "num_tokens": 45849299.0, + "reward": 0.5204504579305649, + "reward_std": 0.5081758014857769, + "rewards/cosine_scaled_reward": 0.26022522151470184, "step": 356 }, { "clip_ratio": 0.0, - "completion_length": 1452.2709045410156, - "epoch": 0.408, - "grad_norm": 0.48744305968284607, - "kl": 0.0196380615234375, + "completion_length": 3009.8958740234375, + "epoch": 0.204, + "grad_norm": 0.22186511754989624, + "kl": 0.0023040771484375, "learning_rate": 3.062313053727671e-07, - "loss": 0.5284, - "num_tokens": 36241107.0, - "reward": 0.5863317297771573, - "reward_std": 0.4162827581167221, - "rewards/cosine_scaled_reward": -0.11308415234088898, - "rewards/format_reward": 0.8125000074505806, + "loss": 0.0384, + "num_tokens": 46000422.0, + "reward": -0.5638711154460907, + "reward_std": 0.42032088339328766, + "rewards/cosine_scaled_reward": -0.28193555772304535, "step": 357 }, { "clip_ratio": 0.0, - "completion_length": 1344.3541870117188, - "epoch": 0.40914285714285714, - "grad_norm": 0.4239792823791504, - "kl": 0.02117919921875, + "completion_length": 2124.437530517578, + "epoch": 0.20457142857142857, + "grad_norm": 0.48417916893959045, + "kl": 0.012571334838867188, "learning_rate": 3.0359654942835247e-07, - "loss": 0.3413, - "num_tokens": 36314024.0, - "reward": 0.6943970248103142, - "reward_std": 0.7140942215919495, - "rewards/cosine_scaled_reward": -0.07988481689244509, - "rewards/format_reward": 0.8541666716337204, + "loss": 0.2297, + "num_tokens": 46108431.0, + "reward": 0.3528481721878052, + "reward_std": 0.594680666923523, + "rewards/cosine_scaled_reward": 0.17642410099506378, "step": 358 }, { "clip_ratio": 0.0, - "completion_length": 1117.1042404174805, - "epoch": 0.4102857142857143, - "grad_norm": 0.5485315918922424, - "kl": 0.0322418212890625, + "completion_length": 2094.8542098999023, + "epoch": 0.20514285714285715, + "grad_norm": 0.4315508306026459, + "kl": 0.013898849487304688, "learning_rate": 3.0097380284049523e-07, - "loss": 0.1325, - "num_tokens": 36375343.0, - "reward": 1.2047154903411865, - "reward_std": 0.547421857714653, - "rewards/cosine_scaled_reward": 0.13360773120075464, - "rewards/format_reward": 0.9375, + "loss": 0.053, + "num_tokens": 46214660.0, + "reward": -0.27906909096054733, + "reward_std": 0.774463415145874, + "rewards/cosine_scaled_reward": -0.1395345416967757, "step": 359 }, { "clip_ratio": 0.0, - "completion_length": 562.5208511352539, - "epoch": 0.4114285714285714, - "grad_norm": 0.5694870352745056, - "kl": 0.03411865234375, + "completion_length": 2567.1250610351562, + "epoch": 0.2057142857142857, + "grad_norm": 0.285454660654068, + "kl": 0.009855270385742188, "learning_rate": 2.9836319343816397e-07, - "loss": 0.1663, - "num_tokens": 36409910.0, - "reward": 1.1702253967523575, - "reward_std": 0.46344269812107086, - "rewards/cosine_scaled_reward": 0.08511268568690866, - "rewards/format_reward": 1.0, + "loss": 0.0508, + "num_tokens": 46344038.0, + "reward": -0.10896847397089005, + "reward_std": 0.5441469997167587, + "rewards/cosine_scaled_reward": -0.05448423605412245, "step": 360 }, { "clip_ratio": 0.0, - "completion_length": 1849.8958740234375, - "epoch": 0.4125714285714286, - "grad_norm": 0.35474714636802673, - "kl": 0.022979736328125, + "completion_length": 2514.5, + "epoch": 0.2062857142857143, + "grad_norm": 0.26314839720726013, + "kl": 0.0074367523193359375, "learning_rate": 2.9576484845877793e-07, - "loss": 0.118, - "num_tokens": 36506409.0, - "reward": 0.4462018497288227, - "reward_std": 0.2832772321999073, - "rewards/cosine_scaled_reward": -0.08939909003674984, - "rewards/format_reward": 0.625, + "loss": 0.043, + "num_tokens": 46471574.0, + "reward": 0.004594132304191589, + "reward_std": 0.6141269728541374, + "rewards/cosine_scaled_reward": 0.002297069877386093, "step": 361 }, { "clip_ratio": 0.0, - "completion_length": 1358.5833435058594, - "epoch": 0.4137142857142857, - "grad_norm": 0.4604112207889557, - "kl": 0.023681640625, + "completion_length": 2410.2708740234375, + "epoch": 0.20685714285714285, + "grad_norm": 0.2571261525154114, + "kl": 0.0035762786865234375, "learning_rate": 2.931788945420058e-07, - "loss": 0.2728, - "num_tokens": 36580003.0, - "reward": 0.8181298896670341, - "reward_std": 0.7494820207357407, - "rewards/cosine_scaled_reward": 0.0028149131685495377, - "rewards/format_reward": 0.8125000149011612, + "loss": 0.0212, + "num_tokens": 46592595.0, + "reward": 0.23570144176483154, + "reward_std": 0.9038522392511368, + "rewards/cosine_scaled_reward": 0.11785072460770607, "step": 362 }, { "clip_ratio": 0.0, - "completion_length": 1144.3125305175781, - "epoch": 0.41485714285714287, - "grad_norm": 0.44948238134384155, - "kl": 0.0284423828125, + "completion_length": 2346.1250610351562, + "epoch": 0.20742857142857143, + "grad_norm": 0.4718479812145233, + "kl": 0.0143280029296875, "learning_rate": 2.9060545772359305e-07, - "loss": 0.1588, - "num_tokens": 36642832.0, - "reward": 0.7569349557161331, - "reward_std": 0.5557837300002575, - "rewards/cosine_scaled_reward": -0.06944921240210533, - "rewards/format_reward": 0.8958333283662796, + "loss": 0.0335, + "num_tokens": 46710693.0, + "reward": -0.08351481426507235, + "reward_std": 0.6729232966899872, + "rewards/cosine_scaled_reward": -0.041757403407245874, "step": 363 }, { "clip_ratio": 0.0, - "completion_length": 1319.2708435058594, - "epoch": 0.416, - "grad_norm": 0.40363508462905884, - "kl": 0.0239715576171875, + "completion_length": 3191.6458740234375, + "epoch": 0.208, + "grad_norm": 0.25456610321998596, + "kl": 0.0029468536376953125, "learning_rate": 2.8804466342921987e-07, - "loss": 0.3013, - "num_tokens": 36714131.0, - "reward": 0.6689659608528018, - "reward_std": 0.6713744327425957, - "rewards/cosine_scaled_reward": -0.050933680147863925, - "rewards/format_reward": 0.7708333358168602, + "loss": -0.0697, + "num_tokens": 46869184.0, + "reward": -0.24707527458667755, + "reward_std": 0.4364708364009857, + "rewards/cosine_scaled_reward": -0.12353762984275818, "step": 364 }, { "clip_ratio": 0.0, - "completion_length": 1737.854248046875, - "epoch": 0.41714285714285715, - "grad_norm": 0.3201209604740143, - "kl": 0.018829345703125, + "completion_length": 2315.5209045410156, + "epoch": 0.20857142857142857, + "grad_norm": 0.33200034499168396, + "kl": 0.00350189208984375, "learning_rate": 2.854966364683872e-07, - "loss": -0.001, - "num_tokens": 36805390.0, - "reward": 0.7450773566961288, - "reward_std": 0.8447261303663254, - "rewards/cosine_scaled_reward": -0.0024613337591290474, - "rewards/format_reward": 0.75, + "loss": -0.0, + "num_tokens": 46985621.0, + "reward": -0.40703647769987583, + "reward_std": 0.5758531466126442, + "rewards/cosine_scaled_reward": -0.20351823465898633, "step": 365 }, { "clip_ratio": 0.0, - "completion_length": 1310.9583435058594, - "epoch": 0.41828571428571426, - "grad_norm": 0.336849570274353, - "kl": 0.0190582275390625, + "completion_length": 2923.937530517578, + "epoch": 0.20914285714285713, + "grad_norm": 0.23750042915344238, + "kl": 0.003543853759765625, "learning_rate": 2.829615010283344e-07, - "loss": 0.1492, - "num_tokens": 36876134.0, - "reward": 0.7563202679157257, - "reward_std": 0.5958736017346382, - "rewards/cosine_scaled_reward": -0.06975654885172844, - "rewards/format_reward": 0.8958333283662796, + "loss": -0.013, + "num_tokens": 47132150.0, + "reward": -0.4153160899877548, + "reward_std": 0.33962608128786087, + "rewards/cosine_scaled_reward": -0.2076580412685871, "step": 366 }, { "clip_ratio": 0.0, - "completion_length": 946.7500152587891, - "epoch": 0.41942857142857143, - "grad_norm": 0.5472022294998169, - "kl": 0.028717041015625, + "completion_length": 2619.208396911621, + "epoch": 0.20971428571428571, + "grad_norm": 0.4440971612930298, + "kl": 0.019357681274414062, "learning_rate": 2.8043938066798645e-07, - "loss": 0.1855, - "num_tokens": 36929078.0, - "reward": 1.0344794690608978, - "reward_std": 0.44533807784318924, - "rewards/cosine_scaled_reward": 0.06932303309440613, - "rewards/format_reward": 0.8958333283662796, + "loss": 0.0553, + "num_tokens": 47263032.0, + "reward": 0.14148210734128952, + "reward_std": 0.9161304086446762, + "rewards/cosine_scaled_reward": 0.0707410629838705, "step": 367 }, { "clip_ratio": 0.0, - "completion_length": 1162.7083740234375, - "epoch": 0.4205714285714286, - "grad_norm": 0.4045441448688507, - "kl": 0.021575927734375, + "completion_length": 2354.354248046875, + "epoch": 0.2102857142857143, + "grad_norm": 0.31312811374664307, + "kl": 0.00652313232421875, "learning_rate": 2.7793039831193133e-07, - "loss": -0.0114, - "num_tokens": 36993378.0, - "reward": 1.0075117945671082, - "reward_std": 0.6353353708982468, - "rewards/cosine_scaled_reward": 0.08708921447396278, - "rewards/format_reward": 0.8333333432674408, + "loss": 0.0197, + "num_tokens": 47382209.0, + "reward": 0.4818605841137469, + "reward_std": 1.0587524473667145, + "rewards/cosine_scaled_reward": 0.24093026970513165, "step": 368 }, { "clip_ratio": 0.0, - "completion_length": 1022.9791870117188, - "epoch": 0.4217142857142857, - "grad_norm": 0.4526968002319336, - "kl": 0.02374267578125, + "completion_length": 2897.875, + "epoch": 0.21085714285714285, + "grad_norm": 0.22899067401885986, + "kl": 0.0037288665771484375, "learning_rate": 2.7543467624442956e-07, - "loss": 0.1353, - "num_tokens": 37050395.0, - "reward": 1.3955522179603577, - "reward_std": 0.35689088329672813, - "rewards/cosine_scaled_reward": 0.2081927489489317, - "rewards/format_reward": 0.9791666716337204, + "loss": -0.0322, + "num_tokens": 47527295.0, + "reward": -0.12967145442962646, + "reward_std": 0.5554239340126514, + "rewards/cosine_scaled_reward": -0.06483572721481323, "step": 369 }, { "clip_ratio": 0.0, - "completion_length": 1004.8958587646484, - "epoch": 0.4228571428571429, - "grad_norm": 0.41456565260887146, - "kl": 0.02435302734375, + "completion_length": 2788.3958740234375, + "epoch": 0.21142857142857144, + "grad_norm": 0.2895820140838623, + "kl": 0.003108978271484375, "learning_rate": 2.729523361034538e-07, - "loss": 0.4306, - "num_tokens": 37107066.0, - "reward": 0.7459383457899094, - "reward_std": 0.5385182648897171, - "rewards/cosine_scaled_reward": -0.09578085504472256, - "rewards/format_reward": 0.9375000149011612, + "loss": 0.0391, + "num_tokens": 47666670.0, + "reward": -0.20442558825016022, + "reward_std": 0.4632416293025017, + "rewards/cosine_scaled_reward": -0.10221279412508011, "step": 370 }, { "clip_ratio": 0.0, - "completion_length": 927.7708892822266, - "epoch": 0.424, - "grad_norm": 0.39877113699913025, - "kl": 0.02532958984375, + "completion_length": 2021.9166793823242, + "epoch": 0.212, + "grad_norm": 0.46538272500038147, + "kl": 0.022820472717285156, "learning_rate": 2.7048349887476037e-07, - "loss": 0.2257, - "num_tokens": 37159825.0, - "reward": 0.9463022500276566, - "reward_std": 0.7569273114204407, - "rewards/cosine_scaled_reward": -0.006015541031956673, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.0362, + "num_tokens": 47769830.0, + "reward": 0.32230387814342976, + "reward_std": 0.5243347375653684, + "rewards/cosine_scaled_reward": 0.16115193953737617, "step": 371 }, { "clip_ratio": 0.0, - "completion_length": 860.7291793823242, - "epoch": 0.42514285714285716, - "grad_norm": 1.0322585105895996, - "kl": 0.0416259765625, + "completion_length": 2881.1666717529297, + "epoch": 0.21257142857142858, + "grad_norm": 0.25469550490379333, + "kl": 0.002590179443359375, "learning_rate": 2.6802828488599294e-07, - "loss": 0.2483, - "num_tokens": 37209402.0, - "reward": 1.0459811389446259, - "reward_std": 0.37404677644371986, - "rewards/cosine_scaled_reward": 0.033407218754291534, - "rewards/format_reward": 0.9791666716337204, + "loss": -0.0075, + "num_tokens": 47913934.0, + "reward": -0.4967922382056713, + "reward_std": 0.17979411222040653, + "rewards/cosine_scaled_reward": -0.24839610792696476, "step": 372 }, { "clip_ratio": 0.0, - "completion_length": 1284.541732788086, - "epoch": 0.42628571428571427, - "grad_norm": 0.3862076997756958, - "kl": 0.02374267578125, + "completion_length": 2935.0000610351562, + "epoch": 0.21314285714285713, + "grad_norm": 0.26110002398490906, + "kl": 0.00124359130859375, "learning_rate": 2.655868138008171e-07, - "loss": 0.2574, - "num_tokens": 37279418.0, - "reward": 0.6485820934176445, - "reward_std": 0.3174329958856106, - "rewards/cosine_scaled_reward": -0.11320894956588745, - "rewards/format_reward": 0.875, + "loss": -0.0101, + "num_tokens": 48061966.0, + "reward": 0.28749898076057434, + "reward_std": 0.6488082036376, + "rewards/cosine_scaled_reward": 0.14374948665499687, "step": 373 }, { "clip_ratio": 0.0, - "completion_length": 1156.7708587646484, - "epoch": 0.42742857142857144, - "grad_norm": 0.41904541850090027, - "kl": 0.0294189453125, + "completion_length": 3053.4375610351562, + "epoch": 0.21371428571428572, + "grad_norm": 0.22372771799564362, + "kl": 0.002582550048828125, "learning_rate": 2.631592046130896e-07, - "loss": 0.288, - "num_tokens": 37342689.0, - "reward": 0.7976613417267799, - "reward_std": 0.6100458800792694, - "rewards/cosine_scaled_reward": -0.049086001701653004, - "rewards/format_reward": 0.8958333432674408, + "loss": 0.1397, + "num_tokens": 48215599.0, + "reward": -0.5365136712789536, + "reward_std": 0.4280487932264805, + "rewards/cosine_scaled_reward": -0.2682568356394768, "step": 374 }, { "clip_ratio": 0.0, - "completion_length": 827.9791870117188, - "epoch": 0.42857142857142855, - "grad_norm": 0.483198344707489, - "kl": 0.030426025390625, + "completion_length": 1970.3333358764648, + "epoch": 0.21428571428571427, + "grad_norm": 0.4923068583011627, + "kl": 0.022411346435546875, "learning_rate": 2.6074557564105724e-07, - "loss": 0.1944, - "num_tokens": 37390370.0, - "reward": 0.974768877029419, - "reward_std": 0.5094475597143173, - "rewards/cosine_scaled_reward": 0.008217750757467002, - "rewards/format_reward": 0.9583333283662796, + "loss": -0.1989, + "num_tokens": 48315947.0, + "reward": 0.056493550539016724, + "reward_std": 0.5003200061619282, + "rewards/cosine_scaled_reward": 0.028246776200830936, "step": 375 }, { "clip_ratio": 0.0, - "completion_length": 1670.7916870117188, - "epoch": 0.4297142857142857, - "grad_norm": 0.3357181251049042, - "kl": 0.0157012939453125, + "completion_length": 2141.437515258789, + "epoch": 0.21485714285714286, + "grad_norm": 0.2769676744937897, + "kl": 0.0077648162841796875, "learning_rate": 2.583460445215911e-07, - "loss": 0.2978, - "num_tokens": 37478224.0, - "reward": 0.8598220273852348, - "reward_std": 0.8302092999219894, - "rewards/cosine_scaled_reward": 0.04449433600530028, - "rewards/format_reward": 0.7708333283662796, + "loss": -0.1635, + "num_tokens": 48423980.0, + "reward": 0.3381408303976059, + "reward_std": 0.4275904409587383, + "rewards/cosine_scaled_reward": 0.16907040774822235, "step": 376 }, { "clip_ratio": 0.0, - "completion_length": 884.0000152587891, - "epoch": 0.4308571428571429, - "grad_norm": 0.43593403697013855, - "kl": 0.031005859375, + "completion_length": 2033.104248046875, + "epoch": 0.21542857142857144, + "grad_norm": 0.3097473680973053, + "kl": 0.0059566497802734375, "learning_rate": 2.5596072820445254e-07, - "loss": 0.1621, - "num_tokens": 37528678.0, - "reward": 0.9919871091842651, - "reward_std": 0.5908376425504684, - "rewards/cosine_scaled_reward": 0.006410190369933844, - "rewards/format_reward": 0.9791666716337204, + "loss": 0.2215, + "num_tokens": 48527305.0, + "reward": -0.06256039813160896, + "reward_std": 0.6639396920800209, + "rewards/cosine_scaled_reward": -0.03128020092844963, "step": 377 }, { "clip_ratio": 0.0, - "completion_length": 1222.2708435058594, - "epoch": 0.432, - "grad_norm": 0.41493770480155945, - "kl": 0.025177001953125, + "completion_length": 2935.1875, + "epoch": 0.216, + "grad_norm": 0.26714199781417847, + "kl": 0.0024471282958984375, "learning_rate": 2.5358974294659373e-07, - "loss": 0.3092, - "num_tokens": 37595327.0, - "reward": 0.6667770892381668, - "reward_std": 0.628834679722786, - "rewards/cosine_scaled_reward": -0.11452813632786274, - "rewards/format_reward": 0.8958333432674408, + "loss": -0.0197, + "num_tokens": 48674110.0, + "reward": -0.44828396290540695, + "reward_std": 0.32541000843048096, + "rewards/cosine_scaled_reward": -0.22414197400212288, "step": 378 }, { "clip_ratio": 0.0, - "completion_length": 1521.6459045410156, - "epoch": 0.43314285714285716, - "grad_norm": 0.48460623621940613, - "kl": 0.027587890625, + "completion_length": 2753.9792098999023, + "epoch": 0.21657142857142858, + "grad_norm": 0.46519750356674194, + "kl": 0.012783050537109375, "learning_rate": 2.512332043064913e-07, - "loss": 0.2818, - "num_tokens": 37676838.0, - "reward": 0.8283536843955517, - "reward_std": 0.7019159197807312, - "rewards/cosine_scaled_reward": 0.039176818914711475, - "rewards/format_reward": 0.75, + "loss": -0.0604, + "num_tokens": 48811893.0, + "reward": -0.3965173475444317, + "reward_std": 0.294017824344337, + "rewards/cosine_scaled_reward": -0.19825865840539336, "step": 379 }, { "clip_ratio": 0.0, - "completion_length": 1020.9375305175781, - "epoch": 0.4342857142857143, - "grad_norm": 0.5315077900886536, - "kl": 0.0264892578125, + "completion_length": 2460.7708587646484, + "epoch": 0.21714285714285714, + "grad_norm": 0.38828662037849426, + "kl": 0.015163421630859375, "learning_rate": 2.488912271385139e-07, - "loss": 0.3609, - "num_tokens": 37734285.0, - "reward": 0.8689068183302879, - "reward_std": 0.6152400150895119, - "rewards/cosine_scaled_reward": -0.0342966066673398, - "rewards/format_reward": 0.9375000149011612, + "loss": 0.0109, + "num_tokens": 48935722.0, + "reward": 0.1380084827542305, + "reward_std": 0.838990144431591, + "rewards/cosine_scaled_reward": 0.06900423765182495, "step": 380 }, { "clip_ratio": 0.0, - "completion_length": 1150.5000305175781, - "epoch": 0.43542857142857144, - "grad_norm": 0.4323570728302002, - "kl": 0.01971435546875, + "completion_length": 3124.3333740234375, + "epoch": 0.21771428571428572, + "grad_norm": 0.2345580905675888, + "kl": 0.0048999786376953125, "learning_rate": 2.465639255873246e-07, - "loss": 0.177, - "num_tokens": 37797207.0, - "reward": 1.0002194195985794, - "reward_std": 0.7640446051955223, - "rewards/cosine_scaled_reward": 0.020943036302924156, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.0489, + "num_tokens": 49091774.0, + "reward": -0.5142972506582737, + "reward_std": 0.43132054805755615, + "rewards/cosine_scaled_reward": -0.2571486262604594, "step": 381 }, { "clip_ratio": 0.0, - "completion_length": 1450.6041870117188, - "epoch": 0.43657142857142855, - "grad_norm": 0.39461439847946167, - "kl": 0.023468017578125, + "completion_length": 2855.9375610351562, + "epoch": 0.21828571428571428, + "grad_norm": 0.2583065629005432, + "kl": 0.0037899017333984375, "learning_rate": 2.4425141308231765e-07, - "loss": 0.2971, - "num_tokens": 37874744.0, - "reward": 0.7021645717322826, - "reward_std": 0.7685395777225494, - "rewards/cosine_scaled_reward": -0.04475104878656566, - "rewards/format_reward": 0.7916666716337204, + "loss": -0.0191, + "num_tokens": 49234511.0, + "reward": -0.32289815321564674, + "reward_std": 0.5398626290261745, + "rewards/cosine_scaled_reward": -0.16144907008856535, "step": 382 }, { "clip_ratio": 0.0, - "completion_length": 2027.2083740234375, - "epoch": 0.4377142857142857, - "grad_norm": 0.34751343727111816, - "kl": 0.0179443359375, + "completion_length": 2359.1041870117188, + "epoch": 0.21885714285714286, + "grad_norm": 0.2668103873729706, + "kl": 0.003749847412109375, "learning_rate": 2.4195380233209006e-07, - "loss": 0.1176, - "num_tokens": 37979940.0, - "reward": 0.5363237643614411, - "reward_std": 0.3470821715891361, - "rewards/cosine_scaled_reward": -0.03392144478857517, - "rewards/format_reward": 0.6041666567325592, + "loss": -0.0061, + "num_tokens": 49353196.0, + "reward": 0.04249673895537853, + "reward_std": 0.5773097351193428, + "rewards/cosine_scaled_reward": 0.02124837739393115, "step": 383 }, { "clip_ratio": 0.0, - "completion_length": 1126.5208587646484, - "epoch": 0.43885714285714283, - "grad_norm": 0.4206862151622772, - "kl": 0.0215301513671875, + "completion_length": 3301.041748046875, + "epoch": 0.21942857142857142, + "grad_norm": 0.21543462574481964, + "kl": 0.0013751983642578125, "learning_rate": 2.3967120531894857e-07, - "loss": 0.123, - "num_tokens": 38042449.0, - "reward": 0.7954209297895432, - "reward_std": 0.4029863178730011, - "rewards/cosine_scaled_reward": -0.10228953487239778, - "rewards/format_reward": 1.0, + "loss": -0.0099, + "num_tokens": 49517442.0, + "reward": -0.13908865815028548, + "reward_std": 0.3829270862042904, + "rewards/cosine_scaled_reward": -0.06954432907514274, "step": 384 }, { "clip_ratio": 0.0, - "completion_length": 1238.9584045410156, - "epoch": 0.44, - "grad_norm": 0.3564131259918213, - "kl": 0.022125244140625, + "completion_length": 1987.6667098999023, + "epoch": 0.22, + "grad_norm": 0.48938560485839844, + "kl": 0.023218154907226562, "learning_rate": 2.374037332934512e-07, - "loss": 0.0834, - "num_tokens": 38109791.0, - "reward": 1.001874104142189, - "reward_std": 0.6318590641021729, - "rewards/cosine_scaled_reward": 0.042603690177202225, - "rewards/format_reward": 0.9166666716337204, + "loss": 0.0919, + "num_tokens": 49618466.0, + "reward": 0.08302738517522812, + "reward_std": 0.382623303681612, + "rewards/cosine_scaled_reward": 0.041513677686452866, "step": 385 }, { "clip_ratio": 0.0, - "completion_length": 1390.8125610351562, - "epoch": 0.44114285714285717, - "grad_norm": 0.3943594992160797, - "kl": 0.0227203369140625, + "completion_length": 2939.6458740234375, + "epoch": 0.22057142857142858, + "grad_norm": 0.28820157051086426, + "kl": 0.0025806427001953125, "learning_rate": 2.3515149676898552e-07, - "loss": 0.2978, - "num_tokens": 38184644.0, - "reward": 0.8800090253353119, - "reward_std": 0.5464823693037033, - "rewards/cosine_scaled_reward": 0.01292116753757, - "rewards/format_reward": 0.8541666716337204, + "loss": -0.034, + "num_tokens": 49765545.0, + "reward": -0.2633480429649353, + "reward_std": 0.5663026869297028, + "rewards/cosine_scaled_reward": -0.13167402520775795, "step": 386 }, { "clip_ratio": 0.0, - "completion_length": 1202.3333740234375, - "epoch": 0.4422857142857143, - "grad_norm": 0.4116460978984833, - "kl": 0.02825927734375, + "completion_length": 1900.6250228881836, + "epoch": 0.22114285714285714, + "grad_norm": 0.6309099197387695, + "kl": 0.03489875793457031, "learning_rate": 2.3291460551638237e-07, - "loss": 0.217, - "num_tokens": 38250456.0, - "reward": 0.7325910478830338, - "reward_std": 0.3488932326436043, - "rewards/cosine_scaled_reward": -0.10245448537170887, - "rewards/format_reward": 0.9375, + "loss": 0.0862, + "num_tokens": 49862631.0, + "reward": 0.8468033410608768, + "reward_std": 0.44973103795200586, + "rewards/cosine_scaled_reward": 0.4234016705304384, "step": 387 }, { "clip_ratio": 0.0, - "completion_length": 1391.6667175292969, - "epoch": 0.44342857142857145, - "grad_norm": 0.4180901050567627, - "kl": 0.0212860107421875, + "completion_length": 1040.0833587646484, + "epoch": 0.22171428571428572, + "grad_norm": 0.49885639548301697, + "kl": 0.030452728271484375, "learning_rate": 2.306931685585657e-07, - "loss": 0.1904, - "num_tokens": 38325338.0, - "reward": 1.0158484429121017, - "reward_std": 0.7112128995358944, - "rewards/cosine_scaled_reward": 0.04959086421877146, - "rewards/format_reward": 0.9166666716337204, + "loss": -0.0405, + "num_tokens": 49917811.0, + "reward": 0.8224863847717643, + "reward_std": 0.9109100252389908, + "rewards/cosine_scaled_reward": 0.41124319238588214, "step": 388 }, { "clip_ratio": 0.0, - "completion_length": 1247.3959045410156, - "epoch": 0.44457142857142856, - "grad_norm": 0.3733654320240021, - "kl": 0.02099609375, + "completion_length": 1138.8541717529297, + "epoch": 0.22228571428571428, + "grad_norm": 0.5563756227493286, + "kl": 0.0333251953125, "learning_rate": 2.2848729416523859e-07, - "loss": 0.1651, - "num_tokens": 38393073.0, - "reward": 1.2150611281394958, - "reward_std": 0.7659250199794769, - "rewards/cosine_scaled_reward": 0.13878049701452255, - "rewards/format_reward": 0.9375, + "loss": -0.022, + "num_tokens": 49977864.0, + "reward": 0.6305245533585548, + "reward_std": 0.6203858032822609, + "rewards/cosine_scaled_reward": 0.3152622692286968, "step": 389 }, { "clip_ratio": 0.0, - "completion_length": 1146.5833435058594, - "epoch": 0.44571428571428573, - "grad_norm": 0.6047143936157227, - "kl": 0.03582763671875, + "completion_length": 2377.1666870117188, + "epoch": 0.22285714285714286, + "grad_norm": 0.2759566605091095, + "kl": 0.0078125, "learning_rate": 2.2629708984760706e-07, - "loss": 0.3609, - "num_tokens": 38456011.0, - "reward": 0.6786292586475611, - "reward_std": 0.6744736880064011, - "rewards/cosine_scaled_reward": -0.06693539395928383, - "rewards/format_reward": 0.8125000074505806, + "loss": 0.1241, + "num_tokens": 50098244.0, + "reward": -0.42858413606882095, + "reward_std": 0.5400056019425392, + "rewards/cosine_scaled_reward": -0.21429206058382988, "step": 390 }, { "clip_ratio": 0.0, - "completion_length": 942.6667022705078, - "epoch": 0.44685714285714284, - "grad_norm": 0.46833914518356323, - "kl": 0.031005859375, + "completion_length": 2341.0833587646484, + "epoch": 0.22342857142857142, + "grad_norm": 0.4552629888057709, + "kl": 0.01029205322265625, "learning_rate": 2.2412266235313973e-07, - "loss": 0.2084, - "num_tokens": 38508975.0, - "reward": 0.8765498697757721, - "reward_std": 0.6001102030277252, - "rewards/cosine_scaled_reward": -0.040891751646995544, - "rewards/format_reward": 0.9583333432674408, + "loss": -0.0161, + "num_tokens": 50215944.0, + "reward": -0.3929029032588005, + "reward_std": 0.24793048202991486, + "rewards/cosine_scaled_reward": -0.19645144790410995, "step": 391 }, { "clip_ratio": 0.0, - "completion_length": 1619.6041870117188, - "epoch": 0.448, - "grad_norm": 0.3091895878314972, - "kl": 0.018768310546875, + "completion_length": 3172.9583435058594, + "epoch": 0.224, + "grad_norm": 0.22934892773628235, + "kl": 0.0015659332275390625, "learning_rate": 2.2196411766036487e-07, - "loss": 0.0724, - "num_tokens": 38594150.0, - "reward": 1.0864002853631973, - "reward_std": 0.7392374388873577, - "rewards/cosine_scaled_reward": 0.14736680313944817, - "rewards/format_reward": 0.7916666716337204, + "loss": 0.0007, + "num_tokens": 50374774.0, + "reward": -0.2935205027461052, + "reward_std": 0.3377592619508505, + "rewards/cosine_scaled_reward": -0.146760243922472, "step": 392 }, { "clip_ratio": 0.0, - "completion_length": 1556.4583740234375, - "epoch": 0.4491428571428571, - "grad_norm": 0.35970357060432434, - "kl": 0.021942138671875, + "completion_length": 2505.979217529297, + "epoch": 0.22457142857142856, + "grad_norm": 0.2346705198287964, + "kl": 0.004924774169921875, "learning_rate": 2.1982156097370557e-07, - "loss": 0.2975, - "num_tokens": 38677536.0, - "reward": 0.7836254239082336, - "reward_std": 0.6550324112176895, - "rewards/cosine_scaled_reward": -0.004020635038614273, - "rewards/format_reward": 0.7916666567325592, + "loss": 0.0914, + "num_tokens": 50501181.0, + "reward": -0.4504806846380234, + "reward_std": 0.5376848392188549, + "rewards/cosine_scaled_reward": -0.2252403423190117, "step": 393 }, { "clip_ratio": 0.0, - "completion_length": 1020.1042022705078, - "epoch": 0.4502857142857143, - "grad_norm": 0.41625267267227173, - "kl": 0.03192138671875, + "completion_length": 1801.2708358764648, + "epoch": 0.22514285714285714, + "grad_norm": 0.4237222373485565, + "kl": 0.02034759521484375, "learning_rate": 2.1769509671835223e-07, - "loss": 0.2402, - "num_tokens": 38735135.0, - "reward": 0.8095816224813461, - "reward_std": 0.5363309979438782, - "rewards/cosine_scaled_reward": -0.06395919248461723, - "rewards/format_reward": 0.9375, + "loss": -0.1076, + "num_tokens": 50593294.0, + "reward": 0.2521429820917547, + "reward_std": 0.720090851187706, + "rewards/cosine_scaled_reward": 0.12607147614471614, "step": 394 }, { "clip_ratio": 0.0, - "completion_length": 1274.0625610351562, - "epoch": 0.4514285714285714, - "grad_norm": 0.3793922960758209, - "kl": 0.026214599609375, + "completion_length": 3169.479248046875, + "epoch": 0.2257142857142857, + "grad_norm": 0.20394039154052734, + "kl": 0.00328826904296875, "learning_rate": 2.1558482853517253e-07, - "loss": 0.2779, - "num_tokens": 38805098.0, - "reward": 0.853252187371254, - "reward_std": 0.5334652774035931, - "rewards/cosine_scaled_reward": 0.00995943695306778, - "rewards/format_reward": 0.8333333283662796, + "loss": 0.1086, + "num_tokens": 50751405.0, + "reward": -0.14242761582136154, + "reward_std": 0.6643783301115036, + "rewards/cosine_scaled_reward": -0.07121380046010017, "step": 395 }, { "clip_ratio": 0.0, - "completion_length": 740.1875152587891, - "epoch": 0.45257142857142857, - "grad_norm": 0.45222580432891846, - "kl": 0.034210205078125, + "completion_length": 1952.1875915527344, + "epoch": 0.22628571428571428, + "grad_norm": 0.313944548368454, + "kl": 0.006565093994140625, "learning_rate": 2.134908592756607e-07, - "loss": 0.2259, - "num_tokens": 38848211.0, - "reward": 1.0253641307353973, - "reward_std": 0.33863143250346184, - "rewards/cosine_scaled_reward": 0.03351536951959133, - "rewards/format_reward": 0.9583333283662796, + "loss": 0.185, + "num_tokens": 50851554.0, + "reward": -0.08985854685306549, + "reward_std": 0.3944113999605179, + "rewards/cosine_scaled_reward": -0.04492926225066185, "step": 396 }, { "clip_ratio": 0.0, - "completion_length": 880.2500305175781, - "epoch": 0.45371428571428574, - "grad_norm": 0.4955395758152008, - "kl": 0.03704833984375, + "completion_length": 2726.75, + "epoch": 0.22685714285714287, + "grad_norm": 0.31909239292144775, + "kl": 0.003192901611328125, "learning_rate": 2.1141329099692406e-07, - "loss": 0.3078, - "num_tokens": 38898983.0, - "reward": 1.428503692150116, - "reward_std": 0.5125311873853207, - "rewards/cosine_scaled_reward": 0.2350851409137249, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.0178, + "num_tokens": 50988486.0, + "reward": 0.4016297087073326, + "reward_std": 0.4423007359728217, + "rewards/cosine_scaled_reward": 0.20081482455134392, "step": 397 }, { "clip_ratio": 0.0, - "completion_length": 1339.7500305175781, - "epoch": 0.45485714285714285, - "grad_norm": 0.39652198553085327, - "kl": 0.020843505859375, + "completion_length": 2519.9584045410156, + "epoch": 0.22742857142857142, + "grad_norm": 0.3040643334388733, + "kl": 0.0029087066650390625, "learning_rate": 2.0935222495670968e-07, - "loss": 0.257, - "num_tokens": 38971691.0, - "reward": 1.0975730419158936, - "reward_std": 0.5926219597458839, - "rewards/cosine_scaled_reward": 0.13211984746158123, - "rewards/format_reward": 0.8333333283662796, + "loss": 0.0979, + "num_tokens": 51115648.0, + "reward": 0.3028845489025116, + "reward_std": 0.7478218302130699, + "rewards/cosine_scaled_reward": 0.15144225861877203, "step": 398 }, { "clip_ratio": 0.0, - "completion_length": 1455.916748046875, - "epoch": 0.456, - "grad_norm": 0.32825639843940735, - "kl": 0.023895263671875, + "completion_length": 2820.8125, + "epoch": 0.228, + "grad_norm": 0.38658609986305237, + "kl": 0.008646011352539062, "learning_rate": 2.0730776160846853e-07, - "loss": 0.43, - "num_tokens": 39049621.0, - "reward": 0.7255288064479828, - "reward_std": 0.5127102136611938, - "rewards/cosine_scaled_reward": -0.0434856116771698, - "rewards/format_reward": 0.8125, + "loss": 0.0512, + "num_tokens": 51257047.0, + "reward": -0.2828827500343323, + "reward_std": 0.5114560127258301, + "rewards/cosine_scaled_reward": -0.14144137874245644, "step": 399 }, { "clip_ratio": 0.0, - "completion_length": 904.7291717529297, - "epoch": 0.45714285714285713, - "grad_norm": 0.47382229566574097, - "kl": 0.0305938720703125, + "completion_length": 805.1458549499512, + "epoch": 0.22857142857142856, + "grad_norm": 0.6325322985649109, + "kl": 0.02809906005859375, "learning_rate": 2.0528000059645995e-07, - "loss": 0.3168, - "num_tokens": 39101004.0, - "reward": 0.9123225212097168, - "reward_std": 0.324755534529686, - "rewards/cosine_scaled_reward": -0.023005416616797447, - "rewards/format_reward": 0.9583333283662796, + "loss": 0.0474, + "num_tokens": 51301778.0, + "reward": 1.05329729616642, + "reward_std": 0.6592238266021013, + "rewards/cosine_scaled_reward": 0.52664864808321, "step": 400 }, { "clip_ratio": 0.0, - "completion_length": 670.7500152587891, - "epoch": 0.4582857142857143, - "grad_norm": 0.5006996989250183, - "kl": 0.028167724609375, + "completion_length": 2285.5209045410156, + "epoch": 0.22914285714285715, + "grad_norm": 0.2752256691455841, + "kl": 0.0121612548828125, "learning_rate": 2.032690407508949e-07, - "loss": 0.1668, - "num_tokens": 39141300.0, - "reward": 1.1984441727399826, - "reward_std": 0.43713772110641, - "rewards/cosine_scaled_reward": 0.09922206029295921, - "rewards/format_reward": 1.0, + "loss": 0.1225, + "num_tokens": 51417255.0, + "reward": -0.1001143604516983, + "reward_std": 0.5159653499722481, + "rewards/cosine_scaled_reward": -0.05005717650055885, "step": 401 }, { "clip_ratio": 0.0, - "completion_length": 1485.2500610351562, - "epoch": 0.4594285714285714, - "grad_norm": 2.8007476329803467, - "kl": 0.042938232421875, + "completion_length": 1967.4375457763672, + "epoch": 0.2297142857142857, + "grad_norm": 0.5139122009277344, + "kl": 0.01324462890625, "learning_rate": 2.0127498008311922e-07, - "loss": 0.254, - "num_tokens": 39221130.0, - "reward": 0.5768831968307495, - "reward_std": 0.5035651251673698, - "rewards/cosine_scaled_reward": -0.1594750825315714, - "rewards/format_reward": 0.8958333283662796, + "loss": 0.0994, + "num_tokens": 51518244.0, + "reward": -0.23355354368686676, + "reward_std": 0.4070703499019146, + "rewards/cosine_scaled_reward": -0.11677676998078823, "step": 402 }, { "clip_ratio": 0.0, - "completion_length": 1249.3958587646484, - "epoch": 0.4605714285714286, - "grad_norm": 0.3587521016597748, - "kl": 0.02703857421875, + "completion_length": 1201.2916793823242, + "epoch": 0.2302857142857143, + "grad_norm": 0.6655187010765076, + "kl": 0.034503936767578125, "learning_rate": 1.9929791578083655e-07, - "loss": 0.1084, - "num_tokens": 39288733.0, - "reward": 1.0168748944997787, - "reward_std": 0.7613801509141922, - "rewards/cosine_scaled_reward": 0.05010411172406748, - "rewards/format_reward": 0.9166666716337204, + "loss": -0.0482, + "num_tokens": 51581546.0, + "reward": -0.011656701564788818, + "reward_std": 0.6958346888422966, + "rewards/cosine_scaled_reward": -0.005828343331813812, "step": 403 }, { "clip_ratio": 0.0, - "completion_length": 893.0000305175781, - "epoch": 0.4617142857142857, - "grad_norm": 0.40811389684677124, - "kl": 0.0299072265625, + "completion_length": 2547.125030517578, + "epoch": 0.23085714285714284, + "grad_norm": 0.32773658633232117, + "kl": 0.00426483154296875, "learning_rate": 1.9733794420337213e-07, - "loss": 0.2457, - "num_tokens": 39339589.0, - "reward": 0.9994086623191833, - "reward_std": 0.523450993001461, - "rewards/cosine_scaled_reward": 0.030954306945204735, - "rewards/format_reward": 0.9375, + "loss": 0.1827, + "num_tokens": 51710132.0, + "reward": -0.30341653153300285, + "reward_std": 0.6679345816373825, + "rewards/cosine_scaled_reward": -0.15170826390385628, "step": 404 }, { "clip_ratio": 0.0, - "completion_length": 1085.8750457763672, - "epoch": 0.46285714285714286, - "grad_norm": 0.3914470076560974, - "kl": 0.029327392578125, + "completion_length": 2246.2917098999023, + "epoch": 0.23142857142857143, + "grad_norm": 0.4948166608810425, + "kl": 0.020887374877929688, "learning_rate": 1.9539516087697517e-07, - "loss": 0.2453, - "num_tokens": 39400087.0, - "reward": 1.1680223643779755, - "reward_std": 0.6629828922450542, - "rewards/cosine_scaled_reward": 0.10484451148658991, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.2173, + "num_tokens": 51823558.0, + "reward": 0.32773495465517044, + "reward_std": 0.43168997671455145, + "rewards/cosine_scaled_reward": 0.16386747732758522, "step": 405 }, { "clip_ratio": 0.0, - "completion_length": 1413.1458740234375, - "epoch": 0.464, - "grad_norm": 0.4474855363368988, - "kl": 0.0238037109375, + "completion_length": 2673.000030517578, + "epoch": 0.232, + "grad_norm": 0.4358680546283722, + "kl": 0.006679534912109375, "learning_rate": 1.934696604901642e-07, - "loss": 0.3965, - "num_tokens": 39476744.0, - "reward": 0.8232813104987144, - "reward_std": 0.5474796146154404, - "rewards/cosine_scaled_reward": 0.015807300806045532, - "rewards/format_reward": 0.7916666865348816, + "loss": 0.116, + "num_tokens": 51958126.0, + "reward": -0.44332827627658844, + "reward_std": 0.464891217648983, + "rewards/cosine_scaled_reward": -0.22166412882506847, "step": 406 }, { "clip_ratio": 0.0, - "completion_length": 973.9791870117188, - "epoch": 0.46514285714285714, - "grad_norm": 0.5875135660171509, - "kl": 0.03564453125, + "completion_length": 1675.5000228881836, + "epoch": 0.23257142857142857, + "grad_norm": 0.5162947773933411, + "kl": 0.015869140625, "learning_rate": 1.915615368891117e-07, - "loss": 0.464, - "num_tokens": 39531097.0, - "reward": 1.0968922302126884, - "reward_std": 0.5210925191640854, - "rewards/cosine_scaled_reward": 0.0901127781253308, - "rewards/format_reward": 0.9166666716337204, + "loss": 0.1665, + "num_tokens": 52043758.0, + "reward": 0.42337319999933243, + "reward_std": 0.5919698104262352, + "rewards/cosine_scaled_reward": 0.21168660186231136, "step": 407 }, { "clip_ratio": 0.0, - "completion_length": 1042.020851135254, - "epoch": 0.4662857142857143, - "grad_norm": 0.5569504499435425, - "kl": 0.036956787109375, + "completion_length": 1417.083366394043, + "epoch": 0.23314285714285715, + "grad_norm": 0.585673451423645, + "kl": 0.019334793090820312, "learning_rate": 1.8967088307307e-07, - "loss": 0.1739, - "num_tokens": 39588962.0, - "reward": 1.0446859672665596, - "reward_std": 0.477496899664402, - "rewards/cosine_scaled_reward": 0.06400964129716158, - "rewards/format_reward": 0.9166666716337204, + "loss": 0.1973, + "num_tokens": 52117886.0, + "reward": 0.12191886268556118, + "reward_std": 0.744186770170927, + "rewards/cosine_scaled_reward": 0.06095943506807089, "step": 408 }, { "clip_ratio": 0.0, - "completion_length": 853.7083740234375, - "epoch": 0.4674285714285714, - "grad_norm": 0.43134912848472595, - "kl": 0.0301513671875, + "completion_length": 2180.645866394043, + "epoch": 0.2337142857142857, + "grad_norm": 0.3679039776325226, + "kl": 0.011453628540039062, "learning_rate": 1.8779779118983867e-07, - "loss": 0.0768, - "num_tokens": 39638154.0, - "reward": 1.1461460888385773, - "reward_std": 0.7252542972564697, - "rewards/cosine_scaled_reward": 0.07307301368564367, - "rewards/format_reward": 1.0, + "loss": 0.0241, + "num_tokens": 52228245.0, + "reward": 0.07945476472377777, + "reward_std": 0.7460962496697903, + "rewards/cosine_scaled_reward": 0.039727382361888885, "step": 409 }, { "clip_ratio": 0.0, - "completion_length": 1219.4583740234375, - "epoch": 0.4685714285714286, - "grad_norm": 0.42697203159332275, - "kl": 0.0308074951171875, + "completion_length": 3364.2291870117188, + "epoch": 0.2342857142857143, + "grad_norm": 0.1959182769060135, + "kl": 0.0014972686767578125, "learning_rate": 1.8594235253127372e-07, - "loss": 0.2314, - "num_tokens": 39704914.0, - "reward": 0.8525972217321396, - "reward_std": 0.7250062227249146, - "rewards/cosine_scaled_reward": -0.011201405432075262, - "rewards/format_reward": 0.875, + "loss": 0.0497, + "num_tokens": 52396376.0, + "reward": -0.09703963249921799, + "reward_std": 0.6702800244092941, + "rewards/cosine_scaled_reward": -0.048519810661673546, "step": 410 }, { "clip_ratio": 0.0, - "completion_length": 1561.7083587646484, - "epoch": 0.4697142857142857, - "grad_norm": 0.35902073979377747, - "kl": 0.027740478515625, + "completion_length": 2733.8333740234375, + "epoch": 0.23485714285714285, + "grad_norm": 0.4071482717990875, + "kl": 0.0033740997314453125, "learning_rate": 1.8410465752883758e-07, - "loss": 0.3147, - "num_tokens": 39788684.0, - "reward": 0.42419329285621643, - "reward_std": 0.4189872369170189, - "rewards/cosine_scaled_reward": -0.2045700242742896, - "rewards/format_reward": 0.8333333432674408, + "loss": 0.0787, + "num_tokens": 52533924.0, + "reward": -0.032442666590213776, + "reward_std": 0.6601308509707451, + "rewards/cosine_scaled_reward": -0.01622132584452629, "step": 411 }, { "clip_ratio": 0.0, - "completion_length": 869.9583435058594, - "epoch": 0.47085714285714286, - "grad_norm": 0.5099910497665405, - "kl": 0.03778076171875, + "completion_length": 3497.3958740234375, + "epoch": 0.23542857142857143, + "grad_norm": 0.20792892575263977, + "kl": 0.0020923614501953125, "learning_rate": 1.822847957491922e-07, - "loss": 0.2973, - "num_tokens": 39838452.0, - "reward": 0.9078437611460686, - "reward_std": 0.4090404435992241, - "rewards/cosine_scaled_reward": -0.025244787335395813, - "rewards/format_reward": 0.9583333283662796, + "loss": 0.0532, + "num_tokens": 52708939.0, + "reward": -0.6876078844070435, + "reward_std": 0.24106604978442192, + "rewards/cosine_scaled_reward": -0.34380391985177994, "step": 412 }, { "clip_ratio": 0.0, - "completion_length": 1232.2708740234375, - "epoch": 0.472, - "grad_norm": 0.4471232295036316, - "kl": 0.02978515625, + "completion_length": 2646.354232788086, + "epoch": 0.236, + "grad_norm": 0.25247225165367126, + "kl": 0.010334014892578125, "learning_rate": 1.804828558898332e-07, - "loss": 0.3114, - "num_tokens": 39905653.0, - "reward": 0.5217236494645476, - "reward_std": 0.31653984263539314, - "rewards/cosine_scaled_reward": -0.1558048389852047, - "rewards/format_reward": 0.8333333432674408, + "loss": 0.0482, + "num_tokens": 52842144.0, + "reward": 0.3487217575311661, + "reward_std": 0.8396928831934929, + "rewards/cosine_scaled_reward": 0.17436087876558304, "step": 413 }, { "clip_ratio": 0.0, - "completion_length": 1533.6666717529297, - "epoch": 0.47314285714285714, - "grad_norm": 0.4558218717575073, - "kl": 0.0302886962890625, + "completion_length": 2190.041732788086, + "epoch": 0.23657142857142857, + "grad_norm": 0.37403154373168945, + "kl": 0.017669677734375, "learning_rate": 1.7869892577476722e-07, - "loss": 0.2128, - "num_tokens": 39988185.0, - "reward": 1.2229383140802383, - "reward_std": 0.6020685248076916, - "rewards/cosine_scaled_reward": 0.19480247469618917, - "rewards/format_reward": 0.8333333283662796, + "loss": 0.0377, + "num_tokens": 52953050.0, + "reward": 0.02339554950594902, + "reward_std": 0.5981272980570793, + "rewards/cosine_scaled_reward": 0.011697770096361637, "step": 414 }, { "clip_ratio": 0.0, - "completion_length": 1351.0000305175781, - "epoch": 0.4742857142857143, - "grad_norm": 0.3420197665691376, - "kl": 0.025360107421875, + "completion_length": 2704.625045776367, + "epoch": 0.23714285714285716, + "grad_norm": 0.5488267540931702, + "kl": 0.01036834716796875, "learning_rate": 1.7693309235023127e-07, - "loss": 0.3434, - "num_tokens": 40061211.0, - "reward": 1.1249284744262695, - "reward_std": 0.8133822381496429, - "rewards/cosine_scaled_reward": 0.12496422789990902, - "rewards/format_reward": 0.875, + "loss": 0.0983, + "num_tokens": 53088536.0, + "reward": -0.4232526607811451, + "reward_std": 0.331368088722229, + "rewards/cosine_scaled_reward": -0.21162631921470165, "step": 415 }, { "clip_ratio": 0.0, - "completion_length": 1263.6458740234375, - "epoch": 0.4754285714285714, - "grad_norm": 0.4289003014564514, - "kl": 0.031707763671875, + "completion_length": 3500.1458740234375, + "epoch": 0.2377142857142857, + "grad_norm": 0.20574693381786346, + "kl": 0.0016450881958007812, "learning_rate": 1.7518544168045524e-07, - "loss": 0.09, - "num_tokens": 40130410.0, - "reward": 0.9559039436280727, - "reward_std": 0.6498378068208694, - "rewards/cosine_scaled_reward": 0.05086860992014408, - "rewards/format_reward": 0.8541666716337204, + "loss": -0.0222, + "num_tokens": 53262651.0, + "reward": -0.3312137499451637, + "reward_std": 0.22738385573029518, + "rewards/cosine_scaled_reward": -0.16560687310993671, "step": 416 }, { "clip_ratio": 0.0, - "completion_length": 1705.5833740234375, - "epoch": 0.4765714285714286, - "grad_norm": 0.4851248562335968, - "kl": 0.0277099609375, + "completion_length": 1754.87504196167, + "epoch": 0.2382857142857143, + "grad_norm": 0.785120964050293, + "kl": 0.019487380981445312, "learning_rate": 1.7345605894346726e-07, - "loss": 0.2165, - "num_tokens": 40220522.0, - "reward": 0.5403048545122147, - "reward_std": 0.5704892203211784, - "rewards/cosine_scaled_reward": -0.08401426486670971, - "rewards/format_reward": 0.7083333358168602, + "loss": 0.1738, + "num_tokens": 53353089.0, + "reward": 0.30004075169563293, + "reward_std": 0.3757967611309141, + "rewards/cosine_scaled_reward": 0.15002036094665527, "step": 417 }, { "clip_ratio": 0.0, - "completion_length": 1089.7292175292969, - "epoch": 0.4777142857142857, - "grad_norm": 0.5024583339691162, - "kl": 0.02630615234375, + "completion_length": 2226.2916870117188, + "epoch": 0.23885714285714285, + "grad_norm": 0.5053194165229797, + "kl": 0.01377105712890625, "learning_rate": 1.7174502842694212e-07, - "loss": 0.35, - "num_tokens": 40281043.0, - "reward": 0.5926680490374565, - "reward_std": 0.48843371123075485, - "rewards/cosine_scaled_reward": -0.1724159736186266, - "rewards/format_reward": 0.9375000149011612, + "loss": 0.1283, + "num_tokens": 53465675.0, + "reward": -0.448532085865736, + "reward_std": 0.38258010521531105, + "rewards/cosine_scaled_reward": -0.22426604200154543, "step": 418 }, { "clip_ratio": 0.0, - "completion_length": 1123.7083587646484, - "epoch": 0.47885714285714287, - "grad_norm": 0.4442269504070282, - "kl": 0.032562255859375, + "completion_length": 2314.875015258789, + "epoch": 0.23942857142857144, + "grad_norm": 0.3890438973903656, + "kl": 0.005889892578125, "learning_rate": 1.7005243352409333e-07, - "loss": 0.0806, - "num_tokens": 40343321.0, - "reward": 0.8577143996953964, - "reward_std": 0.5276221707463264, - "rewards/cosine_scaled_reward": -0.05030947690829635, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.018, + "num_tokens": 53582981.0, + "reward": -0.3278130046091974, + "reward_std": 0.30072507075965405, + "rewards/cosine_scaled_reward": -0.1639064911287278, "step": 419 }, { "clip_ratio": 0.0, - "completion_length": 1060.1042022705078, - "epoch": 0.48, - "grad_norm": 0.410487562417984, - "kl": 0.026458740234375, + "completion_length": 1690.1250228881836, + "epoch": 0.24, + "grad_norm": 0.5274333357810974, + "kl": 0.02419281005859375, "learning_rate": 1.6837835672960831e-07, - "loss": 0.161, - "num_tokens": 40401910.0, - "reward": 1.127629041671753, - "reward_std": 0.7603463605046272, - "rewards/cosine_scaled_reward": 0.09506450593471527, - "rewards/format_reward": 0.9375, + "loss": 0.0839, + "num_tokens": 53669915.0, + "reward": 0.08054505288600922, + "reward_std": 0.3769830437377095, + "rewards/cosine_scaled_reward": 0.040272533893585205, "step": 420 }, { "clip_ratio": 0.0, - "completion_length": 927.5625457763672, - "epoch": 0.48114285714285715, - "grad_norm": 0.4083124101161957, - "kl": 0.028717041015625, + "completion_length": 3035.104217529297, + "epoch": 0.24057142857142857, + "grad_norm": 0.25986582040786743, + "kl": 0.002590179443359375, "learning_rate": 1.6672287963562852e-07, - "loss": 0.4235, - "num_tokens": 40454173.0, - "reward": 1.2100372835993767, - "reward_std": 0.7276662588119507, - "rewards/cosine_scaled_reward": 0.13626863807439804, - "rewards/format_reward": 0.9375, + "loss": 0.0036, + "num_tokens": 53821504.0, + "reward": -0.3971610963344574, + "reward_std": 0.42149341851472855, + "rewards/cosine_scaled_reward": -0.1985805444419384, "step": 421 }, { "clip_ratio": 0.0, - "completion_length": 1145.6667175292969, - "epoch": 0.48228571428571426, - "grad_norm": 0.36734601855278015, - "kl": 0.03082275390625, + "completion_length": 3569.7916870117188, + "epoch": 0.24114285714285713, + "grad_norm": 0.18861441314220428, + "kl": 0.0012664794921875, "learning_rate": 1.6508608292777203e-07, - "loss": 0.3146, - "num_tokens": 40517349.0, - "reward": 0.745939314365387, - "reward_std": 0.4512132965028286, - "rewards/cosine_scaled_reward": -0.10619700700044632, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.0098, + "num_tokens": 53999874.0, + "reward": -0.4841906800866127, + "reward_std": 0.3270874619483948, + "rewards/cosine_scaled_reward": -0.24209534004330635, "step": 422 }, { "clip_ratio": 0.0, - "completion_length": 1289.041732788086, - "epoch": 0.48342857142857143, - "grad_norm": 0.39704540371894836, - "kl": 0.02734375, + "completion_length": 2044.2083587646484, + "epoch": 0.24171428571428571, + "grad_norm": 0.3931296765804291, + "kl": 0.015514373779296875, "learning_rate": 1.6346804638120098e-07, - "loss": 0.2491, - "num_tokens": 40587065.0, - "reward": 0.7884100899100304, - "reward_std": 0.5758587270975113, - "rewards/cosine_scaled_reward": -0.03287830762565136, - "rewards/format_reward": 0.8541666716337204, + "loss": 0.0304, + "num_tokens": 54103636.0, + "reward": 0.31378229707479477, + "reward_std": 0.7438326478004456, + "rewards/cosine_scaled_reward": 0.15689115412533283, "step": 423 }, { "clip_ratio": 0.0, - "completion_length": 1040.0625610351562, - "epoch": 0.4845714285714286, - "grad_norm": 0.39009180665016174, - "kl": 0.0362548828125, + "completion_length": 3155.916748046875, + "epoch": 0.2422857142857143, + "grad_norm": 0.23850037157535553, + "kl": 0.0032806396484375, "learning_rate": 1.6186884885673413e-07, - "loss": 0.3062, - "num_tokens": 40645184.0, - "reward": 0.9872114062309265, - "reward_std": 0.43921393156051636, - "rewards/cosine_scaled_reward": 0.014439025893807411, - "rewards/format_reward": 0.9583333283662796, + "loss": 0.0152, + "num_tokens": 54260976.0, + "reward": -0.1346125192940235, + "reward_std": 0.6563430987298489, + "rewards/cosine_scaled_reward": -0.06730626057833433, "step": 424 }, { "clip_ratio": 0.0, - "completion_length": 1162.2083740234375, - "epoch": 0.4857142857142857, - "grad_norm": 0.41968798637390137, - "kl": 0.031097412109375, + "completion_length": 3406.041748046875, + "epoch": 0.24285714285714285, + "grad_norm": 0.18366895616054535, + "kl": 0.001506805419921875, "learning_rate": 1.6028856829700258e-07, - "loss": 0.1913, - "num_tokens": 40709070.0, - "reward": 1.1210691630840302, - "reward_std": 0.6839531622827053, - "rewards/cosine_scaled_reward": 0.09178455546498299, - "rewards/format_reward": 0.9375, + "loss": 0.0637, + "num_tokens": 54430154.0, + "reward": -0.7047783136367798, + "reward_std": 0.22173772007226944, + "rewards/cosine_scaled_reward": -0.3523891530930996, "step": 425 }, { "clip_ratio": 0.0, - "completion_length": 842.9583740234375, - "epoch": 0.4868571428571429, - "grad_norm": 0.4921518862247467, - "kl": 0.032623291015625, + "completion_length": 2864.2083435058594, + "epoch": 0.24342857142857144, + "grad_norm": 0.23228085041046143, + "kl": 0.0048847198486328125, "learning_rate": 1.5872728172265146e-07, - "loss": 0.1351, - "num_tokens": 40757704.0, - "reward": 1.1493963152170181, - "reward_std": 0.5684140212833881, - "rewards/cosine_scaled_reward": 0.08511480502784252, - "rewards/format_reward": 0.9791666716337204, + "loss": -0.0251, + "num_tokens": 54574260.0, + "reward": -0.05515944957733154, + "reward_std": 0.19446351379156113, + "rewards/cosine_scaled_reward": -0.027579709887504578, "step": 426 }, { "clip_ratio": 0.0, - "completion_length": 1863.0417175292969, - "epoch": 0.488, - "grad_norm": 0.34825828671455383, - "kl": 0.016571044921875, + "completion_length": 3079.7709350585938, + "epoch": 0.244, + "grad_norm": 0.21749088168144226, + "kl": 0.0016574859619140625, "learning_rate": 1.5718506522858572e-07, - "loss": 0.262, - "num_tokens": 40854708.0, - "reward": 0.6934209913015366, - "reward_std": 0.6876000687479973, - "rewards/cosine_scaled_reward": -0.007456184830516577, - "rewards/format_reward": 0.708333320915699, + "loss": -0.035, + "num_tokens": 54728161.0, + "reward": -0.31441882718354464, + "reward_std": 0.4984753504395485, + "rewards/cosine_scaled_reward": -0.15720941359177232, "step": 427 }, { "clip_ratio": 0.0, - "completion_length": 1125.562515258789, - "epoch": 0.48914285714285716, - "grad_norm": 0.431072473526001, - "kl": 0.02935791015625, + "completion_length": 2344.812530517578, + "epoch": 0.24457142857142858, + "grad_norm": 0.26461076736450195, + "kl": 0.00640869140625, "learning_rate": 1.5566199398026147e-07, - "loss": 0.1989, - "num_tokens": 40917639.0, - "reward": 0.79083002358675, - "reward_std": 0.49118974804878235, - "rewards/cosine_scaled_reward": -0.06291831657290459, - "rewards/format_reward": 0.9166666716337204, + "loss": -0.0287, + "num_tokens": 54847000.0, + "reward": 0.19997850060462952, + "reward_std": 0.6211978904902935, + "rewards/cosine_scaled_reward": 0.09998924285173416, "step": 428 }, { "clip_ratio": 0.0, - "completion_length": 1233.250015258789, - "epoch": 0.49028571428571427, - "grad_norm": 0.5056308507919312, - "kl": 0.03472900390625, + "completion_length": 2461.812530517578, + "epoch": 0.24514285714285713, + "grad_norm": 0.3109036982059479, + "kl": 0.00499725341796875, "learning_rate": 1.5415814221002265e-07, - "loss": 0.0979, - "num_tokens": 40985193.0, - "reward": 1.0755756497383118, - "reward_std": 0.6792989065870643, - "rewards/cosine_scaled_reward": 0.12112115137279034, - "rewards/format_reward": 0.8333333432674408, + "loss": 0.13, + "num_tokens": 54971407.0, + "reward": -0.4674905724823475, + "reward_std": 0.33745603263378143, + "rewards/cosine_scaled_reward": -0.2337452843785286, "step": 429 }, { "clip_ratio": 0.0, - "completion_length": 1888.4792175292969, - "epoch": 0.49142857142857144, - "grad_norm": 0.3414265513420105, - "kl": 0.018707275390625, + "completion_length": 3317.6459350585938, + "epoch": 0.24571428571428572, + "grad_norm": 0.21356281638145447, + "kl": 0.001575469970703125, "learning_rate": 1.5267358321348285e-07, - "loss": 0.2482, - "num_tokens": 41084198.0, - "reward": 0.5194306820631027, - "reward_std": 0.7147988900542259, - "rewards/cosine_scaled_reward": -0.11528466921299696, - "rewards/format_reward": 0.75, + "loss": 0.0489, + "num_tokens": 55136462.0, + "reward": 0.11995480954647064, + "reward_std": 0.8040067255496979, + "rewards/cosine_scaled_reward": 0.05997740477323532, "step": 430 }, { "clip_ratio": 0.0, - "completion_length": 1227.1042175292969, - "epoch": 0.49257142857142855, - "grad_norm": 0.45565667748451233, - "kl": 0.027862548828125, + "completion_length": 3584.0, + "epoch": 0.24628571428571427, + "grad_norm": 0.17928774654865265, + "kl": 0.0009527206420898438, "learning_rate": 1.5120838934595337e-07, - "loss": 0.2267, - "num_tokens": 41150899.0, - "reward": 1.093374602496624, - "reward_std": 0.5845904052257538, - "rewards/cosine_scaled_reward": 0.11960397660732269, - "rewards/format_reward": 0.8541666567325592, + "loss": 0.0, + "num_tokens": 55315274.0, + "reward": -0.5093374960124493, + "reward_std": 0.20916565880179405, + "rewards/cosine_scaled_reward": -0.25466873310506344, "step": 431 }, { "clip_ratio": 0.0, - "completion_length": 1604.812515258789, - "epoch": 0.4937142857142857, - "grad_norm": 0.35553795099258423, - "kl": 0.026458740234375, + "completion_length": 2002.8542098999023, + "epoch": 0.24685714285714286, + "grad_norm": 0.45603108406066895, + "kl": 0.013976097106933594, "learning_rate": 1.4976263201891613e-07, - "loss": 0.1478, - "num_tokens": 41235508.0, - "reward": 0.7511163279414177, - "reward_std": 0.7302351742982864, - "rewards/cosine_scaled_reward": -0.030691856518387794, - "rewards/format_reward": 0.8125, + "loss": -0.0071, + "num_tokens": 55416811.0, + "reward": 0.010444401763379574, + "reward_std": 0.6464076600968838, + "rewards/cosine_scaled_reward": 0.005222200881689787, "step": 432 }, { "clip_ratio": 0.0, - "completion_length": 1305.6041717529297, - "epoch": 0.4948571428571429, - "grad_norm": 0.36688801646232605, - "kl": 0.029998779296875, + "completion_length": 1662.4166870117188, + "epoch": 0.24742857142857144, + "grad_norm": 0.3496110737323761, + "kl": 0.015367507934570312, "learning_rate": 1.483363816965435e-07, - "loss": 0.3838, - "num_tokens": 41306295.0, - "reward": 0.8434520326554775, - "reward_std": 0.5114891212433577, - "rewards/cosine_scaled_reward": 0.01547599770128727, - "rewards/format_reward": 0.8125, + "loss": -0.0874, + "num_tokens": 55502091.0, + "reward": 0.2004665769636631, + "reward_std": 0.4371606968343258, + "rewards/cosine_scaled_reward": 0.10023329593241215, "step": 433 }, { "clip_ratio": 0.0, - "completion_length": 1180.0833740234375, - "epoch": 0.496, - "grad_norm": 0.585704505443573, - "kl": 0.0245361328125, + "completion_length": 3065.2500610351562, + "epoch": 0.248, + "grad_norm": 0.2298591136932373, + "kl": 0.0030975341796875, "learning_rate": 1.469297078922642e-07, - "loss": 0.2038, - "num_tokens": 41370631.0, - "reward": 1.3207928240299225, - "reward_std": 0.9274136871099472, - "rewards/cosine_scaled_reward": 0.2124797385185957, - "rewards/format_reward": 0.8958333432674408, + "loss": -0.0334, + "num_tokens": 55655283.0, + "reward": 0.04710858315229416, + "reward_std": 0.42717816680669785, + "rewards/cosine_scaled_reward": 0.02355429343879223, "step": 434 }, { "clip_ratio": 0.0, - "completion_length": 1271.5417175292969, - "epoch": 0.49714285714285716, - "grad_norm": 0.4473916292190552, - "kl": 0.030487060546875, + "completion_length": 2757.7291717529297, + "epoch": 0.24857142857142858, + "grad_norm": 0.4120491147041321, + "kl": 0.012348175048828125, "learning_rate": 1.4554267916537495e-07, - "loss": 0.3724, - "num_tokens": 41439423.0, - "reward": 0.6336492225527763, - "reward_std": 0.540140762925148, - "rewards/cosine_scaled_reward": -0.11025873199105263, - "rewards/format_reward": 0.8541666865348816, + "loss": -0.0036, + "num_tokens": 55794170.0, + "reward": 0.13828672468662262, + "reward_std": 0.4802464433014393, + "rewards/cosine_scaled_reward": 0.06914335861802101, "step": 435 }, { "clip_ratio": 0.0, - "completion_length": 1551.9792175292969, - "epoch": 0.4982857142857143, - "grad_norm": 0.3577360510826111, - "kl": 0.0240631103515625, + "completion_length": 2647.8125228881836, + "epoch": 0.24914285714285714, + "grad_norm": 0.40533170104026794, + "kl": 0.017702102661132812, "learning_rate": 1.4417536311769885e-07, - "loss": 0.3034, - "num_tokens": 41521970.0, - "reward": 0.7670895345509052, - "reward_std": 0.7583608031272888, - "rewards/cosine_scaled_reward": -0.012288582162000239, - "rewards/format_reward": 0.7916666716337204, + "loss": 0.1134, + "num_tokens": 55927697.0, + "reward": 0.27192703634500504, + "reward_std": 0.8968725055456161, + "rewards/cosine_scaled_reward": 0.13596352562308311, "step": 436 }, { "clip_ratio": 0.0, - "completion_length": 1145.5833587646484, - "epoch": 0.49942857142857144, - "grad_norm": 0.41388896107673645, - "kl": 0.02911376953125, + "completion_length": 1862.0208778381348, + "epoch": 0.24971428571428572, + "grad_norm": 0.9058906435966492, + "kl": 0.04045867919921875, "learning_rate": 1.4282782639029128e-07, - "loss": 0.2609, - "num_tokens": 41584824.0, - "reward": 1.317168042063713, - "reward_std": 0.6604329124093056, - "rewards/cosine_scaled_reward": 0.21066732332110405, - "rewards/format_reward": 0.8958333432674408, + "loss": 0.0546, + "num_tokens": 56022282.0, + "reward": 0.15178360044956207, + "reward_std": 0.6174656040966511, + "rewards/cosine_scaled_reward": 0.07589179277420044, "step": 437 }, { "clip_ratio": 0.0, - "completion_length": 888.7917175292969, - "epoch": 0.5005714285714286, - "grad_norm": 1.7249231338500977, - "kl": 0.05987548828125, + "completion_length": 2569.291717529297, + "epoch": 0.2502857142857143, + "grad_norm": 0.25029805302619934, + "kl": 0.007534027099609375, "learning_rate": 1.4150013466019114e-07, - "loss": 0.3252, - "num_tokens": 41635646.0, - "reward": 1.056243896484375, - "reward_std": 0.5137106534093618, - "rewards/cosine_scaled_reward": 0.04895526496693492, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.0397, + "num_tokens": 56151440.0, + "reward": -0.22741758823394775, + "reward_std": 0.537294939160347, + "rewards/cosine_scaled_reward": -0.11370879039168358, "step": 438 }, { "clip_ratio": 0.0, - "completion_length": 1017.1458740234375, - "epoch": 0.5017142857142857, - "grad_norm": 0.5317440629005432, - "kl": 0.0291748046875, + "completion_length": 1111.958381652832, + "epoch": 0.25085714285714283, + "grad_norm": 0.45296990871429443, + "kl": 0.02793121337890625, "learning_rate": 1.4019235263722034e-07, - "loss": 0.2548, - "num_tokens": 41691807.0, - "reward": 0.9232819229364395, - "reward_std": 0.62980717420578, - "rewards/cosine_scaled_reward": 0.013724284246563911, - "rewards/format_reward": 0.8958333283662796, + "loss": 0.0986, + "num_tokens": 56210862.0, + "reward": -0.31671690940856934, + "reward_std": 0.350432638078928, + "rewards/cosine_scaled_reward": -0.15835845470428467, "step": 439 }, { "clip_ratio": 0.0, - "completion_length": 1725.8542175292969, - "epoch": 0.5028571428571429, - "grad_norm": 0.3497353196144104, - "kl": 0.0239105224609375, + "completion_length": 1919.1458587646484, + "epoch": 0.25142857142857145, + "grad_norm": 0.4494737386703491, + "kl": 0.011789321899414062, "learning_rate": 1.3890454406082956e-07, - "loss": 0.1759, - "num_tokens": 41783018.0, - "reward": 0.6551067333202809, - "reward_std": 0.9818321466445923, - "rewards/cosine_scaled_reward": -0.06827997602522373, - "rewards/format_reward": 0.7916666716337204, + "loss": 0.0479, + "num_tokens": 56309089.0, + "reward": 0.27276327461004257, + "reward_std": 0.5138226337730885, + "rewards/cosine_scaled_reward": 0.1363816224038601, "step": 440 }, { "clip_ratio": 0.0, - "completion_length": 1217.1875457763672, - "epoch": 0.504, - "grad_norm": 0.35091596841812134, - "kl": 0.02471923828125, + "completion_length": 3518.625, + "epoch": 0.252, + "grad_norm": 0.19750487804412842, + "kl": 0.0019445419311523438, "learning_rate": 1.3763677169699217e-07, - "loss": 0.2462, - "num_tokens": 41849957.0, - "reward": 0.8108302969485521, - "reward_std": 0.4449946694076061, - "rewards/cosine_scaled_reward": -0.021668191999197006, - "rewards/format_reward": 0.8541666716337204, + "loss": 0.0253, + "num_tokens": 56485063.0, + "reward": -0.358419805765152, + "reward_std": 0.32525915279984474, + "rewards/cosine_scaled_reward": -0.1792098954319954, "step": 441 }, { "clip_ratio": 0.0, - "completion_length": 1427.3333740234375, - "epoch": 0.5051428571428571, - "grad_norm": 0.424146831035614, - "kl": 0.02996826171875, + "completion_length": 2622.8541870117188, + "epoch": 0.25257142857142856, + "grad_norm": 0.331227570772171, + "kl": 0.011693954467773438, "learning_rate": 1.3638909733514452e-07, - "loss": 0.1381, - "num_tokens": 41927001.0, - "reward": 0.6767252758145332, - "reward_std": 0.43887148424983025, - "rewards/cosine_scaled_reward": -0.06788737326860428, - "rewards/format_reward": 0.8125, + "loss": 0.0307, + "num_tokens": 56617584.0, + "reward": 0.23914362490177155, + "reward_std": 0.5278170146048069, + "rewards/cosine_scaled_reward": 0.11957181245088577, "step": 442 }, { "clip_ratio": 0.0, - "completion_length": 1055.1042022705078, - "epoch": 0.5062857142857143, - "grad_norm": 0.3556446135044098, - "kl": 0.02960205078125, + "completion_length": 3383.729248046875, + "epoch": 0.25314285714285717, + "grad_norm": 0.1942642778158188, + "kl": 0.00176239013671875, "learning_rate": 1.351615817851748e-07, - "loss": 0.3331, - "num_tokens": 41985806.0, - "reward": 0.8476575687527657, - "reward_std": 0.3729449659585953, - "rewards/cosine_scaled_reward": -0.03450455144047737, - "rewards/format_reward": 0.9166666567325592, + "loss": 0.049, + "num_tokens": 56786303.0, + "reward": -0.6218864843249321, + "reward_std": 0.2640235126018524, + "rewards/cosine_scaled_reward": -0.31094324216246605, "step": 443 }, { "clip_ratio": 0.0, - "completion_length": 1387.104232788086, - "epoch": 0.5074285714285715, - "grad_norm": 0.35173964500427246, - "kl": 0.0280914306640625, + "completion_length": 3413.5833740234375, + "epoch": 0.2537142857142857, + "grad_norm": 0.1867051124572754, + "kl": 0.0013294219970703125, "learning_rate": 1.3395428487445914e-07, - "loss": 0.2315, - "num_tokens": 42061015.0, - "reward": 1.0118909031152725, - "reward_std": 0.5946892201900482, - "rewards/cosine_scaled_reward": 0.05802877992391586, - "rewards/format_reward": 0.8958333432674408, + "loss": 0.0401, + "num_tokens": 56956251.0, + "reward": -0.6013847589492798, + "reward_std": 0.32602211087942123, + "rewards/cosine_scaled_reward": -0.3006923794746399, "step": 444 }, { "clip_ratio": 0.0, - "completion_length": 1481.416732788086, - "epoch": 0.5085714285714286, - "grad_norm": 0.4023115038871765, - "kl": 0.02587890625, + "completion_length": 2585.5833740234375, + "epoch": 0.2542857142857143, + "grad_norm": 0.3707902133464813, + "kl": 0.01132965087890625, "learning_rate": 1.3276726544494571e-07, - "loss": 0.1811, - "num_tokens": 42140733.0, - "reward": 0.6685292148031294, - "reward_std": 0.4582846313714981, - "rewards/cosine_scaled_reward": -0.040735377464443445, - "rewards/format_reward": 0.75, + "loss": 0.0547, + "num_tokens": 57086335.0, + "reward": 0.8475865125656128, + "reward_std": 0.9629704058170319, + "rewards/cosine_scaled_reward": 0.4237932413816452, "step": 445 }, { "clip_ratio": 0.0, - "completion_length": 1054.937515258789, - "epoch": 0.5097142857142857, - "grad_norm": 0.4524555504322052, - "kl": 0.0357666015625, + "completion_length": 1880.5209045410156, + "epoch": 0.25485714285714284, + "grad_norm": 0.5581598877906799, + "kl": 0.020392417907714844, "learning_rate": 1.316005813502869e-07, - "loss": 0.175, - "num_tokens": 42199530.0, - "reward": 1.0812687873840332, - "reward_std": 0.7833655476570129, - "rewards/cosine_scaled_reward": 0.08230104623362422, - "rewards/format_reward": 0.9166666716337204, + "loss": 0.19, + "num_tokens": 57182540.0, + "reward": -0.18871257454156876, + "reward_std": 0.3908994784578681, + "rewards/cosine_scaled_reward": -0.09435627982020378, "step": 446 }, { "clip_ratio": 0.0, - "completion_length": 1313.1875305175781, - "epoch": 0.5108571428571429, - "grad_norm": 0.44180259108543396, - "kl": 0.030792236328125, + "completion_length": 2767.1041717529297, + "epoch": 0.25542857142857145, + "grad_norm": 0.4027818739414215, + "kl": 0.0086669921875, "learning_rate": 1.3045428945301953e-07, - "loss": 0.1406, - "num_tokens": 42270315.0, - "reward": 0.6780117899179459, - "reward_std": 0.3482256345450878, - "rewards/cosine_scaled_reward": -0.07766077481210232, - "rewards/format_reward": 0.8333333358168602, + "loss": 0.051, + "num_tokens": 57320965.0, + "reward": -0.5574091803282499, + "reward_std": 0.29374449513852596, + "rewards/cosine_scaled_reward": -0.27870458643883467, "step": 447 }, { "clip_ratio": 0.0, - "completion_length": 976.1250152587891, - "epoch": 0.512, - "grad_norm": 0.3419904410839081, - "kl": 0.02667236328125, + "completion_length": 2482.3125610351562, + "epoch": 0.256, + "grad_norm": 0.1986325979232788, + "kl": 0.0056400299072265625, "learning_rate": 1.2932844562179352e-07, - "loss": 0.2378, - "num_tokens": 42324861.0, - "reward": 1.2964210510253906, - "reward_std": 0.7425139099359512, - "rewards/cosine_scaled_reward": 0.15862719155848026, - "rewards/format_reward": 0.9791666716337204, + "loss": 0.0354, + "num_tokens": 57446116.0, + "reward": 0.4690280854701996, + "reward_std": 0.40538226813077927, + "rewards/cosine_scaled_reward": 0.2345140352845192, "step": 448 }, { "clip_ratio": 0.0, - "completion_length": 1269.7916870117188, - "epoch": 0.5131428571428571, - "grad_norm": 0.4110361635684967, - "kl": 0.02386474609375, + "completion_length": 2598.9375610351562, + "epoch": 0.25657142857142856, + "grad_norm": 0.2474825382232666, + "kl": 0.003582000732421875, "learning_rate": 1.2822310472864885e-07, - "loss": 0.3271, - "num_tokens": 42393947.0, - "reward": 0.8870633244514465, - "reward_std": 0.6967166736721992, - "rewards/cosine_scaled_reward": -0.004385008476674557, - "rewards/format_reward": 0.8958333432674408, + "loss": 0.1014, + "num_tokens": 57578473.0, + "reward": 0.4089887887239456, + "reward_std": 0.8240047469735146, + "rewards/cosine_scaled_reward": 0.20449439622461796, "step": 449 }, { "clip_ratio": 0.0, - "completion_length": 702.6250305175781, - "epoch": 0.5142857142857142, - "grad_norm": 0.5267137289047241, - "kl": 0.03900146484375, + "completion_length": 1180.7708740234375, + "epoch": 0.2571428571428571, + "grad_norm": 0.5087565183639526, + "kl": 0.02840423583984375, "learning_rate": 1.2713832064634125e-07, - "loss": 0.1521, - "num_tokens": 42435545.0, - "reward": 1.109239935874939, - "reward_std": 0.573906421661377, - "rewards/cosine_scaled_reward": 0.05461995178484358, - "rewards/format_reward": 1.0, + "loss": 0.2658, + "num_tokens": 57640442.0, + "reward": 0.25451022386550903, + "reward_std": 0.42495069094002247, + "rewards/cosine_scaled_reward": 0.1272551193833351, "step": 450 }, { "clip_ratio": 0.0, - "completion_length": 1432.2083740234375, - "epoch": 0.5154285714285715, - "grad_norm": 0.40656542778015137, - "kl": 0.030517578125, + "completion_length": 2582.8541870117188, + "epoch": 0.25771428571428573, + "grad_norm": 0.47060224413871765, + "kl": 0.0055389404296875, "learning_rate": 1.260741462457165e-07, - "loss": 0.2169, - "num_tokens": 42512091.0, - "reward": 0.9379907790571451, - "reward_std": 0.6935139521956444, - "rewards/cosine_scaled_reward": 0.03149538184516132, - "rewards/format_reward": 0.875, + "loss": 0.0476, + "num_tokens": 57771091.0, + "reward": -0.6611975803971291, + "reward_std": 0.29068057239055634, + "rewards/cosine_scaled_reward": -0.3305987734347582, "step": 451 }, { "clip_ratio": 0.0, - "completion_length": 1116.3542022705078, - "epoch": 0.5165714285714286, - "grad_norm": 0.5056872367858887, - "kl": 0.027923583984375, + "completion_length": 1834.93754196167, + "epoch": 0.2582857142857143, + "grad_norm": 0.6933012008666992, + "kl": 0.04656410217285156, "learning_rate": 1.2503063339313356e-07, - "loss": 0.3907, - "num_tokens": 42573776.0, - "reward": 0.6782356053590775, - "reward_std": 0.5416813492774963, - "rewards/cosine_scaled_reward": -0.09838221129029989, - "rewards/format_reward": 0.875, + "loss": 0.1749, + "num_tokens": 57864196.0, + "reward": -0.06097408011555672, + "reward_std": 0.3460182901471853, + "rewards/cosine_scaled_reward": -0.03048703959211707, "step": 452 }, { "clip_ratio": 0.0, - "completion_length": 1324.8334045410156, - "epoch": 0.5177142857142857, - "grad_norm": 0.4419814646244049, - "kl": 0.031219482421875, + "completion_length": 2453.6875, + "epoch": 0.25885714285714284, + "grad_norm": 0.5309619307518005, + "kl": 0.019407272338867188, "learning_rate": 1.2400783294793668e-07, - "loss": 0.2348, - "num_tokens": 42645294.0, - "reward": 1.0193464905023575, - "reward_std": 0.6786343604326248, - "rewards/cosine_scaled_reward": 0.06175656849518418, - "rewards/format_reward": 0.8958333432674408, + "loss": 0.1398, + "num_tokens": 57987841.0, + "reward": -0.14627531357109547, + "reward_std": 0.40048458334058523, + "rewards/cosine_scaled_reward": -0.07313766423612833, "step": 453 }, { "clip_ratio": 0.0, - "completion_length": 933.8541717529297, - "epoch": 0.5188571428571429, - "grad_norm": 0.44726788997650146, - "kl": 0.0369873046875, + "completion_length": 2865.8958740234375, + "epoch": 0.25942857142857145, + "grad_norm": 0.2787575423717499, + "kl": 0.0030364990234375, "learning_rate": 1.2300579475997657e-07, - "loss": 0.36, - "num_tokens": 42698207.0, - "reward": 1.0875123143196106, - "reward_std": 0.31689387187361717, - "rewards/cosine_scaled_reward": 0.05417281948029995, - "rewards/format_reward": 0.9791666716337204, + "loss": 0.0809, + "num_tokens": 58131152.0, + "reward": -0.03645841544494033, + "reward_std": 0.538176491856575, + "rewards/cosine_scaled_reward": -0.01822919282130897, "step": 454 }, { "clip_ratio": 0.0, - "completion_length": 1275.4791870117188, - "epoch": 0.52, - "grad_norm": 0.5583041310310364, - "kl": 0.02591705322265625, + "completion_length": 2737.3125610351562, + "epoch": 0.26, + "grad_norm": 0.2434437870979309, + "kl": 0.0023136138916015625, "learning_rate": 1.220245676671809e-07, - "loss": 0.306, - "num_tokens": 42767536.0, - "reward": 1.0069229509681463, - "reward_std": 0.4878829885274172, - "rewards/cosine_scaled_reward": 0.08679477497935295, - "rewards/format_reward": 0.8333333358168602, + "loss": -0.09, + "num_tokens": 58269443.0, + "reward": -0.0827246904373169, + "reward_std": 0.6058220788836479, + "rewards/cosine_scaled_reward": -0.041362347081303596, "step": 455 }, { "clip_ratio": 0.0, - "completion_length": 1401.6458740234375, - "epoch": 0.5211428571428571, - "grad_norm": 0.40996938943862915, - "kl": 0.02996826171875, + "completion_length": 3515.9166870117188, + "epoch": 0.26057142857142856, + "grad_norm": 0.22416526079177856, + "kl": 0.0019941329956054688, "learning_rate": 1.2106419949317388e-07, - "loss": 0.3287, - "num_tokens": 42842915.0, - "reward": 0.8078549057245255, - "reward_std": 0.7320556342601776, - "rewards/cosine_scaled_reward": 0.018510787514969707, - "rewards/format_reward": 0.7708333283662796, + "loss": 0.0015, + "num_tokens": 58444267.0, + "reward": -0.3426622897386551, + "reward_std": 0.2587408199906349, + "rewards/cosine_scaled_reward": -0.17133113741874695, "step": 456 }, { "clip_ratio": 0.0, - "completion_length": 806.083366394043, - "epoch": 0.5222857142857142, - "grad_norm": 0.48604169487953186, - "kl": 0.03582763671875, + "completion_length": 2765.5416717529297, + "epoch": 0.2611428571428571, + "grad_norm": 0.3764567971229553, + "kl": 0.018613815307617188, "learning_rate": 1.2012473704494537e-07, - "loss": 0.2165, - "num_tokens": 42889905.0, - "reward": 1.1789227575063705, - "reward_std": 0.38530726544559, - "rewards/cosine_scaled_reward": 0.09987803548574448, - "rewards/format_reward": 0.9791666716337204, + "loss": -0.0034, + "num_tokens": 58582509.0, + "reward": -0.07094722986221313, + "reward_std": 0.6748832985758781, + "rewards/cosine_scaled_reward": -0.035473618656396866, "step": 457 }, { "clip_ratio": 0.0, - "completion_length": 1071.4375, - "epoch": 0.5234285714285715, - "grad_norm": 0.4074256122112274, - "kl": 0.02691650390625, + "completion_length": 1633.041732788086, + "epoch": 0.26171428571428573, + "grad_norm": 0.49085304141044617, + "kl": 0.01363372802734375, "learning_rate": 1.1920622611056974e-07, - "loss": 0.2512, - "num_tokens": 42949794.0, - "reward": 1.0278047621250153, - "reward_std": 0.7519606053829193, - "rewards/cosine_scaled_reward": 0.04515235684812069, - "rewards/format_reward": 0.9375000149011612, + "loss": 0.4777, + "num_tokens": 58666343.0, + "reward": -0.2368110716342926, + "reward_std": 0.48651882261037827, + "rewards/cosine_scaled_reward": -0.118405532091856, "step": 458 }, { "clip_ratio": 0.0, - "completion_length": 1211.7500610351562, - "epoch": 0.5245714285714286, - "grad_norm": 0.4030369222164154, - "kl": 0.029937744140625, + "completion_length": 2421.395896911621, + "epoch": 0.2622857142857143, + "grad_norm": 0.40670979022979736, + "kl": 0.016485214233398438, "learning_rate": 1.1830871145697412e-07, - "loss": 0.3212, - "num_tokens": 43017036.0, - "reward": 0.8210784047842026, - "reward_std": 0.6628520265221596, - "rewards/cosine_scaled_reward": -0.037377479020506144, - "rewards/format_reward": 0.8958333283662796, + "loss": 0.1308, + "num_tokens": 58789170.0, + "reward": -0.010643558576703072, + "reward_std": 0.8577997833490372, + "rewards/cosine_scaled_reward": -0.005321786738932133, "step": 459 }, { "clip_ratio": 0.0, - "completion_length": 1466.2916870117188, - "epoch": 0.5257142857142857, - "grad_norm": 0.4611579477787018, - "kl": 0.024932861328125, + "completion_length": 2716.1875, + "epoch": 0.26285714285714284, + "grad_norm": 0.40470758080482483, + "kl": 0.01605224609375, "learning_rate": 1.1743223682775649e-07, - "loss": 0.2586, - "num_tokens": 43095800.0, - "reward": 1.166172817349434, - "reward_std": 1.0088868141174316, - "rewards/cosine_scaled_reward": 0.13516972260549664, - "rewards/format_reward": 0.8958333283662796, + "loss": 0.0628, + "num_tokens": 58925859.0, + "reward": 0.18111078813672066, + "reward_std": 0.2746178447268903, + "rewards/cosine_scaled_reward": 0.09055539406836033, "step": 460 }, { "clip_ratio": 0.0, - "completion_length": 1250.2500305175781, - "epoch": 0.5268571428571428, - "grad_norm": 0.35799574851989746, - "kl": 0.02569580078125, + "completion_length": 2102.7916870117188, + "epoch": 0.2634285714285714, + "grad_norm": 0.6008480191230774, + "kl": 0.015697479248046875, "learning_rate": 1.1657684494105386e-07, - "loss": 0.2041, - "num_tokens": 43163744.0, - "reward": 0.8057873249053955, - "reward_std": 0.647061862051487, - "rewards/cosine_scaled_reward": -0.024189693154767156, - "rewards/format_reward": 0.8541666716337204, + "loss": 0.2683, + "num_tokens": 59032085.0, + "reward": -0.23030911991372705, + "reward_std": 0.6799526810646057, + "rewards/cosine_scaled_reward": -0.11515455157496035, "step": 461 }, { "clip_ratio": 0.0, - "completion_length": 1080.1458740234375, - "epoch": 0.528, - "grad_norm": 0.4635910093784332, - "kl": 0.03936767578125, + "completion_length": 3317.604248046875, + "epoch": 0.264, + "grad_norm": 0.18618419766426086, + "kl": 0.0010318756103515625, "learning_rate": 1.1574257748745986e-07, - "loss": 0.3997, - "num_tokens": 43223661.0, - "reward": 0.6495188176631927, - "reward_std": 0.4584726169705391, - "rewards/cosine_scaled_reward": -0.14399060979485512, - "rewards/format_reward": 0.9375, + "loss": 0.0053, + "num_tokens": 59198026.0, + "reward": 0.1968272104859352, + "reward_std": 0.4609448295086622, + "rewards/cosine_scaled_reward": 0.09841361455619335, "step": 462 }, { "clip_ratio": 0.0, - "completion_length": 1144.8750534057617, - "epoch": 0.5291428571428571, - "grad_norm": 0.4683811068534851, - "kl": 0.03570556640625, + "completion_length": 3123.4168090820312, + "epoch": 0.26457142857142857, + "grad_norm": 0.19308748841285706, + "kl": 0.0020313262939453125, "learning_rate": 1.1492947512799328e-07, - "loss": 0.3718, - "num_tokens": 43286439.0, - "reward": 1.1641941219568253, - "reward_std": 0.6467169672250748, - "rewards/cosine_scaled_reward": 0.15501370280981064, - "rewards/format_reward": 0.8541666567325592, + "loss": 0.0844, + "num_tokens": 59354562.0, + "reward": -0.48374156653881073, + "reward_std": 0.6329436413943768, + "rewards/cosine_scaled_reward": -0.24187077954411507, "step": 463 }, { "clip_ratio": 0.0, - "completion_length": 997.0416870117188, - "epoch": 0.5302857142857142, - "grad_norm": 4.38219690322876, - "kl": 0.05279541015625, + "completion_length": 2666.5625, + "epoch": 0.2651428571428571, + "grad_norm": 0.29937809705734253, + "kl": 0.010850906372070312, "learning_rate": 1.1413757749211602e-07, - "loss": 0.2404, - "num_tokens": 43342619.0, - "reward": 0.8943421989679337, - "reward_std": 0.589733824133873, - "rewards/cosine_scaled_reward": -0.04241223679855466, - "rewards/format_reward": 0.9791666716337204, + "loss": 0.0164, + "num_tokens": 59489061.0, + "reward": 0.12404083646833897, + "reward_std": 0.621801532804966, + "rewards/cosine_scaled_reward": 0.06202041823416948, "step": 464 }, { "clip_ratio": 0.0, - "completion_length": 1403.979248046875, - "epoch": 0.5314285714285715, - "grad_norm": 0.488127201795578, - "kl": 0.029754638671875, + "completion_length": 3420.354248046875, + "epoch": 0.26571428571428574, + "grad_norm": 0.20473025739192963, + "kl": 0.0012063980102539062, "learning_rate": 1.1336692317580158e-07, - "loss": 0.3311, - "num_tokens": 43417882.0, - "reward": 0.8888783566653728, - "reward_std": 0.5945712774991989, - "rewards/cosine_scaled_reward": 0.03818914666771889, - "rewards/format_reward": 0.8125, + "loss": 0.0185, + "num_tokens": 59658794.0, + "reward": -0.1353940162807703, + "reward_std": 0.5671922750771046, + "rewards/cosine_scaled_reward": -0.06769700860604644, "step": 465 }, { "clip_ratio": 0.0, - "completion_length": 1697.6250610351562, - "epoch": 0.5325714285714286, - "grad_norm": 0.26276934146881104, - "kl": 0.01947784423828125, + "completion_length": 2224.750045776367, + "epoch": 0.2662857142857143, + "grad_norm": 0.28893810510635376, + "kl": 0.014387130737304688, "learning_rate": 1.1261754973965422e-07, - "loss": 0.2593, - "num_tokens": 43507456.0, - "reward": 0.6314441710710526, - "reward_std": 0.7540897130966187, - "rewards/cosine_scaled_reward": -0.09052791446447372, - "rewards/format_reward": 0.8124999850988388, + "loss": 0.1965, + "num_tokens": 59772038.0, + "reward": -0.47738330624997616, + "reward_std": 0.32058568112552166, + "rewards/cosine_scaled_reward": -0.23869163822382689, "step": 466 }, { "clip_ratio": 0.0, - "completion_length": 1293.041732788086, - "epoch": 0.5337142857142857, - "grad_norm": 8.432247161865234, - "kl": 0.218963623046875, + "completion_length": 3208.9791870117188, + "epoch": 0.26685714285714285, + "grad_norm": 0.24921633303165436, + "kl": 0.0014972686767578125, "learning_rate": 1.1188949370707787e-07, - "loss": 0.143, - "num_tokens": 43577394.0, - "reward": 0.8170301653444767, - "reward_std": 0.5284570157527924, - "rewards/cosine_scaled_reward": -0.018568256869912148, - "rewards/format_reward": 0.8541666641831398, + "loss": 0.0734, + "num_tokens": 59932489.0, + "reward": -0.10125970467925072, + "reward_std": 0.4448701348155737, + "rewards/cosine_scaled_reward": -0.050629859790205956, "step": 467 }, { "clip_ratio": 0.0, - "completion_length": 1465.8750457763672, - "epoch": 0.5348571428571428, - "grad_norm": 0.49368521571159363, - "kl": 0.0333099365234375, + "completion_length": 2674.4375228881836, + "epoch": 0.2674285714285714, + "grad_norm": 0.4345049262046814, + "kl": 0.012060165405273438, "learning_rate": 1.1118279056249653e-07, - "loss": 0.1917, - "num_tokens": 43655832.0, - "reward": 1.2481490820646286, - "reward_std": 0.7606203258037567, - "rewards/cosine_scaled_reward": 0.18657452706247568, - "rewards/format_reward": 0.8750000149011612, + "loss": -0.0264, + "num_tokens": 60067174.0, + "reward": 0.15898653864860535, + "reward_std": 0.5897807292640209, + "rewards/cosine_scaled_reward": 0.07949326187372208, "step": 468 }, { "clip_ratio": 0.0, - "completion_length": 1239.0000610351562, - "epoch": 0.536, - "grad_norm": 0.4890661835670471, - "kl": 0.0341796875, + "completion_length": 2993.8958740234375, + "epoch": 0.268, + "grad_norm": 0.19168232381343842, + "kl": 0.0040569305419921875, "learning_rate": 1.1049747474962444e-07, - "loss": 0.2292, - "num_tokens": 43723566.0, - "reward": 0.7020466178655624, - "reward_std": 0.5384815186262131, - "rewards/cosine_scaled_reward": -0.08647668547928333, - "rewards/format_reward": 0.875, + "loss": 0.1375, + "num_tokens": 60216449.0, + "reward": -0.49420715123414993, + "reward_std": 0.4377583935856819, + "rewards/cosine_scaled_reward": -0.24710355699062347, "step": 469 }, { "clip_ratio": 0.0, - "completion_length": 1251.4791870117188, - "epoch": 0.5371428571428571, - "grad_norm": 0.4875927269458771, - "kl": 0.029296875, + "completion_length": 1846.8958587646484, + "epoch": 0.26857142857142857, + "grad_norm": 0.6746844053268433, + "kl": 0.0324859619140625, "learning_rate": 1.0983357966978745e-07, - "loss": 0.1285, - "num_tokens": 43791641.0, - "reward": 1.112037941813469, - "reward_std": 0.33521461114287376, - "rewards/cosine_scaled_reward": 0.1185189438983798, - "rewards/format_reward": 0.875, + "loss": 0.1055, + "num_tokens": 60309816.0, + "reward": 0.983911968767643, + "reward_std": 0.7739077722653747, + "rewards/cosine_scaled_reward": 0.49195596762001514, "step": 470 }, { "clip_ratio": 0.0, - "completion_length": 1172.4792022705078, - "epoch": 0.5382857142857143, - "grad_norm": 0.42823565006256104, - "kl": 0.030059814453125, + "completion_length": 2508.1666870117188, + "epoch": 0.26914285714285713, + "grad_norm": 0.2162114381790161, + "kl": 0.004150390625, "learning_rate": 1.0919113768029517e-07, - "loss": 0.2834, - "num_tokens": 43855648.0, - "reward": 0.8745324984192848, - "reward_std": 0.6210919320583344, - "rewards/cosine_scaled_reward": -0.00023377127945423126, - "rewards/format_reward": 0.875, + "loss": -0.049, + "num_tokens": 60436376.0, + "reward": -0.14057038724422455, + "reward_std": 0.5592290014028549, + "rewards/cosine_scaled_reward": -0.07028519362211227, "step": 471 }, { "clip_ratio": 0.0, - "completion_length": 705.2500152587891, - "epoch": 0.5394285714285715, - "grad_norm": 0.46066632866859436, - "kl": 0.03387451171875, + "completion_length": 3250.7916870117188, + "epoch": 0.26971428571428574, + "grad_norm": 0.24621860682964325, + "kl": 0.0020599365234375, "learning_rate": 1.0857018009286381e-07, - "loss": 0.1864, - "num_tokens": 43897018.0, - "reward": 1.1916231364011765, - "reward_std": 0.5989440307021141, - "rewards/cosine_scaled_reward": 0.10622821375727654, - "rewards/format_reward": 0.9791666716337204, + "loss": 0.0899, + "num_tokens": 60598222.0, + "reward": -0.35493073239922523, + "reward_std": 0.4200459411367774, + "rewards/cosine_scaled_reward": -0.17746536619961262, "step": 472 }, { "clip_ratio": 0.0, - "completion_length": 974.3125152587891, - "epoch": 0.5405714285714286, - "grad_norm": 0.5403279662132263, - "kl": 0.035491943359375, + "completion_length": 2814.0208587646484, + "epoch": 0.2702857142857143, + "grad_norm": 0.3132895529270172, + "kl": 0.00676727294921875, "learning_rate": 1.0797073717209013e-07, - "loss": 0.2905, - "num_tokens": 43952113.0, - "reward": 0.7929527014493942, - "reward_std": 0.28072798252105713, - "rewards/cosine_scaled_reward": -0.09310700930655003, - "rewards/format_reward": 0.9791666716337204, + "loss": 0.1305, + "num_tokens": 60740639.0, + "reward": -0.14024419710040092, + "reward_std": 0.6155532822012901, + "rewards/cosine_scaled_reward": -0.07012210600078106, "step": 473 }, { "clip_ratio": 0.0, - "completion_length": 840.8542022705078, - "epoch": 0.5417142857142857, - "grad_norm": 0.4049588143825531, - "kl": 0.030242919921875, + "completion_length": 1528.2917022705078, + "epoch": 0.27085714285714285, + "grad_norm": 0.2247830480337143, + "kl": 0.00786590576171875, "learning_rate": 1.0739283813397639e-07, - "loss": 0.2409, - "num_tokens": 44000700.0, - "reward": 0.9743078052997589, - "reward_std": 0.5338472779840231, - "rewards/cosine_scaled_reward": -0.0024294480681419373, - "rewards/format_reward": 0.9791666716337204, + "loss": 0.0938, + "num_tokens": 60819421.0, + "reward": 0.2218828909099102, + "reward_std": 0.6827406734228134, + "rewards/cosine_scaled_reward": 0.11094144824892282, "step": 474 }, { "clip_ratio": 0.0, - "completion_length": 1120.6667022705078, - "epoch": 0.5428571428571428, - "grad_norm": 0.5333501696586609, - "kl": 0.03533935546875, + "completion_length": 2696.4584197998047, + "epoch": 0.2714285714285714, + "grad_norm": 0.2896968722343445, + "kl": 0.00408172607421875, "learning_rate": 1.068365111445064e-07, - "loss": 0.2386, - "num_tokens": 44062388.0, - "reward": 0.8754686489701271, - "reward_std": 0.7480637580156326, - "rewards/cosine_scaled_reward": 0.00023431982845067978, - "rewards/format_reward": 0.875, + "loss": 0.1801, + "num_tokens": 60955163.0, + "reward": 0.13359621167182922, + "reward_std": 0.7966333255171776, + "rewards/cosine_scaled_reward": 0.06679810583591461, "step": 475 }, { "clip_ratio": 0.0, - "completion_length": 1014.0417175292969, - "epoch": 0.544, - "grad_norm": 0.42668408155441284, - "kl": 0.032989501953125, + "completion_length": 3057.5, + "epoch": 0.272, + "grad_norm": 0.21167173981666565, + "kl": 0.0032901763916015625, "learning_rate": 1.063017833182728e-07, - "loss": 0.2809, - "num_tokens": 44120026.0, - "reward": 1.238448478281498, - "reward_std": 0.5114860832691193, - "rewards/cosine_scaled_reward": 0.14005756378173828, - "rewards/format_reward": 0.9583333283662796, + "loss": 0.0928, + "num_tokens": 61108295.0, + "reward": -0.6038347482681274, + "reward_std": 0.3766014724969864, + "rewards/cosine_scaled_reward": -0.30191735550761223, "step": 476 }, { "clip_ratio": 0.0, - "completion_length": 760.7708587646484, - "epoch": 0.5451428571428572, - "grad_norm": 0.4274328052997589, - "kl": 0.03369140625, + "completion_length": 2832.604217529297, + "epoch": 0.2725714285714286, + "grad_norm": 0.28865718841552734, + "kl": 0.007213592529296875, "learning_rate": 1.0578868071715544e-07, - "loss": 0.1905, - "num_tokens": 44163635.0, - "reward": 1.3580638319253922, - "reward_std": 0.6236323565244675, - "rewards/cosine_scaled_reward": 0.18944857362657785, - "rewards/format_reward": 0.9791666716337204, + "loss": -0.0879, + "num_tokens": 61251244.0, + "reward": -0.4621117692440748, + "reward_std": 0.3360144942998886, + "rewards/cosine_scaled_reward": -0.23105588043108582, "step": 477 }, { "clip_ratio": 0.0, - "completion_length": 1135.5417022705078, - "epoch": 0.5462857142857143, - "grad_norm": 0.3638131022453308, - "kl": 0.028350830078125, + "completion_length": 3392.6458740234375, + "epoch": 0.27314285714285713, + "grad_norm": 0.18357343971729279, + "kl": 0.001720428466796875, "learning_rate": 1.0529722834905125e-07, - "loss": 0.2326, - "num_tokens": 44226181.0, - "reward": 0.9669682309031487, - "reward_std": 0.5843757539987564, - "rewards/cosine_scaled_reward": 0.014734117314219475, - "rewards/format_reward": 0.9375, + "loss": -0.032, + "num_tokens": 61420799.0, + "reward": -0.3251003101468086, + "reward_std": 0.37241343408823013, + "rewards/cosine_scaled_reward": -0.16255014389753342, "step": 478 }, { "clip_ratio": 0.0, - "completion_length": 1052.4792175292969, - "epoch": 0.5474285714285714, - "grad_norm": 0.4436745047569275, - "kl": 0.03021240234375, + "completion_length": 2110.25004196167, + "epoch": 0.2737142857142857, + "grad_norm": 0.452695369720459, + "kl": 0.0191802978515625, "learning_rate": 1.0482745016665526e-07, - "loss": 0.2299, - "num_tokens": 44284902.0, - "reward": 1.0697922855615616, - "reward_std": 0.6758029907941818, - "rewards/cosine_scaled_reward": 0.08697945438325405, - "rewards/format_reward": 0.8958333432674408, + "loss": 0.1311, + "num_tokens": 61527719.0, + "reward": 0.4394497722387314, + "reward_std": 0.30791839864104986, + "rewards/cosine_scaled_reward": 0.2197248861193657, "step": 479 }, { "clip_ratio": 0.0, - "completion_length": 777.0833587646484, - "epoch": 0.5485714285714286, - "grad_norm": 0.3994079530239105, - "kl": 0.03125, + "completion_length": 3178.5208740234375, + "epoch": 0.2742857142857143, + "grad_norm": 0.2389601320028305, + "kl": 0.002948760986328125, "learning_rate": 1.0437936906629334e-07, - "loss": 0.128, - "num_tokens": 44330476.0, - "reward": 1.0248275697231293, - "reward_std": 0.6758236438035965, - "rewards/cosine_scaled_reward": 0.012413740856572986, - "rewards/format_reward": 1.0, + "loss": 0.0376, + "num_tokens": 61687032.0, + "reward": 0.2565951645374298, + "reward_std": 0.6847127676010132, + "rewards/cosine_scaled_reward": 0.1282975897192955, "step": 480 }, { "clip_ratio": 0.0, - "completion_length": 1162.0000610351562, - "epoch": 0.5497142857142857, - "grad_norm": 0.4814600646495819, - "kl": 0.03363037109375, + "completion_length": 2711.5625228881836, + "epoch": 0.27485714285714286, + "grad_norm": 0.6793060302734375, + "kl": 0.02097320556640625, "learning_rate": 1.0395300688680625e-07, - "loss": 0.2608, - "num_tokens": 44393998.0, - "reward": 0.7157177105545998, - "reward_std": 0.44291423074901104, - "rewards/cosine_scaled_reward": -0.07964113913476467, - "rewards/format_reward": 0.875, + "loss": -0.0007, + "num_tokens": 61823763.0, + "reward": -0.17994186095893383, + "reward_std": 0.5131379179656506, + "rewards/cosine_scaled_reward": -0.08997092954814434, "step": 481 }, { "clip_ratio": 0.0, - "completion_length": 600.6458587646484, - "epoch": 0.5508571428571428, - "grad_norm": 0.5359194874763489, - "kl": 0.041259765625, + "completion_length": 2249.0833435058594, + "epoch": 0.2754285714285714, + "grad_norm": 0.33344191312789917, + "kl": 0.0082855224609375, "learning_rate": 1.0354838440848501e-07, - "loss": 0.1478, - "num_tokens": 44430977.0, - "reward": 1.1498132944107056, - "reward_std": 0.3962139468640089, - "rewards/cosine_scaled_reward": 0.07490663533098996, - "rewards/format_reward": 1.0, + "loss": 0.1291, + "num_tokens": 61938103.0, + "reward": 0.008612923324108124, + "reward_std": 0.5992633532732725, + "rewards/cosine_scaled_reward": 0.004306461662054062, "step": 482 }, { "clip_ratio": 0.0, - "completion_length": 1325.2291717529297, - "epoch": 0.552, - "grad_norm": 0.39830389618873596, - "kl": 0.02691650390625, + "completion_length": 2914.4791870117188, + "epoch": 0.276, + "grad_norm": 0.26747846603393555, + "kl": 0.0048503875732421875, "learning_rate": 1.0316552135205837e-07, - "loss": 0.1685, - "num_tokens": 44502010.0, - "reward": 0.9287998229265213, - "reward_std": 0.8306932374835014, - "rewards/cosine_scaled_reward": 0.0477332123555243, - "rewards/format_reward": 0.8333333432674408, + "loss": 0.061, + "num_tokens": 62083878.0, + "reward": -0.18974144756793976, + "reward_std": 0.6735084727406502, + "rewards/cosine_scaled_reward": -0.09487072005867958, "step": 483 }, { "clip_ratio": 0.0, - "completion_length": 902.9792175292969, - "epoch": 0.5531428571428572, - "grad_norm": 0.47691088914871216, - "kl": 0.03839111328125, + "completion_length": 1817.7292289733887, + "epoch": 0.2765714285714286, + "grad_norm": 0.5921136736869812, + "kl": 0.02447509765625, "learning_rate": 1.0280443637773163e-07, - "loss": 0.3136, - "num_tokens": 44553171.0, - "reward": 1.4516296237707138, - "reward_std": 0.355313777923584, - "rewards/cosine_scaled_reward": 0.26748147048056126, - "rewards/format_reward": 0.9166666716337204, + "loss": 0.0267, + "num_tokens": 62176649.0, + "reward": -0.4410746842622757, + "reward_std": 0.301199066452682, + "rewards/cosine_scaled_reward": -0.22053734213113785, "step": 484 }, { "clip_ratio": 0.0, - "completion_length": 1307.2500610351562, - "epoch": 0.5542857142857143, - "grad_norm": 0.4514337182044983, - "kl": 0.0279541015625, + "completion_length": 1706.4791870117188, + "epoch": 0.27714285714285714, + "grad_norm": 0.36696869134902954, + "kl": 0.01856231689453125, "learning_rate": 1.0246514708427701e-07, - "loss": 0.3272, - "num_tokens": 44623797.0, - "reward": 0.9350142329931259, - "reward_std": 0.3977475240826607, - "rewards/cosine_scaled_reward": 0.030007120221853256, - "rewards/format_reward": 0.875, + "loss": 0.2368, + "num_tokens": 62263576.0, + "reward": 0.08185825496912003, + "reward_std": 0.23910308256745338, + "rewards/cosine_scaled_reward": 0.04092914238572121, "step": 485 }, { "clip_ratio": 0.0, - "completion_length": 1277.8333892822266, - "epoch": 0.5554285714285714, - "grad_norm": 0.3418792486190796, - "kl": 0.027435302734375, + "completion_length": 1779.6250534057617, + "epoch": 0.2777142857142857, + "grad_norm": 0.44206833839416504, + "kl": 0.017059326171875, "learning_rate": 1.0214767000817596e-07, - "loss": 0.347, - "num_tokens": 44693005.0, - "reward": 1.2393681406974792, - "reward_std": 0.827592596411705, - "rewards/cosine_scaled_reward": 0.19260072708129883, - "rewards/format_reward": 0.8541666716337204, + "loss": 0.2164, + "num_tokens": 62354842.0, + "reward": -0.1533157378435135, + "reward_std": 0.4944304316304624, + "rewards/cosine_scaled_reward": -0.07665786519646645, "step": 486 }, { "clip_ratio": 0.0, - "completion_length": 1803.3542175292969, - "epoch": 0.5565714285714286, - "grad_norm": 0.390234112739563, - "kl": 0.0197601318359375, + "completion_length": 1240.7708358764648, + "epoch": 0.2782857142857143, + "grad_norm": 0.6005072593688965, + "kl": 0.02964019775390625, "learning_rate": 1.0185202062281336e-07, - "loss": 0.2833, - "num_tokens": 44787534.0, - "reward": 0.41170351952314377, - "reward_std": 0.6201038360595703, - "rewards/cosine_scaled_reward": -0.1587315769866109, - "rewards/format_reward": 0.7291666716337204, + "loss": 0.2567, + "num_tokens": 62420027.0, + "reward": 0.3417629040777683, + "reward_std": 0.5913028866052628, + "rewards/cosine_scaled_reward": 0.17088143806904554, "step": 487 }, { "clip_ratio": 0.0, - "completion_length": 1535.7709197998047, - "epoch": 0.5577142857142857, - "grad_norm": 0.3768185079097748, - "kl": 0.025360107421875, + "completion_length": 3259.6041870117188, + "epoch": 0.27885714285714286, + "grad_norm": 0.2043127417564392, + "kl": 0.0017719268798828125, "learning_rate": 1.0157821333772304e-07, - "loss": 0.2207, - "num_tokens": 44868931.0, - "reward": 0.5452676527202129, - "reward_std": 0.5340722799301147, - "rewards/cosine_scaled_reward": -0.15444950759410858, - "rewards/format_reward": 0.8541666567325592, + "loss": 0.0477, + "num_tokens": 62583244.0, + "reward": 0.000857822597026825, + "reward_std": 0.6294433549046516, + "rewards/cosine_scaled_reward": 0.0004289112985134125, "step": 488 }, { "clip_ratio": 0.0, - "completion_length": 628.0208358764648, - "epoch": 0.5588571428571428, - "grad_norm": 0.6455451250076294, - "kl": 0.05511474609375, + "completion_length": 2507.875045776367, + "epoch": 0.2794285714285714, + "grad_norm": 0.41613826155662537, + "kl": 0.0082244873046875, "learning_rate": 1.013262614978859e-07, - "loss": 0.1924, - "num_tokens": 44906402.0, - "reward": 1.6253060102462769, - "reward_std": 0.5070550180971622, - "rewards/cosine_scaled_reward": 0.3230696848477237, - "rewards/format_reward": 0.9791666716337204, + "loss": 0.171, + "num_tokens": 62708254.0, + "reward": -0.3011748939752579, + "reward_std": 0.6403844729065895, + "rewards/cosine_scaled_reward": -0.15058743953704834, "step": 489 }, { "clip_ratio": 0.0, - "completion_length": 1089.416732788086, - "epoch": 0.56, - "grad_norm": 0.5013791918754578, - "kl": 0.03314208984375, + "completion_length": 2246.9166870117188, + "epoch": 0.28, + "grad_norm": 0.3775407671928406, + "kl": 0.00557708740234375, "learning_rate": 1.0109617738307911e-07, - "loss": 0.1106, - "num_tokens": 44966572.0, - "reward": 1.0448856204748154, - "reward_std": 0.5141267701983452, - "rewards/cosine_scaled_reward": 0.04327613674104214, - "rewards/format_reward": 0.9583333283662796, + "loss": 0.2481, + "num_tokens": 62821314.0, + "reward": 0.13066441006958485, + "reward_std": 0.39571982994675636, + "rewards/cosine_scaled_reward": 0.06533220689743757, "step": 490 }, { "clip_ratio": 0.0, - "completion_length": 1648.9583740234375, - "epoch": 0.5611428571428572, - "grad_norm": 0.4632658064365387, - "kl": 0.0217742919921875, + "completion_length": 2470.5417098999023, + "epoch": 0.2805714285714286, + "grad_norm": 0.34662094712257385, + "kl": 0.010288238525390625, "learning_rate": 1.0088797220727779e-07, - "loss": 0.4465, - "num_tokens": 45053672.0, - "reward": 0.9366854764521122, - "reward_std": 0.3563379105180502, - "rewards/cosine_scaled_reward": 0.09334271214902401, - "rewards/format_reward": 0.75, + "loss": 0.1097, + "num_tokens": 62945936.0, + "reward": -0.09428609162569046, + "reward_std": 0.7784368544816971, + "rewards/cosine_scaled_reward": -0.04714304953813553, "step": 491 }, { "clip_ratio": 0.0, - "completion_length": 934.3958587646484, - "epoch": 0.5622857142857143, - "grad_norm": 0.5427266955375671, - "kl": 0.0307159423828125, + "completion_length": 3139.291748046875, + "epoch": 0.28114285714285714, + "grad_norm": 0.23511607944965363, + "kl": 0.0025539398193359375, "learning_rate": 1.0070165611810855e-07, - "loss": 0.4325, - "num_tokens": 45106761.0, - "reward": 0.9387294054031372, - "reward_std": 0.27895698696374893, - "rewards/cosine_scaled_reward": -0.009801974520087242, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.0161, + "num_tokens": 63102874.0, + "reward": -0.3465424180030823, + "reward_std": 0.5582562312483788, + "rewards/cosine_scaled_reward": -0.17327120155096054, "step": 492 }, { "clip_ratio": 0.0, - "completion_length": 869.5208740234375, - "epoch": 0.5634285714285714, - "grad_norm": 0.4428861141204834, - "kl": 0.035308837890625, + "completion_length": 2491.479232788086, + "epoch": 0.2817142857142857, + "grad_norm": 0.3599470555782318, + "kl": 0.013715744018554688, "learning_rate": 1.005372381963547e-07, - "loss": 0.0663, - "num_tokens": 45156058.0, - "reward": 0.8442247211933136, - "reward_std": 0.5361873507499695, - "rewards/cosine_scaled_reward": -0.06747098336927593, - "rewards/format_reward": 0.9791666716337204, + "loss": 0.0382, + "num_tokens": 63228357.0, + "reward": 0.5691114738583565, + "reward_std": 1.0396928787231445, + "rewards/cosine_scaled_reward": 0.2845557164400816, "step": 493 }, { "clip_ratio": 0.0, - "completion_length": 1189.5416870117188, - "epoch": 0.5645714285714286, - "grad_norm": 0.5454748868942261, - "kl": 0.03359222412109375, + "completion_length": 2203.6667098999023, + "epoch": 0.2822857142857143, + "grad_norm": 0.40486791729927063, + "kl": 0.01262664794921875, "learning_rate": 1.0039472645551372e-07, - "loss": 0.1418, - "num_tokens": 45220542.0, - "reward": 0.7198501382954419, - "reward_std": 0.4353892542421818, - "rewards/cosine_scaled_reward": -0.05674161948263645, - "rewards/format_reward": 0.8333333358168602, + "loss": -0.0022, + "num_tokens": 63339497.0, + "reward": 0.44267672300338745, + "reward_std": 0.6116999462246895, + "rewards/cosine_scaled_reward": 0.22133836150169373, "step": 494 }, { "clip_ratio": 0.0, - "completion_length": 911.8541870117188, - "epoch": 0.5657142857142857, - "grad_norm": 0.5451707243919373, - "kl": 0.037506103515625, + "completion_length": 3396.854248046875, + "epoch": 0.28285714285714286, + "grad_norm": 0.18696698546409607, + "kl": 0.001537322998046875, "learning_rate": 1.002741278414069e-07, - "loss": 0.4167, - "num_tokens": 45272705.0, - "reward": 1.2369663715362549, - "reward_std": 0.6268434636294842, - "rewards/cosine_scaled_reward": 0.14973314851522446, - "rewards/format_reward": 0.9375, + "loss": -0.024, + "num_tokens": 63508690.0, + "reward": -0.15206171572208405, + "reward_std": 0.6857472285628319, + "rewards/cosine_scaled_reward": -0.07603085786104202, "step": 495 }, { "clip_ratio": 0.0, - "completion_length": 874.5417022705078, - "epoch": 0.5668571428571428, - "grad_norm": 0.4017329216003418, - "kl": 0.03363037109375, + "completion_length": 1400.7708587646484, + "epoch": 0.2834285714285714, + "grad_norm": 0.508706271648407, + "kl": 0.018621444702148438, "learning_rate": 1.0017544823184055e-07, - "loss": 0.3653, - "num_tokens": 45322435.0, - "reward": 1.1579137444496155, - "reward_std": 0.6667543575167656, - "rewards/cosine_scaled_reward": 0.09979016706347466, - "rewards/format_reward": 0.9583333432674408, + "loss": 0.2565, + "num_tokens": 63581303.0, + "reward": -0.016182963736355305, + "reward_std": 0.6838839948177338, + "rewards/cosine_scaled_reward": -0.008091474417597055, "step": 496 }, { "clip_ratio": 0.0, - "completion_length": 957.6042022705078, - "epoch": 0.568, - "grad_norm": 0.481536865234375, - "kl": 0.0282135009765625, + "completion_length": 3447.9583740234375, + "epoch": 0.284, + "grad_norm": 0.20476852357387543, + "kl": 0.0011720657348632812, "learning_rate": 1.0009869243631952e-07, - "loss": 0.2427, - "num_tokens": 45376566.0, - "reward": 1.3649472296237946, - "reward_std": 0.6812234669923782, - "rewards/cosine_scaled_reward": 0.1928902603685856, - "rewards/format_reward": 0.9791666716337204, + "loss": 0.0013, + "num_tokens": 63753261.0, + "reward": -0.3728942945599556, + "reward_std": 0.4060557000339031, + "rewards/cosine_scaled_reward": -0.18644713005051017, "step": 497 }, { "clip_ratio": 0.0, - "completion_length": 1804.0000610351562, - "epoch": 0.5691428571428572, - "grad_norm": 0.40501669049263, - "kl": 0.0171966552734375, + "completion_length": 2509.0833587646484, + "epoch": 0.2845714285714286, + "grad_norm": 0.3915875554084778, + "kl": 0.0099945068359375, "learning_rate": 1.000438641958131e-07, - "loss": 0.4378, - "num_tokens": 45471132.0, - "reward": 0.38411422073841095, - "reward_std": 0.7113517224788666, - "rewards/cosine_scaled_reward": -0.15169288171455264, - "rewards/format_reward": 0.6875, + "loss": 0.1323, + "num_tokens": 63881005.0, + "reward": -0.10937303304672241, + "reward_std": 0.6931461840867996, + "rewards/cosine_scaled_reward": -0.054686516523361206, "step": 498 }, { "clip_ratio": 0.0, - "completion_length": 1580.2500305175781, - "epoch": 0.5702857142857143, - "grad_norm": 0.6002418398857117, - "kl": 0.034637451171875, + "completion_length": 2609.8959197998047, + "epoch": 0.28514285714285714, + "grad_norm": 0.31072404980659485, + "kl": 0.005950927734375, "learning_rate": 1.0001096618257236e-07, - "loss": 0.2261, - "num_tokens": 45554832.0, - "reward": 0.4394306093454361, - "reward_std": 0.6632324308156967, - "rewards/cosine_scaled_reward": -0.16570135951042175, - "rewards/format_reward": 0.7708333283662796, + "loss": 0.0247, + "num_tokens": 64012160.0, + "reward": 0.3292629097122699, + "reward_std": 0.9715245068073273, + "rewards/cosine_scaled_reward": 0.16463144560111687, "step": 499 }, { "clip_ratio": 0.0, - "completion_length": 1098.8542175292969, - "epoch": 0.5714285714285714, - "grad_norm": 0.4748859703540802, - "kl": 0.027740478515625, + "completion_length": 1381.4583435058594, + "epoch": 0.2857142857142857, + "grad_norm": 0.41521623730659485, + "kl": 0.023681640625, "learning_rate": 1e-07, - "loss": 0.3975, - "num_tokens": 45615545.0, - "reward": 0.7702142149209976, - "reward_std": 0.559198834002018, - "rewards/cosine_scaled_reward": -0.07322624698281288, - "rewards/format_reward": 0.9166666716337204, + "loss": 0.032, + "num_tokens": 64084134.0, + "reward": -0.11208531260490417, + "reward_std": 0.5163028538227081, + "rewards/cosine_scaled_reward": -0.05604265257716179, "step": 500 }, { - "epoch": 0.5714285714285714, + "epoch": 0.2857142857142857, "step": 500, "total_flos": 0.0, - "train_loss": 0.16526810049655613, - "train_runtime": 49656.4521, - "train_samples_per_second": 0.483, + "train_loss": 0.04327104251924902, + "train_runtime": 47844.8241, + "train_samples_per_second": 0.502, "train_steps_per_second": 0.01 } ],