{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5714285714285714, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 2571.2083587646484, "epoch": 0.001142857142857143, "grad_norm": 0.19727857410907745, "kl": 0.0, "learning_rate": 2e-08, "loss": -0.0, "reward": 0.1723687592893839, "reward_std": 0.7976016625761986, "rewards/cosine_scaled_reward": -0.015534311532974243, "rewards/format_reward": 0.5208333488553762, "step": 1 }, { "completion_length": 2804.395881652832, "epoch": 0.002285714285714286, "grad_norm": 0.18166053295135498, "kl": 0.0, "learning_rate": 4e-08, "loss": 0.0, "reward": -0.018269629566930234, "reward_std": 0.44402940198779106, "rewards/cosine_scaled_reward": -0.04980122856795788, "rewards/format_reward": 0.37500000558793545, "step": 2 }, { "completion_length": 3291.9583587646484, "epoch": 0.0034285714285714284, "grad_norm": 0.20012830197811127, "kl": 4.538148641586304e-05, "learning_rate": 6e-08, "loss": 0.0, "reward": -0.4293696880340576, "reward_std": 0.42283543944358826, "rewards/cosine_scaled_reward": -0.20520474947988987, "rewards/format_reward": 0.1250000037252903, "step": 3 }, { "completion_length": 2115.8542098999023, "epoch": 0.004571428571428572, "grad_norm": 0.25242456793785095, "kl": 3.884732723236084e-05, "learning_rate": 8e-08, "loss": 0.0, "reward": 0.2737832348793745, "reward_std": 0.9233334362506866, "rewards/cosine_scaled_reward": -0.041339562041684985, "rewards/format_reward": 0.6875000055879354, "step": 4 }, { "completion_length": 3488.5416870117188, "epoch": 0.005714285714285714, "grad_norm": 0.2051168829202652, "kl": 4.386343061923981e-05, "learning_rate": 1e-07, "loss": 0.0, "reward": -0.37347570061683655, "reward_std": 0.6771278530359268, "rewards/cosine_scaled_reward": -0.20584097504615784, "rewards/format_reward": 0.1666666716337204, "step": 5 }, { "completion_length": 3050.5417404174805, "epoch": 0.006857142857142857, "grad_norm": 0.2063540816307068, "kl": 4.404783248901367e-05, "learning_rate": 1.2e-07, "loss": 0.0, "reward": -0.21201085951179266, "reward_std": 0.8398218862712383, "rewards/cosine_scaled_reward": -0.17123706359416246, "rewards/format_reward": 0.29166666977107525, "step": 6 }, { "completion_length": 3078.5000610351562, "epoch": 0.008, "grad_norm": 0.1487358957529068, "kl": 2.9146671295166016e-05, "learning_rate": 1.4e-07, "loss": 0.0, "reward": -0.06183330807834864, "reward_std": 0.7480270601809025, "rewards/cosine_scaled_reward": -0.12833543797023594, "rewards/format_reward": 0.4375000149011612, "step": 7 }, { "completion_length": 2689.9375381469727, "epoch": 0.009142857142857144, "grad_norm": 0.18276464939117432, "kl": 1.888629049062729e-05, "learning_rate": 1.6e-07, "loss": 0.0, "reward": 0.2490266114473343, "reward_std": 0.8049478307366371, "rewards/cosine_scaled_reward": 0.06792760454118252, "rewards/format_reward": 0.4583333395421505, "step": 8 }, { "completion_length": 3336.5000610351562, "epoch": 0.010285714285714285, "grad_norm": 0.2246587723493576, "kl": 4.10228967666626e-05, "learning_rate": 1.8e-07, "loss": 0.0, "reward": -0.2021113825030625, "reward_std": 0.6383066512644291, "rewards/cosine_scaled_reward": -0.1338323038071394, "rewards/format_reward": 0.27083333395421505, "step": 9 }, { "completion_length": 2547.7916831970215, "epoch": 0.011428571428571429, "grad_norm": 0.2385585457086563, "kl": 3.205146640539169e-05, "learning_rate": 2e-07, "loss": 0.0, "reward": -0.010044756345450878, "reward_std": 0.6000867374241352, "rewards/cosine_scaled_reward": -0.08963373815640807, "rewards/format_reward": 0.43750000186264515, "step": 10 }, { "completion_length": 3325.1041870117188, "epoch": 0.012571428571428572, "grad_norm": 0.22885829210281372, "kl": 3.9190053939819336e-05, "learning_rate": 2.1999999999999998e-07, "loss": 0.0, "reward": -0.3854191079735756, "reward_std": 0.6930013746023178, "rewards/cosine_scaled_reward": -0.21312777569983155, "rewards/format_reward": 0.1666666716337204, "step": 11 }, { "completion_length": 2436.750045776367, "epoch": 0.013714285714285714, "grad_norm": 0.2692394554615021, "kl": 4.0858983993530273e-05, "learning_rate": 2.4e-07, "loss": 0.0, "reward": 0.05584145151078701, "reward_std": 0.7531977295875549, "rewards/cosine_scaled_reward": -0.14725533686578274, "rewards/format_reward": 0.6250000018626451, "step": 12 }, { "completion_length": 2905.625030517578, "epoch": 0.014857142857142857, "grad_norm": 0.18586185574531555, "kl": 3.486126661300659e-05, "learning_rate": 2.6e-07, "loss": 0.0, "reward": -0.06436450174078345, "reward_std": 0.5915969423949718, "rewards/cosine_scaled_reward": -0.10991157777607441, "rewards/format_reward": 0.416666679084301, "step": 13 }, { "completion_length": 2978.0208892822266, "epoch": 0.016, "grad_norm": 0.21407712996006012, "kl": 2.9122806154191494e-05, "learning_rate": 2.8e-07, "loss": 0.0, "reward": 0.008545992895960808, "reward_std": 0.8296682890504599, "rewards/cosine_scaled_reward": -0.05597527138888836, "rewards/format_reward": 0.3750000074505806, "step": 14 }, { "completion_length": 2821.291679382324, "epoch": 0.017142857142857144, "grad_norm": 0.18756185472011566, "kl": 2.925284206867218e-05, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.09492451697587967, "reward_std": 0.5804186388850212, "rewards/cosine_scaled_reward": -0.005186626687645912, "rewards/format_reward": 0.41666667349636555, "step": 15 }, { "completion_length": 3463.1458435058594, "epoch": 0.018285714285714287, "grad_norm": 0.19236689805984497, "kl": 3.2588839530944824e-05, "learning_rate": 3.2e-07, "loss": 0.0, "reward": -0.41888961754739285, "reward_std": 0.5801856927573681, "rewards/cosine_scaled_reward": -0.19731347833294421, "rewards/format_reward": 0.10416666977107525, "step": 16 }, { "completion_length": 2245.083366394043, "epoch": 0.019428571428571427, "grad_norm": 0.24209940433502197, "kl": 4.054419696331024e-05, "learning_rate": 3.4000000000000003e-07, "loss": 0.0, "reward": 0.233390836045146, "reward_std": 0.9472852721810341, "rewards/cosine_scaled_reward": -0.0257677553454414, "rewards/format_reward": 0.6041666716337204, "step": 17 }, { "completion_length": 2929.062530517578, "epoch": 0.02057142857142857, "grad_norm": 0.14878323674201965, "kl": 2.1327286958694458e-05, "learning_rate": 3.6e-07, "loss": 0.0, "reward": -0.019776458386331797, "reward_std": 0.5773040689527988, "rewards/cosine_scaled_reward": -0.08265479793772101, "rewards/format_reward": 0.4166666716337204, "step": 18 }, { "completion_length": 2807.437545776367, "epoch": 0.021714285714285714, "grad_norm": 0.16009841859340668, "kl": 2.41696834564209e-05, "learning_rate": 3.7999999999999996e-07, "loss": 0.0, "reward": 0.3507382436655462, "reward_std": 0.921161625534296, "rewards/cosine_scaled_reward": 0.12314888671971858, "rewards/format_reward": 0.4583333395421505, "step": 19 }, { "completion_length": 2367.8958892822266, "epoch": 0.022857142857142857, "grad_norm": 0.18880178034305573, "kl": 1.8640421330928802e-05, "learning_rate": 4e-07, "loss": 0.0, "reward": 0.5233698049560189, "reward_std": 0.9742525108158588, "rewards/cosine_scaled_reward": 0.14227081835269928, "rewards/format_reward": 0.666666679084301, "step": 20 }, { "completion_length": 2770.4583740234375, "epoch": 0.024, "grad_norm": 0.24240563809871674, "kl": 3.499537706375122e-05, "learning_rate": 4.1999999999999995e-07, "loss": 0.0, "reward": -0.16247276589274406, "reward_std": 0.5624231658875942, "rewards/cosine_scaled_reward": -0.1772752869874239, "rewards/format_reward": 0.4166666716337204, "step": 21 }, { "completion_length": 1995.1875305175781, "epoch": 0.025142857142857144, "grad_norm": 0.43609920144081116, "kl": 3.091990947723389e-05, "learning_rate": 4.3999999999999997e-07, "loss": 0.0, "reward": 0.36330048087984324, "reward_std": 0.6434031687676907, "rewards/cosine_scaled_reward": 0.004692776128649712, "rewards/format_reward": 0.750000013038516, "step": 22 }, { "completion_length": 2544.666717529297, "epoch": 0.026285714285714287, "grad_norm": 0.23585595190525055, "kl": 3.0001625418663025e-05, "learning_rate": 4.6e-07, "loss": 0.0, "reward": -0.11523125227540731, "reward_std": 0.6521053463220596, "rewards/cosine_scaled_reward": -0.16176861617714167, "rewards/format_reward": 0.4375000111758709, "step": 23 }, { "completion_length": 2566.333335876465, "epoch": 0.027428571428571427, "grad_norm": 0.18629349768161774, "kl": 2.0964653231203556e-05, "learning_rate": 4.8e-07, "loss": 0.0, "reward": 0.23491991590708494, "reward_std": 0.7349754758179188, "rewards/cosine_scaled_reward": -0.0008563240990042686, "rewards/format_reward": 0.5833333469927311, "step": 24 }, { "completion_length": 2641.145881652832, "epoch": 0.02857142857142857, "grad_norm": 0.1711462438106537, "kl": 3.1113624572753906e-05, "learning_rate": 5e-07, "loss": 0.0, "reward": 0.03685340657830238, "reward_std": 0.7854471541941166, "rewards/cosine_scaled_reward": -0.06554372794926167, "rewards/format_reward": 0.43750001303851604, "step": 25 }, { "completion_length": 3136.604217529297, "epoch": 0.029714285714285714, "grad_norm": 0.19311213493347168, "kl": 4.175305366516113e-05, "learning_rate": 5.2e-07, "loss": 0.0, "reward": -0.036069474183022976, "reward_std": 0.7284176610410213, "rewards/cosine_scaled_reward": -0.06703814025968313, "rewards/format_reward": 0.3541666716337204, "step": 26 }, { "completion_length": 2979.4583740234375, "epoch": 0.030857142857142857, "grad_norm": 0.2027575522661209, "kl": 3.305543214082718e-05, "learning_rate": 5.4e-07, "loss": 0.0, "reward": 0.02418377436697483, "reward_std": 0.8362919054925442, "rewards/cosine_scaled_reward": -0.07142194919288158, "rewards/format_reward": 0.4375000111758709, "step": 27 }, { "completion_length": 2897.979202270508, "epoch": 0.032, "grad_norm": 0.19330666959285736, "kl": 3.600865602493286e-05, "learning_rate": 5.6e-07, "loss": 0.0, "reward": 0.09552552457898855, "reward_std": 0.6331899762153625, "rewards/cosine_scaled_reward": -0.015744205564260483, "rewards/format_reward": 0.43750001303851604, "step": 28 }, { "completion_length": 3302.8958740234375, "epoch": 0.03314285714285714, "grad_norm": 0.16206476092338562, "kl": 2.98917293548584e-05, "learning_rate": 5.8e-07, "loss": 0.0, "reward": -0.2337268814444542, "reward_std": 0.5517648197710514, "rewards/cosine_scaled_reward": -0.12987937778234482, "rewards/format_reward": 0.2291666753590107, "step": 29 }, { "completion_length": 2989.604232788086, "epoch": 0.03428571428571429, "grad_norm": 0.15634141862392426, "kl": 2.584606409072876e-05, "learning_rate": 6e-07, "loss": 0.0, "reward": 0.2867018459364772, "reward_std": 0.8898419290781021, "rewards/cosine_scaled_reward": 0.0461630261852406, "rewards/format_reward": 0.541666679084301, "step": 30 }, { "completion_length": 2971.3750228881836, "epoch": 0.03542857142857143, "grad_norm": 0.17451131343841553, "kl": 2.5779008865356445e-05, "learning_rate": 6.2e-07, "loss": 0.0, "reward": -0.1252674162387848, "reward_std": 0.6488835532218218, "rewards/cosine_scaled_reward": -0.11415221774950624, "rewards/format_reward": 0.3333333395421505, "step": 31 }, { "completion_length": 3259.375030517578, "epoch": 0.036571428571428574, "grad_norm": 0.1704079955816269, "kl": 2.6818830519914627e-05, "learning_rate": 6.4e-07, "loss": 0.0, "reward": 0.015448510646820068, "reward_std": 0.6844744682312012, "rewards/cosine_scaled_reward": -0.022644946351647377, "rewards/format_reward": 0.3333333395421505, "step": 32 }, { "completion_length": 3343.5208435058594, "epoch": 0.037714285714285714, "grad_norm": 0.1502685844898224, "kl": 2.950429916381836e-05, "learning_rate": 6.6e-07, "loss": 0.0, "reward": -0.3123129680752754, "reward_std": 0.4822962246835232, "rewards/cosine_scaled_reward": -0.17962717823684216, "rewards/format_reward": 0.22916666977107525, "step": 33 }, { "completion_length": 2390.2083435058594, "epoch": 0.038857142857142854, "grad_norm": 0.32976019382476807, "kl": 3.445148468017578e-05, "learning_rate": 6.800000000000001e-07, "loss": 0.0, "reward": 0.5364574566483498, "reward_std": 0.7043890058994293, "rewards/cosine_scaled_reward": 0.2165452465415001, "rewards/format_reward": 0.5625, "step": 34 }, { "completion_length": 3077.145866394043, "epoch": 0.04, "grad_norm": 0.21368089318275452, "kl": 4.202127456665039e-05, "learning_rate": 7e-07, "loss": 0.0, "reward": -0.110074105206877, "reward_std": 0.9502048678696156, "rewards/cosine_scaled_reward": -0.08587458729743958, "rewards/format_reward": 0.2500000037252903, "step": 35 }, { "completion_length": 3360.687530517578, "epoch": 0.04114285714285714, "grad_norm": 0.1669422686100006, "kl": 3.0137598514556885e-05, "learning_rate": 7.2e-07, "loss": 0.0, "reward": -0.4919657548889518, "reward_std": 0.4837723895907402, "rewards/cosine_scaled_reward": -0.28421421349048615, "rewards/format_reward": 0.18750000558793545, "step": 36 }, { "completion_length": 3358.7291870117188, "epoch": 0.04228571428571429, "grad_norm": 0.1673896312713623, "kl": 1.73947773873806e-05, "learning_rate": 7.4e-07, "loss": 0.0, "reward": -0.35696901264600456, "reward_std": 0.4194028750061989, "rewards/cosine_scaled_reward": -0.20523495320230722, "rewards/format_reward": 0.22916666977107525, "step": 37 }, { "completion_length": 3273.2708435058594, "epoch": 0.04342857142857143, "grad_norm": 0.17379561066627502, "kl": 3.8963742554187775e-05, "learning_rate": 7.599999999999999e-07, "loss": 0.0, "reward": -0.2825834956020117, "reward_std": 0.6573631279170513, "rewards/cosine_scaled_reward": -0.14204833284020424, "rewards/format_reward": 0.1666666679084301, "step": 38 }, { "completion_length": 2792.2500228881836, "epoch": 0.044571428571428574, "grad_norm": 0.2008347511291504, "kl": 1.8621794879436493e-05, "learning_rate": 7.799999999999999e-07, "loss": 0.0, "reward": 0.01606392953544855, "reward_std": 0.6000814624130726, "rewards/cosine_scaled_reward": -0.07735766470432281, "rewards/format_reward": 0.4583333469927311, "step": 39 }, { "completion_length": 2641.7084045410156, "epoch": 0.045714285714285714, "grad_norm": 0.21129944920539856, "kl": 6.28940761089325e-05, "learning_rate": 8e-07, "loss": 0.0, "reward": 0.01600947417318821, "reward_std": 0.6365399248898029, "rewards/cosine_scaled_reward": -0.11234160404001159, "rewards/format_reward": 0.5208333469927311, "step": 40 }, { "completion_length": 2874.9791870117188, "epoch": 0.046857142857142854, "grad_norm": 0.17788062989711761, "kl": 3.086775541305542e-05, "learning_rate": 8.199999999999999e-07, "loss": 0.0, "reward": -0.050673360005021095, "reward_std": 0.9452972374856472, "rewards/cosine_scaled_reward": -0.14972026087343693, "rewards/format_reward": 0.4583333469927311, "step": 41 }, { "completion_length": 2774.2916984558105, "epoch": 0.048, "grad_norm": 0.3309200704097748, "kl": 6.945431232452393e-05, "learning_rate": 8.399999999999999e-07, "loss": 0.0, "reward": -0.2789352214895189, "reward_std": 0.5039437636733055, "rewards/cosine_scaled_reward": -0.24113959958776832, "rewards/format_reward": 0.3958333358168602, "step": 42 }, { "completion_length": 2986.1667098999023, "epoch": 0.04914285714285714, "grad_norm": 0.19303752481937408, "kl": 3.374367952346802e-05, "learning_rate": 8.599999999999999e-07, "loss": 0.0, "reward": -0.041154950857162476, "reward_std": 0.6381138861179352, "rewards/cosine_scaled_reward": -0.029581679904367775, "rewards/format_reward": 0.27083333395421505, "step": 43 }, { "completion_length": 2853.937530517578, "epoch": 0.05028571428571429, "grad_norm": 0.24794019758701324, "kl": 0.00017474591732025146, "learning_rate": 8.799999999999999e-07, "loss": 0.0, "reward": 0.08132272958755493, "reward_std": 0.7235868386924267, "rewards/cosine_scaled_reward": -0.0631332267075777, "rewards/format_reward": 0.5000000111758709, "step": 44 }, { "completion_length": 3266.812530517578, "epoch": 0.05142857142857143, "grad_norm": 0.15977022051811218, "kl": 5.7324767112731934e-05, "learning_rate": 9e-07, "loss": 0.0, "reward": -0.18004001304507256, "reward_std": 0.7050215303897858, "rewards/cosine_scaled_reward": -0.12542735040187836, "rewards/format_reward": 0.2708333358168602, "step": 45 }, { "completion_length": 3250.2083435058594, "epoch": 0.052571428571428575, "grad_norm": 0.19012750685214996, "kl": 8.171796798706055e-05, "learning_rate": 9.2e-07, "loss": 0.0, "reward": -0.32018135441467166, "reward_std": 0.6548310741782188, "rewards/cosine_scaled_reward": -0.17957252322230488, "rewards/format_reward": 0.18750000186264515, "step": 46 }, { "completion_length": 2832.333351135254, "epoch": 0.053714285714285714, "grad_norm": 0.2044163942337036, "kl": 3.51807102560997e-05, "learning_rate": 9.399999999999999e-07, "loss": 0.0, "reward": 0.25673263892531395, "reward_std": 0.8521627373993397, "rewards/cosine_scaled_reward": 0.061620082706213, "rewards/format_reward": 0.47916667349636555, "step": 47 }, { "completion_length": 2832.937530517578, "epoch": 0.054857142857142854, "grad_norm": 0.20488843321800232, "kl": 0.00012751109898090363, "learning_rate": 9.6e-07, "loss": 0.0, "reward": -0.01403064839541912, "reward_std": 0.847560465335846, "rewards/cosine_scaled_reward": -0.08664842648431659, "rewards/format_reward": 0.3958333358168602, "step": 48 }, { "completion_length": 2368.645896911621, "epoch": 0.056, "grad_norm": 0.21413730084896088, "kl": 6.522238254547119e-05, "learning_rate": 9.8e-07, "loss": 0.0, "reward": 0.28362084983382374, "reward_std": 0.9133072569966316, "rewards/cosine_scaled_reward": 0.0036178361624479294, "rewards/format_reward": 0.6250000037252903, "step": 49 }, { "completion_length": 2862.520835876465, "epoch": 0.05714285714285714, "grad_norm": 0.184512659907341, "kl": 0.00011056661605834961, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.04502496123313904, "reward_std": 0.570039439946413, "rewards/cosine_scaled_reward": -0.0024489806964993477, "rewards/format_reward": 0.3541666679084301, "step": 50 }, { "completion_length": 2245.750015258789, "epoch": 0.05828571428571429, "grad_norm": 0.21354830265045166, "kl": 0.00020595639944076538, "learning_rate": 9.999890338174275e-07, "loss": 0.0, "reward": 0.049862007377669215, "reward_std": 0.5920056514441967, "rewards/cosine_scaled_reward": -0.11850284365937114, "rewards/format_reward": 0.5833333358168602, "step": 51 }, { "completion_length": 2897.93754196167, "epoch": 0.05942857142857143, "grad_norm": 0.222417950630188, "kl": 9.300559759140015e-05, "learning_rate": 9.999561358041868e-07, "loss": 0.0, "reward": 0.30677789729088545, "reward_std": 0.9850648231804371, "rewards/cosine_scaled_reward": 0.08488897839561105, "rewards/format_reward": 0.47916667722165585, "step": 52 }, { "completion_length": 2835.979202270508, "epoch": 0.060571428571428575, "grad_norm": 0.2359960377216339, "kl": 0.00013843923807144165, "learning_rate": 9.999013075636804e-07, "loss": 0.0, "reward": 0.12393231969326735, "reward_std": 0.8365762289613485, "rewards/cosine_scaled_reward": -0.017082044621929526, "rewards/format_reward": 0.45833334513008595, "step": 53 }, { "completion_length": 2773.250030517578, "epoch": 0.061714285714285715, "grad_norm": 0.16520988941192627, "kl": 6.0267746448516846e-05, "learning_rate": 9.998245517681593e-07, "loss": 0.0, "reward": 0.4487614845857024, "reward_std": 0.9010818786919117, "rewards/cosine_scaled_reward": 0.14949666894972324, "rewards/format_reward": 0.5625000186264515, "step": 54 }, { "completion_length": 2909.0000762939453, "epoch": 0.06285714285714286, "grad_norm": 0.17167004942893982, "kl": 8.597038686275482e-05, "learning_rate": 9.997258721585931e-07, "loss": 0.0, "reward": 0.11020261491648853, "reward_std": 0.8988244608044624, "rewards/cosine_scaled_reward": 0.003133175428956747, "rewards/format_reward": 0.3750000074505806, "step": 55 }, { "completion_length": 2952.333366394043, "epoch": 0.064, "grad_norm": 0.18516214191913605, "kl": 4.446879029273987e-05, "learning_rate": 9.996052735444862e-07, "loss": 0.0, "reward": -0.027613874524831772, "reward_std": 0.6530672702938318, "rewards/cosine_scaled_reward": -0.09866334870457649, "rewards/format_reward": 0.4375000111758709, "step": 56 }, { "completion_length": 3412.1041870117188, "epoch": 0.06514285714285714, "grad_norm": 0.12485022842884064, "kl": 1.195073127746582e-05, "learning_rate": 9.994627618036452e-07, "loss": 0.0, "reward": -0.1474010832607746, "reward_std": 0.7124437093734741, "rewards/cosine_scaled_reward": -0.10700338426977396, "rewards/format_reward": 0.2708333358168602, "step": 57 }, { "completion_length": 2150.3750534057617, "epoch": 0.06628571428571428, "grad_norm": 0.1975310742855072, "kl": 0.0003490075469017029, "learning_rate": 9.992983438818915e-07, "loss": 0.0, "reward": 0.4446102287620306, "reward_std": 0.8766555078327656, "rewards/cosine_scaled_reward": 0.0772969089448452, "rewards/format_reward": 0.6875000186264515, "step": 58 }, { "completion_length": 2856.041717529297, "epoch": 0.06742857142857143, "grad_norm": 0.1683138906955719, "kl": 3.864988684654236e-05, "learning_rate": 9.991120277927223e-07, "loss": 0.0, "reward": -0.09963385201990604, "reward_std": 0.6718280278146267, "rewards/cosine_scaled_reward": -0.10808059107512236, "rewards/format_reward": 0.35416668094694614, "step": 59 }, { "completion_length": 3047.6041870117188, "epoch": 0.06857142857142857, "grad_norm": 0.16972136497497559, "kl": 4.49158251285553e-05, "learning_rate": 9.989038226169207e-07, "loss": 0.0, "reward": -0.3548573371954262, "reward_std": 0.5078517571091652, "rewards/cosine_scaled_reward": -0.25396787002682686, "rewards/format_reward": 0.31250000186264515, "step": 60 }, { "completion_length": 3066.0000610351562, "epoch": 0.06971428571428571, "grad_norm": 0.16273915767669678, "kl": 0.00016046315431594849, "learning_rate": 9.98673738502114e-07, "loss": 0.0, "reward": 0.08701966446824372, "reward_std": 0.6688249669969082, "rewards/cosine_scaled_reward": -0.06528997980058193, "rewards/format_reward": 0.5208333395421505, "step": 61 }, { "completion_length": 2751.4583740234375, "epoch": 0.07085714285714285, "grad_norm": 0.28360143303871155, "kl": 0.0005411431193351746, "learning_rate": 9.98421786662277e-07, "loss": 0.0, "reward": 0.1395623767748475, "reward_std": 0.8662982396781445, "rewards/cosine_scaled_reward": -0.021802921197377145, "rewards/format_reward": 0.479166679084301, "step": 62 }, { "completion_length": 2314.854217529297, "epoch": 0.072, "grad_norm": 0.21262036263942719, "kl": 0.0005500763654708862, "learning_rate": 9.981479793771866e-07, "loss": 0.0, "reward": 0.5263404976576567, "reward_std": 0.880022443830967, "rewards/cosine_scaled_reward": 0.14735051710158587, "rewards/format_reward": 0.6666666716337204, "step": 63 }, { "completion_length": 2922.4583892822266, "epoch": 0.07314285714285715, "grad_norm": 0.17176398634910583, "kl": 0.00019650161266326904, "learning_rate": 9.97852329991824e-07, "loss": 0.0, "reward": 0.010981407947838306, "reward_std": 0.8544860817492008, "rewards/cosine_scaled_reward": -0.04802468419075012, "rewards/format_reward": 0.35416667349636555, "step": 64 }, { "completion_length": 2751.0625381469727, "epoch": 0.07428571428571429, "grad_norm": 0.19784730672836304, "kl": 0.00014169886708259583, "learning_rate": 9.975348529157229e-07, "loss": 0.0, "reward": -0.14385659247636795, "reward_std": 0.6372686810791492, "rewards/cosine_scaled_reward": -0.16209950670599937, "rewards/format_reward": 0.3958333358168602, "step": 65 }, { "completion_length": 2123.6666717529297, "epoch": 0.07542857142857143, "grad_norm": 0.2654637396335602, "kl": 0.00044383853673934937, "learning_rate": 9.971955636222684e-07, "loss": 0.0, "reward": 0.21715208888053894, "reward_std": 0.5327535588294268, "rewards/cosine_scaled_reward": 0.03916000574827194, "rewards/format_reward": 0.5, "step": 66 }, { "completion_length": 3416.0625, "epoch": 0.07657142857142857, "grad_norm": 0.14119970798492432, "kl": 0.0003875941038131714, "learning_rate": 9.968344786479415e-07, "loss": 0.0, "reward": -0.544161144644022, "reward_std": 0.4329412467777729, "rewards/cosine_scaled_reward": -0.28361437405692413, "rewards/format_reward": 0.1250000037252903, "step": 67 }, { "completion_length": 2162.645881652832, "epoch": 0.07771428571428571, "grad_norm": 0.25515133142471313, "kl": 0.0016131103038787842, "learning_rate": 9.964516155915151e-07, "loss": 0.0001, "reward": 0.18700658343732357, "reward_std": 0.9016527011990547, "rewards/cosine_scaled_reward": -0.053536335937678814, "rewards/format_reward": 0.6041666716337204, "step": 68 }, { "completion_length": 2560.0625534057617, "epoch": 0.07885714285714286, "grad_norm": 0.2858283817768097, "kl": 0.0012285411357879639, "learning_rate": 9.960469931131936e-07, "loss": 0.0, "reward": -0.2131683579646051, "reward_std": 0.6651364080607891, "rewards/cosine_scaled_reward": -0.23473481088876724, "rewards/format_reward": 0.43750000558793545, "step": 69 }, { "completion_length": 3074.875015258789, "epoch": 0.08, "grad_norm": 0.18202482163906097, "kl": 0.0006042998284101486, "learning_rate": 9.956206309337066e-07, "loss": 0.0, "reward": -0.12620488926768303, "reward_std": 0.6856267843395472, "rewards/cosine_scaled_reward": -0.13051698391791433, "rewards/format_reward": 0.3541666679084301, "step": 70 }, { "completion_length": 2711.4375228881836, "epoch": 0.08114285714285714, "grad_norm": 0.37329548597335815, "kl": 0.0009671822190284729, "learning_rate": 9.951725498333448e-07, "loss": 0.0, "reward": 0.09207316488027573, "reward_std": 0.7490551918745041, "rewards/cosine_scaled_reward": 0.00596673134714365, "rewards/format_reward": 0.37500000558793545, "step": 71 }, { "completion_length": 2840.895866394043, "epoch": 0.08228571428571428, "grad_norm": 0.2226312905550003, "kl": 0.0008899271488189697, "learning_rate": 9.947027716509488e-07, "loss": 0.0, "reward": -0.16130333952605724, "reward_std": 0.48493045195937157, "rewards/cosine_scaled_reward": -0.13195209205150604, "rewards/format_reward": 0.33333334140479565, "step": 72 }, { "completion_length": 3428.4583740234375, "epoch": 0.08342857142857144, "grad_norm": 0.18076418340206146, "kl": 0.00024478137493133545, "learning_rate": 9.942113192828444e-07, "loss": 0.0, "reward": -0.3564223051071167, "reward_std": 0.7073610313236713, "rewards/cosine_scaled_reward": -0.18848324241116643, "rewards/format_reward": 0.14583333767950535, "step": 73 }, { "completion_length": 3174.479248046875, "epoch": 0.08457142857142858, "grad_norm": 0.19092188775539398, "kl": 0.0007887572282925248, "learning_rate": 9.93698216681727e-07, "loss": 0.0, "reward": -0.04798390786163509, "reward_std": 0.9048562906682491, "rewards/cosine_scaled_reward": -0.06373507156968117, "rewards/format_reward": 0.2916666716337204, "step": 74 }, { "completion_length": 3045.8958587646484, "epoch": 0.08571428571428572, "grad_norm": 0.1473468691110611, "kl": 0.0011369436979293823, "learning_rate": 9.931634888554935e-07, "loss": 0.0, "reward": 0.051474731182679534, "reward_std": 0.6175347343087196, "rewards/cosine_scaled_reward": 0.00460137240588665, "rewards/format_reward": 0.3333333358168602, "step": 75 }, { "completion_length": 2876.5208740234375, "epoch": 0.08685714285714285, "grad_norm": 0.22771689295768738, "kl": 0.00022584199905395508, "learning_rate": 9.926071618660237e-07, "loss": 0.0, "reward": -0.2855505235493183, "reward_std": 0.3848421312868595, "rewards/cosine_scaled_reward": -0.23991595953702927, "rewards/format_reward": 0.39583333767950535, "step": 76 }, { "completion_length": 2967.3958435058594, "epoch": 0.088, "grad_norm": 0.15564849972724915, "kl": 0.00028374046087265015, "learning_rate": 9.9202926282791e-07, "loss": 0.0, "reward": -0.11336795706301928, "reward_std": 0.42253825441002846, "rewards/cosine_scaled_reward": -0.09275355748832226, "rewards/format_reward": 0.33333334140479565, "step": 77 }, { "completion_length": 3223.9583892822266, "epoch": 0.08914285714285715, "grad_norm": 0.1456020325422287, "kl": 0.00031157582998275757, "learning_rate": 9.91429819907136e-07, "loss": 0.0, "reward": -0.039307162165641785, "reward_std": 0.6996998488903046, "rewards/cosine_scaled_reward": -0.04336683638393879, "rewards/format_reward": 0.3125000074505806, "step": 78 }, { "completion_length": 2270.0417098999023, "epoch": 0.09028571428571429, "grad_norm": 0.20563355088233948, "kl": 0.0013409852981567383, "learning_rate": 9.908088623197048e-07, "loss": 0.0001, "reward": 0.09923344664275646, "reward_std": 0.7259157933294773, "rewards/cosine_scaled_reward": -0.10380276478827, "rewards/format_reward": 0.6041666679084301, "step": 79 }, { "completion_length": 3190.3333587646484, "epoch": 0.09142857142857143, "grad_norm": 0.21478085219860077, "kl": 0.0009518936276435852, "learning_rate": 9.901664203302124e-07, "loss": 0.0, "reward": -0.2797470228979364, "reward_std": 0.6308383457362652, "rewards/cosine_scaled_reward": -0.18668191879987717, "rewards/format_reward": 0.27083333767950535, "step": 80 }, { "completion_length": 2895.583354949951, "epoch": 0.09257142857142857, "grad_norm": 0.2533528804779053, "kl": 0.002989828586578369, "learning_rate": 9.895025252503755e-07, "loss": 0.0001, "reward": -0.1824733428657055, "reward_std": 0.5710629485547543, "rewards/cosine_scaled_reward": -0.15205951500684023, "rewards/format_reward": 0.33333333395421505, "step": 81 }, { "completion_length": 2906.6458740234375, "epoch": 0.09371428571428571, "grad_norm": 0.19861973822116852, "kl": 0.0019384026527404785, "learning_rate": 9.888172094375033e-07, "loss": 0.0001, "reward": 0.002590768039226532, "reward_std": 0.6845040954649448, "rewards/cosine_scaled_reward": -0.050318428009632044, "rewards/format_reward": 0.37500000931322575, "step": 82 }, { "completion_length": 2862.9583587646484, "epoch": 0.09485714285714286, "grad_norm": 0.5050163865089417, "kl": 0.0011357255280017853, "learning_rate": 9.881105062929221e-07, "loss": 0.0, "reward": -0.2405467852950096, "reward_std": 0.5951391458511353, "rewards/cosine_scaled_reward": -0.16155940247699618, "rewards/format_reward": 0.27083333395421505, "step": 83 }, { "completion_length": 3058.375045776367, "epoch": 0.096, "grad_norm": 0.1727391928434372, "kl": 0.0007171034812927246, "learning_rate": 9.873824502603459e-07, "loss": 0.0, "reward": 0.07485563680529594, "reward_std": 0.8334123902022839, "rewards/cosine_scaled_reward": -0.0007621082477271557, "rewards/format_reward": 0.35416667349636555, "step": 84 }, { "completion_length": 2977.7709045410156, "epoch": 0.09714285714285714, "grad_norm": 0.14627371728420258, "kl": 0.0006437301635742188, "learning_rate": 9.866330768241983e-07, "loss": 0.0, "reward": -0.01777968415990472, "reward_std": 0.8515256457030773, "rewards/cosine_scaled_reward": -0.09596842993050814, "rewards/format_reward": 0.4166666753590107, "step": 85 }, { "completion_length": 2917.0625228881836, "epoch": 0.09828571428571428, "grad_norm": 0.20686082541942596, "kl": 0.001413583755493164, "learning_rate": 9.85862422507884e-07, "loss": 0.0001, "reward": 0.06965514738112688, "reward_std": 0.6772139333188534, "rewards/cosine_scaled_reward": -0.03397463448345661, "rewards/format_reward": 0.4375000074505806, "step": 86 }, { "completion_length": 2726.0833740234375, "epoch": 0.09942857142857142, "grad_norm": 0.22425347566604614, "kl": 0.001748785376548767, "learning_rate": 9.850705248720068e-07, "loss": 0.0001, "reward": 0.15189366973936558, "reward_std": 0.8081401586532593, "rewards/cosine_scaled_reward": -0.0596234705299139, "rewards/format_reward": 0.5833333469927311, "step": 87 }, { "completion_length": 2768.354202270508, "epoch": 0.10057142857142858, "grad_norm": 0.1945296823978424, "kl": 0.002580702304840088, "learning_rate": 9.8425742251254e-07, "loss": 0.0001, "reward": -0.015582697466015816, "reward_std": 0.7642906121909618, "rewards/cosine_scaled_reward": -0.13294149283319712, "rewards/format_reward": 0.5, "step": 88 }, { "completion_length": 3245.0833740234375, "epoch": 0.10171428571428572, "grad_norm": 0.1559896320104599, "kl": 0.0016347765922546387, "learning_rate": 9.83423155058946e-07, "loss": 0.0001, "reward": 0.022897440940141678, "reward_std": 0.6935332976281643, "rewards/cosine_scaled_reward": -0.016921459697186947, "rewards/format_reward": 0.3333333395421505, "step": 89 }, { "completion_length": 2285.729179382324, "epoch": 0.10285714285714286, "grad_norm": 0.3084622621536255, "kl": 0.0027695298194885254, "learning_rate": 9.825677631722435e-07, "loss": 0.0001, "reward": -0.18626173213124275, "reward_std": 0.48771001771092415, "rewards/cosine_scaled_reward": -0.2529527278384194, "rewards/format_reward": 0.5416666679084301, "step": 90 }, { "completion_length": 2982.937530517578, "epoch": 0.104, "grad_norm": 0.2009773999452591, "kl": 0.0014831870794296265, "learning_rate": 9.816912885430258e-07, "loss": 0.0001, "reward": -0.15441028028726578, "reward_std": 0.5955647341907024, "rewards/cosine_scaled_reward": -0.11808557622134686, "rewards/format_reward": 0.3125000074505806, "step": 91 }, { "completion_length": 2740.3125228881836, "epoch": 0.10514285714285715, "grad_norm": 0.22146174311637878, "kl": 0.003424704074859619, "learning_rate": 9.807937738894303e-07, "loss": 0.0001, "reward": -0.21460794005542994, "reward_std": 0.6763530150055885, "rewards/cosine_scaled_reward": -0.24455153848975897, "rewards/format_reward": 0.4583333469927311, "step": 92 }, { "completion_length": 3584.0, "epoch": 0.10628571428571429, "grad_norm": 0.15897655487060547, "kl": 0.0015739202499389648, "learning_rate": 9.798752629550546e-07, "loss": 0.0001, "reward": -0.5496739558875561, "reward_std": 0.33528773859143257, "rewards/cosine_scaled_reward": -0.22698437981307507, "rewards/format_reward": 0.02083333395421505, "step": 93 }, { "completion_length": 3051.958366394043, "epoch": 0.10742857142857143, "grad_norm": 0.17778827250003815, "kl": 0.0024556964635849, "learning_rate": 9.78935800506826e-07, "loss": 0.0001, "reward": -0.12991986190900207, "reward_std": 0.6625968441367149, "rewards/cosine_scaled_reward": -0.10003662994131446, "rewards/format_reward": 0.2916666716337204, "step": 94 }, { "completion_length": 3445.2500610351562, "epoch": 0.10857142857142857, "grad_norm": 0.1542736142873764, "kl": 0.0006103254854679108, "learning_rate": 9.779754323328192e-07, "loss": 0.0, "reward": -0.22923543583601713, "reward_std": 0.6721294671297073, "rewards/cosine_scaled_reward": -0.14534958777949214, "rewards/format_reward": 0.25000000931322575, "step": 95 }, { "completion_length": 2762.812545776367, "epoch": 0.10971428571428571, "grad_norm": 0.1754182130098343, "kl": 0.0021101534366607666, "learning_rate": 9.769942052400235e-07, "loss": 0.0001, "reward": -0.0632266215980053, "reward_std": 0.49075169675052166, "rewards/cosine_scaled_reward": -0.09756997041404247, "rewards/format_reward": 0.39583333395421505, "step": 96 }, { "completion_length": 3206.5208740234375, "epoch": 0.11085714285714286, "grad_norm": 0.175271138548851, "kl": 0.0009508877992630005, "learning_rate": 9.759921670520634e-07, "loss": 0.0, "reward": 0.05997042031958699, "reward_std": 0.8464921619743109, "rewards/cosine_scaled_reward": -0.04258055402897298, "rewards/format_reward": 0.41666668094694614, "step": 97 }, { "completion_length": 2683.291702270508, "epoch": 0.112, "grad_norm": 0.1968272626399994, "kl": 0.00044539570808410645, "learning_rate": 9.749693666068663e-07, "loss": 0.0, "reward": 0.1016645822674036, "reward_std": 0.7722124308347702, "rewards/cosine_scaled_reward": -0.06791386939585209, "rewards/format_reward": 0.520833345130086, "step": 98 }, { "completion_length": 2768.7500076293945, "epoch": 0.11314285714285714, "grad_norm": 0.22673000395298004, "kl": 0.0013333559036254883, "learning_rate": 9.739258537542835e-07, "loss": 0.0001, "reward": -0.15783914551138878, "reward_std": 0.5159456543624401, "rewards/cosine_scaled_reward": -0.11131853237748146, "rewards/format_reward": 0.2916666679084301, "step": 99 }, { "completion_length": 2545.4792098999023, "epoch": 0.11428571428571428, "grad_norm": 0.18069936335086823, "kl": 0.002035856246948242, "learning_rate": 9.728616793536587e-07, "loss": 0.0001, "reward": 0.4198254104703665, "reward_std": 0.8630654141306877, "rewards/cosine_scaled_reward": 0.12847260013222694, "rewards/format_reward": 0.5625000111758709, "step": 100 }, { "completion_length": 2497.000030517578, "epoch": 0.11542857142857142, "grad_norm": 0.22976230084896088, "kl": 0.0024021267890930176, "learning_rate": 9.717768952713511e-07, "loss": 0.0001, "reward": 0.06796598061919212, "reward_std": 0.587594460695982, "rewards/cosine_scaled_reward": -0.033663152426015586, "rewards/format_reward": 0.4375, "step": 101 }, { "completion_length": 2184.916717529297, "epoch": 0.11657142857142858, "grad_norm": 0.24666009843349457, "kl": 0.003571033477783203, "learning_rate": 9.706715543782064e-07, "loss": 0.0001, "reward": 0.22202930855564773, "reward_std": 0.7155356109142303, "rewards/cosine_scaled_reward": -0.06345116719603539, "rewards/format_reward": 0.6875000186264515, "step": 102 }, { "completion_length": 2856.0000610351562, "epoch": 0.11771428571428572, "grad_norm": 0.2828216850757599, "kl": 0.0014602243900299072, "learning_rate": 9.695457105469804e-07, "loss": 0.0001, "reward": 0.047393606044352055, "reward_std": 0.709520248696208, "rewards/cosine_scaled_reward": -0.06169603951275349, "rewards/format_reward": 0.45833334513008595, "step": 103 }, { "completion_length": 2754.2083587646484, "epoch": 0.11885714285714286, "grad_norm": 0.6580407023429871, "kl": 0.02184271812438965, "learning_rate": 9.683994186497132e-07, "loss": 0.0009, "reward": -0.11339985858649015, "reward_std": 0.5312070623040199, "rewards/cosine_scaled_reward": -0.11011355556547642, "rewards/format_reward": 0.35416666977107525, "step": 104 }, { "completion_length": 2917.104232788086, "epoch": 0.12, "grad_norm": 0.2027282565832138, "kl": 0.0013439655303955078, "learning_rate": 9.672327345550543e-07, "loss": 0.0001, "reward": -0.16319485567510128, "reward_std": 0.8842182755470276, "rewards/cosine_scaled_reward": -0.14117630943655968, "rewards/format_reward": 0.291666679084301, "step": 105 }, { "completion_length": 2187.4166870117188, "epoch": 0.12114285714285715, "grad_norm": 0.23987863957881927, "kl": 0.0010932087898254395, "learning_rate": 9.66045715125541e-07, "loss": 0.0, "reward": 0.5684326654300094, "reward_std": 0.8030262924730778, "rewards/cosine_scaled_reward": 0.16187836416065693, "rewards/format_reward": 0.7083333414047956, "step": 106 }, { "completion_length": 2847.750030517578, "epoch": 0.12228571428571429, "grad_norm": 0.2214292734861374, "kl": 0.001035928726196289, "learning_rate": 9.648384182148252e-07, "loss": 0.0, "reward": 0.11794419679790735, "reward_std": 0.49015025421977043, "rewards/cosine_scaled_reward": -0.0253047663718462, "rewards/format_reward": 0.5000000037252903, "step": 107 }, { "completion_length": 2861.7916870117188, "epoch": 0.12342857142857143, "grad_norm": 0.21802686154842377, "kl": 0.001120924949645996, "learning_rate": 9.636109026648554e-07, "loss": 0.0, "reward": 0.12055499106645584, "reward_std": 0.8332936242222786, "rewards/cosine_scaled_reward": -0.0109476950019598, "rewards/format_reward": 0.43750000558793545, "step": 108 }, { "completion_length": 3038.4583587646484, "epoch": 0.12457142857142857, "grad_norm": 0.17514924705028534, "kl": 0.0005573034286499023, "learning_rate": 9.623632283030077e-07, "loss": 0.0, "reward": -0.07062571635469794, "reward_std": 0.6698406562209129, "rewards/cosine_scaled_reward": -0.08270325418561697, "rewards/format_reward": 0.3333333395421505, "step": 109 }, { "completion_length": 2691.1250228881836, "epoch": 0.12571428571428572, "grad_norm": 0.2149229645729065, "kl": 0.000943649560213089, "learning_rate": 9.610954559391704e-07, "loss": 0.0, "reward": -0.06578723713755608, "reward_std": 0.8007752932608128, "rewards/cosine_scaled_reward": -0.16101466468535364, "rewards/format_reward": 0.4791666716337204, "step": 110 }, { "completion_length": 3023.562545776367, "epoch": 0.12685714285714286, "grad_norm": 0.1930156946182251, "kl": 0.0017292499542236328, "learning_rate": 9.598076473627796e-07, "loss": 0.0001, "reward": 0.05401752423495054, "reward_std": 0.6248798258602619, "rewards/cosine_scaled_reward": -0.012754572555422783, "rewards/format_reward": 0.3750000149011612, "step": 111 }, { "completion_length": 3057.2083740234375, "epoch": 0.128, "grad_norm": 0.17147748172283173, "kl": 0.000888526439666748, "learning_rate": 9.58499865339809e-07, "loss": 0.0, "reward": 0.12023577280342579, "reward_std": 1.0201664790511131, "rewards/cosine_scaled_reward": -0.015141034498810768, "rewards/format_reward": 0.41666668094694614, "step": 112 }, { "completion_length": 2712.000030517578, "epoch": 0.12914285714285714, "grad_norm": 0.2709903419017792, "kl": 0.0015894174575805664, "learning_rate": 9.571721736097088e-07, "loss": 0.0001, "reward": 0.06785004865378141, "reward_std": 0.7393594160676003, "rewards/cosine_scaled_reward": -0.029724635183811188, "rewards/format_reward": 0.41666667349636555, "step": 113 }, { "completion_length": 2551.7500228881836, "epoch": 0.13028571428571428, "grad_norm": 0.23112726211547852, "kl": 0.002204298973083496, "learning_rate": 9.55824636882301e-07, "loss": 0.0001, "reward": -0.08507447713054717, "reward_std": 0.6186108030378819, "rewards/cosine_scaled_reward": -0.19081172411097214, "rewards/format_reward": 0.5416666846722364, "step": 114 }, { "completion_length": 2909.4583587646484, "epoch": 0.13142857142857142, "grad_norm": 0.20251956582069397, "kl": 0.002556443214416504, "learning_rate": 9.54457320834625e-07, "loss": 0.0001, "reward": 0.06335902085993439, "reward_std": 0.6296372078359127, "rewards/cosine_scaled_reward": -0.025603776797652245, "rewards/format_reward": 0.41666666977107525, "step": 115 }, { "completion_length": 3241.979179382324, "epoch": 0.13257142857142856, "grad_norm": 0.19871990382671356, "kl": 0.0013921260833740234, "learning_rate": 9.530702921077358e-07, "loss": 0.0001, "reward": -0.24349494744092226, "reward_std": 0.7026082035154104, "rewards/cosine_scaled_reward": -0.13611777569167316, "rewards/format_reward": 0.2083333358168602, "step": 116 }, { "completion_length": 2976.6041870117188, "epoch": 0.1337142857142857, "grad_norm": 0.20164498686790466, "kl": 0.0022377967834472656, "learning_rate": 9.516636183034564e-07, "loss": 0.0001, "reward": -0.2154933400452137, "reward_std": 0.7364092990756035, "rewards/cosine_scaled_reward": -0.16451909206807613, "rewards/format_reward": 0.2916666679084301, "step": 117 }, { "completion_length": 2854.416732788086, "epoch": 0.13485714285714287, "grad_norm": 0.18143188953399658, "kl": 0.001271367073059082, "learning_rate": 9.502373679810839e-07, "loss": 0.0001, "reward": 0.48517339397221804, "reward_std": 1.1588403210043907, "rewards/cosine_scaled_reward": 0.1642640804639086, "rewards/format_reward": 0.5416666753590107, "step": 118 }, { "completion_length": 2527.312530517578, "epoch": 0.136, "grad_norm": 0.2451496571302414, "kl": 0.004798531532287598, "learning_rate": 9.487916106540465e-07, "loss": 0.0002, "reward": 0.31419147294946015, "reward_std": 0.5915131606161594, "rewards/cosine_scaled_reward": 0.07133831456303596, "rewards/format_reward": 0.5625000037252903, "step": 119 }, { "completion_length": 2332.937526702881, "epoch": 0.13714285714285715, "grad_norm": 0.23065590858459473, "kl": 0.0033817291259765625, "learning_rate": 9.473264167865171e-07, "loss": 0.0001, "reward": 0.3343121665529907, "reward_std": 0.7079905085265636, "rewards/cosine_scaled_reward": 0.032319008372724056, "rewards/format_reward": 0.6458333395421505, "step": 120 }, { "completion_length": 1775.125015258789, "epoch": 0.1382857142857143, "grad_norm": 0.2634807825088501, "kl": 0.004510760307312012, "learning_rate": 9.458418577899774e-07, "loss": 0.0002, "reward": 0.29028519266285, "reward_std": 0.6599521674215794, "rewards/cosine_scaled_reward": -0.04384784400463104, "rewards/format_reward": 0.7500000149011612, "step": 121 }, { "completion_length": 2823.0208740234375, "epoch": 0.13942857142857143, "grad_norm": 0.18922610580921173, "kl": 0.0015087127685546875, "learning_rate": 9.443380060197385e-07, "loss": 0.0001, "reward": 0.09716521622613072, "reward_std": 0.784220265224576, "rewards/cosine_scaled_reward": -0.04663149267435074, "rewards/format_reward": 0.4791666716337204, "step": 122 }, { "completion_length": 2748.3125610351562, "epoch": 0.14057142857142857, "grad_norm": 0.18109041452407837, "kl": 0.001852273941040039, "learning_rate": 9.428149347714143e-07, "loss": 0.0001, "reward": 0.029360684799030423, "reward_std": 0.6506396494805813, "rewards/cosine_scaled_reward": -0.08873580838553607, "rewards/format_reward": 0.47916666977107525, "step": 123 }, { "completion_length": 2168.9583740234375, "epoch": 0.1417142857142857, "grad_norm": 0.21630355715751648, "kl": 0.004555702209472656, "learning_rate": 9.412727182773486e-07, "loss": 0.0002, "reward": 0.06176229752600193, "reward_std": 0.7377404235303402, "rewards/cosine_scaled_reward": -0.12269194982945919, "rewards/format_reward": 0.5833333395421505, "step": 124 }, { "completion_length": 2883.666702270508, "epoch": 0.14285714285714285, "grad_norm": 0.21693550050258636, "kl": 0.001661539077758789, "learning_rate": 9.397114317029974e-07, "loss": 0.0001, "reward": -0.012577313929796219, "reward_std": 0.5609125196933746, "rewards/cosine_scaled_reward": -0.0033534724498167634, "rewards/format_reward": 0.27083333395421505, "step": 125 }, { "completion_length": 2796.4584045410156, "epoch": 0.144, "grad_norm": 0.16647285223007202, "kl": 0.0015259981155395508, "learning_rate": 9.381311511432658e-07, "loss": 0.0001, "reward": 0.02252695895731449, "reward_std": 0.5781615749001503, "rewards/cosine_scaled_reward": -0.0809040479362011, "rewards/format_reward": 0.479166679084301, "step": 126 }, { "completion_length": 2886.479202270508, "epoch": 0.14514285714285713, "grad_norm": 0.2508350610733032, "kl": 0.0021495819091796875, "learning_rate": 9.36531953618799e-07, "loss": 0.0001, "reward": -0.1614384746644646, "reward_std": 0.5212756060063839, "rewards/cosine_scaled_reward": -0.16197758680209517, "rewards/format_reward": 0.3958333432674408, "step": 127 }, { "completion_length": 2887.708366394043, "epoch": 0.1462857142857143, "grad_norm": 0.20055890083312988, "kl": 0.0028142929077148438, "learning_rate": 9.34913917072228e-07, "loss": 0.0001, "reward": 0.24019910395145416, "reward_std": 0.7546846177428961, "rewards/cosine_scaled_reward": 0.09899781085550785, "rewards/format_reward": 0.3958333358168602, "step": 128 }, { "completion_length": 3410.8333740234375, "epoch": 0.14742857142857144, "grad_norm": 0.18904834985733032, "kl": 0.002665996551513672, "learning_rate": 9.332771203643714e-07, "loss": 0.0001, "reward": -0.25656406581401825, "reward_std": 0.6125860698521137, "rewards/cosine_scaled_reward": -0.12984570860862732, "rewards/format_reward": 0.18750000186264515, "step": 129 }, { "completion_length": 2878.7500228881836, "epoch": 0.14857142857142858, "grad_norm": 0.18152809143066406, "kl": 0.0019643306732177734, "learning_rate": 9.316216432703916e-07, "loss": 0.0001, "reward": -0.2177959904074669, "reward_std": 0.5922619923949242, "rewards/cosine_scaled_reward": -0.16636610962450504, "rewards/format_reward": 0.31250000186264515, "step": 130 }, { "completion_length": 2894.3125228881836, "epoch": 0.14971428571428572, "grad_norm": 0.19060392677783966, "kl": 0.0039920806884765625, "learning_rate": 9.299475664759068e-07, "loss": 0.0002, "reward": 0.15484683774411678, "reward_std": 0.8720951899886131, "rewards/cosine_scaled_reward": 0.04138324782252312, "rewards/format_reward": 0.3750000037252903, "step": 131 }, { "completion_length": 2647.1250762939453, "epoch": 0.15085714285714286, "grad_norm": 0.2062901258468628, "kl": 0.002198457717895508, "learning_rate": 9.282549715730579e-07, "loss": 0.0001, "reward": 0.21524347737431526, "reward_std": 0.9688759073615074, "rewards/cosine_scaled_reward": 0.02236761897802353, "rewards/format_reward": 0.4791666753590107, "step": 132 }, { "completion_length": 3305.375030517578, "epoch": 0.152, "grad_norm": 0.19573596119880676, "kl": 0.002875089645385742, "learning_rate": 9.265439410565328e-07, "loss": 0.0001, "reward": -0.3930224981158972, "reward_std": 0.40591937862336636, "rewards/cosine_scaled_reward": -0.22959117405116558, "rewards/format_reward": 0.2291666716337204, "step": 133 }, { "completion_length": 2513.2291870117188, "epoch": 0.15314285714285714, "grad_norm": 0.22160597145557404, "kl": 0.0038213729858398438, "learning_rate": 9.248145583195447e-07, "loss": 0.0002, "reward": 0.14458692615153268, "reward_std": 0.6939279437065125, "rewards/cosine_scaled_reward": -0.046293994411826134, "rewards/format_reward": 0.562500013038516, "step": 134 }, { "completion_length": 2181.979217529297, "epoch": 0.15428571428571428, "grad_norm": 0.2493942677974701, "kl": 0.0032486915588378906, "learning_rate": 9.230669076497687e-07, "loss": 0.0001, "reward": 0.49393612146377563, "reward_std": 0.9125401228666306, "rewards/cosine_scaled_reward": 0.16448672115802765, "rewards/format_reward": 0.5833333395421505, "step": 135 }, { "completion_length": 2919.604248046875, "epoch": 0.15542857142857142, "grad_norm": 0.16584192216396332, "kl": 0.0039215087890625, "learning_rate": 9.213010742252327e-07, "loss": 0.0002, "reward": 0.26563066989183426, "reward_std": 0.9960127659142017, "rewards/cosine_scaled_reward": 0.06982969990349375, "rewards/format_reward": 0.4583333432674408, "step": 136 }, { "completion_length": 2992.1458892822266, "epoch": 0.15657142857142858, "grad_norm": 0.16871950030326843, "kl": 0.0024118423461914062, "learning_rate": 9.195171441101668e-07, "loss": 0.0001, "reward": -0.12757205590605736, "reward_std": 0.6657307185232639, "rewards/cosine_scaled_reward": -0.10885505130863748, "rewards/format_reward": 0.31250000186264515, "step": 137 }, { "completion_length": 2352.166748046875, "epoch": 0.15771428571428572, "grad_norm": 0.23502451181411743, "kl": 0.0022039413452148438, "learning_rate": 9.177152042508077e-07, "loss": 0.0001, "reward": 0.05814027041196823, "reward_std": 0.5509532168507576, "rewards/cosine_scaled_reward": -0.12178020738065243, "rewards/format_reward": 0.6041666828095913, "step": 138 }, { "completion_length": 3276.6041870117188, "epoch": 0.15885714285714286, "grad_norm": 0.18198812007904053, "kl": 0.004057884216308594, "learning_rate": 9.158953424711624e-07, "loss": 0.0002, "reward": -0.3338587903417647, "reward_std": 0.5466504357755184, "rewards/cosine_scaled_reward": -0.21947086788713932, "rewards/format_reward": 0.27083334513008595, "step": 139 }, { "completion_length": 2934.854202270508, "epoch": 0.16, "grad_norm": 0.230448380112648, "kl": 0.004750251770019531, "learning_rate": 9.140576474687263e-07, "loss": 0.0002, "reward": -0.08668341906741261, "reward_std": 0.6167776882648468, "rewards/cosine_scaled_reward": -0.07037415914237499, "rewards/format_reward": 0.29166667722165585, "step": 140 }, { "completion_length": 2843.229248046875, "epoch": 0.16114285714285714, "grad_norm": 0.22269631922245026, "kl": 0.004426002502441406, "learning_rate": 9.122022088101613e-07, "loss": 0.0002, "reward": 0.011526300571858883, "reward_std": 0.9608168490231037, "rewards/cosine_scaled_reward": -0.12784388910222333, "rewards/format_reward": 0.500000013038516, "step": 141 }, { "completion_length": 2842.2084197998047, "epoch": 0.16228571428571428, "grad_norm": 0.201444610953331, "kl": 0.003329753875732422, "learning_rate": 9.103291169269299e-07, "loss": 0.0001, "reward": 0.07312965765595436, "reward_std": 0.7833281680941582, "rewards/cosine_scaled_reward": -0.09690700098872185, "rewards/format_reward": 0.5625000111758709, "step": 142 }, { "completion_length": 2778.687530517578, "epoch": 0.16342857142857142, "grad_norm": 0.2938580811023712, "kl": 0.0045318603515625, "learning_rate": 9.084384631108882e-07, "loss": 0.0002, "reward": -0.14028875157237053, "reward_std": 0.5204294696450233, "rewards/cosine_scaled_reward": -0.14672685600817204, "rewards/format_reward": 0.3958333395421505, "step": 143 }, { "completion_length": 2723.229217529297, "epoch": 0.16457142857142856, "grad_norm": 0.2279835343360901, "kl": 0.0032444000244140625, "learning_rate": 9.065303395098358e-07, "loss": 0.0001, "reward": 0.12287670839577913, "reward_std": 0.7171412445604801, "rewards/cosine_scaled_reward": 0.005602353252470493, "rewards/format_reward": 0.4166666753590107, "step": 144 }, { "completion_length": 2038.9584007263184, "epoch": 0.1657142857142857, "grad_norm": 0.3204021155834198, "kl": 0.004198551177978516, "learning_rate": 9.046048391230247e-07, "loss": 0.0002, "reward": 0.3942429169546813, "reward_std": 0.6445030942559242, "rewards/cosine_scaled_reward": 0.07814209163188934, "rewards/format_reward": 0.6458333395421505, "step": 145 }, { "completion_length": 2138.0209045410156, "epoch": 0.16685714285714287, "grad_norm": 0.20871621370315552, "kl": 0.0025124549865722656, "learning_rate": 9.026620557966279e-07, "loss": 0.0001, "reward": 0.16610028222203255, "reward_std": 0.7146016918122768, "rewards/cosine_scaled_reward": -0.11062343697994947, "rewards/format_reward": 0.7083333469927311, "step": 146 }, { "completion_length": 2778.729179382324, "epoch": 0.168, "grad_norm": 0.20433104038238525, "kl": 0.004992485046386719, "learning_rate": 9.007020842191634e-07, "loss": 0.0002, "reward": -0.17755376175045967, "reward_std": 0.5958786718547344, "rewards/cosine_scaled_reward": -0.16063082218170166, "rewards/format_reward": 0.3541666716337204, "step": 147 }, { "completion_length": 2253.812545776367, "epoch": 0.16914285714285715, "grad_norm": 0.25221163034439087, "kl": 0.0036783218383789062, "learning_rate": 8.987250199168808e-07, "loss": 0.0001, "reward": 0.21890676906332374, "reward_std": 0.684816125780344, "rewards/cosine_scaled_reward": -0.01959683746099472, "rewards/format_reward": 0.604166679084301, "step": 148 }, { "completion_length": 2701.104232788086, "epoch": 0.1702857142857143, "grad_norm": 0.19634747505187988, "kl": 0.002753734588623047, "learning_rate": 8.967309592491052e-07, "loss": 0.0001, "reward": 0.22702318988740444, "reward_std": 0.8613481521606445, "rewards/cosine_scaled_reward": 0.003744029439985752, "rewards/format_reward": 0.541666679084301, "step": 149 }, { "completion_length": 2753.479202270508, "epoch": 0.17142857142857143, "grad_norm": 0.22827665507793427, "kl": 0.0054454803466796875, "learning_rate": 8.9471999940354e-07, "loss": 0.0002, "reward": 0.10332798771560192, "reward_std": 0.8186976052820683, "rewards/cosine_scaled_reward": -0.04311029799282551, "rewards/format_reward": 0.4791666753590107, "step": 150 }, { "completion_length": 2541.979202270508, "epoch": 0.17257142857142857, "grad_norm": 0.18608002364635468, "kl": 0.004375934600830078, "learning_rate": 8.926922383915315e-07, "loss": 0.0002, "reward": 0.2210833989083767, "reward_std": 0.9012398943305016, "rewards/cosine_scaled_reward": -0.0220514964312315, "rewards/format_reward": 0.5833333432674408, "step": 151 }, { "completion_length": 2626.9791870117188, "epoch": 0.1737142857142857, "grad_norm": 0.232888326048851, "kl": 0.003198862075805664, "learning_rate": 8.906477750432903e-07, "loss": 0.0001, "reward": -0.07092434912919998, "reward_std": 0.635146826505661, "rewards/cosine_scaled_reward": -0.11896505579352379, "rewards/format_reward": 0.4166666716337204, "step": 152 }, { "completion_length": 2805.5625610351562, "epoch": 0.17485714285714285, "grad_norm": 0.27297356724739075, "kl": 0.00728607177734375, "learning_rate": 8.88586709003076e-07, "loss": 0.0003, "reward": -0.17417170573025942, "reward_std": 0.6839433200657368, "rewards/cosine_scaled_reward": -0.18635874427855015, "rewards/format_reward": 0.39583334140479565, "step": 153 }, { "completion_length": 3413.7709045410156, "epoch": 0.176, "grad_norm": 0.14731697738170624, "kl": 0.0031638145446777344, "learning_rate": 8.865091407243394e-07, "loss": 0.0001, "reward": 0.05796261690557003, "reward_std": 1.0252362191677094, "rewards/cosine_scaled_reward": -0.010732692433521152, "rewards/format_reward": 0.3333333395421505, "step": 154 }, { "completion_length": 2530.604217529297, "epoch": 0.17714285714285713, "grad_norm": 0.1957230567932129, "kl": 0.0033855438232421875, "learning_rate": 8.844151714648274e-07, "loss": 0.0001, "reward": 0.001878822222352028, "reward_std": 0.5804712846875191, "rewards/cosine_scaled_reward": -0.07664009183645248, "rewards/format_reward": 0.43750000186264515, "step": 155 }, { "completion_length": 2793.583351135254, "epoch": 0.1782857142857143, "grad_norm": 0.17240341007709503, "kl": 0.0031371116638183594, "learning_rate": 8.823049032816478e-07, "loss": 0.0001, "reward": 0.20785324275493622, "reward_std": 0.9210717603564262, "rewards/cosine_scaled_reward": 0.044585417956113815, "rewards/format_reward": 0.4375000074505806, "step": 156 }, { "completion_length": 2746.2917098999023, "epoch": 0.17942857142857144, "grad_norm": 0.2299441546201706, "kl": 0.0045833587646484375, "learning_rate": 8.801784390262943e-07, "loss": 0.0002, "reward": -0.09681704035028815, "reward_std": 0.5040780827403069, "rewards/cosine_scaled_reward": -0.1591328363865614, "rewards/format_reward": 0.4791666753590107, "step": 157 }, { "completion_length": 3003.0209350585938, "epoch": 0.18057142857142858, "grad_norm": 0.20773041248321533, "kl": 0.003810882568359375, "learning_rate": 8.780358823396352e-07, "loss": 0.0002, "reward": 0.33219840307720006, "reward_std": 0.9787469133734703, "rewards/cosine_scaled_reward": 0.11911612004041672, "rewards/format_reward": 0.43750001303851604, "step": 158 }, { "completion_length": 2543.3958740234375, "epoch": 0.18171428571428572, "grad_norm": 0.17278896272182465, "kl": 0.003597259521484375, "learning_rate": 8.758773376468604e-07, "loss": 0.0001, "reward": -0.10545414686203003, "reward_std": 0.690795011818409, "rewards/cosine_scaled_reward": -0.1695910869166255, "rewards/format_reward": 0.45833334140479565, "step": 159 }, { "completion_length": 2490.2083740234375, "epoch": 0.18285714285714286, "grad_norm": 0.21473944187164307, "kl": 0.005862236022949219, "learning_rate": 8.737029101523929e-07, "loss": 0.0002, "reward": 0.2439700961112976, "reward_std": 0.8647127598524094, "rewards/cosine_scaled_reward": 0.016008037142455578, "rewards/format_reward": 0.5416666734963655, "step": 160 }, { "completion_length": 2415.604202270508, "epoch": 0.184, "grad_norm": 0.21809473633766174, "kl": 0.004559516906738281, "learning_rate": 8.715127058347614e-07, "loss": 0.0002, "reward": 0.08977567870169878, "reward_std": 0.6846508830785751, "rewards/cosine_scaled_reward": -0.12287470698356628, "rewards/format_reward": 0.6250000111758709, "step": 161 }, { "completion_length": 3056.791702270508, "epoch": 0.18514285714285714, "grad_norm": 0.20735777914524078, "kl": 0.006768226623535156, "learning_rate": 8.693068314414344e-07, "loss": 0.0003, "reward": -0.18869051523506641, "reward_std": 0.6717521250247955, "rewards/cosine_scaled_reward": -0.13688376732170582, "rewards/format_reward": 0.29166667349636555, "step": 162 }, { "completion_length": 2427.437545776367, "epoch": 0.18628571428571428, "grad_norm": 0.24365104734897614, "kl": 0.004721641540527344, "learning_rate": 8.670853944836176e-07, "loss": 0.0002, "reward": 0.4920094236731529, "reward_std": 0.755946584045887, "rewards/cosine_scaled_reward": 0.16996393306180835, "rewards/format_reward": 0.5833333414047956, "step": 163 }, { "completion_length": 2403.041732788086, "epoch": 0.18742857142857142, "grad_norm": 0.19330641627311707, "kl": 0.0048122406005859375, "learning_rate": 8.648485032310144e-07, "loss": 0.0002, "reward": 0.08846932090818882, "reward_std": 0.7613176181912422, "rewards/cosine_scaled_reward": -0.06676248833537102, "rewards/format_reward": 0.5000000037252903, "step": 164 }, { "completion_length": 2565.2083435058594, "epoch": 0.18857142857142858, "grad_norm": 0.22222936153411865, "kl": 0.0052661895751953125, "learning_rate": 8.625962667065487e-07, "loss": 0.0002, "reward": -0.24459386244416237, "reward_std": 0.4413093514740467, "rewards/cosine_scaled_reward": -0.2269913526251912, "rewards/format_reward": 0.4166666716337204, "step": 165 }, { "completion_length": 2543.312515258789, "epoch": 0.18971428571428572, "grad_norm": 0.18792009353637695, "kl": 0.003421306610107422, "learning_rate": 8.603287946810513e-07, "loss": 0.0001, "reward": 0.20099905133247375, "reward_std": 0.7041075564920902, "rewards/cosine_scaled_reward": 0.01856833230704069, "rewards/format_reward": 0.5000000037252903, "step": 166 }, { "completion_length": 2143.854232788086, "epoch": 0.19085714285714286, "grad_norm": 0.24791951477527618, "kl": 0.003101825714111328, "learning_rate": 8.580461976679099e-07, "loss": 0.0001, "reward": 0.2092342609539628, "reward_std": 0.6548982411623001, "rewards/cosine_scaled_reward": -0.09624562226235867, "rewards/format_reward": 0.7500000186264515, "step": 167 }, { "completion_length": 2693.3333740234375, "epoch": 0.192, "grad_norm": 0.19963470101356506, "kl": 0.0033502578735351562, "learning_rate": 8.557485869176825e-07, "loss": 0.0001, "reward": 0.2024577334523201, "reward_std": 0.8647420592606068, "rewards/cosine_scaled_reward": 0.00969130964949727, "rewards/format_reward": 0.5000000111758709, "step": 168 }, { "completion_length": 1926.437515258789, "epoch": 0.19314285714285714, "grad_norm": 0.20806600153446198, "kl": 0.003902435302734375, "learning_rate": 8.534360744126753e-07, "loss": 0.0002, "reward": 0.7901292243041098, "reward_std": 0.9031771421432495, "rewards/cosine_scaled_reward": 0.24018706334754825, "rewards/format_reward": 0.8333333432674408, "step": 169 }, { "completion_length": 2477.1042098999023, "epoch": 0.19428571428571428, "grad_norm": 0.19945000112056732, "kl": 0.0035648345947265625, "learning_rate": 8.511087728614862e-07, "loss": 0.0001, "reward": 0.14062727987766266, "reward_std": 0.6545419208705425, "rewards/cosine_scaled_reward": -0.010002967901527882, "rewards/format_reward": 0.4791666753590107, "step": 170 }, { "completion_length": 2406.479202270508, "epoch": 0.19542857142857142, "grad_norm": 0.19067376852035522, "kl": 0.0029478073120117188, "learning_rate": 8.487667956935087e-07, "loss": 0.0001, "reward": 0.1707976944744587, "reward_std": 0.7575587891042233, "rewards/cosine_scaled_reward": 0.005063103046268225, "rewards/format_reward": 0.4791666828095913, "step": 171 }, { "completion_length": 2873.8750228881836, "epoch": 0.19657142857142856, "grad_norm": 0.2227620929479599, "kl": 0.006169319152832031, "learning_rate": 8.464102570534061e-07, "loss": 0.0002, "reward": 0.2180697526782751, "reward_std": 0.8882052935659885, "rewards/cosine_scaled_reward": 0.07928902423009276, "rewards/format_reward": 0.37500000186264515, "step": 172 }, { "completion_length": 1677.6875076293945, "epoch": 0.1977142857142857, "grad_norm": 0.2608514428138733, "kl": 0.0040760040283203125, "learning_rate": 8.440392717955475e-07, "loss": 0.0002, "reward": 0.11270578391849995, "reward_std": 0.7172344215214252, "rewards/cosine_scaled_reward": -0.1393726442474872, "rewards/format_reward": 0.6875, "step": 173 }, { "completion_length": 2115.6458740234375, "epoch": 0.19885714285714284, "grad_norm": 0.21290229260921478, "kl": 0.006714820861816406, "learning_rate": 8.416539554784089e-07, "loss": 0.0003, "reward": 0.4762940816581249, "reward_std": 0.8587245345115662, "rewards/cosine_scaled_reward": 0.08201689831912518, "rewards/format_reward": 0.7291666734963655, "step": 174 }, { "completion_length": 2647.6041870117188, "epoch": 0.2, "grad_norm": 0.15685485303401947, "kl": 0.004894256591796875, "learning_rate": 8.392544243589427e-07, "loss": 0.0002, "reward": 0.2985331416130066, "reward_std": 0.5968715883791447, "rewards/cosine_scaled_reward": 0.10642770305275917, "rewards/format_reward": 0.4791666716337204, "step": 175 }, { "completion_length": 1974.5625686645508, "epoch": 0.20114285714285715, "grad_norm": 0.28263431787490845, "kl": 0.0044193267822265625, "learning_rate": 8.368407953869103e-07, "loss": 0.0002, "reward": 0.1971809258684516, "reward_std": 0.8409126959741116, "rewards/cosine_scaled_reward": -0.0763177121989429, "rewards/format_reward": 0.6666666697710752, "step": 176 }, { "completion_length": 2667.4583740234375, "epoch": 0.2022857142857143, "grad_norm": 0.2523181140422821, "kl": 0.005799293518066406, "learning_rate": 8.344131861991828e-07, "loss": 0.0002, "reward": 0.11005790415219963, "reward_std": 0.6982481181621552, "rewards/cosine_scaled_reward": -0.059121566824615, "rewards/format_reward": 0.541666679084301, "step": 177 }, { "completion_length": 2512.770881652832, "epoch": 0.20342857142857143, "grad_norm": 0.24519820511341095, "kl": 0.0081329345703125, "learning_rate": 8.319717151140072e-07, "loss": 0.0003, "reward": 0.2254039328545332, "reward_std": 0.8539987355470657, "rewards/cosine_scaled_reward": -0.00572938984259963, "rewards/format_reward": 0.5625000018626451, "step": 178 }, { "completion_length": 2653.083335876465, "epoch": 0.20457142857142857, "grad_norm": 0.3077125549316406, "kl": 0.004204273223876953, "learning_rate": 8.295165011252396e-07, "loss": 0.0002, "reward": 0.029737239703536034, "reward_std": 0.7201298326253891, "rewards/cosine_scaled_reward": -0.08826536871492863, "rewards/format_reward": 0.4791666716337204, "step": 179 }, { "completion_length": 1912.3333740234375, "epoch": 0.2057142857142857, "grad_norm": 0.2891172468662262, "kl": 0.006420135498046875, "learning_rate": 8.270476638965461e-07, "loss": 0.0003, "reward": 0.3585444991476834, "reward_std": 0.922141257673502, "rewards/cosine_scaled_reward": 0.023937703692354262, "rewards/format_reward": 0.6666666679084301, "step": 180 }, { "completion_length": 2936.9583435058594, "epoch": 0.20685714285714285, "grad_norm": 0.21322257816791534, "kl": 0.005759239196777344, "learning_rate": 8.245653237555705e-07, "loss": 0.0002, "reward": -0.0014306185767054558, "reward_std": 0.7736919745802879, "rewards/cosine_scaled_reward": -0.05711588263511658, "rewards/format_reward": 0.3541666753590107, "step": 181 }, { "completion_length": 2112.875030517578, "epoch": 0.208, "grad_norm": 0.18484172224998474, "kl": 0.0020971298217773438, "learning_rate": 8.220696016880687e-07, "loss": 0.0001, "reward": 0.22709017619490623, "reward_std": 0.7963708490133286, "rewards/cosine_scaled_reward": -0.04323145607486367, "rewards/format_reward": 0.6458333432674408, "step": 182 }, { "completion_length": 1799.1042175292969, "epoch": 0.20914285714285713, "grad_norm": 0.2422093003988266, "kl": 0.007781982421875, "learning_rate": 8.195606193320136e-07, "loss": 0.0003, "reward": 0.4508074652403593, "reward_std": 0.8603553548455238, "rewards/cosine_scaled_reward": 0.010005924385040998, "rewards/format_reward": 0.8333333469927311, "step": 183 }, { "completion_length": 2360.8541870117188, "epoch": 0.2102857142857143, "grad_norm": 0.26070070266723633, "kl": 0.00511932373046875, "learning_rate": 8.170384989716657e-07, "loss": 0.0002, "reward": -0.12935980968177319, "reward_std": 0.5107476897537708, "rewards/cosine_scaled_reward": -0.21540038008242846, "rewards/format_reward": 0.5416666734963655, "step": 184 }, { "completion_length": 2236.8542098999023, "epoch": 0.21142857142857144, "grad_norm": 0.22640132904052734, "kl": 0.004220008850097656, "learning_rate": 8.145033635316128e-07, "loss": 0.0002, "reward": -0.22387782018631697, "reward_std": 0.3759702183306217, "rewards/cosine_scaled_reward": -0.2601937036961317, "rewards/format_reward": 0.5208333358168602, "step": 185 }, { "completion_length": 2573.1667098999023, "epoch": 0.21257142857142858, "grad_norm": 0.18543511629104614, "kl": 0.005625724792480469, "learning_rate": 8.119553365707802e-07, "loss": 0.0002, "reward": -0.0019482946954667568, "reward_std": 0.6483882665634155, "rewards/cosine_scaled_reward": -0.08200034685432911, "rewards/format_reward": 0.43750000558793545, "step": 186 }, { "completion_length": 1852.8958587646484, "epoch": 0.21371428571428572, "grad_norm": 0.2344105988740921, "kl": 0.005881786346435547, "learning_rate": 8.093945422764069e-07, "loss": 0.0002, "reward": 0.061095200944691896, "reward_std": 0.5442762821912766, "rewards/cosine_scaled_reward": -0.1909130923449993, "rewards/format_reward": 0.7500000055879354, "step": 187 }, { "completion_length": 2904.625030517578, "epoch": 0.21485714285714286, "grad_norm": 0.17529600858688354, "kl": 0.0055294036865234375, "learning_rate": 8.068211054579943e-07, "loss": 0.0002, "reward": -0.289316936628893, "reward_std": 0.5605541579425335, "rewards/cosine_scaled_reward": -0.2442341128771659, "rewards/format_reward": 0.3750000111758709, "step": 188 }, { "completion_length": 2011.3750457763672, "epoch": 0.216, "grad_norm": 0.5071147084236145, "kl": 0.0047817230224609375, "learning_rate": 8.04235151541222e-07, "loss": 0.0002, "reward": 0.18212716095149517, "reward_std": 0.6630742475390434, "rewards/cosine_scaled_reward": -0.06412508455105126, "rewards/format_reward": 0.6458333395421505, "step": 189 }, { "completion_length": 2108.0000610351562, "epoch": 0.21714285714285714, "grad_norm": 0.20937705039978027, "kl": 0.005312919616699219, "learning_rate": 8.01636806561836e-07, "loss": 0.0002, "reward": 0.4989317310974002, "reward_std": 0.8167940117418766, "rewards/cosine_scaled_reward": 0.11576890759170055, "rewards/format_reward": 0.6875000055879354, "step": 190 }, { "completion_length": 1798.833381652832, "epoch": 0.21828571428571428, "grad_norm": 0.24339546263217926, "kl": 0.005016326904296875, "learning_rate": 7.990261971595048e-07, "loss": 0.0002, "reward": 0.3590816780924797, "reward_std": 0.886138778179884, "rewards/cosine_scaled_reward": 0.0189095304813236, "rewards/format_reward": 0.6875000055879354, "step": 191 }, { "completion_length": 2366.5000762939453, "epoch": 0.21942857142857142, "grad_norm": 0.20198509097099304, "kl": 0.0047245025634765625, "learning_rate": 7.964034505716476e-07, "loss": 0.0002, "reward": -0.017504574730992317, "reward_std": 0.6682247072458267, "rewards/cosine_scaled_reward": -0.1821177799720317, "rewards/format_reward": 0.6041666734963655, "step": 192 }, { "completion_length": 2782.4792098999023, "epoch": 0.22057142857142858, "grad_norm": 0.33190837502479553, "kl": 0.0082855224609375, "learning_rate": 7.93768694627233e-07, "loss": 0.0003, "reward": -0.09848117176443338, "reward_std": 0.7263169325888157, "rewards/cosine_scaled_reward": -0.16758773289620876, "rewards/format_reward": 0.45833334513008595, "step": 193 }, { "completion_length": 2497.958396911621, "epoch": 0.22171428571428572, "grad_norm": 0.20004087686538696, "kl": 0.005490303039550781, "learning_rate": 7.911220577405484e-07, "loss": 0.0002, "reward": 0.4655712964013219, "reward_std": 0.936052493751049, "rewards/cosine_scaled_reward": 0.11659512436017394, "rewards/format_reward": 0.6458333469927311, "step": 194 }, { "completion_length": 2314.0000915527344, "epoch": 0.22285714285714286, "grad_norm": 0.23152968287467957, "kl": 0.0047397613525390625, "learning_rate": 7.884636689049422e-07, "loss": 0.0002, "reward": 0.33897530660033226, "reward_std": 0.9344584122300148, "rewards/cosine_scaled_reward": 0.003916108049452305, "rewards/format_reward": 0.6875000167638063, "step": 195 }, { "completion_length": 3108.625030517578, "epoch": 0.224, "grad_norm": 0.17755372822284698, "kl": 0.006099700927734375, "learning_rate": 7.857936576865356e-07, "loss": 0.0002, "reward": 0.04909700155258179, "reward_std": 0.5633429922163486, "rewards/cosine_scaled_reward": -0.03188931196928024, "rewards/format_reward": 0.41666667349636555, "step": 196 }, { "completion_length": 1294.9583549499512, "epoch": 0.22514285714285714, "grad_norm": 0.26458728313446045, "kl": 0.0051975250244140625, "learning_rate": 7.831121542179086e-07, "loss": 0.0002, "reward": 0.580374225974083, "reward_std": 0.8097280263900757, "rewards/cosine_scaled_reward": 0.0928502269089222, "rewards/format_reward": 0.8541666716337204, "step": 197 }, { "completion_length": 2055.2500381469727, "epoch": 0.22628571428571428, "grad_norm": 0.2084938883781433, "kl": 0.0077152252197265625, "learning_rate": 7.804192891917571e-07, "loss": 0.0003, "reward": 0.2619430273771286, "reward_std": 0.6607060171663761, "rewards/cosine_scaled_reward": -0.01983140129595995, "rewards/format_reward": 0.666666679084301, "step": 198 }, { "completion_length": 2145.4584045410156, "epoch": 0.22742857142857142, "grad_norm": 0.24516309797763824, "kl": 0.007678985595703125, "learning_rate": 7.777151938545235e-07, "loss": 0.0003, "reward": 0.267922455444932, "reward_std": 0.8017127625644207, "rewards/cosine_scaled_reward": -0.057456295005977154, "rewards/format_reward": 0.7291666846722364, "step": 199 }, { "completion_length": 1690.3958892822266, "epoch": 0.22857142857142856, "grad_norm": 0.25067099928855896, "kl": 0.004558563232421875, "learning_rate": 7.75e-07, "loss": 0.0002, "reward": 0.4661361053586006, "reward_std": 0.831662617623806, "rewards/cosine_scaled_reward": 0.043831199407577515, "rewards/format_reward": 0.7916666753590107, "step": 200 }, { "completion_length": 1972.2083854675293, "epoch": 0.2297142857142857, "grad_norm": 0.30941376090049744, "kl": 0.0045299530029296875, "learning_rate": 7.72273839962904e-07, "loss": 0.0002, "reward": 0.9293034709990025, "reward_std": 0.8486984223127365, "rewards/cosine_scaled_reward": 0.35864230850711465, "rewards/format_reward": 0.7916666716337204, "step": 201 }, { "completion_length": 1596.5625305175781, "epoch": 0.23085714285714284, "grad_norm": 0.20324425399303436, "kl": 0.0039119720458984375, "learning_rate": 7.695368466124296e-07, "loss": 0.0002, "reward": 0.5681986985728145, "reward_std": 0.5084148272871971, "rewards/cosine_scaled_reward": 0.16256619803607464, "rewards/format_reward": 0.7291666716337204, "step": 202 }, { "completion_length": 2294.5208435058594, "epoch": 0.232, "grad_norm": 0.2826099097728729, "kl": 0.006962776184082031, "learning_rate": 7.667891533457718e-07, "loss": 0.0003, "reward": 0.20878975582309067, "reward_std": 0.8034153431653976, "rewards/cosine_scaled_reward": -0.03315589763224125, "rewards/format_reward": 0.6041666734963655, "step": 203 }, { "completion_length": 1520.5000381469727, "epoch": 0.23314285714285715, "grad_norm": 0.3326990306377411, "kl": 0.005408287048339844, "learning_rate": 7.640308940816239e-07, "loss": 0.0002, "reward": 0.5824265358969569, "reward_std": 0.7743073143064976, "rewards/cosine_scaled_reward": 0.06728124246001244, "rewards/format_reward": 0.916666679084301, "step": 204 }, { "completion_length": 1976.6250534057617, "epoch": 0.2342857142857143, "grad_norm": 0.2846101224422455, "kl": 0.004940032958984375, "learning_rate": 7.612622032536507e-07, "loss": 0.0002, "reward": 0.47044914215803146, "reward_std": 0.9910449758172035, "rewards/cosine_scaled_reward": 0.09328299947082996, "rewards/format_reward": 0.6875000093132257, "step": 205 }, { "completion_length": 2566.541702270508, "epoch": 0.23542857142857143, "grad_norm": 0.22673514485359192, "kl": 0.0060405731201171875, "learning_rate": 7.584832158039378e-07, "loss": 0.0002, "reward": -0.0411979085765779, "reward_std": 0.6954502202570438, "rewards/cosine_scaled_reward": -0.14637301303446293, "rewards/format_reward": 0.5000000111758709, "step": 206 }, { "completion_length": 1979.3125457763672, "epoch": 0.23657142857142857, "grad_norm": 0.2924785315990448, "kl": 0.007213592529296875, "learning_rate": 7.556940671764124e-07, "loss": 0.0003, "reward": 0.3043129206635058, "reward_std": 0.830761406570673, "rewards/cosine_scaled_reward": -0.0667324224486947, "rewards/format_reward": 0.7916666716337204, "step": 207 }, { "completion_length": 1684.020851135254, "epoch": 0.2377142857142857, "grad_norm": 0.21258480846881866, "kl": 0.0056591033935546875, "learning_rate": 7.528948933102438e-07, "loss": 0.0002, "reward": 0.4881124454550445, "reward_std": 0.6019946355372667, "rewards/cosine_scaled_reward": 0.062041960656642914, "rewards/format_reward": 0.8125000074505806, "step": 208 }, { "completion_length": 1827.8125457763672, "epoch": 0.23885714285714285, "grad_norm": 0.26896902918815613, "kl": 0.00604248046875, "learning_rate": 7.500858306332172e-07, "loss": 0.0002, "reward": 0.4111206401139498, "reward_std": 0.7256677895784378, "rewards/cosine_scaled_reward": 0.03580674855038524, "rewards/format_reward": 0.7500000037252903, "step": 209 }, { "completion_length": 1963.2917175292969, "epoch": 0.24, "grad_norm": 0.18104591965675354, "kl": 0.0045299530029296875, "learning_rate": 7.472670160550848e-07, "loss": 0.0002, "reward": 0.3350107278674841, "reward_std": 0.733224093914032, "rewards/cosine_scaled_reward": -0.016452430087156245, "rewards/format_reward": 0.7500000074505806, "step": 210 }, { "completion_length": 1673.8750228881836, "epoch": 0.24114285714285713, "grad_norm": 0.2317248284816742, "kl": 0.005611419677734375, "learning_rate": 7.444385869608921e-07, "loss": 0.0002, "reward": 0.4623087588697672, "reward_std": 0.633669501170516, "rewards/cosine_scaled_reward": 0.09324477170594037, "rewards/format_reward": 0.7083333432674408, "step": 211 }, { "completion_length": 1575.3125457763672, "epoch": 0.2422857142857143, "grad_norm": 0.26243215799331665, "kl": 0.0056781768798828125, "learning_rate": 7.416006812042827e-07, "loss": 0.0002, "reward": 0.524188793846406, "reward_std": 0.9563748612999916, "rewards/cosine_scaled_reward": 0.07078039133921266, "rewards/format_reward": 0.7916666716337204, "step": 212 }, { "completion_length": 1991.3750686645508, "epoch": 0.24342857142857144, "grad_norm": 0.2917014956474304, "kl": 0.009765625, "learning_rate": 7.387534371007797e-07, "loss": 0.0004, "reward": 0.44908707588911057, "reward_std": 0.7470048144459724, "rewards/cosine_scaled_reward": 0.06995298992842436, "rewards/format_reward": 0.7291666716337204, "step": 213 }, { "completion_length": 2065.70841217041, "epoch": 0.24457142857142858, "grad_norm": 0.2539423108100891, "kl": 0.00661468505859375, "learning_rate": 7.358969934210438e-07, "loss": 0.0003, "reward": 0.48666782677173615, "reward_std": 0.8372413367033005, "rewards/cosine_scaled_reward": 0.07745558395981789, "rewards/format_reward": 0.7500000149011612, "step": 214 }, { "completion_length": 1527.125015258789, "epoch": 0.24571428571428572, "grad_norm": 0.22233973443508148, "kl": 0.0035753250122070312, "learning_rate": 7.330314893841101e-07, "loss": 0.0001, "reward": 0.27787265577353537, "reward_std": 0.5949838422238827, "rewards/cosine_scaled_reward": -0.09243576228618622, "rewards/format_reward": 0.8333333432674408, "step": 215 }, { "completion_length": 1395.00004196167, "epoch": 0.24685714285714286, "grad_norm": 0.35132068395614624, "kl": 0.00598907470703125, "learning_rate": 7.301570646506027e-07, "loss": 0.0002, "reward": 0.70725142583251, "reward_std": 0.6296214163303375, "rewards/cosine_scaled_reward": 0.16727344412356615, "rewards/format_reward": 0.8958333395421505, "step": 216 }, { "completion_length": 1762.5416946411133, "epoch": 0.248, "grad_norm": 0.18891237676143646, "kl": 0.005519866943359375, "learning_rate": 7.27273859315928e-07, "loss": 0.0002, "reward": 0.4034770044963807, "reward_std": 0.6970177218317986, "rewards/cosine_scaled_reward": 0.04039741773158312, "rewards/format_reward": 0.7291666716337204, "step": 217 }, { "completion_length": 1985.6875305175781, "epoch": 0.24914285714285714, "grad_norm": 0.2248099148273468, "kl": 0.005970954895019531, "learning_rate": 7.243820139034464e-07, "loss": 0.0002, "reward": 0.15873102471232414, "reward_std": 0.6960118226706982, "rewards/cosine_scaled_reward": -0.12544306740164757, "rewards/format_reward": 0.7291666697710752, "step": 218 }, { "completion_length": 1378.6666946411133, "epoch": 0.2502857142857143, "grad_norm": 0.26156318187713623, "kl": 0.0061511993408203125, "learning_rate": 7.214816693576234e-07, "loss": 0.0002, "reward": 0.537579414434731, "reward_std": 0.7653206884860992, "rewards/cosine_scaled_reward": 0.07624521851539612, "rewards/format_reward": 0.8333333432674408, "step": 219 }, { "completion_length": 1764.0000228881836, "epoch": 0.25142857142857145, "grad_norm": 0.2845953404903412, "kl": 0.006932258605957031, "learning_rate": 7.185729670371604e-07, "loss": 0.0003, "reward": 0.02811681106686592, "reward_std": 0.4322117939591408, "rewards/cosine_scaled_reward": -0.2192363552749157, "rewards/format_reward": 0.7708333432674408, "step": 220 }, { "completion_length": 1456.3958740234375, "epoch": 0.25257142857142856, "grad_norm": 0.22674359381198883, "kl": 0.0051517486572265625, "learning_rate": 7.156560487081051e-07, "loss": 0.0002, "reward": 0.47192759346216917, "reward_std": 0.6820324845612049, "rewards/cosine_scaled_reward": 0.045280902180820704, "rewards/format_reward": 0.8125000074505806, "step": 221 }, { "completion_length": 1963.2917175292969, "epoch": 0.2537142857142857, "grad_norm": 0.21307772397994995, "kl": 0.005227088928222656, "learning_rate": 7.127310565369415e-07, "loss": 0.0002, "reward": 0.42855058796703815, "reward_std": 0.7586027830839157, "rewards/cosine_scaled_reward": 0.055503834038972855, "rewards/format_reward": 0.7291666828095913, "step": 222 }, { "completion_length": 1937.0208740234375, "epoch": 0.25485714285714284, "grad_norm": 0.26616960763931274, "kl": 0.0052585601806640625, "learning_rate": 7.097981330836616e-07, "loss": 0.0002, "reward": 0.2477805533562787, "reward_std": 0.5524286925792694, "rewards/cosine_scaled_reward": -0.013783978298306465, "rewards/format_reward": 0.6458333376795053, "step": 223 }, { "completion_length": 1899.812515258789, "epoch": 0.256, "grad_norm": 0.18020717799663544, "kl": 0.00457000732421875, "learning_rate": 7.068574212948169e-07, "loss": 0.0002, "reward": 0.4362371205352247, "reward_std": 0.8643104657530785, "rewards/cosine_scaled_reward": -0.009671762585639954, "rewards/format_reward": 0.8541666716337204, "step": 224 }, { "completion_length": 2635.125045776367, "epoch": 0.2571428571428571, "grad_norm": 0.2361401468515396, "kl": 0.011203765869140625, "learning_rate": 7.039090644965509e-07, "loss": 0.0004, "reward": 0.1398951131850481, "reward_std": 0.7698212433606386, "rewards/cosine_scaled_reward": -0.0572519232518971, "rewards/format_reward": 0.5625, "step": 225 }, { "completion_length": 1900.4375534057617, "epoch": 0.2582857142857143, "grad_norm": 0.21450620889663696, "kl": 0.0057201385498046875, "learning_rate": 7.009532063876148e-07, "loss": 0.0002, "reward": 0.5856498526409268, "reward_std": 0.7293794807046652, "rewards/cosine_scaled_reward": 0.13076626230031252, "rewards/format_reward": 0.7916666734963655, "step": 226 }, { "completion_length": 1369.208366394043, "epoch": 0.25942857142857145, "grad_norm": 0.323088675737381, "kl": 0.008134841918945312, "learning_rate": 6.979899910323624e-07, "loss": 0.0003, "reward": 0.4275068351998925, "reward_std": 0.8954740911722183, "rewards/cosine_scaled_reward": -0.019443090073764324, "rewards/format_reward": 0.8541666679084301, "step": 227 }, { "completion_length": 1602.8750343322754, "epoch": 0.26057142857142856, "grad_norm": 0.2612713873386383, "kl": 0.0060291290283203125, "learning_rate": 6.950195628537299e-07, "loss": 0.0002, "reward": 0.46579291112720966, "reward_std": 0.7513364851474762, "rewards/cosine_scaled_reward": 0.05927048996090889, "rewards/format_reward": 0.7708333432674408, "step": 228 }, { "completion_length": 1817.6041870117188, "epoch": 0.26171428571428573, "grad_norm": 0.2555846869945526, "kl": 0.0076618194580078125, "learning_rate": 6.920420666261961e-07, "loss": 0.0003, "reward": 0.22703039785847068, "reward_std": 0.6141838692128658, "rewards/cosine_scaled_reward": -0.08688993845134974, "rewards/format_reward": 0.750000013038516, "step": 229 }, { "completion_length": 1947.083396911621, "epoch": 0.26285714285714284, "grad_norm": 0.22352339327335358, "kl": 0.0059909820556640625, "learning_rate": 6.890576474687263e-07, "loss": 0.0002, "reward": 0.030418872833251953, "reward_std": 0.5589127205312252, "rewards/cosine_scaled_reward": -0.2263335685711354, "rewards/format_reward": 0.7708333432674408, "step": 230 }, { "completion_length": 1575.8125381469727, "epoch": 0.264, "grad_norm": 0.24202732741832733, "kl": 0.0063457489013671875, "learning_rate": 6.860664508377001e-07, "loss": 0.0003, "reward": 0.5021250182762742, "reward_std": 0.6612259335815907, "rewards/cosine_scaled_reward": 0.03393824491649866, "rewards/format_reward": 0.8750000111758709, "step": 231 }, { "completion_length": 2361.145881652832, "epoch": 0.2651428571428571, "grad_norm": 0.28824689984321594, "kl": 0.008266448974609375, "learning_rate": 6.83068622519821e-07, "loss": 0.0003, "reward": -0.13501371257007122, "reward_std": 0.600494496524334, "rewards/cosine_scaled_reward": -0.24766290560364723, "rewards/format_reward": 0.5833333376795053, "step": 232 }, { "completion_length": 1480.9167022705078, "epoch": 0.2662857142857143, "grad_norm": 0.32725366950035095, "kl": 0.006622314453125, "learning_rate": 6.800643086250121e-07, "loss": 0.0003, "reward": 0.22992298612371087, "reward_std": 0.7725229002535343, "rewards/cosine_scaled_reward": -0.15780717965390068, "rewards/format_reward": 0.8750000149011612, "step": 233 }, { "completion_length": 1913.5208854675293, "epoch": 0.2674285714285714, "grad_norm": 0.27367836236953735, "kl": 0.008426666259765625, "learning_rate": 6.770536555792944e-07, "loss": 0.0003, "reward": 0.17465345282107592, "reward_std": 0.7128382474184036, "rewards/cosine_scaled_reward": -0.08376146724913269, "rewards/format_reward": 0.6666666716337204, "step": 234 }, { "completion_length": 1466.6458892822266, "epoch": 0.26857142857142857, "grad_norm": 0.25469592213630676, "kl": 0.00775909423828125, "learning_rate": 6.740368101176495e-07, "loss": 0.0003, "reward": 0.6011685915291309, "reward_std": 0.7442786023020744, "rewards/cosine_scaled_reward": 0.11913689319044352, "rewards/format_reward": 0.833333333954215, "step": 235 }, { "completion_length": 1939.0417175292969, "epoch": 0.26971428571428574, "grad_norm": 0.23096546530723572, "kl": 0.0063419342041015625, "learning_rate": 6.710139192768694e-07, "loss": 0.0003, "reward": 0.33170187287032604, "reward_std": 0.7268010787665844, "rewards/cosine_scaled_reward": -0.018086417112499475, "rewards/format_reward": 0.7500000018626451, "step": 236 }, { "completion_length": 1496.2083740234375, "epoch": 0.27085714285714285, "grad_norm": 0.25223881006240845, "kl": 0.0052013397216796875, "learning_rate": 6.679851303883891e-07, "loss": 0.0002, "reward": 0.5692060198634863, "reward_std": 0.5697116628289223, "rewards/cosine_scaled_reward": 0.09943875670433044, "rewards/format_reward": 0.8541666716337204, "step": 237 }, { "completion_length": 1303.5625228881836, "epoch": 0.272, "grad_norm": 0.23987644910812378, "kl": 0.00640106201171875, "learning_rate": 6.649505910711058e-07, "loss": 0.0003, "reward": 0.6454257536679506, "reward_std": 0.8430968932807446, "rewards/cosine_scaled_reward": 0.09218539297580719, "rewards/format_reward": 0.9375000074505806, "step": 238 }, { "completion_length": 1533.0833473205566, "epoch": 0.27314285714285713, "grad_norm": 0.22352874279022217, "kl": 0.004787445068359375, "learning_rate": 6.619104492241847e-07, "loss": 0.0002, "reward": 0.7365382118150592, "reward_std": 0.6625222954899073, "rewards/cosine_scaled_reward": 0.24717165902256966, "rewards/format_reward": 0.770833333954215, "step": 239 }, { "completion_length": 1882.8125381469727, "epoch": 0.2742857142857143, "grad_norm": 0.35037240386009216, "kl": 0.010135650634765625, "learning_rate": 6.588648530198504e-07, "loss": 0.0004, "reward": 0.04069505538791418, "reward_std": 0.6465389877557755, "rewards/cosine_scaled_reward": -0.16988872209913097, "rewards/format_reward": 0.6666666772216558, "step": 240 }, { "completion_length": 2043.1667175292969, "epoch": 0.2754285714285714, "grad_norm": 0.2375987321138382, "kl": 0.009412765502929688, "learning_rate": 6.558139508961654e-07, "loss": 0.0004, "reward": -0.024178337305784225, "reward_std": 0.5009024143218994, "rewards/cosine_scaled_reward": -0.20676567568443716, "rewards/format_reward": 0.6666666753590107, "step": 241 }, { "completion_length": 1260.9792022705078, "epoch": 0.2765714285714286, "grad_norm": 0.29835185408592224, "kl": 0.010770797729492188, "learning_rate": 6.527578915497951e-07, "loss": 0.0004, "reward": 0.3275939063169062, "reward_std": 0.5498746670782566, "rewards/cosine_scaled_reward": -0.09544416703283787, "rewards/format_reward": 0.9166666716337204, "step": 242 }, { "completion_length": 1774.5000534057617, "epoch": 0.2777142857142857, "grad_norm": 0.19600969552993774, "kl": 0.0063343048095703125, "learning_rate": 6.496968239287603e-07, "loss": 0.0003, "reward": 0.43449581041932106, "reward_std": 0.7577872760593891, "rewards/cosine_scaled_reward": 0.026275813579559326, "rewards/format_reward": 0.7916666716337204, "step": 243 }, { "completion_length": 1623.8750457763672, "epoch": 0.27885714285714286, "grad_norm": 0.22528576850891113, "kl": 0.0068035125732421875, "learning_rate": 6.466308972251785e-07, "loss": 0.0003, "reward": 0.7132167363015469, "reward_std": 0.7599900439381599, "rewards/cosine_scaled_reward": 0.18282007612287998, "rewards/format_reward": 0.8541666679084301, "step": 244 }, { "completion_length": 1842.708381652832, "epoch": 0.28, "grad_norm": 0.23914141952991486, "kl": 0.006076812744140625, "learning_rate": 6.435602608679916e-07, "loss": 0.0002, "reward": 0.589666100917384, "reward_std": 0.911857221275568, "rewards/cosine_scaled_reward": 0.11452391929924488, "rewards/format_reward": 0.8125000111758709, "step": 245 }, { "completion_length": 1442.0625457763672, "epoch": 0.28114285714285714, "grad_norm": 0.22671173512935638, "kl": 0.007045745849609375, "learning_rate": 6.404850645156841e-07, "loss": 0.0003, "reward": 0.3358262628316879, "reward_std": 0.5915343575179577, "rewards/cosine_scaled_reward": -0.07317247241735458, "rewards/format_reward": 0.8750000037252903, "step": 246 }, { "completion_length": 2208.770866394043, "epoch": 0.2822857142857143, "grad_norm": 0.29410502314567566, "kl": 0.010019302368164062, "learning_rate": 6.374054580489873e-07, "loss": 0.0004, "reward": -0.014087029732763767, "reward_std": 0.6953110322356224, "rewards/cosine_scaled_reward": -0.19972090609371662, "rewards/format_reward": 0.6458333469927311, "step": 247 }, { "completion_length": 1561.9583740234375, "epoch": 0.2834285714285714, "grad_norm": 0.2551148533821106, "kl": 0.007082939147949219, "learning_rate": 6.343215915635761e-07, "loss": 0.0003, "reward": 0.7867583259940147, "reward_std": 0.7739764004945755, "rewards/cosine_scaled_reward": 0.28318586223758757, "rewards/format_reward": 0.7500000055879354, "step": 248 }, { "completion_length": 1342.2917022705078, "epoch": 0.2845714285714286, "grad_norm": 0.2254449427127838, "kl": 0.00762939453125, "learning_rate": 6.31233615362752e-07, "loss": 0.0003, "reward": 0.7728391233831644, "reward_std": 0.6931521892547607, "rewards/cosine_scaled_reward": 0.20786779932677746, "rewards/format_reward": 0.8958333358168602, "step": 249 }, { "completion_length": 1144.3750305175781, "epoch": 0.2857142857142857, "grad_norm": 0.4075692892074585, "kl": 0.008514404296875, "learning_rate": 6.281416799501187e-07, "loss": 0.0003, "reward": 0.4015044257976115, "reward_std": 0.511387325823307, "rewards/cosine_scaled_reward": -0.07808645971817896, "rewards/format_reward": 0.9791666716337204, "step": 250 }, { "completion_length": 1381.1459045410156, "epoch": 0.28685714285714287, "grad_norm": 0.32429951429367065, "kl": 0.0106201171875, "learning_rate": 6.25045936022246e-07, "loss": 0.0004, "reward": 0.26284962613135576, "reward_std": 0.8155036717653275, "rewards/cosine_scaled_reward": -0.09845332545228302, "rewards/format_reward": 0.7916666679084301, "step": 251 }, { "completion_length": 1654.3750457763672, "epoch": 0.288, "grad_norm": 0.2845689356327057, "kl": 0.009103775024414062, "learning_rate": 6.219465344613258e-07, "loss": 0.0004, "reward": 0.05027025658637285, "reward_std": 0.4697149991989136, "rewards/cosine_scaled_reward": -0.1850506253540516, "rewards/format_reward": 0.7291666753590107, "step": 252 }, { "completion_length": 1660.5208740234375, "epoch": 0.28914285714285715, "grad_norm": 0.2830834984779358, "kl": 0.010486602783203125, "learning_rate": 6.188436263278172e-07, "loss": 0.0004, "reward": 0.18601116666104645, "reward_std": 0.66238809004426, "rewards/cosine_scaled_reward": -0.1478569945320487, "rewards/format_reward": 0.8125000074505806, "step": 253 }, { "completion_length": 1486.208339691162, "epoch": 0.29028571428571426, "grad_norm": 0.2770550847053528, "kl": 0.008548736572265625, "learning_rate": 6.157373628530852e-07, "loss": 0.0003, "reward": 0.3256164574995637, "reward_std": 0.72544976323843, "rewards/cosine_scaled_reward": -0.07894822582602501, "rewards/format_reward": 0.8541666753590107, "step": 254 }, { "completion_length": 2314.5416946411133, "epoch": 0.2914285714285714, "grad_norm": 0.32598185539245605, "kl": 0.01126861572265625, "learning_rate": 6.126278954320294e-07, "loss": 0.0005, "reward": -0.04141565319150686, "reward_std": 0.7138533964753151, "rewards/cosine_scaled_reward": -0.21306942123919725, "rewards/format_reward": 0.6250000149011612, "step": 255 }, { "completion_length": 1553.5208740234375, "epoch": 0.2925714285714286, "grad_norm": 0.2590584456920624, "kl": 0.008270263671875, "learning_rate": 6.095153756157051e-07, "loss": 0.0003, "reward": 0.5183277567848563, "reward_std": 0.539212403818965, "rewards/cosine_scaled_reward": 0.05446232855319977, "rewards/format_reward": 0.8750000111758709, "step": 256 }, { "completion_length": 1969.1042175292969, "epoch": 0.2937142857142857, "grad_norm": 0.2547340989112854, "kl": 0.007389068603515625, "learning_rate": 6.06399955103937e-07, "loss": 0.0003, "reward": 0.6981238089501858, "reward_std": 0.957792617380619, "rewards/cosine_scaled_reward": 0.20680510997772217, "rewards/format_reward": 0.7708333414047956, "step": 257 }, { "completion_length": 1860.1042175292969, "epoch": 0.2948571428571429, "grad_norm": 0.2526947855949402, "kl": 0.006866455078125, "learning_rate": 6.032817857379256e-07, "loss": 0.0003, "reward": 0.3261422934010625, "reward_std": 0.8735288791358471, "rewards/cosine_scaled_reward": -0.07447653356939554, "rewards/format_reward": 0.8333333395421505, "step": 258 }, { "completion_length": 1497.6667137145996, "epoch": 0.296, "grad_norm": 0.2954126298427582, "kl": 0.009616851806640625, "learning_rate": 6.001610194928464e-07, "loss": 0.0004, "reward": 0.4608106706291437, "reward_std": 0.6903665885329247, "rewards/cosine_scaled_reward": 0.035531939938664436, "rewards/format_reward": 0.8125000186264515, "step": 259 }, { "completion_length": 1280.8958778381348, "epoch": 0.29714285714285715, "grad_norm": 0.2724701166152954, "kl": 0.0075778961181640625, "learning_rate": 5.97037808470444e-07, "loss": 0.0003, "reward": 0.6839517981279641, "reward_std": 0.8328232653439045, "rewards/cosine_scaled_reward": 0.13848848675843328, "rewards/format_reward": 0.8958333358168602, "step": 260 }, { "completion_length": 2193.687515258789, "epoch": 0.29828571428571427, "grad_norm": 0.18936224281787872, "kl": 0.00774383544921875, "learning_rate": 5.939123048916173e-07, "loss": 0.0003, "reward": -0.03597256541252136, "reward_std": 0.5031098667532206, "rewards/cosine_scaled_reward": -0.18370476551353931, "rewards/format_reward": 0.6041666716337204, "step": 261 }, { "completion_length": 1862.2083740234375, "epoch": 0.29942857142857143, "grad_norm": 0.2641579806804657, "kl": 0.0114593505859375, "learning_rate": 5.907846610890011e-07, "loss": 0.0005, "reward": 0.04227437451481819, "reward_std": 0.5424713045358658, "rewards/cosine_scaled_reward": -0.1736736847087741, "rewards/format_reward": 0.6875000037252903, "step": 262 }, { "completion_length": 1353.3125228881836, "epoch": 0.30057142857142854, "grad_norm": 0.2091461718082428, "kl": 0.00551605224609375, "learning_rate": 5.87655029499542e-07, "loss": 0.0002, "reward": 0.29918272816576064, "reward_std": 0.7759961858391762, "rewards/cosine_scaled_reward": -0.13049082271754742, "rewards/format_reward": 0.9166666716337204, "step": 263 }, { "completion_length": 1441.7083740234375, "epoch": 0.3017142857142857, "grad_norm": 0.2494620829820633, "kl": 0.0072498321533203125, "learning_rate": 5.845235626570683e-07, "loss": 0.0003, "reward": 0.2639927687123418, "reward_std": 0.689430944621563, "rewards/cosine_scaled_reward": -0.11652671941556036, "rewards/format_reward": 0.8541666772216558, "step": 264 }, { "completion_length": 1564.6250457763672, "epoch": 0.3028571428571429, "grad_norm": 0.2662133276462555, "kl": 0.010656356811523438, "learning_rate": 5.813904131848564e-07, "loss": 0.0004, "reward": 0.44831580482423306, "reward_std": 0.7113944664597511, "rewards/cosine_scaled_reward": -0.0021858818363398314, "rewards/format_reward": 0.8750000055879354, "step": 265 }, { "completion_length": 1647.4375381469727, "epoch": 0.304, "grad_norm": 0.2215126007795334, "kl": 0.00804901123046875, "learning_rate": 5.78255733788191e-07, "loss": 0.0003, "reward": 0.1729487591655925, "reward_std": 0.6001881808042526, "rewards/cosine_scaled_reward": -0.10871189273893833, "rewards/format_reward": 0.7291666865348816, "step": 266 }, { "completion_length": 2309.1458740234375, "epoch": 0.30514285714285716, "grad_norm": 0.33464720845222473, "kl": 0.013841629028320312, "learning_rate": 5.751196772469237e-07, "loss": 0.0006, "reward": -0.04994155094027519, "reward_std": 0.63846031203866, "rewards/cosine_scaled_reward": -0.18117934837937355, "rewards/format_reward": 0.5625000111758709, "step": 267 }, { "completion_length": 1347.291732788086, "epoch": 0.3062857142857143, "grad_norm": 0.2725054919719696, "kl": 0.011320114135742188, "learning_rate": 5.71982396408026e-07, "loss": 0.0005, "reward": 0.3638640786521137, "reward_std": 0.856957983225584, "rewards/cosine_scaled_reward": -0.07128076790831983, "rewards/format_reward": 0.8750000149011612, "step": 268 }, { "completion_length": 1536.8541870117188, "epoch": 0.30742857142857144, "grad_norm": 0.242116779088974, "kl": 0.008541107177734375, "learning_rate": 5.688440441781398e-07, "loss": 0.0003, "reward": 0.2962089798747911, "reward_std": 0.652573972940445, "rewards/cosine_scaled_reward": -0.07353247702121735, "rewards/format_reward": 0.8125000149011612, "step": 269 }, { "completion_length": 1723.6875305175781, "epoch": 0.30857142857142855, "grad_norm": 0.20673726499080658, "kl": 0.00959014892578125, "learning_rate": 5.657047735161255e-07, "loss": 0.0004, "reward": 0.6585689373314381, "reward_std": 0.8569062799215317, "rewards/cosine_scaled_reward": 0.13866457249969244, "rewards/format_reward": 0.854166679084301, "step": 270 }, { "completion_length": 1226.4375534057617, "epoch": 0.3097142857142857, "grad_norm": 0.3247756063938141, "kl": 0.009336471557617188, "learning_rate": 5.625647374256061e-07, "loss": 0.0004, "reward": 0.6650833152234554, "reward_std": 0.7789544351398945, "rewards/cosine_scaled_reward": 0.11849892261670902, "rewards/format_reward": 0.9166666679084301, "step": 271 }, { "completion_length": 1959.395896911621, "epoch": 0.31085714285714283, "grad_norm": 0.2230900079011917, "kl": 0.0106964111328125, "learning_rate": 5.594240889475106e-07, "loss": 0.0004, "reward": 0.23697172570973635, "reward_std": 0.7450396865606308, "rewards/cosine_scaled_reward": -0.0959045309573412, "rewards/format_reward": 0.7708333507180214, "step": 272 }, { "completion_length": 1628.1042175292969, "epoch": 0.312, "grad_norm": 0.3747721314430237, "kl": 0.01097869873046875, "learning_rate": 5.562829811526154e-07, "loss": 0.0004, "reward": 0.4147872976027429, "reward_std": 0.6467026993632317, "rewards/cosine_scaled_reward": 0.0440314169973135, "rewards/format_reward": 0.7500000149011612, "step": 273 }, { "completion_length": 1172.4375457763672, "epoch": 0.31314285714285717, "grad_norm": 0.2670040726661682, "kl": 0.008930206298828125, "learning_rate": 5.531415671340826e-07, "loss": 0.0004, "reward": 0.6682712404581252, "reward_std": 0.8026308417320251, "rewards/cosine_scaled_reward": 0.0984476669691503, "rewards/format_reward": 0.9583333432674408, "step": 274 }, { "completion_length": 1716.9375534057617, "epoch": 0.3142857142857143, "grad_norm": 0.2296976000070572, "kl": 0.00942230224609375, "learning_rate": 5.5e-07, "loss": 0.0004, "reward": 0.6355814579874277, "reward_std": 0.902884915471077, "rewards/cosine_scaled_reward": 0.1562671698629856, "rewards/format_reward": 0.791666679084301, "step": 275 }, { "completion_length": 1406.5625457763672, "epoch": 0.31542857142857145, "grad_norm": 0.27804121375083923, "kl": 0.012622833251953125, "learning_rate": 5.468584328659172e-07, "loss": 0.0005, "reward": 0.47063780203461647, "reward_std": 0.826964907348156, "rewards/cosine_scaled_reward": 0.035029259510338306, "rewards/format_reward": 0.8125000055879354, "step": 276 }, { "completion_length": 1531.7708892822266, "epoch": 0.31657142857142856, "grad_norm": 0.3786030411720276, "kl": 0.0130615234375, "learning_rate": 5.437170188473847e-07, "loss": 0.0005, "reward": 0.4646074064075947, "reward_std": 0.7622004933655262, "rewards/cosine_scaled_reward": 0.04781521949917078, "rewards/format_reward": 0.7916666734963655, "step": 277 }, { "completion_length": 1472.6458740234375, "epoch": 0.3177142857142857, "grad_norm": 0.29159483313560486, "kl": 0.012044906616210938, "learning_rate": 5.405759110524894e-07, "loss": 0.0005, "reward": 0.5979791600257158, "reward_std": 0.6016073673963547, "rewards/cosine_scaled_reward": 0.13445703126490116, "rewards/format_reward": 0.8125000186264515, "step": 278 }, { "completion_length": 1808.8750457763672, "epoch": 0.31885714285714284, "grad_norm": 0.45572957396507263, "kl": 0.0137481689453125, "learning_rate": 5.37435262574394e-07, "loss": 0.0006, "reward": 0.3207678751787171, "reward_std": 0.7218646891415119, "rewards/cosine_scaled_reward": -0.07327653095126152, "rewards/format_reward": 0.854166679084301, "step": 279 }, { "completion_length": 1779.0000457763672, "epoch": 0.32, "grad_norm": 0.3021228611469269, "kl": 0.011760711669921875, "learning_rate": 5.342952264838747e-07, "loss": 0.0005, "reward": 0.8133369982242584, "reward_std": 0.8129135742783546, "rewards/cosine_scaled_reward": 0.2262397282756865, "rewards/format_reward": 0.8958333395421505, "step": 280 }, { "completion_length": 2400.2708740234375, "epoch": 0.3211428571428571, "grad_norm": 0.2547665238380432, "kl": 0.012500762939453125, "learning_rate": 5.311559558218603e-07, "loss": 0.0005, "reward": -0.10774591006338596, "reward_std": 0.6432830318808556, "rewards/cosine_scaled_reward": -0.22082971967756748, "rewards/format_reward": 0.5625000055879354, "step": 281 }, { "completion_length": 1696.4792251586914, "epoch": 0.3222857142857143, "grad_norm": 0.3483729362487793, "kl": 0.013248443603515625, "learning_rate": 5.28017603591974e-07, "loss": 0.0005, "reward": 0.4830823950469494, "reward_std": 0.720955528318882, "rewards/cosine_scaled_reward": 0.053989187348634005, "rewards/format_reward": 0.8125000149011612, "step": 282 }, { "completion_length": 2184.6250534057617, "epoch": 0.32342857142857145, "grad_norm": 0.26255351305007935, "kl": 0.01146697998046875, "learning_rate": 5.248803227530763e-07, "loss": 0.0005, "reward": 0.6261483291164041, "reward_std": 0.8373686634004116, "rewards/cosine_scaled_reward": 0.18944203667342663, "rewards/format_reward": 0.7291666772216558, "step": 283 }, { "completion_length": 1453.270881652832, "epoch": 0.32457142857142857, "grad_norm": 0.26391106843948364, "kl": 0.0076885223388671875, "learning_rate": 5.21744266211809e-07, "loss": 0.0003, "reward": 0.14405585872009397, "reward_std": 0.5576095655560493, "rewards/cosine_scaled_reward": -0.18134936597198248, "rewards/format_reward": 0.8333333432674408, "step": 284 }, { "completion_length": 1164.7291946411133, "epoch": 0.32571428571428573, "grad_norm": 0.40485504269599915, "kl": 0.0128326416015625, "learning_rate": 5.186095868151436e-07, "loss": 0.0005, "reward": 0.3652733010239899, "reward_std": 0.7800908386707306, "rewards/cosine_scaled_reward": -0.04528142110211775, "rewards/format_reward": 0.8333333358168602, "step": 285 }, { "completion_length": 1260.0208587646484, "epoch": 0.32685714285714285, "grad_norm": 0.26052573323249817, "kl": 0.008724212646484375, "learning_rate": 5.154764373429315e-07, "loss": 0.0003, "reward": 0.4141525523737073, "reward_std": 0.6447485722601414, "rewards/cosine_scaled_reward": -0.043887258507311344, "rewards/format_reward": 0.9166666679084301, "step": 286 }, { "completion_length": 1336.2083702087402, "epoch": 0.328, "grad_norm": 0.3096228837966919, "kl": 0.012447357177734375, "learning_rate": 5.123449705004581e-07, "loss": 0.0005, "reward": 0.5274541154503822, "reward_std": 0.7009152993559837, "rewards/cosine_scaled_reward": 0.121887655579485, "rewards/format_reward": 0.7291666679084301, "step": 287 }, { "completion_length": 1846.3750228881836, "epoch": 0.3291428571428571, "grad_norm": 0.28548097610473633, "kl": 0.01815032958984375, "learning_rate": 5.09215338910999e-07, "loss": 0.0007, "reward": 0.12442206963896751, "reward_std": 0.5581525340676308, "rewards/cosine_scaled_reward": -0.1310774045996368, "rewards/format_reward": 0.7083333395421505, "step": 288 }, { "completion_length": 1571.437557220459, "epoch": 0.3302857142857143, "grad_norm": 0.3936711251735687, "kl": 0.012668609619140625, "learning_rate": 5.060876951083828e-07, "loss": 0.0005, "reward": 0.4288631723029539, "reward_std": 0.593647625297308, "rewards/cosine_scaled_reward": 0.030754741048440337, "rewards/format_reward": 0.7916666679084301, "step": 289 }, { "completion_length": 898.5833473205566, "epoch": 0.3314285714285714, "grad_norm": 0.32913920283317566, "kl": 0.009197235107421875, "learning_rate": 5.02962191529556e-07, "loss": 0.0004, "reward": 0.7029262520372868, "reward_std": 0.8086423352360725, "rewards/cosine_scaled_reward": 0.12271453440189362, "rewards/format_reward": 0.9583333432674408, "step": 290 }, { "completion_length": 1254.2917022705078, "epoch": 0.3325714285714286, "grad_norm": 0.24602723121643066, "kl": 0.010408401489257812, "learning_rate": 4.998389805071536e-07, "loss": 0.0004, "reward": 0.44253411889076233, "reward_std": 0.8852041102945805, "rewards/cosine_scaled_reward": -0.05237848265096545, "rewards/format_reward": 0.9375, "step": 291 }, { "completion_length": 1754.5000610351562, "epoch": 0.33371428571428574, "grad_norm": 0.24916251003742218, "kl": 0.015148162841796875, "learning_rate": 4.967182142620745e-07, "loss": 0.0006, "reward": 0.15450574783608317, "reward_std": 0.5213648546487093, "rewards/cosine_scaled_reward": -0.1516056777909398, "rewards/format_reward": 0.7916666772216558, "step": 292 }, { "completion_length": 1034.2917022705078, "epoch": 0.33485714285714285, "grad_norm": 0.31137824058532715, "kl": 0.012929916381835938, "learning_rate": 4.93600044896063e-07, "loss": 0.0005, "reward": 0.4599906969233416, "reward_std": 0.5716858878731728, "rewards/cosine_scaled_reward": -0.009167976677417755, "rewards/format_reward": 0.916666679084301, "step": 293 }, { "completion_length": 1682.0208740234375, "epoch": 0.336, "grad_norm": 0.27208465337753296, "kl": 0.013782501220703125, "learning_rate": 4.904846243842949e-07, "loss": 0.0006, "reward": 0.3077076869085431, "reward_std": 0.777203194797039, "rewards/cosine_scaled_reward": -0.05544556397944689, "rewards/format_reward": 0.7708333432674408, "step": 294 }, { "completion_length": 1438.083381652832, "epoch": 0.33714285714285713, "grad_norm": 0.4438267946243286, "kl": 0.017040252685546875, "learning_rate": 4.873721045679706e-07, "loss": 0.0007, "reward": 0.7139077642932534, "reward_std": 0.7298413254320621, "rewards/cosine_scaled_reward": 0.1472441926598549, "rewards/format_reward": 0.9375000074505806, "step": 295 }, { "completion_length": 1397.895896911621, "epoch": 0.3382857142857143, "grad_norm": 0.3188610076904297, "kl": 0.01441192626953125, "learning_rate": 4.842626371469149e-07, "loss": 0.0006, "reward": 0.47962956223636866, "reward_std": 0.759897030889988, "rewards/cosine_scaled_reward": 0.025067659094929695, "rewards/format_reward": 0.8541666716337204, "step": 296 }, { "completion_length": 2094.354217529297, "epoch": 0.3394285714285714, "grad_norm": 0.472922682762146, "kl": 0.0243682861328125, "learning_rate": 4.811563736721829e-07, "loss": 0.001, "reward": 0.12652458110824227, "reward_std": 0.6632986180484295, "rewards/cosine_scaled_reward": -0.115331269800663, "rewards/format_reward": 0.6666666809469461, "step": 297 }, { "completion_length": 1431.7292175292969, "epoch": 0.3405714285714286, "grad_norm": 0.2858993113040924, "kl": 0.013988494873046875, "learning_rate": 4.780534655386743e-07, "loss": 0.0006, "reward": 0.41945501090958714, "reward_std": 0.8443580865859985, "rewards/cosine_scaled_reward": -0.003992303041741252, "rewards/format_reward": 0.8125000149011612, "step": 298 }, { "completion_length": 1634.4167175292969, "epoch": 0.3417142857142857, "grad_norm": 0.403289258480072, "kl": 0.019683837890625, "learning_rate": 4.749540639777539e-07, "loss": 0.0008, "reward": 0.3505248324945569, "reward_std": 0.7320738956332207, "rewards/cosine_scaled_reward": -0.020624496042728424, "rewards/format_reward": 0.7708333469927311, "step": 299 }, { "completion_length": 1821.9166870117188, "epoch": 0.34285714285714286, "grad_norm": 0.3450154662132263, "kl": 0.0260009765625, "learning_rate": 4.7185832004988133e-07, "loss": 0.001, "reward": 0.28817289136350155, "reward_std": 0.6358778662979603, "rewards/cosine_scaled_reward": -0.05276927351951599, "rewards/format_reward": 0.7708333358168602, "step": 300 }, { "completion_length": 1875.5000381469727, "epoch": 0.344, "grad_norm": 0.4964129328727722, "kl": 0.031585693359375, "learning_rate": 4.68766384637248e-07, "loss": 0.0013, "reward": 0.23697214853018522, "reward_std": 0.731932707130909, "rewards/cosine_scaled_reward": -0.10828239191323519, "rewards/format_reward": 0.7916666753590107, "step": 301 }, { "completion_length": 1755.0000343322754, "epoch": 0.34514285714285714, "grad_norm": 0.3546035587787628, "kl": 0.022808074951171875, "learning_rate": 4.656784084364238e-07, "loss": 0.0009, "reward": 0.3061390779912472, "reward_std": 0.6646707132458687, "rewards/cosine_scaled_reward": -0.003997504012659192, "rewards/format_reward": 0.6875, "step": 302 }, { "completion_length": 1310.7291984558105, "epoch": 0.3462857142857143, "grad_norm": 0.3501085937023163, "kl": 0.017917633056640625, "learning_rate": 4.6259454195101267e-07, "loss": 0.0007, "reward": 0.49688062351197004, "reward_std": 0.7856386080384254, "rewards/cosine_scaled_reward": 0.005726959556341171, "rewards/format_reward": 0.916666679084301, "step": 303 }, { "completion_length": 1531.625015258789, "epoch": 0.3474285714285714, "grad_norm": 0.23862285912036896, "kl": 0.0212249755859375, "learning_rate": 4.59514935484316e-07, "loss": 0.0008, "reward": 0.33517973372363485, "reward_std": 0.8556637056171894, "rewards/cosine_scaled_reward": -0.07236111164093018, "rewards/format_reward": 0.8333333432674408, "step": 304 }, { "completion_length": 1325.145851135254, "epoch": 0.3485714285714286, "grad_norm": 0.3753778338432312, "kl": 0.013553619384765625, "learning_rate": 4.5643973913200837e-07, "loss": 0.0005, "reward": 0.1812261645682156, "reward_std": 0.7640487253665924, "rewards/cosine_scaled_reward": -0.18994974298402667, "rewards/format_reward": 0.8750000223517418, "step": 305 }, { "completion_length": 1495.1875610351562, "epoch": 0.3497142857142857, "grad_norm": 0.43624046444892883, "kl": 0.028331756591796875, "learning_rate": 4.5336910277482155e-07, "loss": 0.0011, "reward": 0.5158186480402946, "reward_std": 0.8463861420750618, "rewards/cosine_scaled_reward": 0.07386103633325547, "rewards/format_reward": 0.791666679084301, "step": 306 }, { "completion_length": 1391.8750228881836, "epoch": 0.35085714285714287, "grad_norm": 0.36552807688713074, "kl": 0.01470184326171875, "learning_rate": 4.503031760712397e-07, "loss": 0.0006, "reward": 0.4544885288923979, "reward_std": 0.9182650446891785, "rewards/cosine_scaled_reward": 0.03140606731176376, "rewards/format_reward": 0.7916666716337204, "step": 307 }, { "completion_length": 2439.062545776367, "epoch": 0.352, "grad_norm": 0.25556859374046326, "kl": 0.02754974365234375, "learning_rate": 4.4724210845020494e-07, "loss": 0.0011, "reward": 0.16432497836649418, "reward_std": 0.7285211831331253, "rewards/cosine_scaled_reward": -0.06424459861591458, "rewards/format_reward": 0.6041666734963655, "step": 308 }, { "completion_length": 2007.7709045410156, "epoch": 0.35314285714285715, "grad_norm": 0.2907513380050659, "kl": 0.02512359619140625, "learning_rate": 4.441860491038345e-07, "loss": 0.001, "reward": 0.3333761217072606, "reward_std": 0.9507267326116562, "rewards/cosine_scaled_reward": -0.06830872967839241, "rewards/format_reward": 0.8125000074505806, "step": 309 }, { "completion_length": 1377.9375610351562, "epoch": 0.35428571428571426, "grad_norm": 0.6299467086791992, "kl": 0.019756317138671875, "learning_rate": 4.4113514698014953e-07, "loss": 0.0008, "reward": 0.14710421487689018, "reward_std": 0.5830218307673931, "rewards/cosine_scaled_reward": -0.2127772723324597, "rewards/format_reward": 0.8958333358168602, "step": 310 }, { "completion_length": 1264.770851135254, "epoch": 0.3554285714285714, "grad_norm": 0.3672167658805847, "kl": 0.01552581787109375, "learning_rate": 4.3808955077581546e-07, "loss": 0.0006, "reward": 0.5835366472601891, "reward_std": 0.8306069150567055, "rewards/cosine_scaled_reward": 0.05884586926549673, "rewards/format_reward": 0.9166666716337204, "step": 311 }, { "completion_length": 1228.6875228881836, "epoch": 0.3565714285714286, "grad_norm": 0.31035158038139343, "kl": 0.01593017578125, "learning_rate": 4.350494089288943e-07, "loss": 0.0006, "reward": 0.7780788261443377, "reward_std": 0.725801732391119, "rewards/cosine_scaled_reward": 0.23832012061029673, "rewards/format_reward": 0.8333333432674408, "step": 312 }, { "completion_length": 2067.1666831970215, "epoch": 0.3577142857142857, "grad_norm": 0.3178723454475403, "kl": 0.03451347351074219, "learning_rate": 4.3201486961161093e-07, "loss": 0.0014, "reward": 0.3173049371689558, "reward_std": 0.6579355709254742, "rewards/cosine_scaled_reward": 0.03718515514628962, "rewards/format_reward": 0.6250000111758709, "step": 313 }, { "completion_length": 1668.3333892822266, "epoch": 0.3588571428571429, "grad_norm": 0.46666356921195984, "kl": 0.031169891357421875, "learning_rate": 4.2898608072313045e-07, "loss": 0.0012, "reward": 0.28569703502580523, "reward_std": 0.6306734308600426, "rewards/cosine_scaled_reward": -0.056163689121603966, "rewards/format_reward": 0.7708333395421505, "step": 314 }, { "completion_length": 2082.7084045410156, "epoch": 0.36, "grad_norm": 0.2961288392543793, "kl": 0.05887603759765625, "learning_rate": 4.2596318988235037e-07, "loss": 0.0024, "reward": 0.4379968661814928, "reward_std": 0.8023070320487022, "rewards/cosine_scaled_reward": 0.04415438207797706, "rewards/format_reward": 0.7500000074505806, "step": 315 }, { "completion_length": 2361.541732788086, "epoch": 0.36114285714285715, "grad_norm": 0.477490097284317, "kl": 0.0577545166015625, "learning_rate": 4.2294634442070553e-07, "loss": 0.0023, "reward": -0.013460966947604902, "reward_std": 0.6273909620940685, "rewards/cosine_scaled_reward": -0.19738789275288582, "rewards/format_reward": 0.6458333488553762, "step": 316 }, { "completion_length": 1918.9375686645508, "epoch": 0.36228571428571427, "grad_norm": 0.6062735319137573, "kl": 0.03968238830566406, "learning_rate": 4.1993569137498776e-07, "loss": 0.0016, "reward": 0.2778178099542856, "reward_std": 0.8473985716700554, "rewards/cosine_scaled_reward": -0.024428293108940125, "rewards/format_reward": 0.6666666828095913, "step": 317 }, { "completion_length": 1197.4792098999023, "epoch": 0.36342857142857143, "grad_norm": 0.5727298855781555, "kl": 0.025470733642578125, "learning_rate": 4.1693137748017915e-07, "loss": 0.001, "reward": 0.35039830300956964, "reward_std": 0.665754821151495, "rewards/cosine_scaled_reward": -0.09761649183928967, "rewards/format_reward": 0.9375000149011612, "step": 318 }, { "completion_length": 1549.0416946411133, "epoch": 0.36457142857142855, "grad_norm": 0.44952714443206787, "kl": 0.022411346435546875, "learning_rate": 4.1393354916230005e-07, "loss": 0.0009, "reward": 0.16405144333839417, "reward_std": 0.7381913363933563, "rewards/cosine_scaled_reward": -0.17837723344564438, "rewards/format_reward": 0.8333333488553762, "step": 319 }, { "completion_length": 1096.270866394043, "epoch": 0.3657142857142857, "grad_norm": 0.9068484902381897, "kl": 0.045513153076171875, "learning_rate": 4.1094235253127374e-07, "loss": 0.0018, "reward": 0.4451125105842948, "reward_std": 0.8512261882424355, "rewards/cosine_scaled_reward": -0.02465624047908932, "rewards/format_reward": 0.8958333395421505, "step": 320 }, { "completion_length": 1107.2083778381348, "epoch": 0.3668571428571429, "grad_norm": 0.33380454778671265, "kl": 0.016986846923828125, "learning_rate": 4.079579333738039e-07, "loss": 0.0007, "reward": 0.7083983863703907, "reward_std": 0.741938479244709, "rewards/cosine_scaled_reward": 0.13879141584038734, "rewards/format_reward": 0.9375000074505806, "step": 321 }, { "completion_length": 1815.958366394043, "epoch": 0.368, "grad_norm": 0.5257051587104797, "kl": 0.06414031982421875, "learning_rate": 4.0498043714627006e-07, "loss": 0.0026, "reward": 0.2582624601200223, "reward_std": 0.6789763048291206, "rewards/cosine_scaled_reward": -0.0668359762057662, "rewards/format_reward": 0.7500000149011612, "step": 322 }, { "completion_length": 1977.5834045410156, "epoch": 0.36914285714285716, "grad_norm": 0.5315828323364258, "kl": 0.07048416137695312, "learning_rate": 4.020100089676376e-07, "loss": 0.0028, "reward": 0.3757035471498966, "reward_std": 0.8692245557904243, "rewards/cosine_scaled_reward": 0.05770140094682574, "rewards/format_reward": 0.6458333469927311, "step": 323 }, { "completion_length": 2044.8125457763672, "epoch": 0.3702857142857143, "grad_norm": 0.5173450708389282, "kl": 0.06734085083007812, "learning_rate": 3.9904679361238526e-07, "loss": 0.0027, "reward": 0.1749881288560573, "reward_std": 0.8538544028997421, "rewards/cosine_scaled_reward": -0.14555134577676654, "rewards/format_reward": 0.7708333507180214, "step": 324 }, { "completion_length": 1892.9375267028809, "epoch": 0.37142857142857144, "grad_norm": 0.6045531034469604, "kl": 0.0422515869140625, "learning_rate": 3.9609093550344907e-07, "loss": 0.0017, "reward": 0.32839928939938545, "reward_std": 0.8866756781935692, "rewards/cosine_scaled_reward": -0.03144947811961174, "rewards/format_reward": 0.7500000093132257, "step": 325 }, { "completion_length": 1686.708366394043, "epoch": 0.37257142857142855, "grad_norm": 0.485423743724823, "kl": 0.061676025390625, "learning_rate": 3.931425787051832e-07, "loss": 0.0025, "reward": 0.39623109018430114, "reward_std": 0.8446017913520336, "rewards/cosine_scaled_reward": 0.0048162119928747416, "rewards/format_reward": 0.7708333414047956, "step": 326 }, { "completion_length": 1453.6458435058594, "epoch": 0.3737142857142857, "grad_norm": 0.2550465166568756, "kl": 0.0176849365234375, "learning_rate": 3.902018669163384e-07, "loss": 0.0007, "reward": 0.6734898695722222, "reward_std": 0.814283449202776, "rewards/cosine_scaled_reward": 0.15649090707302094, "rewards/format_reward": 0.8541666716337204, "step": 327 }, { "completion_length": 1698.7500457763672, "epoch": 0.37485714285714283, "grad_norm": 0.561379611492157, "kl": 0.03546142578125, "learning_rate": 3.872689434630585e-07, "loss": 0.0014, "reward": 0.06343854777514935, "reward_std": 0.6165986470878124, "rewards/cosine_scaled_reward": -0.18727782554924488, "rewards/format_reward": 0.7291666828095913, "step": 328 }, { "completion_length": 971.5000152587891, "epoch": 0.376, "grad_norm": 0.45329251885414124, "kl": 0.024200439453125, "learning_rate": 3.843439512918949e-07, "loss": 0.001, "reward": 0.6886619143188, "reward_std": 0.8372432589530945, "rewards/cosine_scaled_reward": 0.10075276345014572, "rewards/format_reward": 0.9791666716337204, "step": 329 }, { "completion_length": 1391.3542213439941, "epoch": 0.37714285714285717, "grad_norm": 0.7288292646408081, "kl": 0.056179046630859375, "learning_rate": 3.8142703296283953e-07, "loss": 0.0022, "reward": 0.17569798463955522, "reward_std": 0.639821320772171, "rewards/cosine_scaled_reward": -0.15391897410154343, "rewards/format_reward": 0.8125000055879354, "step": 330 }, { "completion_length": 1995.6250457763672, "epoch": 0.3782857142857143, "grad_norm": 0.7102898359298706, "kl": 0.07641983032226562, "learning_rate": 3.785183306423767e-07, "loss": 0.0031, "reward": 0.17617637664079666, "reward_std": 0.7594183348119259, "rewards/cosine_scaled_reward": -0.056006991614822255, "rewards/format_reward": 0.6041666734963655, "step": 331 }, { "completion_length": 1507.145866394043, "epoch": 0.37942857142857145, "grad_norm": 0.6774265170097351, "kl": 0.041957855224609375, "learning_rate": 3.7561798609655373e-07, "loss": 0.0017, "reward": 0.15361519530415535, "reward_std": 0.7348885871469975, "rewards/cosine_scaled_reward": -0.1517366673797369, "rewards/format_reward": 0.7708333414047956, "step": 332 }, { "completion_length": 1263.06254196167, "epoch": 0.38057142857142856, "grad_norm": 0.40281206369400024, "kl": 0.034458160400390625, "learning_rate": 3.72726140684072e-07, "loss": 0.0014, "reward": 0.4631691016256809, "reward_std": 0.8592428974807262, "rewards/cosine_scaled_reward": -0.046608994947746396, "rewards/format_reward": 0.9583333432674408, "step": 333 }, { "completion_length": 2013.9583740234375, "epoch": 0.38171428571428573, "grad_norm": 0.7402753829956055, "kl": 0.076751708984375, "learning_rate": 3.6984293534939737e-07, "loss": 0.0031, "reward": 0.025389771908521652, "reward_std": 0.8407883942127228, "rewards/cosine_scaled_reward": -0.1560281114652753, "rewards/format_reward": 0.583333345130086, "step": 334 }, { "completion_length": 1345.2291946411133, "epoch": 0.38285714285714284, "grad_norm": 0.40346795320510864, "kl": 0.027713775634765625, "learning_rate": 3.6696851061588994e-07, "loss": 0.0011, "reward": 0.43492193752899766, "reward_std": 0.6728620305657387, "rewards/cosine_scaled_reward": -0.019926004111766815, "rewards/format_reward": 0.8958333358168602, "step": 335 }, { "completion_length": 1604.708381652832, "epoch": 0.384, "grad_norm": 0.46796372532844543, "kl": 0.049716949462890625, "learning_rate": 3.641030065789562e-07, "loss": 0.002, "reward": 0.4504225810524076, "reward_std": 0.7610187456011772, "rewards/cosine_scaled_reward": 0.016421111300587654, "rewards/format_reward": 0.833333358168602, "step": 336 }, { "completion_length": 1863.1875457763672, "epoch": 0.3851428571428571, "grad_norm": 0.5092529058456421, "kl": 0.09614944458007812, "learning_rate": 3.612465628992203e-07, "loss": 0.0038, "reward": 0.38631977140903473, "reward_std": 0.9551087282598019, "rewards/cosine_scaled_reward": -0.020930441562086344, "rewards/format_reward": 0.7916666753590107, "step": 337 }, { "completion_length": 1182.8542175292969, "epoch": 0.3862857142857143, "grad_norm": 0.5370055437088013, "kl": 0.021060943603515625, "learning_rate": 3.5839931879571725e-07, "loss": 0.0008, "reward": 0.48011522740125656, "reward_std": 0.71619638428092, "rewards/cosine_scaled_reward": 0.003307923674583435, "rewards/format_reward": 0.895833333954215, "step": 338 }, { "completion_length": 1895.6250610351562, "epoch": 0.38742857142857146, "grad_norm": 0.8054518699645996, "kl": 0.08332443237304688, "learning_rate": 3.555614130391079e-07, "loss": 0.0033, "reward": 0.19063638825900853, "reward_std": 0.5477262288331985, "rewards/cosine_scaled_reward": -0.08626681286841631, "rewards/format_reward": 0.7083333525806665, "step": 339 }, { "completion_length": 1568.1250305175781, "epoch": 0.38857142857142857, "grad_norm": 0.42964616417884827, "kl": 0.040958404541015625, "learning_rate": 3.5273298394491515e-07, "loss": 0.0016, "reward": 0.2769077487755567, "reward_std": 0.7800903655588627, "rewards/cosine_scaled_reward": -0.062285197753226385, "rewards/format_reward": 0.7500000111758709, "step": 340 }, { "completion_length": 1290.6250228881836, "epoch": 0.38971428571428574, "grad_norm": 0.5903887748718262, "kl": 0.032962799072265625, "learning_rate": 3.4991416936678276e-07, "loss": 0.0013, "reward": 0.625646581640467, "reward_std": 0.7541246488690376, "rewards/cosine_scaled_reward": 0.13255258556455374, "rewards/format_reward": 0.8333333414047956, "step": 341 }, { "completion_length": 1680.229232788086, "epoch": 0.39085714285714285, "grad_norm": 0.6661213040351868, "kl": 0.07326126098632812, "learning_rate": 3.471051066897562e-07, "loss": 0.0029, "reward": 0.4082430477719754, "reward_std": 0.7617458440363407, "rewards/cosine_scaled_reward": -0.03322136774659157, "rewards/format_reward": 0.8750000037252903, "step": 342 }, { "completion_length": 1694.3125457763672, "epoch": 0.392, "grad_norm": 0.9473098516464233, "kl": 0.058170318603515625, "learning_rate": 3.4430593282358777e-07, "loss": 0.0023, "reward": 0.325529879424721, "reward_std": 0.8013235367834568, "rewards/cosine_scaled_reward": -0.08207336533814669, "rewards/format_reward": 0.854166679084301, "step": 343 }, { "completion_length": 1449.3958892822266, "epoch": 0.3931428571428571, "grad_norm": 0.3888903856277466, "kl": 0.06012725830078125, "learning_rate": 3.4151678419606233e-07, "loss": 0.0024, "reward": 0.9358495399355888, "reward_std": 0.7060995027422905, "rewards/cosine_scaled_reward": 0.31333022052422166, "rewards/format_reward": 0.8958333432674408, "step": 344 }, { "completion_length": 1473.0417098999023, "epoch": 0.3942857142857143, "grad_norm": 0.605884313583374, "kl": 0.04383659362792969, "learning_rate": 3.387377967463493e-07, "loss": 0.0018, "reward": 0.25372491776943207, "reward_std": 0.7733400501310825, "rewards/cosine_scaled_reward": -0.10962994769215584, "rewards/format_reward": 0.8125000223517418, "step": 345 }, { "completion_length": 1525.5833892822266, "epoch": 0.3954285714285714, "grad_norm": 0.41509008407592773, "kl": 0.0490264892578125, "learning_rate": 3.359691059183761e-07, "loss": 0.002, "reward": 0.39042545296251774, "reward_std": 0.8485492803156376, "rewards/cosine_scaled_reward": -0.09566876385360956, "rewards/format_reward": 0.9583333432674408, "step": 346 }, { "completion_length": 1555.5208740234375, "epoch": 0.3965714285714286, "grad_norm": 0.6986513137817383, "kl": 0.058147430419921875, "learning_rate": 3.3321084665422803e-07, "loss": 0.0023, "reward": 0.1333858126308769, "reward_std": 0.7080183140933514, "rewards/cosine_scaled_reward": -0.23809866607189178, "rewards/format_reward": 0.9166666865348816, "step": 347 }, { "completion_length": 1694.0625686645508, "epoch": 0.3977142857142857, "grad_norm": 0.7975203394889832, "kl": 0.0868682861328125, "learning_rate": 3.3046315338757026e-07, "loss": 0.0035, "reward": 0.39133906550705433, "reward_std": 0.7762907817959785, "rewards/cosine_scaled_reward": -0.012924212962388992, "rewards/format_reward": 0.8125000111758709, "step": 348 }, { "completion_length": 1404.1667404174805, "epoch": 0.39885714285714285, "grad_norm": 3.111985445022583, "kl": 0.11113739013671875, "learning_rate": 3.2772616003709616e-07, "loss": 0.0044, "reward": 0.47680344711989164, "reward_std": 0.7106522209942341, "rewards/cosine_scaled_reward": 0.014676447957754135, "rewards/format_reward": 0.8750000149011612, "step": 349 }, { "completion_length": 820.4791831970215, "epoch": 0.4, "grad_norm": 0.2917865216732025, "kl": 0.01514434814453125, "learning_rate": 3.250000000000001e-07, "loss": 0.0006, "reward": 0.4086197968572378, "reward_std": 0.7495947815477848, "rewards/cosine_scaled_reward": -0.09753232356160879, "rewards/format_reward": 1.0, "step": 350 }, { "completion_length": 1446.520866394043, "epoch": 0.40114285714285713, "grad_norm": 0.7599288821220398, "kl": 0.07086944580078125, "learning_rate": 3.222848061454764e-07, "loss": 0.0028, "reward": 0.3731183987110853, "reward_std": 0.7764743529260159, "rewards/cosine_scaled_reward": -0.060001387260854244, "rewards/format_reward": 0.8750000055879354, "step": 351 }, { "completion_length": 1387.0000381469727, "epoch": 0.4022857142857143, "grad_norm": 0.7137279510498047, "kl": 0.06012725830078125, "learning_rate": 3.195807108082429e-07, "loss": 0.0024, "reward": 0.42939993026084267, "reward_std": 0.8450379781424999, "rewards/cosine_scaled_reward": 0.003599647810915485, "rewards/format_reward": 0.8125000055879354, "step": 352 }, { "completion_length": 1223.083366394043, "epoch": 0.4034285714285714, "grad_norm": 0.5175493359565735, "kl": 0.0400390625, "learning_rate": 3.168878457820915e-07, "loss": 0.0016, "reward": 0.716075923293829, "reward_std": 0.9309490397572517, "rewards/cosine_scaled_reward": 0.1352907968685031, "rewards/format_reward": 0.9375000074505806, "step": 353 }, { "completion_length": 1064.8333587646484, "epoch": 0.4045714285714286, "grad_norm": 0.6795687079429626, "kl": 0.04412841796875, "learning_rate": 3.142063423134644e-07, "loss": 0.0018, "reward": 0.5015737505163997, "reward_std": 0.5811274200677872, "rewards/cosine_scaled_reward": 0.009966753888875246, "rewards/format_reward": 0.9375, "step": 354 }, { "completion_length": 972.6875152587891, "epoch": 0.4057142857142857, "grad_norm": 0.3503471910953522, "kl": 0.02288818359375, "learning_rate": 3.115363310950578e-07, "loss": 0.0009, "reward": 0.6657696301117539, "reward_std": 0.7398427277803421, "rewards/cosine_scaled_reward": 0.08280336670577526, "rewards/format_reward": 1.0, "step": 355 }, { "completion_length": 1551.833396911621, "epoch": 0.40685714285714286, "grad_norm": 0.4588610529899597, "kl": 0.06862258911132812, "learning_rate": 3.0887794225945143e-07, "loss": 0.0027, "reward": 0.3966330944094807, "reward_std": 0.6840399689972401, "rewards/cosine_scaled_reward": -0.058146869763731956, "rewards/format_reward": 0.9166666716337204, "step": 356 }, { "completion_length": 1772.1042175292969, "epoch": 0.408, "grad_norm": 1.0075832605361938, "kl": 0.100799560546875, "learning_rate": 3.062313053727671e-07, "loss": 0.004, "reward": 0.12496365327388048, "reward_std": 0.8355611003935337, "rewards/cosine_scaled_reward": -0.15991984121501446, "rewards/format_reward": 0.7291666828095913, "step": 357 }, { "completion_length": 1562.562515258789, "epoch": 0.40914285714285714, "grad_norm": 0.7331953644752502, "kl": 0.07072067260742188, "learning_rate": 3.0359654942835247e-07, "loss": 0.0028, "reward": 0.5806331331841648, "reward_std": 0.8575869612395763, "rewards/cosine_scaled_reward": 0.05609727092087269, "rewards/format_reward": 0.9166666716337204, "step": 358 }, { "completion_length": 942.1666870117188, "epoch": 0.4102857142857143, "grad_norm": 0.5968821048736572, "kl": 0.0392608642578125, "learning_rate": 3.0097380284049523e-07, "loss": 0.0016, "reward": 0.42657787445932627, "reward_std": 0.5996944792568684, "rewards/cosine_scaled_reward": -0.06319771538255736, "rewards/format_reward": 0.9791666716337204, "step": 359 }, { "completion_length": 1402.3125381469727, "epoch": 0.4114285714285714, "grad_norm": 0.7901036143302917, "kl": 0.07848739624023438, "learning_rate": 2.9836319343816397e-07, "loss": 0.0031, "reward": 0.5172299258410931, "reward_std": 0.8756576031446457, "rewards/cosine_scaled_reward": 0.017652488488238305, "rewards/format_reward": 0.8958333507180214, "step": 360 }, { "completion_length": 1255.0000534057617, "epoch": 0.4125714285714286, "grad_norm": 0.7914440035820007, "kl": 0.0751495361328125, "learning_rate": 2.9576484845877793e-07, "loss": 0.003, "reward": 0.24485921673476696, "reward_std": 0.7093796096742153, "rewards/cosine_scaled_reward": -0.14366490487009287, "rewards/format_reward": 0.8750000149011612, "step": 361 }, { "completion_length": 899.2916831970215, "epoch": 0.4137142857142857, "grad_norm": 0.6618494391441345, "kl": 0.0485382080078125, "learning_rate": 2.931788945420058e-07, "loss": 0.0019, "reward": 0.510432411916554, "reward_std": 0.6674188002943993, "rewards/cosine_scaled_reward": 0.010434551164507866, "rewards/format_reward": 0.9375000074505806, "step": 362 }, { "completion_length": 863.9166946411133, "epoch": 0.41485714285714287, "grad_norm": 0.7848973274230957, "kl": 0.029571533203125, "learning_rate": 2.9060545772359305e-07, "loss": 0.0012, "reward": 0.844133562874049, "reward_std": 0.6796937808394432, "rewards/cosine_scaled_reward": 0.20261266455054283, "rewards/format_reward": 1.0, "step": 363 }, { "completion_length": 1205.1666870117188, "epoch": 0.416, "grad_norm": 0.5032205581665039, "kl": 0.03617095947265625, "learning_rate": 2.8804466342921987e-07, "loss": 0.0014, "reward": 0.08574948133900762, "reward_std": 0.4307812377810478, "rewards/cosine_scaled_reward": -0.26402094028890133, "rewards/format_reward": 0.9375000074505806, "step": 364 }, { "completion_length": 1884.0625686645508, "epoch": 0.41714285714285715, "grad_norm": 0.9030827283859253, "kl": 0.1519775390625, "learning_rate": 2.854966364683872e-07, "loss": 0.0061, "reward": 0.17442028690129519, "reward_std": 0.685190960764885, "rewards/cosine_scaled_reward": -0.09430265240371227, "rewards/format_reward": 0.6875000093132257, "step": 365 }, { "completion_length": 1026.7708644866943, "epoch": 0.41828571428571426, "grad_norm": 0.6462038159370422, "kl": 0.0249786376953125, "learning_rate": 2.829615010283344e-07, "loss": 0.001, "reward": 0.7736296411603689, "reward_std": 0.8399604074656963, "rewards/cosine_scaled_reward": 0.15821054810658097, "rewards/format_reward": 0.9791666716337204, "step": 366 }, { "completion_length": 1750.541748046875, "epoch": 0.41942857142857143, "grad_norm": 0.8086258769035339, "kl": 0.12229537963867188, "learning_rate": 2.8043938066798645e-07, "loss": 0.0049, "reward": 0.40639928355813026, "reward_std": 0.6661986261606216, "rewards/cosine_scaled_reward": -0.005629323422908783, "rewards/format_reward": 0.8333333358168602, "step": 367 }, { "completion_length": 1828.7917289733887, "epoch": 0.4205714285714286, "grad_norm": 1.4647676944732666, "kl": 0.092529296875, "learning_rate": 2.7793039831193133e-07, "loss": 0.0037, "reward": 0.3596602795878425, "reward_std": 0.8183485567569733, "rewards/cosine_scaled_reward": -0.04840289568528533, "rewards/format_reward": 0.8333333395421505, "step": 368 }, { "completion_length": 1507.0208854675293, "epoch": 0.4217142857142857, "grad_norm": 0.9977928996086121, "kl": 0.105255126953125, "learning_rate": 2.7543467624442956e-07, "loss": 0.0042, "reward": 0.3323223125189543, "reward_std": 0.8554218038916588, "rewards/cosine_scaled_reward": -0.0706447935081087, "rewards/format_reward": 0.8333333432674408, "step": 369 }, { "completion_length": 1375.958366394043, "epoch": 0.4228571428571429, "grad_norm": 0.6904154419898987, "kl": 0.1561279296875, "learning_rate": 2.729523361034538e-07, "loss": 0.0062, "reward": 0.2625284339301288, "reward_std": 0.57734365016222, "rewards/cosine_scaled_reward": -0.11408653669059277, "rewards/format_reward": 0.8541666679084301, "step": 370 }, { "completion_length": 747.4166831970215, "epoch": 0.424, "grad_norm": 0.7088914513587952, "kl": 0.04413604736328125, "learning_rate": 2.7048349887476037e-07, "loss": 0.0018, "reward": 0.6137365428730845, "reward_std": 0.7054285481572151, "rewards/cosine_scaled_reward": 0.06669859914109111, "rewards/format_reward": 0.9583333432674408, "step": 371 }, { "completion_length": 1583.250015258789, "epoch": 0.42514285714285716, "grad_norm": 1.3554654121398926, "kl": 0.08654022216796875, "learning_rate": 2.6802828488599294e-07, "loss": 0.0035, "reward": 0.5319191414746456, "reward_std": 0.6091607809066772, "rewards/cosine_scaled_reward": 0.052222222089767456, "rewards/format_reward": 0.8958333507180214, "step": 372 }, { "completion_length": 840.7500114440918, "epoch": 0.42628571428571427, "grad_norm": 0.823254406452179, "kl": 0.02849578857421875, "learning_rate": 2.655868138008171e-07, "loss": 0.0011, "reward": 0.23188667371869087, "reward_std": 0.6112702935934067, "rewards/cosine_scaled_reward": -0.20857458282262087, "rewards/format_reward": 1.0, "step": 373 }, { "completion_length": 1146.895881652832, "epoch": 0.42742857142857144, "grad_norm": 0.8236629962921143, "kl": 0.047637939453125, "learning_rate": 2.631592046130896e-07, "loss": 0.0019, "reward": 0.5981785822659731, "reward_std": 0.8187721818685532, "rewards/cosine_scaled_reward": 0.03387853177264333, "rewards/format_reward": 0.9791666716337204, "step": 374 }, { "completion_length": 1634.3750457763672, "epoch": 0.42857142857142855, "grad_norm": 1.242490530014038, "kl": 0.20917510986328125, "learning_rate": 2.6074557564105724e-07, "loss": 0.0084, "reward": 0.49338567443192005, "reward_std": 0.8198688104748726, "rewards/cosine_scaled_reward": 0.07417132705450058, "rewards/format_reward": 0.7708333469927311, "step": 375 }, { "completion_length": 1403.1875381469727, "epoch": 0.4297142857142857, "grad_norm": 1.0553152561187744, "kl": 0.1156768798828125, "learning_rate": 2.583460445215911e-07, "loss": 0.0046, "reward": 0.4271426647901535, "reward_std": 0.7959622256457806, "rewards/cosine_scaled_reward": -0.03355884738266468, "rewards/format_reward": 0.8958333432674408, "step": 376 }, { "completion_length": 1332.0417098999023, "epoch": 0.4308571428571429, "grad_norm": 0.9126045107841492, "kl": 0.1114654541015625, "learning_rate": 2.5596072820445254e-07, "loss": 0.0045, "reward": 0.5075240693986416, "reward_std": 0.855946060270071, "rewards/cosine_scaled_reward": -0.0022887131199240685, "rewards/format_reward": 0.9375000149011612, "step": 377 }, { "completion_length": 1077.4583854675293, "epoch": 0.432, "grad_norm": 0.5990425944328308, "kl": 0.030300140380859375, "learning_rate": 2.5358974294659373e-07, "loss": 0.0012, "reward": 0.5777185422666662, "reward_std": 0.9156284630298615, "rewards/cosine_scaled_reward": 0.008807800710201263, "rewards/format_reward": 1.0, "step": 378 }, { "completion_length": 1503.7708587646484, "epoch": 0.43314285714285716, "grad_norm": 0.6305628418922424, "kl": 0.13201522827148438, "learning_rate": 2.512332043064913e-07, "loss": 0.0053, "reward": 0.3502848669886589, "reward_std": 0.7390433885157108, "rewards/cosine_scaled_reward": -0.06138859502971172, "rewards/format_reward": 0.8541666679084301, "step": 379 }, { "completion_length": 1135.229175567627, "epoch": 0.4342857142857143, "grad_norm": 1.0688378810882568, "kl": 0.11534881591796875, "learning_rate": 2.488912271385139e-07, "loss": 0.0046, "reward": 0.415515000699088, "reward_std": 0.7041169926524162, "rewards/cosine_scaled_reward": -0.053237104788422585, "rewards/format_reward": 0.9375000149011612, "step": 380 }, { "completion_length": 1531.4583854675293, "epoch": 0.43542857142857144, "grad_norm": 1.1264177560806274, "kl": 0.13215255737304688, "learning_rate": 2.465639255873246e-07, "loss": 0.0053, "reward": 0.24819382751593366, "reward_std": 0.7195433788001537, "rewards/cosine_scaled_reward": -0.15285431523807347, "rewards/format_reward": 0.8958333507180214, "step": 381 }, { "completion_length": 1023.3750305175781, "epoch": 0.43657142857142855, "grad_norm": 0.40737247467041016, "kl": 0.0533447265625, "learning_rate": 2.4425141308231765e-07, "loss": 0.0021, "reward": 0.2534531052224338, "reward_std": 0.6627279557287693, "rewards/cosine_scaled_reward": -0.19920427445322275, "rewards/format_reward": 1.0, "step": 382 }, { "completion_length": 1496.5417251586914, "epoch": 0.4377142857142857, "grad_norm": 1.8750388622283936, "kl": 0.2808380126953125, "learning_rate": 2.4195380233209006e-07, "loss": 0.0112, "reward": 0.3533962171059102, "reward_std": 0.8847429379820824, "rewards/cosine_scaled_reward": -0.04301046393811703, "rewards/format_reward": 0.8125000074505806, "step": 383 }, { "completion_length": 1054.5833435058594, "epoch": 0.43885714285714283, "grad_norm": 0.5143475532531738, "kl": 0.0340118408203125, "learning_rate": 2.3967120531894857e-07, "loss": 0.0014, "reward": 0.9491607993841171, "reward_std": 0.8766042143106461, "rewards/cosine_scaled_reward": 0.2751440554857254, "rewards/format_reward": 0.9791666716337204, "step": 384 }, { "completion_length": 1428.645896911621, "epoch": 0.44, "grad_norm": 0.6951473355293274, "kl": 0.14729690551757812, "learning_rate": 2.374037332934512e-07, "loss": 0.0059, "reward": 0.4135100084822625, "reward_std": 0.7766407653689384, "rewards/cosine_scaled_reward": -0.03412807872518897, "rewards/format_reward": 0.8750000111758709, "step": 385 }, { "completion_length": 1270.7291793823242, "epoch": 0.44114285714285717, "grad_norm": 1.0250437259674072, "kl": 0.1134185791015625, "learning_rate": 2.3515149676898552e-07, "loss": 0.0045, "reward": 0.47308777272701263, "reward_std": 0.6565175838768482, "rewards/cosine_scaled_reward": -0.013674074783921242, "rewards/format_reward": 0.9375, "step": 386 }, { "completion_length": 1528.3542213439941, "epoch": 0.4422857142857143, "grad_norm": 2.0494582653045654, "kl": 0.19681549072265625, "learning_rate": 2.3291460551638237e-07, "loss": 0.0079, "reward": 0.31283304444514215, "reward_std": 0.583834994584322, "rewards/cosine_scaled_reward": -0.06756466627120972, "rewards/format_reward": 0.8333333469927311, "step": 387 }, { "completion_length": 1097.583351135254, "epoch": 0.44342857142857145, "grad_norm": 0.5577840209007263, "kl": 0.0581207275390625, "learning_rate": 2.306931685585657e-07, "loss": 0.0023, "reward": 0.7101229609397706, "reward_std": 0.6756577789783478, "rewards/cosine_scaled_reward": 0.1360636167228222, "rewards/format_reward": 0.9583333358168602, "step": 388 }, { "completion_length": 1202.3958587646484, "epoch": 0.44457142857142856, "grad_norm": 0.6137182712554932, "kl": 0.0680389404296875, "learning_rate": 2.2848729416523859e-07, "loss": 0.0027, "reward": 0.27682637330144644, "reward_std": 0.7127226404845715, "rewards/cosine_scaled_reward": -0.1509529883041978, "rewards/format_reward": 0.9375000074505806, "step": 389 }, { "completion_length": 1435.2917213439941, "epoch": 0.44571428571428573, "grad_norm": 0.9228805303573608, "kl": 0.1553211212158203, "learning_rate": 2.2629708984760706e-07, "loss": 0.0062, "reward": 0.21503479685634375, "reward_std": 0.7798956334590912, "rewards/cosine_scaled_reward": -0.11404814245179296, "rewards/format_reward": 0.7708333544433117, "step": 390 }, { "completion_length": 999.9167022705078, "epoch": 0.44685714285714284, "grad_norm": 0.9710776209831238, "kl": 0.08495330810546875, "learning_rate": 2.2412266235313973e-07, "loss": 0.0034, "reward": 0.6504320180974901, "reward_std": 0.6109317727386951, "rewards/cosine_scaled_reward": 0.1076424578204751, "rewards/format_reward": 0.9375000149011612, "step": 391 }, { "completion_length": 1423.895881652832, "epoch": 0.448, "grad_norm": 1.6490659713745117, "kl": 0.223846435546875, "learning_rate": 2.2196411766036487e-07, "loss": 0.009, "reward": 0.1661514127627015, "reward_std": 0.7237707450985909, "rewards/cosine_scaled_reward": -0.1779894083738327, "rewards/format_reward": 0.833333333954215, "step": 392 }, { "completion_length": 1450.4792022705078, "epoch": 0.4491428571428571, "grad_norm": 1.2397853136062622, "kl": 0.10369110107421875, "learning_rate": 2.1982156097370557e-07, "loss": 0.0042, "reward": 0.49226129427552223, "reward_std": 0.8764687478542328, "rewards/cosine_scaled_reward": 0.0059880828484892845, "rewards/format_reward": 0.8958333358168602, "step": 393 }, { "completion_length": 1569.3958702087402, "epoch": 0.4502857142857143, "grad_norm": 1.4030894041061401, "kl": 0.1822967529296875, "learning_rate": 2.1769509671835223e-07, "loss": 0.0073, "reward": 0.11027060728520155, "reward_std": 0.6850339062511921, "rewards/cosine_scaled_reward": -0.21509641967713833, "rewards/format_reward": 0.8333333432674408, "step": 394 }, { "completion_length": 1156.7917175292969, "epoch": 0.4514285714285714, "grad_norm": 0.9848508238792419, "kl": 0.077606201171875, "learning_rate": 2.1558482853517253e-07, "loss": 0.0031, "reward": 0.41267195832915604, "reward_std": 0.7016527280211449, "rewards/cosine_scaled_reward": -0.04957490786910057, "rewards/format_reward": 0.916666679084301, "step": 395 }, { "completion_length": 1016.1458511352539, "epoch": 0.45257142857142857, "grad_norm": 0.6101754903793335, "kl": 0.020412445068359375, "learning_rate": 2.134908592756607e-07, "loss": 0.0008, "reward": 0.4085001898929477, "reward_std": 0.6500335298478603, "rewards/cosine_scaled_reward": -0.058247581124305725, "rewards/format_reward": 0.9375000074505806, "step": 396 }, { "completion_length": 1285.4583892822266, "epoch": 0.45371428571428574, "grad_norm": 0.699232280254364, "kl": 0.16654586791992188, "learning_rate": 2.1141329099692406e-07, "loss": 0.0067, "reward": 0.2716854903846979, "reward_std": 0.6572816967964172, "rewards/cosine_scaled_reward": -0.10944613942410797, "rewards/format_reward": 0.8541666697710752, "step": 397 }, { "completion_length": 1064.0625305175781, "epoch": 0.45485714285714285, "grad_norm": 0.4855163097381592, "kl": 0.034351348876953125, "learning_rate": 2.0935222495670968e-07, "loss": 0.0014, "reward": 0.40476914402097464, "reward_std": 0.6724813655018806, "rewards/cosine_scaled_reward": -0.0830375433433801, "rewards/format_reward": 0.9791666716337204, "step": 398 }, { "completion_length": 944.5417098999023, "epoch": 0.456, "grad_norm": 0.42374876141548157, "kl": 0.0376739501953125, "learning_rate": 2.0730776160846853e-07, "loss": 0.0015, "reward": 0.5400755191221833, "reward_std": 0.7856027092784643, "rewards/cosine_scaled_reward": -0.010072574485093355, "rewards/format_reward": 1.0, "step": 399 }, { "completion_length": 898.6875152587891, "epoch": 0.45714285714285713, "grad_norm": 0.6626017689704895, "kl": 0.0309600830078125, "learning_rate": 2.0528000059645995e-07, "loss": 0.0012, "reward": 0.7812614720314741, "reward_std": 0.6693713776767254, "rewards/cosine_scaled_reward": 0.1816884014988318, "rewards/format_reward": 0.9583333358168602, "step": 400 }, { "completion_length": 1256.6458740234375, "epoch": 0.4582857142857143, "grad_norm": 1.2477692365646362, "kl": 0.08155059814453125, "learning_rate": 2.032690407508949e-07, "loss": 0.0033, "reward": 0.5257098386064172, "reward_std": 0.81386823579669, "rewards/cosine_scaled_reward": 0.011443385854363441, "rewards/format_reward": 0.9375000074505806, "step": 401 }, { "completion_length": 1103.5208587646484, "epoch": 0.4594285714285714, "grad_norm": 3.268733501434326, "kl": 0.19863510131835938, "learning_rate": 2.0127498008311922e-07, "loss": 0.0079, "reward": 0.4599206205457449, "reward_std": 0.5488756932318211, "rewards/cosine_scaled_reward": -0.017510680481791496, "rewards/format_reward": 0.9375000149011612, "step": 402 }, { "completion_length": 931.4375190734863, "epoch": 0.4605714285714286, "grad_norm": 1.2398287057876587, "kl": 0.09774017333984375, "learning_rate": 1.9929791578083655e-07, "loss": 0.0039, "reward": 0.45444739051163197, "reward_std": 0.4980122298002243, "rewards/cosine_scaled_reward": -0.007617715746164322, "rewards/format_reward": 0.9166666679084301, "step": 403 }, { "completion_length": 1211.6041870117188, "epoch": 0.4617142857142857, "grad_norm": 1.140418529510498, "kl": 0.15045547485351562, "learning_rate": 1.9733794420337213e-07, "loss": 0.006, "reward": 0.4547617736971006, "reward_std": 0.5626656413078308, "rewards/cosine_scaled_reward": -0.01060919463634491, "rewards/format_reward": 0.9166666716337204, "step": 404 }, { "completion_length": 974.6875305175781, "epoch": 0.46285714285714286, "grad_norm": 1.321244239807129, "kl": 0.094329833984375, "learning_rate": 1.9539516087697517e-07, "loss": 0.0038, "reward": 0.7235147282481194, "reward_std": 0.8120486699044704, "rewards/cosine_scaled_reward": 0.14601082005538046, "rewards/format_reward": 0.9375000149011612, "step": 405 }, { "completion_length": 1133.5416946411133, "epoch": 0.464, "grad_norm": 0.6807760000228882, "kl": 0.1176910400390625, "learning_rate": 1.934696604901642e-07, "loss": 0.0047, "reward": 0.6748548084869981, "reward_std": 0.78925821185112, "rewards/cosine_scaled_reward": 0.10528674224042334, "rewards/format_reward": 0.9583333432674408, "step": 406 }, { "completion_length": 1320.2500305175781, "epoch": 0.46514285714285714, "grad_norm": 1.170783519744873, "kl": 0.21154022216796875, "learning_rate": 1.915615368891117e-07, "loss": 0.0085, "reward": 0.4940835009329021, "reward_std": 0.5547701120376587, "rewards/cosine_scaled_reward": 0.037424055859446526, "rewards/format_reward": 0.8750000055879354, "step": 407 }, { "completion_length": 1262.9375457763672, "epoch": 0.4662857142857143, "grad_norm": 0.8598933815956116, "kl": 0.14272689819335938, "learning_rate": 1.8967088307307e-07, "loss": 0.0057, "reward": 0.404069249285385, "reward_std": 0.7031322456896305, "rewards/cosine_scaled_reward": -0.0431424961425364, "rewards/format_reward": 0.895833333954215, "step": 408 }, { "completion_length": 1639.5000305175781, "epoch": 0.4674285714285714, "grad_norm": 1.15615975856781, "kl": 0.17174530029296875, "learning_rate": 1.8779779118983867e-07, "loss": 0.0069, "reward": 0.31328563997521996, "reward_std": 0.7064049206674099, "rewards/cosine_scaled_reward": -0.11326186126098037, "rewards/format_reward": 0.9166666865348816, "step": 409 }, { "completion_length": 1499.333366394043, "epoch": 0.4685714285714286, "grad_norm": 1.267825961112976, "kl": 0.22209548950195312, "learning_rate": 1.8594235253127372e-07, "loss": 0.0089, "reward": 0.3295501284301281, "reward_std": 0.6071663275361061, "rewards/cosine_scaled_reward": -0.09065644256770611, "rewards/format_reward": 0.8958333358168602, "step": 410 }, { "completion_length": 1507.6041946411133, "epoch": 0.4697142857142857, "grad_norm": 1.4931728839874268, "kl": 0.13777923583984375, "learning_rate": 1.8410465752883758e-07, "loss": 0.0055, "reward": 0.3418419687077403, "reward_std": 0.8959544003009796, "rewards/cosine_scaled_reward": -0.10114361811429262, "rewards/format_reward": 0.8958333395421505, "step": 411 }, { "completion_length": 1131.0208587646484, "epoch": 0.47085714285714286, "grad_norm": 1.660409688949585, "kl": 0.13482666015625, "learning_rate": 1.822847957491922e-07, "loss": 0.0054, "reward": 0.5560126416385174, "reward_std": 0.7937713004648685, "rewards/cosine_scaled_reward": 0.06759718805551529, "rewards/format_reward": 0.8750000149011612, "step": 412 }, { "completion_length": 1269.9375267028809, "epoch": 0.472, "grad_norm": 0.629540205001831, "kl": 0.112335205078125, "learning_rate": 1.804828558898332e-07, "loss": 0.0045, "reward": 0.46968742460012436, "reward_std": 0.6266082711517811, "rewards/cosine_scaled_reward": 0.005679788533598185, "rewards/format_reward": 0.8958333358168602, "step": 413 }, { "completion_length": 1644.041732788086, "epoch": 0.47314285714285714, "grad_norm": 1.6342582702636719, "kl": 0.16037750244140625, "learning_rate": 1.7869892577476722e-07, "loss": 0.0064, "reward": 0.17194935493171215, "reward_std": 0.7680409215390682, "rewards/cosine_scaled_reward": -0.1542492527514696, "rewards/format_reward": 0.7916666846722364, "step": 414 }, { "completion_length": 1607.4167022705078, "epoch": 0.4742857142857143, "grad_norm": 2.081047296524048, "kl": 0.315277099609375, "learning_rate": 1.7693309235023127e-07, "loss": 0.0126, "reward": 0.15895176446065307, "reward_std": 0.7137060277163982, "rewards/cosine_scaled_reward": -0.0956956222653389, "rewards/format_reward": 0.6666666865348816, "step": 415 }, { "completion_length": 1069.0208587646484, "epoch": 0.4754285714285714, "grad_norm": 1.11202871799469, "kl": 0.07001304626464844, "learning_rate": 1.7518544168045524e-07, "loss": 0.0028, "reward": 0.8406836800277233, "reward_std": 0.7853529006242752, "rewards/cosine_scaled_reward": 0.20668393187224865, "rewards/format_reward": 0.9791666716337204, "step": 416 }, { "completion_length": 1633.3125534057617, "epoch": 0.4765714285714286, "grad_norm": 2.8300588130950928, "kl": 0.274017333984375, "learning_rate": 1.7345605894346726e-07, "loss": 0.011, "reward": 0.2972092959098518, "reward_std": 0.8013018406927586, "rewards/cosine_scaled_reward": -0.08885959023609757, "rewards/format_reward": 0.833333358168602, "step": 417 }, { "completion_length": 1035.1041870117188, "epoch": 0.4777142857142857, "grad_norm": 0.9021016359329224, "kl": 0.11577606201171875, "learning_rate": 1.7174502842694212e-07, "loss": 0.0046, "reward": 0.8001389261335135, "reward_std": 0.8382466398179531, "rewards/cosine_scaled_reward": 0.18563345912843943, "rewards/format_reward": 0.9583333358168602, "step": 418 }, { "completion_length": 1300.9792175292969, "epoch": 0.47885714285714287, "grad_norm": 1.542521595954895, "kl": 0.1188812255859375, "learning_rate": 1.7005243352409333e-07, "loss": 0.0048, "reward": 0.47798004280775785, "reward_std": 0.6525602787733078, "rewards/cosine_scaled_reward": -0.022527330555021763, "rewards/format_reward": 0.9583333432674408, "step": 419 }, { "completion_length": 774.5833587646484, "epoch": 0.48, "grad_norm": 0.5312778949737549, "kl": 0.01689910888671875, "learning_rate": 1.6837835672960831e-07, "loss": 0.0007, "reward": 0.34607438833336346, "reward_std": 0.6675878167152405, "rewards/cosine_scaled_reward": -0.12272587232291698, "rewards/format_reward": 0.9791666716337204, "step": 420 }, { "completion_length": 1449.6250381469727, "epoch": 0.48114285714285715, "grad_norm": 1.6367040872573853, "kl": 0.283203125, "learning_rate": 1.6672287963562852e-07, "loss": 0.0114, "reward": 0.3528535794466734, "reward_std": 0.7183955535292625, "rewards/cosine_scaled_reward": -0.09002240933477879, "rewards/format_reward": 0.916666679084301, "step": 421 }, { "completion_length": 1520.2917251586914, "epoch": 0.48228571428571426, "grad_norm": 1.4518808126449585, "kl": 0.29302215576171875, "learning_rate": 1.6508608292777203e-07, "loss": 0.0117, "reward": 0.46497548231855035, "reward_std": 0.6263095885515213, "rewards/cosine_scaled_reward": 0.00397377647459507, "rewards/format_reward": 0.8958333432674408, "step": 422 }, { "completion_length": 1500.2500534057617, "epoch": 0.48342857142857143, "grad_norm": 1.8499155044555664, "kl": 0.3067626953125, "learning_rate": 1.6346804638120098e-07, "loss": 0.0122, "reward": 0.27377578942105174, "reward_std": 0.6246924735605717, "rewards/cosine_scaled_reward": -0.08783163502812386, "rewards/format_reward": 0.812500013038516, "step": 423 }, { "completion_length": 1651.6250610351562, "epoch": 0.4845714285714286, "grad_norm": 2.137979745864868, "kl": 0.3133392333984375, "learning_rate": 1.6186884885673413e-07, "loss": 0.0125, "reward": 0.17908507003448904, "reward_std": 0.8118347823619843, "rewards/cosine_scaled_reward": -0.165109351859428, "rewards/format_reward": 0.812500013038516, "step": 424 }, { "completion_length": 1094.2500381469727, "epoch": 0.4857142857142857, "grad_norm": 1.6124348640441895, "kl": 0.13685989379882812, "learning_rate": 1.6028856829700258e-07, "loss": 0.0055, "reward": 0.9797765575349331, "reward_std": 0.7934582978487015, "rewards/cosine_scaled_reward": 0.33494018763303757, "rewards/format_reward": 0.916666679084301, "step": 425 }, { "completion_length": 1124.4166831970215, "epoch": 0.4868571428571429, "grad_norm": 1.3242241144180298, "kl": 0.1932373046875, "learning_rate": 1.5872728172265146e-07, "loss": 0.0077, "reward": 0.3963227402418852, "reward_std": 0.6691669598221779, "rewards/cosine_scaled_reward": -0.06672874744981527, "rewards/format_reward": 0.9375, "step": 426 }, { "completion_length": 1525.520896911621, "epoch": 0.488, "grad_norm": 1.3827719688415527, "kl": 0.24054718017578125, "learning_rate": 1.5718506522858572e-07, "loss": 0.0096, "reward": 0.38459044555202127, "reward_std": 0.8512180484831333, "rewards/cosine_scaled_reward": -0.07769208890385926, "rewards/format_reward": 0.9166666865348816, "step": 427 }, { "completion_length": 1536.3542098999023, "epoch": 0.48914285714285716, "grad_norm": 1.885048270225525, "kl": 0.2565765380859375, "learning_rate": 1.5566199398026147e-07, "loss": 0.0103, "reward": 0.1980901760980487, "reward_std": 0.6689082272350788, "rewards/cosine_scaled_reward": -0.1701727721374482, "rewards/format_reward": 0.8750000074505806, "step": 428 }, { "completion_length": 803.3333587646484, "epoch": 0.49028571428571427, "grad_norm": 0.38952067494392395, "kl": 0.01854705810546875, "learning_rate": 1.5415814221002265e-07, "loss": 0.0007, "reward": 0.3733687801286578, "reward_std": 0.6376638635993004, "rewards/cosine_scaled_reward": -0.11222685221582651, "rewards/format_reward": 1.0, "step": 429 }, { "completion_length": 1186.7291984558105, "epoch": 0.49142857142857144, "grad_norm": 1.133076548576355, "kl": 0.19187164306640625, "learning_rate": 1.5267358321348285e-07, "loss": 0.0077, "reward": 0.529668789356947, "reward_std": 0.6670900508761406, "rewards/cosine_scaled_reward": 0.011455949861556292, "rewards/format_reward": 0.9583333358168602, "step": 430 }, { "completion_length": 1090.479190826416, "epoch": 0.49257142857142855, "grad_norm": 2.763601064682007, "kl": 0.3094367980957031, "learning_rate": 1.5120838934595337e-07, "loss": 0.0124, "reward": 0.4001455968245864, "reward_std": 0.6080403476953506, "rewards/cosine_scaled_reward": -0.03941826708614826, "rewards/format_reward": 0.8958333507180214, "step": 431 }, { "completion_length": 1532.8541870117188, "epoch": 0.4937142857142857, "grad_norm": 1.8087928295135498, "kl": 0.3389739990234375, "learning_rate": 1.4976263201891613e-07, "loss": 0.0135, "reward": 0.2658024498960003, "reward_std": 0.7025109715759754, "rewards/cosine_scaled_reward": -0.10354285500943661, "rewards/format_reward": 0.8333333432674408, "step": 432 }, { "completion_length": 1383.3542022705078, "epoch": 0.4948571428571429, "grad_norm": 1.609007477760315, "kl": 0.202850341796875, "learning_rate": 1.483363816965435e-07, "loss": 0.0081, "reward": 0.47822056571021676, "reward_std": 0.6413930989801884, "rewards/cosine_scaled_reward": -0.032901763916015625, "rewards/format_reward": 0.9791666716337204, "step": 433 }, { "completion_length": 1405.7500381469727, "epoch": 0.496, "grad_norm": 1.7864830493927002, "kl": 0.16643524169921875, "learning_rate": 1.469297078922642e-07, "loss": 0.0067, "reward": 0.09692841861397028, "reward_std": 0.5686304904520512, "rewards/cosine_scaled_reward": -0.22341302986023948, "rewards/format_reward": 0.8541666865348816, "step": 434 }, { "completion_length": 894.8541984558105, "epoch": 0.49714285714285716, "grad_norm": 1.2933272123336792, "kl": 0.20261383056640625, "learning_rate": 1.4554267916537495e-07, "loss": 0.0081, "reward": 0.3131554089486599, "reward_std": 0.5978329069912434, "rewards/cosine_scaled_reward": -0.12059530150145292, "rewards/format_reward": 0.9375000074505806, "step": 435 }, { "completion_length": 1181.6875343322754, "epoch": 0.4982857142857143, "grad_norm": 2.0181899070739746, "kl": 0.34568023681640625, "learning_rate": 1.4417536311769885e-07, "loss": 0.0138, "reward": 0.5736293898262375, "reward_std": 0.6847642697393894, "rewards/cosine_scaled_reward": 0.08606236800551414, "rewards/format_reward": 0.8750000149011612, "step": 436 }, { "completion_length": 1276.4791946411133, "epoch": 0.49942857142857144, "grad_norm": 2.564283847808838, "kl": 0.2025604248046875, "learning_rate": 1.4282782639029128e-07, "loss": 0.0081, "reward": 0.4678001292049885, "reward_std": 0.7040045224130154, "rewards/cosine_scaled_reward": -0.008855259045958519, "rewards/format_reward": 0.916666679084301, "step": 437 }, { "completion_length": 1573.0833587646484, "epoch": 0.5005714285714286, "grad_norm": 2.7367420196533203, "kl": 0.5104904174804688, "learning_rate": 1.4150013466019114e-07, "loss": 0.0204, "reward": 0.17007037345319986, "reward_std": 0.5956090353429317, "rewards/cosine_scaled_reward": -0.1865391266765073, "rewards/format_reward": 0.8750000055879354, "step": 438 }, { "completion_length": 1149.6875305175781, "epoch": 0.5017142857142857, "grad_norm": 1.5625011920928955, "kl": 0.2465057373046875, "learning_rate": 1.4019235263722034e-07, "loss": 0.0099, "reward": 0.29737665317952633, "reward_std": 0.6380414590239525, "rewards/cosine_scaled_reward": -0.1426175870001316, "rewards/format_reward": 0.9583333432674408, "step": 439 }, { "completion_length": 1237.354206085205, "epoch": 0.5028571428571429, "grad_norm": 5.108010768890381, "kl": 0.387542724609375, "learning_rate": 1.3890454406082956e-07, "loss": 0.0155, "reward": 0.17432179488241673, "reward_std": 0.6026643626391888, "rewards/cosine_scaled_reward": -0.20669769623782486, "rewards/format_reward": 0.9166666865348816, "step": 440 }, { "completion_length": 1345.958381652832, "epoch": 0.504, "grad_norm": 3.1474623680114746, "kl": 0.32805633544921875, "learning_rate": 1.3763677169699217e-07, "loss": 0.0131, "reward": 0.5082171498797834, "reward_std": 0.5840157195925713, "rewards/cosine_scaled_reward": 0.023840421810746193, "rewards/format_reward": 0.916666679084301, "step": 441 }, { "completion_length": 1032.1042022705078, "epoch": 0.5051428571428571, "grad_norm": 6.944667339324951, "kl": 0.20575714111328125, "learning_rate": 1.3638909733514452e-07, "loss": 0.0082, "reward": 0.43416818673722446, "reward_std": 0.6829791888594627, "rewards/cosine_scaled_reward": -0.0446524852886796, "rewards/format_reward": 0.9375000149011612, "step": 442 }, { "completion_length": 1498.2708587646484, "epoch": 0.5062857142857143, "grad_norm": 2.155977725982666, "kl": 0.36579132080078125, "learning_rate": 1.351615817851748e-07, "loss": 0.0147, "reward": 0.35938314939267, "reward_std": 0.6541690826416016, "rewards/cosine_scaled_reward": -0.07123458385467529, "rewards/format_reward": 0.8958333432674408, "step": 443 }, { "completion_length": 1179.6666870117188, "epoch": 0.5074285714285715, "grad_norm": 1.1800274848937988, "kl": 0.2142486572265625, "learning_rate": 1.3395428487445914e-07, "loss": 0.0086, "reward": 0.20739420503377914, "reward_std": 0.665216438472271, "rewards/cosine_scaled_reward": -0.16437909565865993, "rewards/format_reward": 0.8750000149011612, "step": 444 }, { "completion_length": 1355.6042098999023, "epoch": 0.5085714285714286, "grad_norm": 3.1479392051696777, "kl": 0.36163330078125, "learning_rate": 1.3276726544494571e-07, "loss": 0.0145, "reward": 0.3757004216313362, "reward_std": 0.7862649708986282, "rewards/cosine_scaled_reward": -0.08248981460928917, "rewards/format_reward": 0.9166666716337204, "step": 445 }, { "completion_length": 1242.4792175292969, "epoch": 0.5097142857142857, "grad_norm": 1.7266709804534912, "kl": 0.15660858154296875, "learning_rate": 1.316005813502869e-07, "loss": 0.0063, "reward": 0.41175389010459185, "reward_std": 0.6945022568106651, "rewards/cosine_scaled_reward": -0.06201653182506561, "rewards/format_reward": 0.9375000074505806, "step": 446 }, { "completion_length": 1084.6042098999023, "epoch": 0.5108571428571429, "grad_norm": 1.3974360227584839, "kl": 0.21521759033203125, "learning_rate": 1.3045428945301953e-07, "loss": 0.0086, "reward": 0.36824747081846, "reward_std": 0.5542174242436886, "rewards/cosine_scaled_reward": -0.06647356506437063, "rewards/format_reward": 0.916666679084301, "step": 447 }, { "completion_length": 1107.3125228881836, "epoch": 0.512, "grad_norm": 1.4814095497131348, "kl": 0.2049713134765625, "learning_rate": 1.2932844562179352e-07, "loss": 0.0082, "reward": 0.4160159872844815, "reward_std": 0.5563804637640715, "rewards/cosine_scaled_reward": -0.026282913982868195, "rewards/format_reward": 0.8958333507180214, "step": 448 }, { "completion_length": 1043.958351135254, "epoch": 0.5131428571428571, "grad_norm": 0.7390405535697937, "kl": 0.09143447875976562, "learning_rate": 1.2822310472864885e-07, "loss": 0.0037, "reward": 0.14539668832730968, "reward_std": 0.49666889011859894, "rewards/cosine_scaled_reward": -0.22875094041228294, "rewards/format_reward": 0.9375000149011612, "step": 449 }, { "completion_length": 1121.479175567627, "epoch": 0.5142857142857142, "grad_norm": 1.1870522499084473, "kl": 0.17742919921875, "learning_rate": 1.2713832064634125e-07, "loss": 0.0071, "reward": 0.31525158043950796, "reward_std": 0.5219849050045013, "rewards/cosine_scaled_reward": -0.11353826522827148, "rewards/format_reward": 0.9375000074505806, "step": 450 }, { "completion_length": 1166.9583587646484, "epoch": 0.5154285714285715, "grad_norm": 1.581584095954895, "kl": 0.31623077392578125, "learning_rate": 1.260741462457165e-07, "loss": 0.0127, "reward": 0.34822911536321044, "reward_std": 0.6658046580851078, "rewards/cosine_scaled_reward": -0.1033438453450799, "rewards/format_reward": 0.9375000149011612, "step": 451 }, { "completion_length": 1243.687515258789, "epoch": 0.5165714285714286, "grad_norm": 2.1429476737976074, "kl": 0.2833709716796875, "learning_rate": 1.2503063339313356e-07, "loss": 0.0113, "reward": 0.6556259745266289, "reward_std": 0.8615197539329529, "rewards/cosine_scaled_reward": 0.09724390879273415, "rewards/format_reward": 0.9375000149011612, "step": 452 }, { "completion_length": 1214.7708740234375, "epoch": 0.5177142857142857, "grad_norm": 0.8739696741104126, "kl": 0.06499481201171875, "learning_rate": 1.2400783294793668e-07, "loss": 0.0026, "reward": 0.3872159831225872, "reward_std": 0.7029216475784779, "rewards/cosine_scaled_reward": -0.08512475527822971, "rewards/format_reward": 0.9583333432674408, "step": 453 }, { "completion_length": 1117.5417098999023, "epoch": 0.5188571428571429, "grad_norm": 2.8257322311401367, "kl": 0.198486328125, "learning_rate": 1.2300579475997657e-07, "loss": 0.0079, "reward": 0.3443953925743699, "reward_std": 0.6449627317488194, "rewards/cosine_scaled_reward": -0.09044338436797261, "rewards/format_reward": 0.9166666716337204, "step": 454 }, { "completion_length": 1421.6042022705078, "epoch": 0.52, "grad_norm": 1.7228955030441284, "kl": 0.4829292297363281, "learning_rate": 1.220245676671809e-07, "loss": 0.0193, "reward": 0.14169234223663807, "reward_std": 0.591641578823328, "rewards/cosine_scaled_reward": -0.18374849221436307, "rewards/format_reward": 0.8333333414047956, "step": 455 }, { "completion_length": 1353.6250267028809, "epoch": 0.5211428571428571, "grad_norm": 1.9811756610870361, "kl": 0.2818756103515625, "learning_rate": 1.2106419949317388e-07, "loss": 0.0113, "reward": 0.26744416062138043, "reward_std": 0.691377304494381, "rewards/cosine_scaled_reward": -0.15542185143567622, "rewards/format_reward": 0.9375000149011612, "step": 456 }, { "completion_length": 1305.8125305175781, "epoch": 0.5222857142857142, "grad_norm": 2.8912620544433594, "kl": 0.4429473876953125, "learning_rate": 1.2012473704494537e-07, "loss": 0.0177, "reward": 0.33940141764469445, "reward_std": 0.8498760014772415, "rewards/cosine_scaled_reward": -0.04779240628704429, "rewards/format_reward": 0.7916666753590107, "step": 457 }, { "completion_length": 1106.770866394043, "epoch": 0.5234285714285715, "grad_norm": 1.6600767374038696, "kl": 0.252227783203125, "learning_rate": 1.1920622611056974e-07, "loss": 0.0101, "reward": 0.3232785561122, "reward_std": 0.5980268009006977, "rewards/cosine_scaled_reward": -0.10486513609066606, "rewards/format_reward": 0.9166666716337204, "step": 458 }, { "completion_length": 1032.0625228881836, "epoch": 0.5245714285714286, "grad_norm": 1.426925778388977, "kl": 0.23378753662109375, "learning_rate": 1.1830871145697412e-07, "loss": 0.0094, "reward": 0.5453407493187115, "reward_std": 0.7394749782979488, "rewards/cosine_scaled_reward": 0.02821425348520279, "rewards/format_reward": 0.9375000074505806, "step": 459 }, { "completion_length": 1541.208366394043, "epoch": 0.5257142857142857, "grad_norm": 1.5413358211517334, "kl": 0.431365966796875, "learning_rate": 1.1743223682775649e-07, "loss": 0.0172, "reward": 0.19518441951368004, "reward_std": 0.6402174085378647, "rewards/cosine_scaled_reward": -0.1528447875753045, "rewards/format_reward": 0.8333333395421505, "step": 460 }, { "completion_length": 1445.0000610351562, "epoch": 0.5268571428571428, "grad_norm": 1.153205156326294, "kl": 0.4829826354980469, "learning_rate": 1.1657684494105386e-07, "loss": 0.0194, "reward": 0.6377177089452744, "reward_std": 1.0267005078494549, "rewards/cosine_scaled_reward": 0.11436054221121594, "rewards/format_reward": 0.8541666772216558, "step": 461 }, { "completion_length": 989.8333511352539, "epoch": 0.528, "grad_norm": 0.7328640818595886, "kl": 0.04534912109375, "learning_rate": 1.1574257748745986e-07, "loss": 0.0018, "reward": 0.2644703108817339, "reward_std": 0.6658349372446537, "rewards/cosine_scaled_reward": -0.180008752271533, "rewards/format_reward": 0.9791666716337204, "step": 462 }, { "completion_length": 1494.3125457763672, "epoch": 0.5291428571428571, "grad_norm": 2.0038442611694336, "kl": 0.386627197265625, "learning_rate": 1.1492947512799328e-07, "loss": 0.0155, "reward": 0.5318789854645729, "reward_std": 0.9075686037540436, "rewards/cosine_scaled_reward": 0.03875172859989107, "rewards/format_reward": 0.8750000074505806, "step": 463 }, { "completion_length": 992.7500381469727, "epoch": 0.5302857142857142, "grad_norm": 1.684259295463562, "kl": 0.20226287841796875, "learning_rate": 1.1413757749211602e-07, "loss": 0.0081, "reward": 0.7598909301450476, "reward_std": 0.5075674168765545, "rewards/cosine_scaled_reward": 0.17828124994412065, "rewards/format_reward": 0.9583333358168602, "step": 464 }, { "completion_length": 1197.208366394043, "epoch": 0.5314285714285715, "grad_norm": 2.0846643447875977, "kl": 0.156219482421875, "learning_rate": 1.1336692317580158e-07, "loss": 0.0063, "reward": 0.36710425605997443, "reward_std": 0.7569303959608078, "rewards/cosine_scaled_reward": -0.09665499581024051, "rewards/format_reward": 0.9375000149011612, "step": 465 }, { "completion_length": 1219.9375381469727, "epoch": 0.5325714285714286, "grad_norm": 1.655044436454773, "kl": 0.1450958251953125, "learning_rate": 1.1261754973965422e-07, "loss": 0.0058, "reward": 0.589016193524003, "reward_std": 0.6661927625536919, "rewards/cosine_scaled_reward": 0.03179914876818657, "rewards/format_reward": 1.0, "step": 466 }, { "completion_length": 1338.2292137145996, "epoch": 0.5337142857142857, "grad_norm": 1.6192094087600708, "kl": 0.24442291259765625, "learning_rate": 1.1188949370707787e-07, "loss": 0.0098, "reward": 0.32355972472578287, "reward_std": 0.788430068641901, "rewards/cosine_scaled_reward": -0.11671263433527201, "rewards/format_reward": 0.9166666716337204, "step": 467 }, { "completion_length": 1452.9791870117188, "epoch": 0.5348571428571428, "grad_norm": 2.234537124633789, "kl": 0.47918701171875, "learning_rate": 1.1118279056249653e-07, "loss": 0.0191, "reward": 0.42969178780913353, "reward_std": 0.7676932998001575, "rewards/cosine_scaled_reward": -0.04107399005442858, "rewards/format_reward": 0.9166666865348816, "step": 468 }, { "completion_length": 1134.1250114440918, "epoch": 0.536, "grad_norm": 3.1756770610809326, "kl": 0.333251953125, "learning_rate": 1.1049747474962444e-07, "loss": 0.0133, "reward": 0.36575854755938053, "reward_std": 0.7719821371138096, "rewards/cosine_scaled_reward": -0.0800532667490188, "rewards/format_reward": 0.8958333507180214, "step": 469 }, { "completion_length": 1417.6042175292969, "epoch": 0.5371428571428571, "grad_norm": 1.5227336883544922, "kl": 0.470703125, "learning_rate": 1.0983357966978745e-07, "loss": 0.0188, "reward": 0.3243638591375202, "reward_std": 0.8565945476293564, "rewards/cosine_scaled_reward": -0.0990256522782147, "rewards/format_reward": 0.8750000223517418, "step": 470 }, { "completion_length": 1389.9167098999023, "epoch": 0.5382857142857143, "grad_norm": 0.8987606167793274, "kl": 0.13502120971679688, "learning_rate": 1.0919113768029517e-07, "loss": 0.0054, "reward": 0.5736090987920761, "reward_std": 0.7181516140699387, "rewards/cosine_scaled_reward": 0.048229770036414266, "rewards/format_reward": 0.9375000074505806, "step": 471 }, { "completion_length": 1417.3125305175781, "epoch": 0.5394285714285715, "grad_norm": 1.6695373058319092, "kl": 0.477447509765625, "learning_rate": 1.0857018009286381e-07, "loss": 0.019, "reward": 0.2978296782821417, "reward_std": 0.8513829745352268, "rewards/cosine_scaled_reward": -0.10584009531885386, "rewards/format_reward": 0.8541666753590107, "step": 472 }, { "completion_length": 1296.083351135254, "epoch": 0.5405714285714286, "grad_norm": 1.4655067920684814, "kl": 0.205474853515625, "learning_rate": 1.0797073717209013e-07, "loss": 0.0082, "reward": 0.1657012979267165, "reward_std": 0.5747975409030914, "rewards/cosine_scaled_reward": -0.20028305146843195, "rewards/format_reward": 0.8958333358168602, "step": 473 }, { "completion_length": 1289.0833778381348, "epoch": 0.5417142857142857, "grad_norm": 2.321017026901245, "kl": 0.4053497314453125, "learning_rate": 1.0739283813397639e-07, "loss": 0.0162, "reward": 0.743865036405623, "reward_std": 0.733939703553915, "rewards/cosine_scaled_reward": 0.17521634395234287, "rewards/format_reward": 0.916666679084301, "step": 474 }, { "completion_length": 1455.7500228881836, "epoch": 0.5428571428571428, "grad_norm": 1.4909836053848267, "kl": 0.37940216064453125, "learning_rate": 1.068365111445064e-07, "loss": 0.0152, "reward": 0.3271640567108989, "reward_std": 0.7808557823300362, "rewards/cosine_scaled_reward": -0.08086569933220744, "rewards/format_reward": 0.8541666679084301, "step": 475 }, { "completion_length": 1167.2291793823242, "epoch": 0.544, "grad_norm": 2.0223841667175293, "kl": 0.2843170166015625, "learning_rate": 1.063017833182728e-07, "loss": 0.0114, "reward": 0.5520191243849695, "reward_std": 0.9013429544866085, "rewards/cosine_scaled_reward": 0.02163805003510788, "rewards/format_reward": 0.9375000074505806, "step": 476 }, { "completion_length": 1188.1041946411133, "epoch": 0.5451428571428572, "grad_norm": 0.9858556985855103, "kl": 0.3222007751464844, "learning_rate": 1.0578868071715544e-07, "loss": 0.0129, "reward": 0.6779396012425423, "reward_std": 0.7415244840085506, "rewards/cosine_scaled_reward": 0.11190658155828714, "rewards/format_reward": 0.9583333432674408, "step": 477 }, { "completion_length": 1321.9375381469727, "epoch": 0.5462857142857143, "grad_norm": 1.826804280281067, "kl": 0.29107666015625, "learning_rate": 1.0529722834905125e-07, "loss": 0.0116, "reward": 0.23478064546361566, "reward_std": 0.6792666539549828, "rewards/cosine_scaled_reward": -0.1663953149691224, "rewards/format_reward": 0.9166666865348816, "step": 478 }, { "completion_length": 1477.6041946411133, "epoch": 0.5474285714285714, "grad_norm": 2.102496862411499, "kl": 0.60791015625, "learning_rate": 1.0482745016665526e-07, "loss": 0.0243, "reward": 0.5314750894904137, "reward_std": 0.8037730753421783, "rewards/cosine_scaled_reward": 0.02458848152309656, "rewards/format_reward": 0.9166666716337204, "step": 479 }, { "completion_length": 1360.0208587646484, "epoch": 0.5485714285714286, "grad_norm": 1.7093435525894165, "kl": 0.4275360107421875, "learning_rate": 1.0437936906629334e-07, "loss": 0.0171, "reward": 0.3702342016622424, "reward_std": 0.6831196062266827, "rewards/cosine_scaled_reward": -0.07479118811897933, "rewards/format_reward": 0.9166666716337204, "step": 480 }, { "completion_length": 1480.3750534057617, "epoch": 0.5497142857142857, "grad_norm": 2.413257122039795, "kl": 0.6222076416015625, "learning_rate": 1.0395300688680625e-07, "loss": 0.0249, "reward": 0.1323686043615453, "reward_std": 0.6688967496156693, "rewards/cosine_scaled_reward": -0.17295160831417888, "rewards/format_reward": 0.7916666753590107, "step": 481 }, { "completion_length": 1245.3750381469727, "epoch": 0.5508571428571428, "grad_norm": 1.1108511686325073, "kl": 0.3742408752441406, "learning_rate": 1.0354838440848501e-07, "loss": 0.015, "reward": 0.5778923179022968, "reward_std": 0.7943699173629284, "rewards/cosine_scaled_reward": 0.05815020017325878, "rewards/format_reward": 0.9166666865348816, "step": 482 }, { "completion_length": 1345.7917022705078, "epoch": 0.552, "grad_norm": 3.296278238296509, "kl": 0.32867431640625, "learning_rate": 1.0316552135205837e-07, "loss": 0.0131, "reward": 0.4724856864195317, "reward_std": 0.7850163578987122, "rewards/cosine_scaled_reward": 0.012874770443886518, "rewards/format_reward": 0.8750000149011612, "step": 483 }, { "completion_length": 1077.2083625793457, "epoch": 0.5531428571428572, "grad_norm": 1.8179315328598022, "kl": 0.19476699829101562, "learning_rate": 1.0280443637773163e-07, "loss": 0.0078, "reward": 0.460749551653862, "reward_std": 0.6464533470571041, "rewards/cosine_scaled_reward": -0.013641191995702684, "rewards/format_reward": 0.9166666716337204, "step": 484 }, { "completion_length": 1128.0208740234375, "epoch": 0.5542857142857143, "grad_norm": 3.131510019302368, "kl": 0.27252197265625, "learning_rate": 1.0246514708427701e-07, "loss": 0.0109, "reward": 0.29226525872945786, "reward_std": 0.67261578515172, "rewards/cosine_scaled_reward": -0.1184451412409544, "rewards/format_reward": 0.8958333432674408, "step": 485 }, { "completion_length": 904.479190826416, "epoch": 0.5554285714285714, "grad_norm": 2.3228588104248047, "kl": 0.23955535888671875, "learning_rate": 1.0214767000817596e-07, "loss": 0.0096, "reward": 0.4756124352570623, "reward_std": 0.6488616764545441, "rewards/cosine_scaled_reward": -0.003937863744795322, "rewards/format_reward": 0.9166666865348816, "step": 486 }, { "completion_length": 1126.666690826416, "epoch": 0.5565714285714286, "grad_norm": 1.264155626296997, "kl": 0.18535614013671875, "learning_rate": 1.0185202062281336e-07, "loss": 0.0074, "reward": 0.8187248446047306, "reward_std": 0.7255717366933823, "rewards/cosine_scaled_reward": 0.23560354206711054, "rewards/format_reward": 0.8958333432674408, "step": 487 }, { "completion_length": 975.666690826416, "epoch": 0.5577142857142857, "grad_norm": 1.1237510442733765, "kl": 0.089996337890625, "learning_rate": 1.0157821333772304e-07, "loss": 0.0036, "reward": 0.3682096116244793, "reward_std": 0.5310206785798073, "rewards/cosine_scaled_reward": -0.08794048149138689, "rewards/format_reward": 0.9583333432674408, "step": 488 }, { "completion_length": 1439.4167289733887, "epoch": 0.5588571428571428, "grad_norm": 1.723633050918579, "kl": 0.398651123046875, "learning_rate": 1.013262614978859e-07, "loss": 0.016, "reward": 0.12414154410362244, "reward_std": 0.5313785709440708, "rewards/cosine_scaled_reward": -0.21272556111216545, "rewards/format_reward": 0.8750000298023224, "step": 489 }, { "completion_length": 1332.0833740234375, "epoch": 0.56, "grad_norm": 1.4947898387908936, "kl": 0.33425140380859375, "learning_rate": 1.0109617738307911e-07, "loss": 0.0134, "reward": 0.318634120747447, "reward_std": 0.5996614284813404, "rewards/cosine_scaled_reward": -0.1064935065805912, "rewards/format_reward": 0.9166666716337204, "step": 490 }, { "completion_length": 1447.0833587646484, "epoch": 0.5611428571428572, "grad_norm": 2.106166362762451, "kl": 0.3012504577636719, "learning_rate": 1.0088797220727779e-07, "loss": 0.0121, "reward": 0.4496698835864663, "reward_std": 0.8973959572613239, "rewards/cosine_scaled_reward": -0.014357679523527622, "rewards/format_reward": 0.8750000149011612, "step": 491 }, { "completion_length": 1104.6041831970215, "epoch": 0.5622857142857143, "grad_norm": 2.4530482292175293, "kl": 0.1375885009765625, "learning_rate": 1.0070165611810855e-07, "loss": 0.0055, "reward": 0.3822294343262911, "reward_std": 0.6128358133137226, "rewards/cosine_scaled_reward": -0.07346190325915813, "rewards/format_reward": 0.9375000074505806, "step": 492 }, { "completion_length": 1073.3541793823242, "epoch": 0.5634285714285714, "grad_norm": 1.1550400257110596, "kl": 0.18117523193359375, "learning_rate": 1.005372381963547e-07, "loss": 0.0072, "reward": 0.4816288612782955, "reward_std": 0.8117332980036736, "rewards/cosine_scaled_reward": -0.030845604138448834, "rewards/format_reward": 0.9583333432674408, "step": 493 }, { "completion_length": 1056.62504196167, "epoch": 0.5645714285714286, "grad_norm": 0.7965182662010193, "kl": 0.1199188232421875, "learning_rate": 1.0039472645551372e-07, "loss": 0.0048, "reward": 0.3902022670954466, "reward_std": 0.7675591520965099, "rewards/cosine_scaled_reward": -0.09161333832889795, "rewards/format_reward": 0.9583333358168602, "step": 494 }, { "completion_length": 1542.6042175292969, "epoch": 0.5657142857142857, "grad_norm": 2.6710989475250244, "kl": 0.34729766845703125, "learning_rate": 1.002741278414069e-07, "loss": 0.0139, "reward": 0.5254024369642138, "reward_std": 0.9325296804308891, "rewards/cosine_scaled_reward": 0.017028740607202053, "rewards/format_reward": 0.916666679084301, "step": 495 }, { "completion_length": 1316.7292022705078, "epoch": 0.5668571428571428, "grad_norm": 1.4742833375930786, "kl": 0.5881500244140625, "learning_rate": 1.0017544823184055e-07, "loss": 0.0235, "reward": 0.619772095582448, "reward_std": 0.8898240327835083, "rewards/cosine_scaled_reward": 0.13812840729951859, "rewards/format_reward": 0.7916666846722364, "step": 496 }, { "completion_length": 1185.708351135254, "epoch": 0.568, "grad_norm": 1.6387114524841309, "kl": 0.5859603881835938, "learning_rate": 1.0009869243631952e-07, "loss": 0.0235, "reward": 0.7584062092937529, "reward_std": 0.6907122731208801, "rewards/cosine_scaled_reward": 0.20853716135025024, "rewards/format_reward": 0.8750000111758709, "step": 497 }, { "completion_length": 1406.3958587646484, "epoch": 0.5691428571428572, "grad_norm": 2.6451492309570312, "kl": 0.383056640625, "learning_rate": 1.000438641958131e-07, "loss": 0.0153, "reward": 0.33473338062322, "reward_std": 0.7394461110234261, "rewards/cosine_scaled_reward": -0.10398351773619652, "rewards/format_reward": 0.916666679084301, "step": 498 }, { "completion_length": 1517.770881652832, "epoch": 0.5702857142857143, "grad_norm": 2.2294161319732666, "kl": 0.6592826843261719, "learning_rate": 1.0001096618257236e-07, "loss": 0.0264, "reward": 0.40126605180557817, "reward_std": 0.7697222679853439, "rewards/cosine_scaled_reward": -0.006456421993789263, "rewards/format_reward": 0.8125000186264515, "step": 499 }, { "completion_length": 1280.7500305175781, "epoch": 0.5714285714285714, "grad_norm": 2.027796983718872, "kl": 0.46686553955078125, "learning_rate": 1e-07, "loss": 0.0186, "reward": 0.2269948897883296, "reward_std": 0.7286034636199474, "rewards/cosine_scaled_reward": -0.13150667655281723, "rewards/format_reward": 0.8333333507180214, "step": 500 }, { "epoch": 0.5714285714285714, "step": 500, "total_flos": 0.0, "train_loss": 0.0027833052817963252, "train_runtime": 55581.6431, "train_samples_per_second": 0.432, "train_steps_per_second": 0.009 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }