| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.4285408185129634, | |
| "eval_steps": 500, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1990.0278625488281, | |
| "epoch": 0.0008570816370259267, | |
| "grad_norm": 0.7143716812133789, | |
| "kl": 0.0, | |
| "learning_rate": 2e-08, | |
| "loss": 0.0768, | |
| "reward": 0.7364962100982666, | |
| "reward_std": 0.8344592750072479, | |
| "rewards/cosine_scaled_reward": 0.3335258811712265, | |
| "rewards/format_reward": 0.0694444477558136, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2242.5694885253906, | |
| "epoch": 0.0017141632740518534, | |
| "grad_norm": 0.3701298236846924, | |
| "kl": 0.0, | |
| "learning_rate": 4e-08, | |
| "loss": 0.0188, | |
| "reward": 0.2869503181427717, | |
| "reward_std": 0.7584073394536972, | |
| "rewards/cosine_scaled_reward": 0.10180849023163319, | |
| "rewards/format_reward": 0.08333333488553762, | |
| "step": 2 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2240.2083740234375, | |
| "epoch": 0.0025712449110777804, | |
| "grad_norm": 0.289852112531662, | |
| "kl": -4.544854164123535e-06, | |
| "learning_rate": 6e-08, | |
| "loss": 0.0887, | |
| "reward": 0.19659454189240932, | |
| "reward_std": 0.6517409235239029, | |
| "rewards/cosine_scaled_reward": 0.04274171104407287, | |
| "rewards/format_reward": 0.11111111287027597, | |
| "step": 3 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1926.1667175292969, | |
| "epoch": 0.0034283265481037067, | |
| "grad_norm": 1.4475386142730713, | |
| "kl": 5.131587386131287e-06, | |
| "learning_rate": 8e-08, | |
| "loss": 0.1401, | |
| "reward": 0.6266543348319829, | |
| "reward_std": 0.7619214951992035, | |
| "rewards/cosine_scaled_reward": 0.25082714576274157, | |
| "rewards/format_reward": 0.12500000186264515, | |
| "step": 4 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2068.638916015625, | |
| "epoch": 0.004285408185129634, | |
| "grad_norm": 0.492279052734375, | |
| "kl": -6.013549864292145e-06, | |
| "learning_rate": 1e-07, | |
| "loss": 0.0131, | |
| "reward": 0.8806147426366806, | |
| "reward_std": 0.7828683108091354, | |
| "rewards/cosine_scaled_reward": 0.3500296138226986, | |
| "rewards/format_reward": 0.18055556155741215, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2029.3888854980469, | |
| "epoch": 0.005142489822155561, | |
| "grad_norm": 0.5861327052116394, | |
| "kl": 4.366040229797363e-06, | |
| "learning_rate": 1.2e-07, | |
| "loss": -0.0572, | |
| "reward": 0.7398289144039154, | |
| "reward_std": 0.8924512416124344, | |
| "rewards/cosine_scaled_reward": 0.2796366587281227, | |
| "rewards/format_reward": 0.18055556062608957, | |
| "step": 6 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1942.5416564941406, | |
| "epoch": 0.005999571459181487, | |
| "grad_norm": 1.1912319660186768, | |
| "kl": 4.488509148359299e-06, | |
| "learning_rate": 1.4e-07, | |
| "loss": 0.0059, | |
| "reward": 0.23400097712874413, | |
| "reward_std": 0.7279467135667801, | |
| "rewards/cosine_scaled_reward": 0.07533382624387741, | |
| "rewards/format_reward": 0.08333333395421505, | |
| "step": 7 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2180.8750610351562, | |
| "epoch": 0.0068566530962074134, | |
| "grad_norm": 0.6393517851829529, | |
| "kl": -4.702596925199032e-06, | |
| "learning_rate": 1.6e-07, | |
| "loss": -0.0338, | |
| "reward": 0.6042829677462578, | |
| "reward_std": 0.8216739147901535, | |
| "rewards/cosine_scaled_reward": 0.26047481037676334, | |
| "rewards/format_reward": 0.08333333395421505, | |
| "step": 8 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2086.1944885253906, | |
| "epoch": 0.00771373473323334, | |
| "grad_norm": 0.8930106163024902, | |
| "kl": 5.990266799926758e-06, | |
| "learning_rate": 1.8e-07, | |
| "loss": -0.1895, | |
| "reward": 0.5918732397258282, | |
| "reward_std": 0.6417871415615082, | |
| "rewards/cosine_scaled_reward": 0.2264921732712537, | |
| "rewards/format_reward": 0.13888888992369175, | |
| "step": 9 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2342.5416870117188, | |
| "epoch": 0.008570816370259268, | |
| "grad_norm": 0.20637141168117523, | |
| "kl": 1.8426217138767242e-06, | |
| "learning_rate": 2e-07, | |
| "loss": -0.0414, | |
| "reward": 0.5427078725770116, | |
| "reward_std": 0.6480782926082611, | |
| "rewards/cosine_scaled_reward": 0.22968729073181748, | |
| "rewards/format_reward": 0.08333333395421505, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2251.083282470703, | |
| "epoch": 0.009427898007285194, | |
| "grad_norm": 0.13938356935977936, | |
| "kl": -3.427267074584961e-07, | |
| "learning_rate": 2.1999999999999998e-07, | |
| "loss": -0.0399, | |
| "reward": 0.4674246795475483, | |
| "reward_std": 0.9071184694766998, | |
| "rewards/cosine_scaled_reward": 0.20593456458300352, | |
| "rewards/format_reward": 0.0555555559694767, | |
| "step": 11 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2086.4306030273438, | |
| "epoch": 0.010284979644311121, | |
| "grad_norm": 0.17967204749584198, | |
| "kl": 4.600733518600464e-07, | |
| "learning_rate": 2.4e-07, | |
| "loss": 0.1108, | |
| "reward": 0.7090050615370274, | |
| "reward_std": 0.9076867997646332, | |
| "rewards/cosine_scaled_reward": 0.31978030502796173, | |
| "rewards/format_reward": 0.06944444589316845, | |
| "step": 12 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2177.7500610351562, | |
| "epoch": 0.011142061281337047, | |
| "grad_norm": 0.6705565452575684, | |
| "kl": -9.059906005859375e-06, | |
| "learning_rate": 2.6e-07, | |
| "loss": -0.107, | |
| "reward": 0.31611130852252245, | |
| "reward_std": 0.700124979019165, | |
| "rewards/cosine_scaled_reward": 0.10250008956063539, | |
| "rewards/format_reward": 0.1111111119389534, | |
| "step": 13 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1980.0971984863281, | |
| "epoch": 0.011999142918362973, | |
| "grad_norm": 0.4753912091255188, | |
| "kl": 1.8067657947540283e-06, | |
| "learning_rate": 2.8e-07, | |
| "loss": -0.1518, | |
| "reward": 0.5453062728047371, | |
| "reward_std": 0.7382813096046448, | |
| "rewards/cosine_scaled_reward": 0.21709759440273046, | |
| "rewards/format_reward": 0.1111111119389534, | |
| "step": 14 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2061.277801513672, | |
| "epoch": 0.012856224555388901, | |
| "grad_norm": 1.0317578315734863, | |
| "kl": -4.398170858621597e-06, | |
| "learning_rate": 3e-07, | |
| "loss": 0.1252, | |
| "reward": 0.7395287081599236, | |
| "reward_std": 0.9806090295314789, | |
| "rewards/cosine_scaled_reward": 0.3350421413779259, | |
| "rewards/format_reward": 0.06944444589316845, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1978.9167175292969, | |
| "epoch": 0.013713306192414827, | |
| "grad_norm": 0.8236006498336792, | |
| "kl": 9.63360071182251e-06, | |
| "learning_rate": 3.2e-07, | |
| "loss": 0.0218, | |
| "reward": 0.6508458182215691, | |
| "reward_std": 0.8475509732961655, | |
| "rewards/cosine_scaled_reward": 0.2768117773812264, | |
| "rewards/format_reward": 0.0972222238779068, | |
| "step": 16 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1865.6111602783203, | |
| "epoch": 0.014570387829440755, | |
| "grad_norm": 0.5406652092933655, | |
| "kl": 7.767230272293091e-07, | |
| "learning_rate": 3.4000000000000003e-07, | |
| "loss": 0.0741, | |
| "reward": 0.8166351541876793, | |
| "reward_std": 0.9249791204929352, | |
| "rewards/cosine_scaled_reward": 0.33887312887236476, | |
| "rewards/format_reward": 0.13888889085501432, | |
| "step": 17 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2117.500030517578, | |
| "epoch": 0.01542746946646668, | |
| "grad_norm": 0.34515851736068726, | |
| "kl": 2.3096799850463867e-06, | |
| "learning_rate": 3.6e-07, | |
| "loss": 0.1115, | |
| "reward": 0.7321371585130692, | |
| "reward_std": 0.6702713221311569, | |
| "rewards/cosine_scaled_reward": 0.3105130400508642, | |
| "rewards/format_reward": 0.11111111473292112, | |
| "step": 18 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1973.0972290039062, | |
| "epoch": 0.016284551103492608, | |
| "grad_norm": 1.3012654781341553, | |
| "kl": 4.159286618232727e-06, | |
| "learning_rate": 3.7999999999999996e-07, | |
| "loss": 0.1323, | |
| "reward": 0.28933676797896624, | |
| "reward_std": 0.875898152589798, | |
| "rewards/cosine_scaled_reward": 0.10994616383686662, | |
| "rewards/format_reward": 0.06944444496184587, | |
| "step": 19 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2246.500030517578, | |
| "epoch": 0.017141632740518536, | |
| "grad_norm": 0.23211120069026947, | |
| "kl": -1.0263174772262573e-05, | |
| "learning_rate": 4e-07, | |
| "loss": 0.0164, | |
| "reward": 0.12983610574156046, | |
| "reward_std": 0.7194785475730896, | |
| "rewards/cosine_scaled_reward": -0.004526391625404358, | |
| "rewards/format_reward": 0.1388888917863369, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1989.7916870117188, | |
| "epoch": 0.01799871437754446, | |
| "grad_norm": 0.2483261376619339, | |
| "kl": -4.772096872329712e-06, | |
| "learning_rate": 4.1999999999999995e-07, | |
| "loss": 0.0647, | |
| "reward": 0.2600990349892527, | |
| "reward_std": 0.7444702684879303, | |
| "rewards/cosine_scaled_reward": 0.06754951924085617, | |
| "rewards/format_reward": 0.12500000279396772, | |
| "step": 21 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2344.8472900390625, | |
| "epoch": 0.018855796014570388, | |
| "grad_norm": 0.531119704246521, | |
| "kl": -5.930662155151367e-06, | |
| "learning_rate": 4.3999999999999997e-07, | |
| "loss": 0.0059, | |
| "reward": 0.31322263344191015, | |
| "reward_std": 0.7910206019878387, | |
| "rewards/cosine_scaled_reward": 0.10105575760826468, | |
| "rewards/format_reward": 0.11111111287027597, | |
| "step": 22 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2217.40283203125, | |
| "epoch": 0.019712877651596315, | |
| "grad_norm": 0.4045103192329407, | |
| "kl": -9.08970832824707e-06, | |
| "learning_rate": 4.6e-07, | |
| "loss": -0.0425, | |
| "reward": 0.28447722643613815, | |
| "reward_std": 0.7937969118356705, | |
| "rewards/cosine_scaled_reward": 0.12140527740120888, | |
| "rewards/format_reward": 0.041666666977107525, | |
| "step": 23 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1954.6805725097656, | |
| "epoch": 0.020569959288622243, | |
| "grad_norm": 1.1069881916046143, | |
| "kl": 3.439374268054962e-06, | |
| "learning_rate": 4.8e-07, | |
| "loss": -0.04, | |
| "reward": 0.61127008497715, | |
| "reward_std": 0.621691383421421, | |
| "rewards/cosine_scaled_reward": 0.22924613999202847, | |
| "rewards/format_reward": 0.15277778077870607, | |
| "step": 24 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1864.0972595214844, | |
| "epoch": 0.021427040925648167, | |
| "grad_norm": 0.13546203076839447, | |
| "kl": 1.4692544937133789e-05, | |
| "learning_rate": 5e-07, | |
| "loss": -0.0367, | |
| "reward": 0.8168461695313454, | |
| "reward_std": 0.8645190075039864, | |
| "rewards/cosine_scaled_reward": 0.33897863049060106, | |
| "rewards/format_reward": 0.1388888917863369, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2138.2638549804688, | |
| "epoch": 0.022284122562674095, | |
| "grad_norm": 0.23594234883785248, | |
| "kl": -1.7136335372924805e-06, | |
| "learning_rate": 5.2e-07, | |
| "loss": -0.0366, | |
| "reward": 0.2813246757723391, | |
| "reward_std": 0.6457278877496719, | |
| "rewards/cosine_scaled_reward": 0.09899566043168306, | |
| "rewards/format_reward": 0.08333333488553762, | |
| "step": 26 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2089.0556030273438, | |
| "epoch": 0.023141204199700022, | |
| "grad_norm": 1.0738513469696045, | |
| "kl": 5.405396223068237e-06, | |
| "learning_rate": 5.4e-07, | |
| "loss": 0.1218, | |
| "reward": 0.6343957483768463, | |
| "reward_std": 0.7526431530714035, | |
| "rewards/cosine_scaled_reward": 0.24775342142675072, | |
| "rewards/format_reward": 0.1388888917863369, | |
| "step": 27 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2116.638916015625, | |
| "epoch": 0.023998285836725947, | |
| "grad_norm": 1.4345226287841797, | |
| "kl": 3.216089680790901e-06, | |
| "learning_rate": 5.6e-07, | |
| "loss": 0.0098, | |
| "reward": 0.5014659259468317, | |
| "reward_std": 0.8205202594399452, | |
| "rewards/cosine_scaled_reward": 0.20212182961404324, | |
| "rewards/format_reward": 0.09722222294658422, | |
| "step": 28 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1885.0278015136719, | |
| "epoch": 0.024855367473751874, | |
| "grad_norm": 0.29304057359695435, | |
| "kl": 4.43682074546814e-06, | |
| "learning_rate": 5.8e-07, | |
| "loss": 0.0183, | |
| "reward": 0.4122583381831646, | |
| "reward_std": 0.7856339067220688, | |
| "rewards/cosine_scaled_reward": 0.1644625086337328, | |
| "rewards/format_reward": 0.08333333395421505, | |
| "step": 29 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2169.583282470703, | |
| "epoch": 0.025712449110777802, | |
| "grad_norm": 0.21029269695281982, | |
| "kl": 1.8927734345197678e-05, | |
| "learning_rate": 6e-07, | |
| "loss": 0.0248, | |
| "reward": 0.484335083514452, | |
| "reward_std": 0.7596095502376556, | |
| "rewards/cosine_scaled_reward": 0.18661198392510414, | |
| "rewards/format_reward": 0.11111111380159855, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1893.3333129882812, | |
| "epoch": 0.02656953074780373, | |
| "grad_norm": 0.18814511597156525, | |
| "kl": 0.00010308623313903809, | |
| "learning_rate": 6.2e-07, | |
| "loss": -0.0622, | |
| "reward": 0.408107977360487, | |
| "reward_std": 0.7381277531385422, | |
| "rewards/cosine_scaled_reward": 0.14849842991679907, | |
| "rewards/format_reward": 0.11111111380159855, | |
| "step": 31 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2393.0694580078125, | |
| "epoch": 0.027426612384829654, | |
| "grad_norm": 0.4075649380683899, | |
| "kl": 3.24249267578125e-05, | |
| "learning_rate": 6.4e-07, | |
| "loss": -0.1087, | |
| "reward": 0.22196191549301147, | |
| "reward_std": 0.7661005556583405, | |
| "rewards/cosine_scaled_reward": 0.06236985884606838, | |
| "rewards/format_reward": 0.09722222294658422, | |
| "step": 32 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2272.3194580078125, | |
| "epoch": 0.02828369402185558, | |
| "grad_norm": 0.21960391104221344, | |
| "kl": 7.766485214233398e-05, | |
| "learning_rate": 6.6e-07, | |
| "loss": 0.0847, | |
| "reward": 0.4620617777109146, | |
| "reward_std": 0.8374988958239555, | |
| "rewards/cosine_scaled_reward": 0.1268642134964466, | |
| "rewards/format_reward": 0.20833334047347307, | |
| "step": 33 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2146.1527709960938, | |
| "epoch": 0.02914077565888151, | |
| "grad_norm": 0.21705187857151031, | |
| "kl": 0.00015848875045776367, | |
| "learning_rate": 6.800000000000001e-07, | |
| "loss": 0.0208, | |
| "reward": 0.228340620175004, | |
| "reward_std": 0.7516879141330719, | |
| "rewards/cosine_scaled_reward": 0.09333697834517807, | |
| "rewards/format_reward": 0.041666666977107525, | |
| "step": 34 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2046.4166870117188, | |
| "epoch": 0.029997857295907437, | |
| "grad_norm": 0.32378724217414856, | |
| "kl": 7.605552673339844e-05, | |
| "learning_rate": 7e-07, | |
| "loss": 0.0799, | |
| "reward": 0.47813151963055134, | |
| "reward_std": 0.6827547252178192, | |
| "rewards/cosine_scaled_reward": 0.19739909376949072, | |
| "rewards/format_reward": 0.08333333395421505, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2243.15283203125, | |
| "epoch": 0.03085493893293336, | |
| "grad_norm": 0.3061051070690155, | |
| "kl": 0.0001518726348876953, | |
| "learning_rate": 7.2e-07, | |
| "loss": 0.0024, | |
| "reward": 0.5015687884879299, | |
| "reward_std": 0.8517486453056335, | |
| "rewards/cosine_scaled_reward": 0.21606217822409235, | |
| "rewards/format_reward": 0.06944444589316845, | |
| "step": 36 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2279.791717529297, | |
| "epoch": 0.03171202056995929, | |
| "grad_norm": 0.6476448178291321, | |
| "kl": 0.0002086162567138672, | |
| "learning_rate": 7.4e-07, | |
| "loss": 0.1123, | |
| "reward": 0.4819689504802227, | |
| "reward_std": 0.8821997940540314, | |
| "rewards/cosine_scaled_reward": 0.1715400367975235, | |
| "rewards/format_reward": 0.13888889085501432, | |
| "step": 37 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2446.9166870117188, | |
| "epoch": 0.032569102206985216, | |
| "grad_norm": 0.4492949843406677, | |
| "kl": 0.00011676549911499023, | |
| "learning_rate": 7.599999999999999e-07, | |
| "loss": 0.0675, | |
| "reward": 0.4931858219206333, | |
| "reward_std": 0.8364229053258896, | |
| "rewards/cosine_scaled_reward": 0.1702040210366249, | |
| "rewards/format_reward": 0.15277778171002865, | |
| "step": 38 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2038.7083740234375, | |
| "epoch": 0.033426183844011144, | |
| "grad_norm": 0.1742168515920639, | |
| "kl": 0.00026722252368927, | |
| "learning_rate": 7.799999999999999e-07, | |
| "loss": 0.1608, | |
| "reward": 0.03227643854916096, | |
| "reward_std": 0.6348245367407799, | |
| "rewards/cosine_scaled_reward": -0.018583996687084436, | |
| "rewards/format_reward": 0.06944444496184587, | |
| "step": 39 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2093.7361450195312, | |
| "epoch": 0.03428326548103707, | |
| "grad_norm": 0.3163558542728424, | |
| "kl": 0.0002613067626953125, | |
| "learning_rate": 8e-07, | |
| "loss": -0.0057, | |
| "reward": 0.6369250733405352, | |
| "reward_std": 0.8408300578594208, | |
| "rewards/cosine_scaled_reward": 0.29068473260849714, | |
| "rewards/format_reward": 0.0555555559694767, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2250.875, | |
| "epoch": 0.03514034711806299, | |
| "grad_norm": 0.24971450865268707, | |
| "kl": 0.0001220703125, | |
| "learning_rate": 8.199999999999999e-07, | |
| "loss": 0.0645, | |
| "reward": 0.6921100318431854, | |
| "reward_std": 0.6361059248447418, | |
| "rewards/cosine_scaled_reward": 0.3043883480131626, | |
| "rewards/format_reward": 0.08333333395421505, | |
| "step": 41 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2280.763916015625, | |
| "epoch": 0.03599742875508892, | |
| "grad_norm": 0.2735660672187805, | |
| "kl": 0.0002765655517578125, | |
| "learning_rate": 8.399999999999999e-07, | |
| "loss": -0.0169, | |
| "reward": 0.6382394358515739, | |
| "reward_std": 0.6768579035997391, | |
| "rewards/cosine_scaled_reward": 0.2982863746583462, | |
| "rewards/format_reward": 0.041666666977107525, | |
| "step": 42 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2201.75, | |
| "epoch": 0.03685451039211485, | |
| "grad_norm": 0.28408563137054443, | |
| "kl": 0.0007734298706054688, | |
| "learning_rate": 8.599999999999999e-07, | |
| "loss": -0.0836, | |
| "reward": 0.2587485685944557, | |
| "reward_std": 0.8071333467960358, | |
| "rewards/cosine_scaled_reward": 0.10159650258719921, | |
| "rewards/format_reward": 0.055555556900799274, | |
| "step": 43 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2117.277801513672, | |
| "epoch": 0.037711592029140775, | |
| "grad_norm": 0.16344650089740753, | |
| "kl": 0.00028318166732788086, | |
| "learning_rate": 8.799999999999999e-07, | |
| "loss": -0.1207, | |
| "reward": 0.14954735711216927, | |
| "reward_std": 0.654334545135498, | |
| "rewards/cosine_scaled_reward": 0.04699590289965272, | |
| "rewards/format_reward": 0.0555555559694767, | |
| "step": 44 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2220.430633544922, | |
| "epoch": 0.0385686736661667, | |
| "grad_norm": 0.5771039128303528, | |
| "kl": 0.0004525184631347656, | |
| "learning_rate": 9e-07, | |
| "loss": -0.1835, | |
| "reward": 0.45511680841445923, | |
| "reward_std": 0.8355356454849243, | |
| "rewards/cosine_scaled_reward": 0.17894728854298592, | |
| "rewards/format_reward": 0.0972222238779068, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2210.3472595214844, | |
| "epoch": 0.03942575530319263, | |
| "grad_norm": 0.16971856355667114, | |
| "kl": 0.0004546046257019043, | |
| "learning_rate": 9.2e-07, | |
| "loss": 0.1153, | |
| "reward": 0.9467306435108185, | |
| "reward_std": 0.7745302617549896, | |
| "rewards/cosine_scaled_reward": 0.40392088890075684, | |
| "rewards/format_reward": 0.13888889271765947, | |
| "step": 46 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2326.6666870117188, | |
| "epoch": 0.04028283694021856, | |
| "grad_norm": 0.18288320302963257, | |
| "kl": 0.00079345703125, | |
| "learning_rate": 9.399999999999999e-07, | |
| "loss": 0.0068, | |
| "reward": 0.4090319825336337, | |
| "reward_std": 0.8255032747983932, | |
| "rewards/cosine_scaled_reward": 0.17673821188509464, | |
| "rewards/format_reward": 0.0555555559694767, | |
| "step": 47 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2395.9027709960938, | |
| "epoch": 0.041139918577244486, | |
| "grad_norm": 0.11357901245355606, | |
| "kl": 0.0008918642997741699, | |
| "learning_rate": 9.6e-07, | |
| "loss": -0.0441, | |
| "reward": 0.5232048779726028, | |
| "reward_std": 0.824892595410347, | |
| "rewards/cosine_scaled_reward": 0.22688022814691067, | |
| "rewards/format_reward": 0.06944444589316845, | |
| "step": 48 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2259.8333740234375, | |
| "epoch": 0.04199700021427041, | |
| "grad_norm": 0.32720884680747986, | |
| "kl": 0.0004296302795410156, | |
| "learning_rate": 9.8e-07, | |
| "loss": -0.056, | |
| "reward": 0.26625396870076656, | |
| "reward_std": 0.7466485947370529, | |
| "rewards/cosine_scaled_reward": 0.08451587241142988, | |
| "rewards/format_reward": 0.0972222238779068, | |
| "step": 49 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2333.138885498047, | |
| "epoch": 0.042854081851296334, | |
| "grad_norm": 0.417357474565506, | |
| "kl": 0.0006256103515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0745, | |
| "reward": 0.7284820526838303, | |
| "reward_std": 0.6617946848273277, | |
| "rewards/cosine_scaled_reward": 0.3017410282045603, | |
| "rewards/format_reward": 0.12500000093132257, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2022.5278625488281, | |
| "epoch": 0.04371116348832226, | |
| "grad_norm": 0.3659718930721283, | |
| "kl": 0.0008034706115722656, | |
| "learning_rate": 9.999890338174275e-07, | |
| "loss": -0.1326, | |
| "reward": 0.9250798672437668, | |
| "reward_std": 0.7151706963777542, | |
| "rewards/cosine_scaled_reward": 0.3861510306596756, | |
| "rewards/format_reward": 0.15277778171002865, | |
| "step": 51 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2212.513885498047, | |
| "epoch": 0.04456824512534819, | |
| "grad_norm": 0.5044128894805908, | |
| "kl": 0.0008015632629394531, | |
| "learning_rate": 9.999561358041868e-07, | |
| "loss": 0.0979, | |
| "reward": 0.6630913875997066, | |
| "reward_std": 0.7525846064090729, | |
| "rewards/cosine_scaled_reward": 0.30376790650188923, | |
| "rewards/format_reward": 0.0555555559694767, | |
| "step": 52 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2055.7222290039062, | |
| "epoch": 0.04542532676237412, | |
| "grad_norm": 0.24211224913597107, | |
| "kl": 0.0007920265197753906, | |
| "learning_rate": 9.999013075636804e-07, | |
| "loss": 0.12, | |
| "reward": 0.7963576763868332, | |
| "reward_std": 0.8260948657989502, | |
| "rewards/cosine_scaled_reward": 0.3495677448809147, | |
| "rewards/format_reward": 0.0972222238779068, | |
| "step": 53 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2191.02783203125, | |
| "epoch": 0.046282408399400045, | |
| "grad_norm": 0.34720614552497864, | |
| "kl": 0.0012145042419433594, | |
| "learning_rate": 9.998245517681593e-07, | |
| "loss": 0.0331, | |
| "reward": 0.477291576564312, | |
| "reward_std": 0.7308211028575897, | |
| "rewards/cosine_scaled_reward": 0.19697911106050014, | |
| "rewards/format_reward": 0.08333333488553762, | |
| "step": 54 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2249.4444885253906, | |
| "epoch": 0.04713949003642597, | |
| "grad_norm": 0.37687548995018005, | |
| "kl": 0.0005116462707519531, | |
| "learning_rate": 9.997258721585931e-07, | |
| "loss": 0.0667, | |
| "reward": 0.4002426005899906, | |
| "reward_std": 0.8839976191520691, | |
| "rewards/cosine_scaled_reward": 0.1515101813711226, | |
| "rewards/format_reward": 0.0972222238779068, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2593.388916015625, | |
| "epoch": 0.04799657167345189, | |
| "grad_norm": 0.3094277083873749, | |
| "kl": 0.0008883476257324219, | |
| "learning_rate": 9.996052735444862e-07, | |
| "loss": -0.0481, | |
| "reward": 0.4086095951497555, | |
| "reward_std": 0.9278567582368851, | |
| "rewards/cosine_scaled_reward": 0.17652700655162334, | |
| "rewards/format_reward": 0.0555555559694767, | |
| "step": 56 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2042.125, | |
| "epoch": 0.04885365331047782, | |
| "grad_norm": 0.218702495098114, | |
| "kl": 0.0008041709661483765, | |
| "learning_rate": 9.994627618036452e-07, | |
| "loss": -0.0412, | |
| "reward": 0.6693701073527336, | |
| "reward_std": 0.7297275960445404, | |
| "rewards/cosine_scaled_reward": 0.2930183745920658, | |
| "rewards/format_reward": 0.08333333395421505, | |
| "step": 57 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1938.15283203125, | |
| "epoch": 0.04971073494750375, | |
| "grad_norm": 0.19452664256095886, | |
| "kl": 0.0034373998641967773, | |
| "learning_rate": 9.992983438818915e-07, | |
| "loss": 0.0073, | |
| "reward": 0.7400819137692451, | |
| "reward_std": 0.7548571228981018, | |
| "rewards/cosine_scaled_reward": 0.34226314071565866, | |
| "rewards/format_reward": 0.0555555559694767, | |
| "step": 58 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1864.9027404785156, | |
| "epoch": 0.050567816584529676, | |
| "grad_norm": 0.18900378048419952, | |
| "kl": 0.0017142295837402344, | |
| "learning_rate": 9.991120277927223e-07, | |
| "loss": -0.0249, | |
| "reward": 0.5964205050840974, | |
| "reward_std": 0.6683285385370255, | |
| "rewards/cosine_scaled_reward": 0.2426547077484429, | |
| "rewards/format_reward": 0.11111111380159855, | |
| "step": 59 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2128.15283203125, | |
| "epoch": 0.051424898221555604, | |
| "grad_norm": 0.8275086283683777, | |
| "kl": 0.0007976293563842773, | |
| "learning_rate": 9.989038226169207e-07, | |
| "loss": -0.0511, | |
| "reward": 0.31469447165727615, | |
| "reward_std": 0.7699690908193588, | |
| "rewards/cosine_scaled_reward": 0.09484723675996065, | |
| "rewards/format_reward": 0.12500000186264515, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1909.6667175292969, | |
| "epoch": 0.05228197985858153, | |
| "grad_norm": 0.3160114288330078, | |
| "kl": 0.0005016326904296875, | |
| "learning_rate": 9.98673738502114e-07, | |
| "loss": 0.1518, | |
| "reward": 0.7029329240322113, | |
| "reward_std": 0.7837346494197845, | |
| "rewards/cosine_scaled_reward": 0.2542442447738722, | |
| "rewards/format_reward": 0.19444444961845875, | |
| "step": 61 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2232.986114501953, | |
| "epoch": 0.05313906149560746, | |
| "grad_norm": 0.15240426361560822, | |
| "kl": 0.0007839202880859375, | |
| "learning_rate": 9.98421786662277e-07, | |
| "loss": -0.0462, | |
| "reward": 0.5141148939728737, | |
| "reward_std": 0.871205598115921, | |
| "rewards/cosine_scaled_reward": 0.2223352324217558, | |
| "rewards/format_reward": 0.06944444496184587, | |
| "step": 62 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2244.6944885253906, | |
| "epoch": 0.05399614313263338, | |
| "grad_norm": 0.3563172519207001, | |
| "kl": 0.00048482418060302734, | |
| "learning_rate": 9.981479793771866e-07, | |
| "loss": -0.092, | |
| "reward": 0.32696417067199945, | |
| "reward_std": 0.7440320923924446, | |
| "rewards/cosine_scaled_reward": 0.10792652331292629, | |
| "rewards/format_reward": 0.11111111287027597, | |
| "step": 63 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2177.111083984375, | |
| "epoch": 0.05485322476965931, | |
| "grad_norm": 0.1857367753982544, | |
| "kl": 0.0009160041809082031, | |
| "learning_rate": 9.97852329991824e-07, | |
| "loss": -0.1367, | |
| "reward": 0.8723283112049103, | |
| "reward_std": 0.8291849941015244, | |
| "rewards/cosine_scaled_reward": 0.36671971902251244, | |
| "rewards/format_reward": 0.13888889085501432, | |
| "step": 64 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2447.9861450195312, | |
| "epoch": 0.055710306406685235, | |
| "grad_norm": 0.4035802185535431, | |
| "kl": 0.0016689300537109375, | |
| "learning_rate": 9.975348529157229e-07, | |
| "loss": -0.1549, | |
| "reward": 0.1848380509763956, | |
| "reward_std": 0.7055136561393738, | |
| "rewards/cosine_scaled_reward": 0.050752353854477406, | |
| "rewards/format_reward": 0.08333333395421505, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2348.5416870117188, | |
| "epoch": 0.05656738804371116, | |
| "grad_norm": 0.1111924946308136, | |
| "kl": 0.0013647079467773438, | |
| "learning_rate": 9.971955636222684e-07, | |
| "loss": -0.046, | |
| "reward": 0.3021825775504112, | |
| "reward_std": 0.5473092719912529, | |
| "rewards/cosine_scaled_reward": 0.09553573839366436, | |
| "rewards/format_reward": 0.11111111380159855, | |
| "step": 66 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2172.625030517578, | |
| "epoch": 0.05742446968073709, | |
| "grad_norm": 0.3930043578147888, | |
| "kl": 0.005237579345703125, | |
| "learning_rate": 9.968344786479415e-07, | |
| "loss": 0.1107, | |
| "reward": 0.6916175857186317, | |
| "reward_std": 0.932863637804985, | |
| "rewards/cosine_scaled_reward": 0.24858656898140907, | |
| "rewards/format_reward": 0.19444444961845875, | |
| "step": 67 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1801.1666564941406, | |
| "epoch": 0.05828155131776302, | |
| "grad_norm": 0.22752800583839417, | |
| "kl": 0.0023860931396484375, | |
| "learning_rate": 9.964516155915151e-07, | |
| "loss": 0.0344, | |
| "reward": 0.47825843654572964, | |
| "reward_std": 0.7419339567422867, | |
| "rewards/cosine_scaled_reward": 0.16968477331101894, | |
| "rewards/format_reward": 0.13888889085501432, | |
| "step": 68 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2261.625030517578, | |
| "epoch": 0.059138632954788946, | |
| "grad_norm": 0.16897499561309814, | |
| "kl": 0.0013341903686523438, | |
| "learning_rate": 9.960469931131936e-07, | |
| "loss": 0.0432, | |
| "reward": 0.35771574825048447, | |
| "reward_std": 0.7713551372289658, | |
| "rewards/cosine_scaled_reward": 0.12330232141539454, | |
| "rewards/format_reward": 0.11111111287027597, | |
| "step": 69 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2128.111114501953, | |
| "epoch": 0.059995714591814873, | |
| "grad_norm": 0.9206687808036804, | |
| "kl": 0.0018476247787475586, | |
| "learning_rate": 9.956206309337066e-07, | |
| "loss": -0.1611, | |
| "reward": 0.5072405263781548, | |
| "reward_std": 0.8621459007263184, | |
| "rewards/cosine_scaled_reward": 0.22584249824285507, | |
| "rewards/format_reward": 0.0555555559694767, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2014.263916015625, | |
| "epoch": 0.060852796228840794, | |
| "grad_norm": 0.3131060004234314, | |
| "kl": 0.0014543533325195312, | |
| "learning_rate": 9.951725498333448e-07, | |
| "loss": 0.0172, | |
| "reward": 0.4196384996175766, | |
| "reward_std": 0.6068210601806641, | |
| "rewards/cosine_scaled_reward": 0.17509703256655484, | |
| "rewards/format_reward": 0.06944444589316845, | |
| "step": 71 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2465.2916870117188, | |
| "epoch": 0.06170987786586672, | |
| "grad_norm": 0.187669575214386, | |
| "kl": 0.0011992454528808594, | |
| "learning_rate": 9.947027716509488e-07, | |
| "loss": -0.0272, | |
| "reward": 0.5738496109843254, | |
| "reward_std": 0.8109806478023529, | |
| "rewards/cosine_scaled_reward": 0.2244247980415821, | |
| "rewards/format_reward": 0.12500000279396772, | |
| "step": 72 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2012.3888854980469, | |
| "epoch": 0.06256695950289265, | |
| "grad_norm": 0.3393029570579529, | |
| "kl": 0.0021200180053710938, | |
| "learning_rate": 9.942113192828444e-07, | |
| "loss": -0.1397, | |
| "reward": 0.32288938760757446, | |
| "reward_std": 0.6824958473443985, | |
| "rewards/cosine_scaled_reward": 0.10588914155960083, | |
| "rewards/format_reward": 0.1111111119389534, | |
| "step": 73 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2256.7084045410156, | |
| "epoch": 0.06342404113991858, | |
| "grad_norm": 0.4278959035873413, | |
| "kl": 0.0016908645629882812, | |
| "learning_rate": 9.93698216681727e-07, | |
| "loss": 0.1105, | |
| "reward": 0.7894450277090073, | |
| "reward_std": 0.6593053936958313, | |
| "rewards/cosine_scaled_reward": 0.32527804747223854, | |
| "rewards/format_reward": 0.13888888992369175, | |
| "step": 74 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2068.9722290039062, | |
| "epoch": 0.0642811227769445, | |
| "grad_norm": 0.4622882008552551, | |
| "kl": 0.0036230087280273438, | |
| "learning_rate": 9.931634888554935e-07, | |
| "loss": 0.0005, | |
| "reward": 0.22274947352707386, | |
| "reward_std": 0.8964805155992508, | |
| "rewards/cosine_scaled_reward": 0.09748584777116776, | |
| "rewards/format_reward": 0.02777777798473835, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2233.750030517578, | |
| "epoch": 0.06513820441397043, | |
| "grad_norm": 0.3396798074245453, | |
| "kl": 0.0010144710540771484, | |
| "learning_rate": 9.926071618660237e-07, | |
| "loss": -0.0669, | |
| "reward": 0.39245418552309275, | |
| "reward_std": 0.7776346653699875, | |
| "rewards/cosine_scaled_reward": 0.15456042252480984, | |
| "rewards/format_reward": 0.08333333674818277, | |
| "step": 76 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2399.513916015625, | |
| "epoch": 0.06599528605099636, | |
| "grad_norm": 0.2183726727962494, | |
| "kl": 0.0007622241973876953, | |
| "learning_rate": 9.9202926282791e-07, | |
| "loss": -0.0456, | |
| "reward": 0.2495843954384327, | |
| "reward_std": 0.5640047863125801, | |
| "rewards/cosine_scaled_reward": 0.07618108215683606, | |
| "rewards/format_reward": 0.0972222238779068, | |
| "step": 77 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1918.75, | |
| "epoch": 0.06685236768802229, | |
| "grad_norm": 0.2223767340183258, | |
| "kl": 0.0014109611511230469, | |
| "learning_rate": 9.91429819907136e-07, | |
| "loss": 0.0196, | |
| "reward": 0.32135773450136185, | |
| "reward_std": 0.7068247720599174, | |
| "rewards/cosine_scaled_reward": 0.10512331128120422, | |
| "rewards/format_reward": 0.11111111287027597, | |
| "step": 78 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2230.125030517578, | |
| "epoch": 0.06770944932504822, | |
| "grad_norm": 0.4025353789329529, | |
| "kl": 0.0010633468627929688, | |
| "learning_rate": 9.908088623197048e-07, | |
| "loss": 0.0663, | |
| "reward": 0.38789599807932973, | |
| "reward_std": 0.703848659992218, | |
| "rewards/cosine_scaled_reward": 0.13839244283735752, | |
| "rewards/format_reward": 0.11111111287027597, | |
| "step": 79 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2083.40283203125, | |
| "epoch": 0.06856653096207414, | |
| "grad_norm": 0.17173290252685547, | |
| "kl": 0.0009837150573730469, | |
| "learning_rate": 9.901664203302124e-07, | |
| "loss": -0.0862, | |
| "reward": 0.5672617349773645, | |
| "reward_std": 0.7471679449081421, | |
| "rewards/cosine_scaled_reward": 0.23501975380349904, | |
| "rewards/format_reward": 0.09722222294658422, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2022.3889465332031, | |
| "epoch": 0.06942361259910007, | |
| "grad_norm": 0.188516765832901, | |
| "kl": 0.00244903564453125, | |
| "learning_rate": 9.895025252503755e-07, | |
| "loss": 0.021, | |
| "reward": 0.6240249052643776, | |
| "reward_std": 0.7871060222387314, | |
| "rewards/cosine_scaled_reward": 0.2703457809984684, | |
| "rewards/format_reward": 0.08333333488553762, | |
| "step": 81 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2090.4583740234375, | |
| "epoch": 0.07028069423612598, | |
| "grad_norm": 0.28751835227012634, | |
| "kl": 0.0048160552978515625, | |
| "learning_rate": 9.888172094375033e-07, | |
| "loss": 0.1485, | |
| "reward": 0.20903612580150366, | |
| "reward_std": 0.8013690561056137, | |
| "rewards/cosine_scaled_reward": 0.05590694583952427, | |
| "rewards/format_reward": 0.0972222238779068, | |
| "step": 82 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2388.4584350585938, | |
| "epoch": 0.07113777587315191, | |
| "grad_norm": 0.22767475247383118, | |
| "kl": 0.0034203529357910156, | |
| "learning_rate": 9.881105062929221e-07, | |
| "loss": -0.0868, | |
| "reward": 0.11441808566451073, | |
| "reward_std": 0.6552191823720932, | |
| "rewards/cosine_scaled_reward": 0.04332014673855156, | |
| "rewards/format_reward": 0.02777777798473835, | |
| "step": 83 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2293.0694274902344, | |
| "epoch": 0.07199485751017784, | |
| "grad_norm": 0.11945953965187073, | |
| "kl": 0.0014848709106445312, | |
| "learning_rate": 9.873824502603459e-07, | |
| "loss": 0.0848, | |
| "reward": 0.5547876954078674, | |
| "reward_std": 0.7587804347276688, | |
| "rewards/cosine_scaled_reward": 0.17322717513889074, | |
| "rewards/format_reward": 0.20833333674818277, | |
| "step": 84 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2540.3611450195312, | |
| "epoch": 0.07285193914720377, | |
| "grad_norm": 0.178083136677742, | |
| "kl": 0.002269744873046875, | |
| "learning_rate": 9.866330768241983e-07, | |
| "loss": -0.1125, | |
| "reward": -0.029575519263744354, | |
| "reward_std": 0.563959889113903, | |
| "rewards/cosine_scaled_reward": -0.05645443079993129, | |
| "rewards/format_reward": 0.08333333395421505, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2571.7361450195312, | |
| "epoch": 0.0737090207842297, | |
| "grad_norm": 0.1585242599248886, | |
| "kl": 0.0007100105285644531, | |
| "learning_rate": 9.85862422507884e-07, | |
| "loss": 0.0231, | |
| "reward": 0.3958965800702572, | |
| "reward_std": 0.8284079432487488, | |
| "rewards/cosine_scaled_reward": 0.12155939312651753, | |
| "rewards/format_reward": 0.15277778077870607, | |
| "step": 86 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2367.3333740234375, | |
| "epoch": 0.07456610242125562, | |
| "grad_norm": 0.32309991121292114, | |
| "kl": 0.0017614364624023438, | |
| "learning_rate": 9.850705248720068e-07, | |
| "loss": 0.1136, | |
| "reward": 0.7444435358047485, | |
| "reward_std": 0.7368911355733871, | |
| "rewards/cosine_scaled_reward": 0.28888843953609467, | |
| "rewards/format_reward": 0.1666666679084301, | |
| "step": 87 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2257.9583129882812, | |
| "epoch": 0.07542318405828155, | |
| "grad_norm": 0.22389107942581177, | |
| "kl": 0.0009946823120117188, | |
| "learning_rate": 9.8425742251254e-07, | |
| "loss": -0.0003, | |
| "reward": 0.7726655453443527, | |
| "reward_std": 0.7912376075983047, | |
| "rewards/cosine_scaled_reward": 0.3099438678473234, | |
| "rewards/format_reward": 0.15277777891606092, | |
| "step": 88 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2160.1944580078125, | |
| "epoch": 0.07628026569530748, | |
| "grad_norm": 0.22130398452281952, | |
| "kl": 0.005789756774902344, | |
| "learning_rate": 9.83423155058946e-07, | |
| "loss": 0.0076, | |
| "reward": 0.47880110889673233, | |
| "reward_std": 0.8113372325897217, | |
| "rewards/cosine_scaled_reward": 0.19773388467729092, | |
| "rewards/format_reward": 0.08333333488553762, | |
| "step": 89 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2317.486114501953, | |
| "epoch": 0.0771373473323334, | |
| "grad_norm": 0.25060081481933594, | |
| "kl": 0.002841949462890625, | |
| "learning_rate": 9.825677631722435e-07, | |
| "loss": -0.0693, | |
| "reward": 0.17479625344276428, | |
| "reward_std": 0.7589741870760918, | |
| "rewards/cosine_scaled_reward": 0.011009246576577425, | |
| "rewards/format_reward": 0.15277778171002865, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2067.597198486328, | |
| "epoch": 0.07799442896935933, | |
| "grad_norm": 0.6746274828910828, | |
| "kl": 0.0038204193115234375, | |
| "learning_rate": 9.816912885430258e-07, | |
| "loss": 0.093, | |
| "reward": 0.5547708794474602, | |
| "reward_std": 0.7499261125922203, | |
| "rewards/cosine_scaled_reward": 0.2148854248225689, | |
| "rewards/format_reward": 0.12500000093132257, | |
| "step": 91 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2108.736114501953, | |
| "epoch": 0.07885151060638526, | |
| "grad_norm": 0.2025013267993927, | |
| "kl": 0.0023174285888671875, | |
| "learning_rate": 9.807937738894303e-07, | |
| "loss": -0.0409, | |
| "reward": 0.5829055476933718, | |
| "reward_std": 0.8402788192033768, | |
| "rewards/cosine_scaled_reward": 0.21506388299167156, | |
| "rewards/format_reward": 0.15277778077870607, | |
| "step": 92 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2121.388885498047, | |
| "epoch": 0.07970859224341119, | |
| "grad_norm": 0.20141884684562683, | |
| "kl": 0.0013599395751953125, | |
| "learning_rate": 9.798752629550546e-07, | |
| "loss": 0.1446, | |
| "reward": 0.4946107156574726, | |
| "reward_std": 0.7894033789634705, | |
| "rewards/cosine_scaled_reward": 0.16397201921790838, | |
| "rewards/format_reward": 0.16666667070239782, | |
| "step": 93 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1983.5138854980469, | |
| "epoch": 0.08056567388043712, | |
| "grad_norm": 0.6650830507278442, | |
| "kl": 0.0022716522216796875, | |
| "learning_rate": 9.78935800506826e-07, | |
| "loss": 0.2632, | |
| "reward": 0.7651779092848301, | |
| "reward_std": 0.8691636770963669, | |
| "rewards/cosine_scaled_reward": 0.25758895510807633, | |
| "rewards/format_reward": 0.25000000558793545, | |
| "step": 94 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1863.0833435058594, | |
| "epoch": 0.08142275551746304, | |
| "grad_norm": 0.16325955092906952, | |
| "kl": 0.0027484893798828125, | |
| "learning_rate": 9.779754323328192e-07, | |
| "loss": -0.0665, | |
| "reward": 0.6303411349654198, | |
| "reward_std": 0.8782893866300583, | |
| "rewards/cosine_scaled_reward": 0.2179483361542225, | |
| "rewards/format_reward": 0.19444444961845875, | |
| "step": 95 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1843.5833435058594, | |
| "epoch": 0.08227983715448897, | |
| "grad_norm": 0.29014191031455994, | |
| "kl": 0.0071086883544921875, | |
| "learning_rate": 9.769942052400235e-07, | |
| "loss": -0.1035, | |
| "reward": 0.8213257193565369, | |
| "reward_std": 0.7900742888450623, | |
| "rewards/cosine_scaled_reward": 0.3551072867412586, | |
| "rewards/format_reward": 0.11111111287027597, | |
| "step": 96 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2286.7361450195312, | |
| "epoch": 0.08313691879151489, | |
| "grad_norm": 0.1988731324672699, | |
| "kl": 0.0035772323608398438, | |
| "learning_rate": 9.759921670520634e-07, | |
| "loss": -0.0885, | |
| "reward": -0.018717994913458824, | |
| "reward_std": 0.6726399958133698, | |
| "rewards/cosine_scaled_reward": -0.07185898721218109, | |
| "rewards/format_reward": 0.12500000465661287, | |
| "step": 97 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1786.0972595214844, | |
| "epoch": 0.08399400042854081, | |
| "grad_norm": 0.49428296089172363, | |
| "kl": 0.008466720581054688, | |
| "learning_rate": 9.749693666068663e-07, | |
| "loss": 0.166, | |
| "reward": 0.6488583460450172, | |
| "reward_std": 0.7391397655010223, | |
| "rewards/cosine_scaled_reward": 0.2966513857245445, | |
| "rewards/format_reward": 0.055555556900799274, | |
| "step": 98 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2219.3055725097656, | |
| "epoch": 0.08485108206556674, | |
| "grad_norm": 0.30233073234558105, | |
| "kl": 0.003276824951171875, | |
| "learning_rate": 9.739258537542835e-07, | |
| "loss": -0.0317, | |
| "reward": 0.06553506385535002, | |
| "reward_std": 0.608645610511303, | |
| "rewards/cosine_scaled_reward": -0.029732469469308853, | |
| "rewards/format_reward": 0.12500000279396772, | |
| "step": 99 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2252.916717529297, | |
| "epoch": 0.08570816370259267, | |
| "grad_norm": 0.16652679443359375, | |
| "kl": 0.007282257080078125, | |
| "learning_rate": 9.728616793536587e-07, | |
| "loss": -0.0111, | |
| "reward": 1.0242944061756134, | |
| "reward_std": 0.8552243709564209, | |
| "rewards/cosine_scaled_reward": 0.4079805314540863, | |
| "rewards/format_reward": 0.20833334140479565, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2132.361114501953, | |
| "epoch": 0.0865652453396186, | |
| "grad_norm": 0.3907296061515808, | |
| "kl": 0.006062507629394531, | |
| "learning_rate": 9.717768952713511e-07, | |
| "loss": 0.2013, | |
| "reward": 0.8774868845939636, | |
| "reward_std": 0.8741513937711716, | |
| "rewards/cosine_scaled_reward": 0.3623545281589031, | |
| "rewards/format_reward": 0.15277778077870607, | |
| "step": 101 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1591.3611450195312, | |
| "epoch": 0.08742232697664452, | |
| "grad_norm": 0.20045924186706543, | |
| "kl": 0.020427703857421875, | |
| "learning_rate": 9.706715543782064e-07, | |
| "loss": 0.067, | |
| "reward": 0.6824215389788151, | |
| "reward_std": 0.8814049959182739, | |
| "rewards/cosine_scaled_reward": 0.27871076576411724, | |
| "rewards/format_reward": 0.12500000093132257, | |
| "step": 102 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2065.1944885253906, | |
| "epoch": 0.08827940861367045, | |
| "grad_norm": 0.39964577555656433, | |
| "kl": 0.00489044189453125, | |
| "learning_rate": 9.695457105469804e-07, | |
| "loss": 0.0617, | |
| "reward": 0.3749300390481949, | |
| "reward_std": 0.8525811061263084, | |
| "rewards/cosine_scaled_reward": 0.11802058666944504, | |
| "rewards/format_reward": 0.13888889364898205, | |
| "step": 103 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1969.375, | |
| "epoch": 0.08913649025069638, | |
| "grad_norm": 0.4705803394317627, | |
| "kl": 0.0055637359619140625, | |
| "learning_rate": 9.683994186497132e-07, | |
| "loss": 0.1758, | |
| "reward": 1.0127343982458115, | |
| "reward_std": 0.6061429902911186, | |
| "rewards/cosine_scaled_reward": 0.3744227262213826, | |
| "rewards/format_reward": 0.2638888955116272, | |
| "step": 104 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2124.2916564941406, | |
| "epoch": 0.0899935718877223, | |
| "grad_norm": 0.19006462395191193, | |
| "kl": 0.0029754638671875, | |
| "learning_rate": 9.672327345550543e-07, | |
| "loss": 0.0454, | |
| "reward": 0.9216031394898891, | |
| "reward_std": 0.9004587382078171, | |
| "rewards/cosine_scaled_reward": 0.3635793221183121, | |
| "rewards/format_reward": 0.19444444868713617, | |
| "step": 105 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1721.5694885253906, | |
| "epoch": 0.09085065352474823, | |
| "grad_norm": 0.18808282911777496, | |
| "kl": 0.008443832397460938, | |
| "learning_rate": 9.66045715125541e-07, | |
| "loss": 0.0263, | |
| "reward": 0.6266486272215843, | |
| "reward_std": 0.6180888190865517, | |
| "rewards/cosine_scaled_reward": 0.2994354497641325, | |
| "rewards/format_reward": 0.02777777798473835, | |
| "step": 106 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2028.7083740234375, | |
| "epoch": 0.09170773516177416, | |
| "grad_norm": 0.24120453000068665, | |
| "kl": 0.01983642578125, | |
| "learning_rate": 9.648384182148252e-07, | |
| "loss": 0.1064, | |
| "reward": 0.5810213461518288, | |
| "reward_std": 0.5696274787187576, | |
| "rewards/cosine_scaled_reward": 0.2002328964881599, | |
| "rewards/format_reward": 0.18055556248873472, | |
| "step": 107 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2217.90283203125, | |
| "epoch": 0.09256481679880009, | |
| "grad_norm": 0.3850785493850708, | |
| "kl": 0.0053253173828125, | |
| "learning_rate": 9.636109026648554e-07, | |
| "loss": -0.0305, | |
| "reward": 0.6920746862888336, | |
| "reward_std": 0.9560717344284058, | |
| "rewards/cosine_scaled_reward": 0.25575956143438816, | |
| "rewards/format_reward": 0.18055556248873472, | |
| "step": 108 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2110.7222595214844, | |
| "epoch": 0.09342189843582602, | |
| "grad_norm": 0.167547345161438, | |
| "kl": 0.01023101806640625, | |
| "learning_rate": 9.623632283030077e-07, | |
| "loss": 0.0664, | |
| "reward": 0.9005825072526932, | |
| "reward_std": 0.7161072492599487, | |
| "rewards/cosine_scaled_reward": 0.36001347936689854, | |
| "rewards/format_reward": 0.18055556155741215, | |
| "step": 109 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1947.2500305175781, | |
| "epoch": 0.09427898007285195, | |
| "grad_norm": 0.14026661217212677, | |
| "kl": 0.00750732421875, | |
| "learning_rate": 9.610954559391704e-07, | |
| "loss": -0.02, | |
| "reward": 0.6598109304904938, | |
| "reward_std": 0.7547452449798584, | |
| "rewards/cosine_scaled_reward": 0.2396276891231537, | |
| "rewards/format_reward": 0.18055555876344442, | |
| "step": 110 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1930.9027709960938, | |
| "epoch": 0.09513606170987787, | |
| "grad_norm": 0.21193362772464752, | |
| "kl": 0.007018566131591797, | |
| "learning_rate": 9.598076473627796e-07, | |
| "loss": 0.1196, | |
| "reward": 0.5302619338035583, | |
| "reward_std": 0.6282015666365623, | |
| "rewards/cosine_scaled_reward": 0.18179763481020927, | |
| "rewards/format_reward": 0.16666666977107525, | |
| "step": 111 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2134.0694885253906, | |
| "epoch": 0.09599314334690379, | |
| "grad_norm": 0.24564455449581146, | |
| "kl": 0.00876617431640625, | |
| "learning_rate": 9.58499865339809e-07, | |
| "loss": 0.0425, | |
| "reward": 0.7763140201568604, | |
| "reward_std": 0.937856912612915, | |
| "rewards/cosine_scaled_reward": 0.2909347750246525, | |
| "rewards/format_reward": 0.19444444868713617, | |
| "step": 112 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2006.2083129882812, | |
| "epoch": 0.09685022498392971, | |
| "grad_norm": 0.2176864892244339, | |
| "kl": 0.011257171630859375, | |
| "learning_rate": 9.571721736097088e-07, | |
| "loss": 0.2242, | |
| "reward": 1.0729680806398392, | |
| "reward_std": 0.8336671739816666, | |
| "rewards/cosine_scaled_reward": 0.39065071195364, | |
| "rewards/format_reward": 0.2916666753590107, | |
| "step": 113 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1895.02783203125, | |
| "epoch": 0.09770730662095564, | |
| "grad_norm": 0.15649937093257904, | |
| "kl": 0.00569915771484375, | |
| "learning_rate": 9.55824636882301e-07, | |
| "loss": 0.0528, | |
| "reward": 0.438002310693264, | |
| "reward_std": 0.9526876509189606, | |
| "rewards/cosine_scaled_reward": 0.09400115348398685, | |
| "rewards/format_reward": 0.2500000074505806, | |
| "step": 114 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1993.6528015136719, | |
| "epoch": 0.09856438825798157, | |
| "grad_norm": 0.4257446527481079, | |
| "kl": 0.01016998291015625, | |
| "learning_rate": 9.54457320834625e-07, | |
| "loss": 0.0147, | |
| "reward": 0.6780023947358131, | |
| "reward_std": 0.9333401471376419, | |
| "rewards/cosine_scaled_reward": 0.2626123018562794, | |
| "rewards/format_reward": 0.15277778077870607, | |
| "step": 115 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1788.4444885253906, | |
| "epoch": 0.0994214698950075, | |
| "grad_norm": 0.15009744465351105, | |
| "kl": 0.01854705810546875, | |
| "learning_rate": 9.530702921077358e-07, | |
| "loss": 0.1687, | |
| "reward": 0.8848052807152271, | |
| "reward_std": 0.9200158715248108, | |
| "rewards/cosine_scaled_reward": 0.3590692952275276, | |
| "rewards/format_reward": 0.16666666977107525, | |
| "step": 116 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1550.0833129882812, | |
| "epoch": 0.10027855153203342, | |
| "grad_norm": 0.2336883395910263, | |
| "kl": 0.0109710693359375, | |
| "learning_rate": 9.516636183034564e-07, | |
| "loss": 0.0968, | |
| "reward": 0.721510112285614, | |
| "reward_std": 0.8123895823955536, | |
| "rewards/cosine_scaled_reward": 0.2635328520555049, | |
| "rewards/format_reward": 0.19444444961845875, | |
| "step": 117 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2167.1250610351562, | |
| "epoch": 0.10113563316905935, | |
| "grad_norm": 0.3328269422054291, | |
| "kl": 0.009395599365234375, | |
| "learning_rate": 9.502373679810839e-07, | |
| "loss": 0.0206, | |
| "reward": -0.14523404464125633, | |
| "reward_std": 0.5727858245372772, | |
| "rewards/cosine_scaled_reward": -0.14206147193908691, | |
| "rewards/format_reward": 0.13888888992369175, | |
| "step": 118 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1597.2361145019531, | |
| "epoch": 0.10199271480608528, | |
| "grad_norm": 1.107146143913269, | |
| "kl": 0.0159759521484375, | |
| "learning_rate": 9.487916106540465e-07, | |
| "loss": 0.5165, | |
| "reward": 0.715252235531807, | |
| "reward_std": 0.834864467382431, | |
| "rewards/cosine_scaled_reward": 0.25345943216234446, | |
| "rewards/format_reward": 0.20833333488553762, | |
| "step": 119 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2161.027801513672, | |
| "epoch": 0.10284979644311121, | |
| "grad_norm": 0.15888024866580963, | |
| "kl": 0.007426261901855469, | |
| "learning_rate": 9.473264167865171e-07, | |
| "loss": 0.0203, | |
| "reward": 0.2189617045223713, | |
| "reward_std": 0.7949748933315277, | |
| "rewards/cosine_scaled_reward": 0.04003641102463007, | |
| "rewards/format_reward": 0.13888889271765947, | |
| "step": 120 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2409.2083129882812, | |
| "epoch": 0.10370687808013714, | |
| "grad_norm": 0.21175938844680786, | |
| "kl": 0.008785247802734375, | |
| "learning_rate": 9.458418577899774e-07, | |
| "loss": -0.0082, | |
| "reward": 0.8535979464650154, | |
| "reward_std": 0.8966440111398697, | |
| "rewards/cosine_scaled_reward": 0.3642989657819271, | |
| "rewards/format_reward": 0.12500000186264515, | |
| "step": 121 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1877.7222290039062, | |
| "epoch": 0.10456395971716306, | |
| "grad_norm": 0.350690633058548, | |
| "kl": 0.01152801513671875, | |
| "learning_rate": 9.443380060197385e-07, | |
| "loss": -0.014, | |
| "reward": 0.47487088665366173, | |
| "reward_std": 0.7110822051763535, | |
| "rewards/cosine_scaled_reward": 0.13326877844519913, | |
| "rewards/format_reward": 0.20833333674818277, | |
| "step": 122 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1858.8750610351562, | |
| "epoch": 0.10542104135418899, | |
| "grad_norm": 0.2693331837654114, | |
| "kl": 0.016326904296875, | |
| "learning_rate": 9.428149347714143e-07, | |
| "loss": -0.021, | |
| "reward": 0.7835421413183212, | |
| "reward_std": 0.7691494226455688, | |
| "rewards/cosine_scaled_reward": 0.2806599698960781, | |
| "rewards/format_reward": 0.22222222946584225, | |
| "step": 123 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1666.7222290039062, | |
| "epoch": 0.10627812299121492, | |
| "grad_norm": 0.21113620698451996, | |
| "kl": 0.018585205078125, | |
| "learning_rate": 9.412727182773486e-07, | |
| "loss": 0.224, | |
| "reward": 0.8200660422444344, | |
| "reward_std": 0.7651881277561188, | |
| "rewards/cosine_scaled_reward": 0.2780885882675648, | |
| "rewards/format_reward": 0.2638888992369175, | |
| "step": 124 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1892.8472290039062, | |
| "epoch": 0.10713520462824085, | |
| "grad_norm": 0.2539234161376953, | |
| "kl": 0.015058517456054688, | |
| "learning_rate": 9.397114317029974e-07, | |
| "loss": 0.1511, | |
| "reward": 0.6824524328112602, | |
| "reward_std": 0.6703763008117676, | |
| "rewards/cosine_scaled_reward": 0.2578928880393505, | |
| "rewards/format_reward": 0.16666667256504297, | |
| "step": 125 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2195.3472290039062, | |
| "epoch": 0.10799228626526676, | |
| "grad_norm": 0.1690240204334259, | |
| "kl": 0.011653900146484375, | |
| "learning_rate": 9.381311511432658e-07, | |
| "loss": 0.0703, | |
| "reward": 0.618515363894403, | |
| "reward_std": 0.9210871905088425, | |
| "rewards/cosine_scaled_reward": 0.1842576777562499, | |
| "rewards/format_reward": 0.25000000558793545, | |
| "step": 126 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2141.375030517578, | |
| "epoch": 0.10884936790229269, | |
| "grad_norm": 0.19393949210643768, | |
| "kl": 0.01447296142578125, | |
| "learning_rate": 9.36531953618799e-07, | |
| "loss": 0.1103, | |
| "reward": 0.9065948352217674, | |
| "reward_std": 0.6801795363426208, | |
| "rewards/cosine_scaled_reward": 0.36301962845027447, | |
| "rewards/format_reward": 0.18055555783212185, | |
| "step": 127 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2105.375030517578, | |
| "epoch": 0.10970644953931862, | |
| "grad_norm": 0.21088111400604248, | |
| "kl": 0.01592254638671875, | |
| "learning_rate": 9.34913917072228e-07, | |
| "loss": 0.0865, | |
| "reward": 0.822256007231772, | |
| "reward_std": 0.953468844294548, | |
| "rewards/cosine_scaled_reward": 0.2861280349898152, | |
| "rewards/format_reward": 0.2500000046566129, | |
| "step": 128 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2267.9583129882812, | |
| "epoch": 0.11056353117634454, | |
| "grad_norm": 0.2045803964138031, | |
| "kl": 0.01177978515625, | |
| "learning_rate": 9.332771203643714e-07, | |
| "loss": 0.0814, | |
| "reward": 0.36659867502748966, | |
| "reward_std": 0.6582172811031342, | |
| "rewards/cosine_scaled_reward": 0.05135490372776985, | |
| "rewards/format_reward": 0.2638888955116272, | |
| "step": 129 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1816.2222290039062, | |
| "epoch": 0.11142061281337047, | |
| "grad_norm": 0.44615596532821655, | |
| "kl": 0.0237884521484375, | |
| "learning_rate": 9.316216432703916e-07, | |
| "loss": 0.0822, | |
| "reward": 0.9266887735575438, | |
| "reward_std": 0.7460800111293793, | |
| "rewards/cosine_scaled_reward": 0.3313999269157648, | |
| "rewards/format_reward": 0.2638888955116272, | |
| "step": 130 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1904.1944885253906, | |
| "epoch": 0.1122776944503964, | |
| "grad_norm": 0.2686100900173187, | |
| "kl": 0.0119476318359375, | |
| "learning_rate": 9.299475664759068e-07, | |
| "loss": 0.0847, | |
| "reward": 0.4160115160048008, | |
| "reward_std": 0.7496158927679062, | |
| "rewards/cosine_scaled_reward": 0.08995018899440765, | |
| "rewards/format_reward": 0.236111119389534, | |
| "step": 131 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1897.7083129882812, | |
| "epoch": 0.11313477608742233, | |
| "grad_norm": 0.2043069452047348, | |
| "kl": 0.01409912109375, | |
| "learning_rate": 9.282549715730579e-07, | |
| "loss": 0.1342, | |
| "reward": 0.4763021022081375, | |
| "reward_std": 0.810718834400177, | |
| "rewards/cosine_scaled_reward": 0.1339843887835741, | |
| "rewards/format_reward": 0.2083333358168602, | |
| "step": 132 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2475.1111450195312, | |
| "epoch": 0.11399185772444825, | |
| "grad_norm": 0.15878832340240479, | |
| "kl": 0.00994110107421875, | |
| "learning_rate": 9.265439410565328e-07, | |
| "loss": -0.0354, | |
| "reward": 0.2221047766506672, | |
| "reward_std": 0.7571545913815498, | |
| "rewards/cosine_scaled_reward": 0.041607944294810295, | |
| "rewards/format_reward": 0.13888889364898205, | |
| "step": 133 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2260.750030517578, | |
| "epoch": 0.11484893936147418, | |
| "grad_norm": 0.1739797592163086, | |
| "kl": 0.00885772705078125, | |
| "learning_rate": 9.248145583195447e-07, | |
| "loss": 0.0656, | |
| "reward": 0.7371436022222042, | |
| "reward_std": 0.7502488344907761, | |
| "rewards/cosine_scaled_reward": 0.2713496144860983, | |
| "rewards/format_reward": 0.19444444961845875, | |
| "step": 134 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1935.4583740234375, | |
| "epoch": 0.11570602099850011, | |
| "grad_norm": 0.5385437607765198, | |
| "kl": 0.0226593017578125, | |
| "learning_rate": 9.230669076497687e-07, | |
| "loss": 0.1112, | |
| "reward": 0.5696738436818123, | |
| "reward_std": 0.6989183947443962, | |
| "rewards/cosine_scaled_reward": 0.19455914944410324, | |
| "rewards/format_reward": 0.18055555876344442, | |
| "step": 135 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1787.6111145019531, | |
| "epoch": 0.11656310263552604, | |
| "grad_norm": 0.5149728059768677, | |
| "kl": 0.012420654296875, | |
| "learning_rate": 9.213010742252327e-07, | |
| "loss": 0.1226, | |
| "reward": 0.687653437256813, | |
| "reward_std": 0.9176287800073624, | |
| "rewards/cosine_scaled_reward": 0.1841045105829835, | |
| "rewards/format_reward": 0.3194444552063942, | |
| "step": 136 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1814.8194274902344, | |
| "epoch": 0.11742018427255196, | |
| "grad_norm": 0.2650432884693146, | |
| "kl": 0.0258636474609375, | |
| "learning_rate": 9.195171441101668e-07, | |
| "loss": -0.0266, | |
| "reward": 1.1596794873476028, | |
| "reward_std": 0.9145576506853104, | |
| "rewards/cosine_scaled_reward": 0.4201175607740879, | |
| "rewards/format_reward": 0.3194444514811039, | |
| "step": 137 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1682.513916015625, | |
| "epoch": 0.11827726590957789, | |
| "grad_norm": 0.2149789184331894, | |
| "kl": 0.0146484375, | |
| "learning_rate": 9.177152042508077e-07, | |
| "loss": 0.1368, | |
| "reward": 0.49744264781475067, | |
| "reward_std": 0.819553479552269, | |
| "rewards/cosine_scaled_reward": 0.12372134439647198, | |
| "rewards/format_reward": 0.25000000558793545, | |
| "step": 138 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1823.75, | |
| "epoch": 0.11913434754660382, | |
| "grad_norm": 0.4041607677936554, | |
| "kl": 0.018646240234375, | |
| "learning_rate": 9.158953424711624e-07, | |
| "loss": 0.158, | |
| "reward": 0.5236843451857567, | |
| "reward_std": 0.6677617505192757, | |
| "rewards/cosine_scaled_reward": 0.10906438087113202, | |
| "rewards/format_reward": 0.305555559694767, | |
| "step": 139 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1966.875, | |
| "epoch": 0.11999142918362975, | |
| "grad_norm": 0.19172143936157227, | |
| "kl": 0.0146484375, | |
| "learning_rate": 9.140576474687263e-07, | |
| "loss": 0.12, | |
| "reward": 0.788971059024334, | |
| "reward_std": 0.7478453367948532, | |
| "rewards/cosine_scaled_reward": 0.2625410854816437, | |
| "rewards/format_reward": 0.2638888955116272, | |
| "step": 140 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2236.7361450195312, | |
| "epoch": 0.12084851082065566, | |
| "grad_norm": 0.20563149452209473, | |
| "kl": 0.011810302734375, | |
| "learning_rate": 9.122022088101613e-07, | |
| "loss": 0.0257, | |
| "reward": 0.4495049864053726, | |
| "reward_std": 0.9425568133592606, | |
| "rewards/cosine_scaled_reward": 0.1275302767753601, | |
| "rewards/format_reward": 0.1944444514811039, | |
| "step": 141 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1477.8611145019531, | |
| "epoch": 0.12170559245768159, | |
| "grad_norm": 0.5766943693161011, | |
| "kl": 0.01958465576171875, | |
| "learning_rate": 9.103291169269299e-07, | |
| "loss": 0.2984, | |
| "reward": 0.9424830563366413, | |
| "reward_std": 0.763814777135849, | |
| "rewards/cosine_scaled_reward": 0.34624152863398194, | |
| "rewards/format_reward": 0.2500000037252903, | |
| "step": 142 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1456.9722595214844, | |
| "epoch": 0.12256267409470752, | |
| "grad_norm": 0.33502310514450073, | |
| "kl": 0.02838134765625, | |
| "learning_rate": 9.084384631108882e-07, | |
| "loss": 0.2811, | |
| "reward": 1.1146164610981941, | |
| "reward_std": 0.8421279340982437, | |
| "rewards/cosine_scaled_reward": 0.3698082063347101, | |
| "rewards/format_reward": 0.3750000074505806, | |
| "step": 143 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1582.8611450195312, | |
| "epoch": 0.12341975573173344, | |
| "grad_norm": 0.5452846884727478, | |
| "kl": 0.0584716796875, | |
| "learning_rate": 9.065303395098358e-07, | |
| "loss": 0.2054, | |
| "reward": 0.49096263851970434, | |
| "reward_std": 0.8086179941892624, | |
| "rewards/cosine_scaled_reward": 0.15520351566374302, | |
| "rewards/format_reward": 0.18055555876344442, | |
| "step": 144 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2001.9305725097656, | |
| "epoch": 0.12427683736875937, | |
| "grad_norm": 0.29051852226257324, | |
| "kl": 0.0219879150390625, | |
| "learning_rate": 9.046048391230247e-07, | |
| "loss": 0.1612, | |
| "reward": 0.858160063624382, | |
| "reward_std": 0.8109176307916641, | |
| "rewards/cosine_scaled_reward": 0.2832467071712017, | |
| "rewards/format_reward": 0.2916666753590107, | |
| "step": 145 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1551.8611145019531, | |
| "epoch": 0.1251339190057853, | |
| "grad_norm": 0.3925701975822449, | |
| "kl": 0.02143096923828125, | |
| "learning_rate": 9.026620557966279e-07, | |
| "loss": 0.301, | |
| "reward": 0.8699105493724346, | |
| "reward_std": 0.96321090310812, | |
| "rewards/cosine_scaled_reward": 0.28217751905322075, | |
| "rewards/format_reward": 0.3055555634200573, | |
| "step": 146 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1913.1111450195312, | |
| "epoch": 0.12599100064281124, | |
| "grad_norm": 0.39382514357566833, | |
| "kl": 0.0259246826171875, | |
| "learning_rate": 9.007020842191634e-07, | |
| "loss": 0.1395, | |
| "reward": 0.39645494148135185, | |
| "reward_std": 0.8219664841890335, | |
| "rewards/cosine_scaled_reward": 0.08017192035913467, | |
| "rewards/format_reward": 0.2361111156642437, | |
| "step": 147 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1661.125, | |
| "epoch": 0.12684808227983715, | |
| "grad_norm": 0.2739737927913666, | |
| "kl": 0.064727783203125, | |
| "learning_rate": 8.987250199168808e-07, | |
| "loss": 0.0322, | |
| "reward": 0.9951315224170685, | |
| "reward_std": 0.8566233068704605, | |
| "rewards/cosine_scaled_reward": 0.39339908584952354, | |
| "rewards/format_reward": 0.20833334140479565, | |
| "step": 148 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2030.9027709960938, | |
| "epoch": 0.12770516391686307, | |
| "grad_norm": 0.1996004581451416, | |
| "kl": 0.01544189453125, | |
| "learning_rate": 8.967309592491052e-07, | |
| "loss": 0.0669, | |
| "reward": 0.31733213737607, | |
| "reward_std": 0.5332914516329765, | |
| "rewards/cosine_scaled_reward": 0.02672163024544716, | |
| "rewards/format_reward": 0.2638888955116272, | |
| "step": 149 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1978.4722595214844, | |
| "epoch": 0.128562245553889, | |
| "grad_norm": 0.4115337133407593, | |
| "kl": 0.0281524658203125, | |
| "learning_rate": 8.9471999940354e-07, | |
| "loss": 0.012, | |
| "reward": 0.8926911260932684, | |
| "reward_std": 0.6474704742431641, | |
| "rewards/cosine_scaled_reward": 0.2935677766799927, | |
| "rewards/format_reward": 0.3055555634200573, | |
| "step": 150 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1758.2083740234375, | |
| "epoch": 0.12941932719091492, | |
| "grad_norm": 0.3856815695762634, | |
| "kl": 0.0369873046875, | |
| "learning_rate": 8.926922383915315e-07, | |
| "loss": 0.237, | |
| "reward": 0.8293609768152237, | |
| "reward_std": 0.9914700090885162, | |
| "rewards/cosine_scaled_reward": 0.2827360359951854, | |
| "rewards/format_reward": 0.26388889364898205, | |
| "step": 151 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1611.4861450195312, | |
| "epoch": 0.13027640882794086, | |
| "grad_norm": 0.44330915808677673, | |
| "kl": 0.0380401611328125, | |
| "learning_rate": 8.906477750432903e-07, | |
| "loss": -0.0427, | |
| "reward": 0.6217841571196914, | |
| "reward_std": 0.6576393991708755, | |
| "rewards/cosine_scaled_reward": 0.19283651188015938, | |
| "rewards/format_reward": 0.236111119389534, | |
| "step": 152 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1863.4305725097656, | |
| "epoch": 0.13113349046496678, | |
| "grad_norm": 0.23155762255191803, | |
| "kl": 0.029293060302734375, | |
| "learning_rate": 8.88586709003076e-07, | |
| "loss": -0.0151, | |
| "reward": 0.6762807443737984, | |
| "reward_std": 0.9942405819892883, | |
| "rewards/cosine_scaled_reward": 0.19925148598849773, | |
| "rewards/format_reward": 0.2777777835726738, | |
| "step": 153 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2036.3055725097656, | |
| "epoch": 0.13199057210199272, | |
| "grad_norm": 6.47697114944458, | |
| "kl": 0.0350189208984375, | |
| "learning_rate": 8.865091407243394e-07, | |
| "loss": 0.0039, | |
| "reward": 0.2652840279042721, | |
| "reward_std": 0.5364516898989677, | |
| "rewards/cosine_scaled_reward": -0.0131913423538208, | |
| "rewards/format_reward": 0.2916666688397527, | |
| "step": 154 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1694.7361450195312, | |
| "epoch": 0.13284765373901863, | |
| "grad_norm": 0.6100454330444336, | |
| "kl": 0.057952880859375, | |
| "learning_rate": 8.844151714648274e-07, | |
| "loss": 0.2837, | |
| "reward": 0.949450820684433, | |
| "reward_std": 0.8621908873319626, | |
| "rewards/cosine_scaled_reward": 0.3705587573349476, | |
| "rewards/format_reward": 0.2083333395421505, | |
| "step": 155 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1715.5972442626953, | |
| "epoch": 0.13370473537604458, | |
| "grad_norm": 0.36461231112480164, | |
| "kl": 0.034576416015625, | |
| "learning_rate": 8.823049032816478e-07, | |
| "loss": 0.1019, | |
| "reward": 0.9654277712106705, | |
| "reward_std": 0.8116246461868286, | |
| "rewards/cosine_scaled_reward": 0.31604722142219543, | |
| "rewards/format_reward": 0.33333333767950535, | |
| "step": 156 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1362.9583435058594, | |
| "epoch": 0.1345618170130705, | |
| "grad_norm": 0.3907575309276581, | |
| "kl": 0.0545654296875, | |
| "learning_rate": 8.801784390262943e-07, | |
| "loss": 0.141, | |
| "reward": 0.8722702264785767, | |
| "reward_std": 0.6232585608959198, | |
| "rewards/cosine_scaled_reward": 0.29724621400237083, | |
| "rewards/format_reward": 0.27777778543531895, | |
| "step": 157 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1861.013916015625, | |
| "epoch": 0.13541889865009643, | |
| "grad_norm": 0.36566102504730225, | |
| "kl": 0.050750732421875, | |
| "learning_rate": 8.780358823396352e-07, | |
| "loss": 0.2151, | |
| "reward": 0.5994696915149689, | |
| "reward_std": 0.9274942576885223, | |
| "rewards/cosine_scaled_reward": 0.18167929956689477, | |
| "rewards/format_reward": 0.2361111156642437, | |
| "step": 158 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1819.916748046875, | |
| "epoch": 0.13627598028712234, | |
| "grad_norm": 0.31390905380249023, | |
| "kl": 0.05914306640625, | |
| "learning_rate": 8.758773376468604e-07, | |
| "loss": 0.3592, | |
| "reward": 0.8621488437056541, | |
| "reward_std": 0.8510329574346542, | |
| "rewards/cosine_scaled_reward": 0.29218554496765137, | |
| "rewards/format_reward": 0.2777777835726738, | |
| "step": 159 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1867.0277709960938, | |
| "epoch": 0.1371330619241483, | |
| "grad_norm": 0.23102979362010956, | |
| "kl": 0.0302734375, | |
| "learning_rate": 8.737029101523929e-07, | |
| "loss": 0.0781, | |
| "reward": 0.5296455472707748, | |
| "reward_std": 0.7065431177616119, | |
| "rewards/cosine_scaled_reward": 0.11898945830762386, | |
| "rewards/format_reward": 0.2916666716337204, | |
| "step": 160 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1560.7916717529297, | |
| "epoch": 0.1379901435611742, | |
| "grad_norm": 0.35218510031700134, | |
| "kl": 0.05161285400390625, | |
| "learning_rate": 8.715127058347614e-07, | |
| "loss": -0.046, | |
| "reward": 0.9299365878105164, | |
| "reward_std": 0.5807594284415245, | |
| "rewards/cosine_scaled_reward": 0.3260794151574373, | |
| "rewards/format_reward": 0.2777777835726738, | |
| "step": 161 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1482.4861450195312, | |
| "epoch": 0.13884722519820014, | |
| "grad_norm": 0.31520912051200867, | |
| "kl": 0.05499267578125, | |
| "learning_rate": 8.693068314414344e-07, | |
| "loss": 0.1936, | |
| "reward": 1.1436526030302048, | |
| "reward_std": 0.786539800465107, | |
| "rewards/cosine_scaled_reward": 0.4398818574845791, | |
| "rewards/format_reward": 0.2638888992369175, | |
| "step": 162 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1211.0694427490234, | |
| "epoch": 0.13970430683522606, | |
| "grad_norm": 0.7765697240829468, | |
| "kl": 0.0682373046875, | |
| "learning_rate": 8.670853944836176e-07, | |
| "loss": 0.403, | |
| "reward": 1.4794435054063797, | |
| "reward_std": 0.9502733200788498, | |
| "rewards/cosine_scaled_reward": 0.5383328944444656, | |
| "rewards/format_reward": 0.4027777798473835, | |
| "step": 163 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1754.9583435058594, | |
| "epoch": 0.14056138847225197, | |
| "grad_norm": 0.484488308429718, | |
| "kl": 0.046142578125, | |
| "learning_rate": 8.648485032310144e-07, | |
| "loss": 0.0821, | |
| "reward": 0.8094066381454468, | |
| "reward_std": 0.9717631787061691, | |
| "rewards/cosine_scaled_reward": 0.2241477482020855, | |
| "rewards/format_reward": 0.3611111231148243, | |
| "step": 164 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1594.3472290039062, | |
| "epoch": 0.1414184701092779, | |
| "grad_norm": 0.40359950065612793, | |
| "kl": 0.063720703125, | |
| "learning_rate": 8.625962667065487e-07, | |
| "loss": 0.3409, | |
| "reward": 0.8673667535185814, | |
| "reward_std": 0.8468609303236008, | |
| "rewards/cosine_scaled_reward": 0.30868337862193584, | |
| "rewards/format_reward": 0.2500000074505806, | |
| "step": 165 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1543.9027404785156, | |
| "epoch": 0.14227555174630382, | |
| "grad_norm": 0.4297288656234741, | |
| "kl": 0.07159423828125, | |
| "learning_rate": 8.603287946810513e-07, | |
| "loss": 0.1784, | |
| "reward": 1.0218196213245392, | |
| "reward_std": 0.8119381964206696, | |
| "rewards/cosine_scaled_reward": 0.3650764860212803, | |
| "rewards/format_reward": 0.2916666716337204, | |
| "step": 166 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1453.4722290039062, | |
| "epoch": 0.14313263338332977, | |
| "grad_norm": 0.6794213652610779, | |
| "kl": 0.08380126953125, | |
| "learning_rate": 8.580461976679099e-07, | |
| "loss": 0.304, | |
| "reward": 1.2251211404800415, | |
| "reward_std": 0.7566522508859634, | |
| "rewards/cosine_scaled_reward": 0.43200499936938286, | |
| "rewards/format_reward": 0.3611111231148243, | |
| "step": 167 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1612.1527862548828, | |
| "epoch": 0.14398971502035568, | |
| "grad_norm": 0.6980922222137451, | |
| "kl": 0.05419921875, | |
| "learning_rate": 8.557485869176825e-07, | |
| "loss": 0.1675, | |
| "reward": 0.8015574552118778, | |
| "reward_std": 0.9579913914203644, | |
| "rewards/cosine_scaled_reward": 0.24800091050565243, | |
| "rewards/format_reward": 0.30555556528270245, | |
| "step": 168 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1692.9028015136719, | |
| "epoch": 0.14484679665738162, | |
| "grad_norm": 0.8601597547531128, | |
| "kl": 0.095458984375, | |
| "learning_rate": 8.534360744126753e-07, | |
| "loss": 0.1693, | |
| "reward": 0.3856995478272438, | |
| "reward_std": 0.7862947285175323, | |
| "rewards/cosine_scaled_reward": 0.012294212356209755, | |
| "rewards/format_reward": 0.361111119389534, | |
| "step": 169 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1859.0416870117188, | |
| "epoch": 0.14570387829440754, | |
| "grad_norm": 0.4334951937198639, | |
| "kl": 0.05572509765625, | |
| "learning_rate": 8.511087728614862e-07, | |
| "loss": 0.1955, | |
| "reward": 0.8502315804362297, | |
| "reward_std": 0.9405944645404816, | |
| "rewards/cosine_scaled_reward": 0.25844913721084595, | |
| "rewards/format_reward": 0.3333333432674408, | |
| "step": 170 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1359.763916015625, | |
| "epoch": 0.14656095993143348, | |
| "grad_norm": 1.0471651554107666, | |
| "kl": 0.1002197265625, | |
| "learning_rate": 8.487667956935087e-07, | |
| "loss": 0.4846, | |
| "reward": 0.9384964210912585, | |
| "reward_std": 0.8249912112951279, | |
| "rewards/cosine_scaled_reward": 0.274803776293993, | |
| "rewards/format_reward": 0.3888888955116272, | |
| "step": 171 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1483.1250305175781, | |
| "epoch": 0.1474180415684594, | |
| "grad_norm": 0.46483904123306274, | |
| "kl": 0.07452392578125, | |
| "learning_rate": 8.464102570534061e-07, | |
| "loss": 0.1276, | |
| "reward": 1.1911370605230331, | |
| "reward_std": 0.7849500328302383, | |
| "rewards/cosine_scaled_reward": 0.4289018586277962, | |
| "rewards/format_reward": 0.33333333767950535, | |
| "step": 172 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1719.3750305175781, | |
| "epoch": 0.14827512320548533, | |
| "grad_norm": 0.6636127233505249, | |
| "kl": 0.08563232421875, | |
| "learning_rate": 8.440392717955475e-07, | |
| "loss": 0.3273, | |
| "reward": 0.640670370310545, | |
| "reward_std": 0.7593775987625122, | |
| "rewards/cosine_scaled_reward": 0.13977964222431183, | |
| "rewards/format_reward": 0.3611111231148243, | |
| "step": 173 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1732.0000305175781, | |
| "epoch": 0.14913220484251125, | |
| "grad_norm": 1.571647047996521, | |
| "kl": 0.093109130859375, | |
| "learning_rate": 8.416539554784089e-07, | |
| "loss": 0.1424, | |
| "reward": 0.7180789969861507, | |
| "reward_std": 0.7864874973893166, | |
| "rewards/cosine_scaled_reward": 0.18542836606502533, | |
| "rewards/format_reward": 0.3472222276031971, | |
| "step": 174 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1991.75, | |
| "epoch": 0.1499892864795372, | |
| "grad_norm": 0.5590563416481018, | |
| "kl": 0.0755615234375, | |
| "learning_rate": 8.392544243589427e-07, | |
| "loss": 0.1139, | |
| "reward": 0.528855599462986, | |
| "reward_std": 0.9861510694026947, | |
| "rewards/cosine_scaled_reward": 0.09081667335703969, | |
| "rewards/format_reward": 0.3472222276031971, | |
| "step": 175 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1684.8750305175781, | |
| "epoch": 0.1508463681165631, | |
| "grad_norm": 0.4577758014202118, | |
| "kl": 0.0635986328125, | |
| "learning_rate": 8.368407953869103e-07, | |
| "loss": 0.2379, | |
| "reward": 0.7251470182090998, | |
| "reward_std": 0.8135327100753784, | |
| "rewards/cosine_scaled_reward": 0.1959068402647972, | |
| "rewards/format_reward": 0.3333333395421505, | |
| "step": 176 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1634.75, | |
| "epoch": 0.15170344975358904, | |
| "grad_norm": 1.0375832319259644, | |
| "kl": 0.09271240234375, | |
| "learning_rate": 8.344131861991828e-07, | |
| "loss": 0.24, | |
| "reward": 0.805720079690218, | |
| "reward_std": 0.8198679685592651, | |
| "rewards/cosine_scaled_reward": 0.1806377861648798, | |
| "rewards/format_reward": 0.4444444477558136, | |
| "step": 177 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1334.4583740234375, | |
| "epoch": 0.15256053139061496, | |
| "grad_norm": 1.7115483283996582, | |
| "kl": 0.14361572265625, | |
| "learning_rate": 8.319717151140072e-07, | |
| "loss": -0.0013, | |
| "reward": 0.6213032007217407, | |
| "reward_std": 0.733954668045044, | |
| "rewards/cosine_scaled_reward": 0.19954046607017517, | |
| "rewards/format_reward": 0.2222222276031971, | |
| "step": 178 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1860.3056030273438, | |
| "epoch": 0.15341761302764087, | |
| "grad_norm": 0.5624024271965027, | |
| "kl": 0.071868896484375, | |
| "learning_rate": 8.295165011252396e-07, | |
| "loss": 0.074, | |
| "reward": 0.5974816232919693, | |
| "reward_std": 0.7972533106803894, | |
| "rewards/cosine_scaled_reward": 0.14596302818972617, | |
| "rewards/format_reward": 0.305555559694767, | |
| "step": 179 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1709.4583435058594, | |
| "epoch": 0.1542746946646668, | |
| "grad_norm": 1.1149603128433228, | |
| "kl": 0.0904541015625, | |
| "learning_rate": 8.270476638965461e-07, | |
| "loss": 0.1984, | |
| "reward": 0.7268609385937452, | |
| "reward_std": 0.8206999450922012, | |
| "rewards/cosine_scaled_reward": 0.21065270341932774, | |
| "rewards/format_reward": 0.3055555634200573, | |
| "step": 180 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1504.4861145019531, | |
| "epoch": 0.15513177630169273, | |
| "grad_norm": 0.8208626508712769, | |
| "kl": 0.10107421875, | |
| "learning_rate": 8.245653237555705e-07, | |
| "loss": 0.2409, | |
| "reward": 0.8135174959897995, | |
| "reward_std": 0.8603949248790741, | |
| "rewards/cosine_scaled_reward": 0.2192587312310934, | |
| "rewards/format_reward": 0.3750000037252903, | |
| "step": 181 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1832.2221984863281, | |
| "epoch": 0.15598885793871867, | |
| "grad_norm": 0.7162370681762695, | |
| "kl": 0.138916015625, | |
| "learning_rate": 8.220696016880687e-07, | |
| "loss": 0.0797, | |
| "reward": 0.7859554402530193, | |
| "reward_std": 0.8516587615013123, | |
| "rewards/cosine_scaled_reward": 0.24019994214177132, | |
| "rewards/format_reward": 0.3055555634200573, | |
| "step": 182 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1286.0000305175781, | |
| "epoch": 0.15684593957574458, | |
| "grad_norm": 2.3148720264434814, | |
| "kl": 0.1636962890625, | |
| "learning_rate": 8.195606193320136e-07, | |
| "loss": 0.2503, | |
| "reward": 0.9098443686962128, | |
| "reward_std": 0.9491814821958542, | |
| "rewards/cosine_scaled_reward": 0.2604777254164219, | |
| "rewards/format_reward": 0.3888888955116272, | |
| "step": 183 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1429.7361450195312, | |
| "epoch": 0.15770302121277052, | |
| "grad_norm": 0.5743198990821838, | |
| "kl": 0.084716796875, | |
| "learning_rate": 8.170384989716657e-07, | |
| "loss": -0.0243, | |
| "reward": 0.716302827000618, | |
| "reward_std": 0.9275215268135071, | |
| "rewards/cosine_scaled_reward": 0.17759587243199348, | |
| "rewards/format_reward": 0.361111119389534, | |
| "step": 184 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1653.9444885253906, | |
| "epoch": 0.15856010284979644, | |
| "grad_norm": 1.1362574100494385, | |
| "kl": 0.10540771484375, | |
| "learning_rate": 8.145033635316128e-07, | |
| "loss": 0.152, | |
| "reward": 0.4382926889229566, | |
| "reward_std": 0.8508107364177704, | |
| "rewards/cosine_scaled_reward": 0.017757446970790625, | |
| "rewards/format_reward": 0.4027777835726738, | |
| "step": 185 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 927.8750152587891, | |
| "epoch": 0.15941718448682238, | |
| "grad_norm": 2.538083553314209, | |
| "kl": 0.1431884765625, | |
| "learning_rate": 8.119553365707802e-07, | |
| "loss": 0.1705, | |
| "reward": 1.315717488527298, | |
| "reward_std": 0.922233521938324, | |
| "rewards/cosine_scaled_reward": 0.44258103519678116, | |
| "rewards/format_reward": 0.4305555671453476, | |
| "step": 186 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1182.6666564941406, | |
| "epoch": 0.1602742661238483, | |
| "grad_norm": 1.7073755264282227, | |
| "kl": 0.180908203125, | |
| "learning_rate": 8.093945422764069e-07, | |
| "loss": 0.2743, | |
| "reward": 1.043900977820158, | |
| "reward_std": 0.8111362755298615, | |
| "rewards/cosine_scaled_reward": 0.36917273700237274, | |
| "rewards/format_reward": 0.305555559694767, | |
| "step": 187 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 922.5416641235352, | |
| "epoch": 0.16113134776087423, | |
| "grad_norm": 1.0958812236785889, | |
| "kl": 0.1793212890625, | |
| "learning_rate": 8.068211054579943e-07, | |
| "loss": 0.4721, | |
| "reward": 0.8457982540130615, | |
| "reward_std": 1.025609239935875, | |
| "rewards/cosine_scaled_reward": 0.23539912048727274, | |
| "rewards/format_reward": 0.3750000074505806, | |
| "step": 188 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1749.4583740234375, | |
| "epoch": 0.16198842939790015, | |
| "grad_norm": 0.6942291259765625, | |
| "kl": 0.0941162109375, | |
| "learning_rate": 8.04235151541222e-07, | |
| "loss": 0.0101, | |
| "reward": 0.9162072837352753, | |
| "reward_std": 0.9123063534498215, | |
| "rewards/cosine_scaled_reward": 0.24282585456967354, | |
| "rewards/format_reward": 0.430555559694767, | |
| "step": 189 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1193.9861297607422, | |
| "epoch": 0.1628455110349261, | |
| "grad_norm": 1.193941354751587, | |
| "kl": 0.1387939453125, | |
| "learning_rate": 8.01636806561836e-07, | |
| "loss": 0.5036, | |
| "reward": 1.0086410311050713, | |
| "reward_std": 0.8937728404998779, | |
| "rewards/cosine_scaled_reward": 0.3237649239599705, | |
| "rewards/format_reward": 0.361111119389534, | |
| "step": 190 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1352.2638854980469, | |
| "epoch": 0.163702592671952, | |
| "grad_norm": 0.7374529242515564, | |
| "kl": 0.163818359375, | |
| "learning_rate": 7.990261971595048e-07, | |
| "loss": 0.2226, | |
| "reward": 0.4831864982843399, | |
| "reward_std": 0.8398203700780869, | |
| "rewards/cosine_scaled_reward": 0.08881546184420586, | |
| "rewards/format_reward": 0.3055555634200573, | |
| "step": 191 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1205.0278015136719, | |
| "epoch": 0.16455967430897794, | |
| "grad_norm": 0.6933061480522156, | |
| "kl": 0.205322265625, | |
| "learning_rate": 7.964034505716476e-07, | |
| "loss": 0.1316, | |
| "reward": 1.1680071130394936, | |
| "reward_std": 0.7614214420318604, | |
| "rewards/cosine_scaled_reward": 0.3687257831916213, | |
| "rewards/format_reward": 0.430555559694767, | |
| "step": 192 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 979.9166564941406, | |
| "epoch": 0.16541675594600386, | |
| "grad_norm": 1.3442184925079346, | |
| "kl": 0.16552734375, | |
| "learning_rate": 7.93768694627233e-07, | |
| "loss": 0.4192, | |
| "reward": 1.7566750347614288, | |
| "reward_std": 0.8575289100408554, | |
| "rewards/cosine_scaled_reward": 0.6700042113661766, | |
| "rewards/format_reward": 0.416666679084301, | |
| "step": 193 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 877.2916870117188, | |
| "epoch": 0.16627383758302977, | |
| "grad_norm": 1.4500232934951782, | |
| "kl": 0.2979736328125, | |
| "learning_rate": 7.911220577405484e-07, | |
| "loss": 0.3993, | |
| "reward": 0.9797601252794266, | |
| "reward_std": 0.8100396543741226, | |
| "rewards/cosine_scaled_reward": 0.2954356314148754, | |
| "rewards/format_reward": 0.3888889029622078, | |
| "step": 194 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1305.9028015136719, | |
| "epoch": 0.1671309192200557, | |
| "grad_norm": 2.036522150039673, | |
| "kl": 0.154541015625, | |
| "learning_rate": 7.884636689049422e-07, | |
| "loss": 0.4356, | |
| "reward": 0.4684947496280074, | |
| "reward_std": 0.6030187755823135, | |
| "rewards/cosine_scaled_reward": 0.025914038997143507, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 195 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1123.9027709960938, | |
| "epoch": 0.16798800085708163, | |
| "grad_norm": 2.62882137298584, | |
| "kl": 0.313232421875, | |
| "learning_rate": 7.857936576865356e-07, | |
| "loss": 0.1435, | |
| "reward": 0.6905184164643288, | |
| "reward_std": 0.7359469905495644, | |
| "rewards/cosine_scaled_reward": 0.1508147695567459, | |
| "rewards/format_reward": 0.3888888992369175, | |
| "step": 196 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 882.1805572509766, | |
| "epoch": 0.16884508249410757, | |
| "grad_norm": 1.288465976715088, | |
| "kl": 0.25537109375, | |
| "learning_rate": 7.831121542179086e-07, | |
| "loss": 0.4079, | |
| "reward": 1.1543247550725937, | |
| "reward_std": 0.9211147129535675, | |
| "rewards/cosine_scaled_reward": 0.41744012013077736, | |
| "rewards/format_reward": 0.31944445334374905, | |
| "step": 197 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1125.5555877685547, | |
| "epoch": 0.16970216413113348, | |
| "grad_norm": 1.6123173236846924, | |
| "kl": 0.3720703125, | |
| "learning_rate": 7.804192891917571e-07, | |
| "loss": 0.4226, | |
| "reward": 0.7712806100025773, | |
| "reward_std": 0.8221293687820435, | |
| "rewards/cosine_scaled_reward": 0.17036251351237297, | |
| "rewards/format_reward": 0.430555559694767, | |
| "step": 198 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1190.1666717529297, | |
| "epoch": 0.17055924576815942, | |
| "grad_norm": 1.612281084060669, | |
| "kl": 0.4462890625, | |
| "learning_rate": 7.777151938545235e-07, | |
| "loss": 0.2878, | |
| "reward": 0.7899716692045331, | |
| "reward_std": 0.8678261786699295, | |
| "rewards/cosine_scaled_reward": 0.23526360094547272, | |
| "rewards/format_reward": 0.3194444514811039, | |
| "step": 199 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 962.3888854980469, | |
| "epoch": 0.17141632740518534, | |
| "grad_norm": 1.62675940990448, | |
| "kl": 0.43115234375, | |
| "learning_rate": 7.75e-07, | |
| "loss": 0.2341, | |
| "reward": 0.8529508542269468, | |
| "reward_std": 0.890992283821106, | |
| "rewards/cosine_scaled_reward": 0.25286430679261684, | |
| "rewards/format_reward": 0.3472222313284874, | |
| "step": 200 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 556.6250190734863, | |
| "epoch": 0.17227340904221128, | |
| "grad_norm": 1.815177083015442, | |
| "kl": 0.45654296875, | |
| "learning_rate": 7.72273839962904e-07, | |
| "loss": 0.4604, | |
| "reward": 1.0454039722681046, | |
| "reward_std": 0.7526903375983238, | |
| "rewards/cosine_scaled_reward": 0.3560353182256222, | |
| "rewards/format_reward": 0.3333333395421505, | |
| "step": 201 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1524.1944580078125, | |
| "epoch": 0.1731304906792372, | |
| "grad_norm": 1.6689302921295166, | |
| "kl": 0.484375, | |
| "learning_rate": 7.695368466124296e-07, | |
| "loss": 0.1883, | |
| "reward": 0.7762234956026077, | |
| "reward_std": 0.7869797348976135, | |
| "rewards/cosine_scaled_reward": 0.21450063399970531, | |
| "rewards/format_reward": 0.3472222276031971, | |
| "step": 202 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 933.7361145019531, | |
| "epoch": 0.17398757231626313, | |
| "grad_norm": 1.4884265661239624, | |
| "kl": 0.51171875, | |
| "learning_rate": 7.667891533457718e-07, | |
| "loss": 0.5092, | |
| "reward": 1.1106317043304443, | |
| "reward_std": 0.9267386496067047, | |
| "rewards/cosine_scaled_reward": 0.34698252752423286, | |
| "rewards/format_reward": 0.416666679084301, | |
| "step": 203 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1146.9583282470703, | |
| "epoch": 0.17484465395328905, | |
| "grad_norm": 2.8435497283935547, | |
| "kl": 0.5849609375, | |
| "learning_rate": 7.640308940816239e-07, | |
| "loss": 0.3038, | |
| "reward": 0.4783020354807377, | |
| "reward_std": 0.7846653908491135, | |
| "rewards/cosine_scaled_reward": 0.12803990789689124, | |
| "rewards/format_reward": 0.22222222946584225, | |
| "step": 204 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 911.7639007568359, | |
| "epoch": 0.175701735590315, | |
| "grad_norm": 2.8843140602111816, | |
| "kl": 0.41357421875, | |
| "learning_rate": 7.612622032536507e-07, | |
| "loss": 0.2368, | |
| "reward": 0.8713492751121521, | |
| "reward_std": 0.9335136562585831, | |
| "rewards/cosine_scaled_reward": 0.23428576067090034, | |
| "rewards/format_reward": 0.4027777872979641, | |
| "step": 205 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 700.7361068725586, | |
| "epoch": 0.1765588172273409, | |
| "grad_norm": 2.3919317722320557, | |
| "kl": 0.548828125, | |
| "learning_rate": 7.584832158039378e-07, | |
| "loss": 0.1555, | |
| "reward": 0.5804239325225353, | |
| "reward_std": 0.8797028362751007, | |
| "rewards/cosine_scaled_reward": 0.13048974284902215, | |
| "rewards/format_reward": 0.3194444514811039, | |
| "step": 206 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1057.2916870117188, | |
| "epoch": 0.17741589886436684, | |
| "grad_norm": 2.6160635948181152, | |
| "kl": 0.60546875, | |
| "learning_rate": 7.556940671764124e-07, | |
| "loss": 0.3724, | |
| "reward": 0.591970931738615, | |
| "reward_std": 0.928965374827385, | |
| "rewards/cosine_scaled_reward": 0.13626324571669102, | |
| "rewards/format_reward": 0.3194444552063942, | |
| "step": 207 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1054.2222442626953, | |
| "epoch": 0.17827298050139276, | |
| "grad_norm": 4.055714130401611, | |
| "kl": 0.6064453125, | |
| "learning_rate": 7.528948933102438e-07, | |
| "loss": 0.1023, | |
| "reward": 0.5747384652495384, | |
| "reward_std": 0.835850402712822, | |
| "rewards/cosine_scaled_reward": 0.13459146209061146, | |
| "rewards/format_reward": 0.3055555634200573, | |
| "step": 208 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 672.6527862548828, | |
| "epoch": 0.17913006213841867, | |
| "grad_norm": 2.89412260055542, | |
| "kl": 0.77734375, | |
| "learning_rate": 7.500858306332172e-07, | |
| "loss": 0.2746, | |
| "reward": 0.5216602731961757, | |
| "reward_std": 0.8375790268182755, | |
| "rewards/cosine_scaled_reward": 0.1080523431301117, | |
| "rewards/format_reward": 0.3055555671453476, | |
| "step": 209 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1029.6527709960938, | |
| "epoch": 0.1799871437754446, | |
| "grad_norm": 2.079103469848633, | |
| "kl": 0.5517578125, | |
| "learning_rate": 7.472670160550848e-07, | |
| "loss": 0.3516, | |
| "reward": 0.3450094065628946, | |
| "reward_std": 0.6973802000284195, | |
| "rewards/cosine_scaled_reward": 0.03361581452190876, | |
| "rewards/format_reward": 0.2777777835726738, | |
| "step": 210 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 815.6805725097656, | |
| "epoch": 0.18084422541247053, | |
| "grad_norm": 2.8681375980377197, | |
| "kl": 0.53173828125, | |
| "learning_rate": 7.444385869608921e-07, | |
| "loss": 0.4467, | |
| "reward": 0.2037402605637908, | |
| "reward_std": 0.7096255868673325, | |
| "rewards/cosine_scaled_reward": -0.050907641649246216, | |
| "rewards/format_reward": 0.30555556155741215, | |
| "step": 211 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1077.4166793823242, | |
| "epoch": 0.18170130704949647, | |
| "grad_norm": 1.8894522190093994, | |
| "kl": 0.513671875, | |
| "learning_rate": 7.416006812042827e-07, | |
| "loss": 0.1559, | |
| "reward": 0.2127110045403242, | |
| "reward_std": 0.7653373330831528, | |
| "rewards/cosine_scaled_reward": -0.018644492141902447, | |
| "rewards/format_reward": 0.2500000046566129, | |
| "step": 212 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 760.8888702392578, | |
| "epoch": 0.18255838868652238, | |
| "grad_norm": 2.848653793334961, | |
| "kl": 0.6552734375, | |
| "learning_rate": 7.387534371007797e-07, | |
| "loss": 0.257, | |
| "reward": 0.4657673854380846, | |
| "reward_std": 0.7699891328811646, | |
| "rewards/cosine_scaled_reward": 0.09399479907006025, | |
| "rewards/format_reward": 0.2777777872979641, | |
| "step": 213 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1346.2916870117188, | |
| "epoch": 0.18341547032354832, | |
| "grad_norm": 3.975214958190918, | |
| "kl": 0.53173828125, | |
| "learning_rate": 7.358969934210438e-07, | |
| "loss": 0.2101, | |
| "reward": 0.010750308400020003, | |
| "reward_std": 0.7175936102867126, | |
| "rewards/cosine_scaled_reward": -0.1265692890738137, | |
| "rewards/format_reward": 0.2638888955116272, | |
| "step": 214 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 903.7500152587891, | |
| "epoch": 0.18427255196057424, | |
| "grad_norm": 1.7518130540847778, | |
| "kl": 0.478515625, | |
| "learning_rate": 7.330314893841101e-07, | |
| "loss": 0.4712, | |
| "reward": 0.6189166195690632, | |
| "reward_std": 0.7413264513015747, | |
| "rewards/cosine_scaled_reward": 0.18445828184485435, | |
| "rewards/format_reward": 0.2500000074505806, | |
| "step": 215 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 569.4166717529297, | |
| "epoch": 0.18512963359760018, | |
| "grad_norm": 2.782475709915161, | |
| "kl": 0.609375, | |
| "learning_rate": 7.301570646506027e-07, | |
| "loss": 0.1972, | |
| "reward": 0.7244105041027069, | |
| "reward_std": 0.7233957052230835, | |
| "rewards/cosine_scaled_reward": 0.12609414962935261, | |
| "rewards/format_reward": 0.4722222238779068, | |
| "step": 216 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 753.7361145019531, | |
| "epoch": 0.1859867152346261, | |
| "grad_norm": 3.62007212638855, | |
| "kl": 0.5966796875, | |
| "learning_rate": 7.27273859315928e-07, | |
| "loss": 0.1095, | |
| "reward": 0.7907614503055811, | |
| "reward_std": 0.9498309046030045, | |
| "rewards/cosine_scaled_reward": 0.2564918287098408, | |
| "rewards/format_reward": 0.2777777872979641, | |
| "step": 217 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 846.0694427490234, | |
| "epoch": 0.18684379687165203, | |
| "grad_norm": 6.74003791809082, | |
| "kl": 0.56884765625, | |
| "learning_rate": 7.243820139034464e-07, | |
| "loss": 0.369, | |
| "reward": 0.3798181489109993, | |
| "reward_std": 0.7759093195199966, | |
| "rewards/cosine_scaled_reward": 0.002409084467217326, | |
| "rewards/format_reward": 0.3750000037252903, | |
| "step": 218 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 657.1527709960938, | |
| "epoch": 0.18770087850867795, | |
| "grad_norm": 8.006071090698242, | |
| "kl": 0.64794921875, | |
| "learning_rate": 7.214816693576234e-07, | |
| "loss": 0.39, | |
| "reward": 0.6195143777877092, | |
| "reward_std": 0.9361841827630997, | |
| "rewards/cosine_scaled_reward": 0.13614607648923993, | |
| "rewards/format_reward": 0.3472222313284874, | |
| "step": 219 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 488.31944274902344, | |
| "epoch": 0.1885579601457039, | |
| "grad_norm": 6.052534580230713, | |
| "kl": 0.662109375, | |
| "learning_rate": 7.185729670371604e-07, | |
| "loss": 0.4044, | |
| "reward": 0.591563917696476, | |
| "reward_std": 0.9269620776176453, | |
| "rewards/cosine_scaled_reward": 0.13605972938239574, | |
| "rewards/format_reward": 0.3194444552063942, | |
| "step": 220 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 640.6805648803711, | |
| "epoch": 0.1894150417827298, | |
| "grad_norm": 3.168754816055298, | |
| "kl": 0.8017578125, | |
| "learning_rate": 7.156560487081051e-07, | |
| "loss": 0.3799, | |
| "reward": 0.4200323410332203, | |
| "reward_std": 0.8377434760332108, | |
| "rewards/cosine_scaled_reward": 0.09890506649389863, | |
| "rewards/format_reward": 0.2222222276031971, | |
| "step": 221 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 715.4305572509766, | |
| "epoch": 0.19027212341975575, | |
| "grad_norm": 3.2562437057495117, | |
| "kl": 0.841796875, | |
| "learning_rate": 7.127310565369415e-07, | |
| "loss": 0.1672, | |
| "reward": 0.4836801737546921, | |
| "reward_std": 0.7787017673254013, | |
| "rewards/cosine_scaled_reward": 0.09600674454122782, | |
| "rewards/format_reward": 0.2916666716337204, | |
| "step": 222 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 603.6527786254883, | |
| "epoch": 0.19112920505678166, | |
| "grad_norm": 4.294973373413086, | |
| "kl": 0.8857421875, | |
| "learning_rate": 7.097981330836616e-07, | |
| "loss": 0.3874, | |
| "reward": 0.41634054901078343, | |
| "reward_std": 0.7685736864805222, | |
| "rewards/cosine_scaled_reward": 0.06233693804824725, | |
| "rewards/format_reward": 0.2916666753590107, | |
| "step": 223 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 615.4027786254883, | |
| "epoch": 0.19198628669380757, | |
| "grad_norm": 5.623297691345215, | |
| "kl": 0.9609375, | |
| "learning_rate": 7.068574212948169e-07, | |
| "loss": 0.2779, | |
| "reward": 0.8603571616113186, | |
| "reward_std": 0.9389389455318451, | |
| "rewards/cosine_scaled_reward": 0.2426785994321108, | |
| "rewards/format_reward": 0.3750000111758709, | |
| "step": 224 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 590.1666870117188, | |
| "epoch": 0.19284336833083351, | |
| "grad_norm": 9.251201629638672, | |
| "kl": 0.970703125, | |
| "learning_rate": 7.039090644965509e-07, | |
| "loss": 0.3814, | |
| "reward": 0.4910267172381282, | |
| "reward_std": 0.7892083153128624, | |
| "rewards/cosine_scaled_reward": 0.09968002699315548, | |
| "rewards/format_reward": 0.2916666716337204, | |
| "step": 225 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 364.7916679382324, | |
| "epoch": 0.19370044996785943, | |
| "grad_norm": 3.1022725105285645, | |
| "kl": 0.87890625, | |
| "learning_rate": 7.009532063876148e-07, | |
| "loss": 0.2221, | |
| "reward": 0.5126173943281174, | |
| "reward_std": 0.6831100434064865, | |
| "rewards/cosine_scaled_reward": 0.12436424475163221, | |
| "rewards/format_reward": 0.2638888992369175, | |
| "step": 226 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 616.5555572509766, | |
| "epoch": 0.19455753160488537, | |
| "grad_norm": 2.3377246856689453, | |
| "kl": 0.9384765625, | |
| "learning_rate": 6.979899910323624e-07, | |
| "loss": 0.3176, | |
| "reward": 0.5720387771725655, | |
| "reward_std": 0.6217157021164894, | |
| "rewards/cosine_scaled_reward": 0.1054638409987092, | |
| "rewards/format_reward": 0.361111119389534, | |
| "step": 227 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 540.0972061157227, | |
| "epoch": 0.19541461324191128, | |
| "grad_norm": 6.154067039489746, | |
| "kl": 0.8564453125, | |
| "learning_rate": 6.950195628537299e-07, | |
| "loss": 0.4069, | |
| "reward": 0.43204435613006353, | |
| "reward_std": 0.7807599157094955, | |
| "rewards/cosine_scaled_reward": 0.08407774195075035, | |
| "rewards/format_reward": 0.2638888955116272, | |
| "step": 228 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 572.2916717529297, | |
| "epoch": 0.19627169487893723, | |
| "grad_norm": 2.6342246532440186, | |
| "kl": 0.923828125, | |
| "learning_rate": 6.920420666261961e-07, | |
| "loss": 0.2841, | |
| "reward": 0.6314438227564096, | |
| "reward_std": 0.8878332078456879, | |
| "rewards/cosine_scaled_reward": 0.13516635773703456, | |
| "rewards/format_reward": 0.361111119389534, | |
| "step": 229 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 488.7638854980469, | |
| "epoch": 0.19712877651596314, | |
| "grad_norm": 2.928675651550293, | |
| "kl": 1.046875, | |
| "learning_rate": 6.890576474687263e-07, | |
| "loss": 0.2236, | |
| "reward": 0.36511653289198875, | |
| "reward_std": 0.7729461342096329, | |
| "rewards/cosine_scaled_reward": 0.029780485958326608, | |
| "rewards/format_reward": 0.3055555634200573, | |
| "step": 230 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 419.2638931274414, | |
| "epoch": 0.19798585815298908, | |
| "grad_norm": 3.3741118907928467, | |
| "kl": 0.9599609375, | |
| "learning_rate": 6.860664508377001e-07, | |
| "loss": 0.2546, | |
| "reward": 0.7524889260530472, | |
| "reward_std": 0.970270186662674, | |
| "rewards/cosine_scaled_reward": 0.20957778953015804, | |
| "rewards/format_reward": 0.3333333395421505, | |
| "step": 231 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 457.93055725097656, | |
| "epoch": 0.198842939790015, | |
| "grad_norm": 3.406334638595581, | |
| "kl": 1.1494140625, | |
| "learning_rate": 6.83068622519821e-07, | |
| "loss": 0.3246, | |
| "reward": 0.9599852412939072, | |
| "reward_std": 0.8014604300260544, | |
| "rewards/cosine_scaled_reward": 0.29943707399070263, | |
| "rewards/format_reward": 0.3611111156642437, | |
| "step": 232 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 593.0833358764648, | |
| "epoch": 0.19970002142704094, | |
| "grad_norm": 2.4271864891052246, | |
| "kl": 1.279296875, | |
| "learning_rate": 6.800643086250121e-07, | |
| "loss": 0.3557, | |
| "reward": 0.4210415966808796, | |
| "reward_std": 0.6882978901267052, | |
| "rewards/cosine_scaled_reward": 0.10635412717238069, | |
| "rewards/format_reward": 0.20833333861082792, | |
| "step": 233 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 488.9583435058594, | |
| "epoch": 0.20055710306406685, | |
| "grad_norm": 2.523871660232544, | |
| "kl": 1.107421875, | |
| "learning_rate": 6.770536555792944e-07, | |
| "loss": 0.4558, | |
| "reward": 0.8532587476074696, | |
| "reward_std": 0.8488901779055595, | |
| "rewards/cosine_scaled_reward": 0.22524047270417213, | |
| "rewards/format_reward": 0.4027777872979641, | |
| "step": 234 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 436.6805725097656, | |
| "epoch": 0.2014141847010928, | |
| "grad_norm": 4.82633113861084, | |
| "kl": 0.9912109375, | |
| "learning_rate": 6.740368101176495e-07, | |
| "loss": 0.2315, | |
| "reward": 0.7889588698744774, | |
| "reward_std": 0.942963719367981, | |
| "rewards/cosine_scaled_reward": 0.2069794237613678, | |
| "rewards/format_reward": 0.3750000074505806, | |
| "step": 235 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 454.0416793823242, | |
| "epoch": 0.2022712663381187, | |
| "grad_norm": 7.737008094787598, | |
| "kl": 1.0390625, | |
| "learning_rate": 6.710139192768694e-07, | |
| "loss": 0.4408, | |
| "reward": 0.2776918327435851, | |
| "reward_std": 0.6808639168739319, | |
| "rewards/cosine_scaled_reward": -4.296889528632164e-05, | |
| "rewards/format_reward": 0.2777777835726738, | |
| "step": 236 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 353.5138931274414, | |
| "epoch": 0.20312834797514465, | |
| "grad_norm": 10.61968994140625, | |
| "kl": 0.990234375, | |
| "learning_rate": 6.679851303883891e-07, | |
| "loss": 0.3745, | |
| "reward": 0.9284175038337708, | |
| "reward_std": 0.9358415603637695, | |
| "rewards/cosine_scaled_reward": 0.3322643097490072, | |
| "rewards/format_reward": 0.26388889364898205, | |
| "step": 237 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 489.4722213745117, | |
| "epoch": 0.20398542961217056, | |
| "grad_norm": 5.041754245758057, | |
| "kl": 1.13671875, | |
| "learning_rate": 6.649505910711058e-07, | |
| "loss": 0.3936, | |
| "reward": 0.8772722482681274, | |
| "reward_std": 1.1123964041471481, | |
| "rewards/cosine_scaled_reward": 0.2719694413244724, | |
| "rewards/format_reward": 0.3333333395421505, | |
| "step": 238 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 384.00000762939453, | |
| "epoch": 0.20484251124919647, | |
| "grad_norm": 3.059415817260742, | |
| "kl": 1.005859375, | |
| "learning_rate": 6.619104492241847e-07, | |
| "loss": 0.3146, | |
| "reward": 0.7366920709609985, | |
| "reward_std": 0.9259907156229019, | |
| "rewards/cosine_scaled_reward": 0.22945713996887207, | |
| "rewards/format_reward": 0.2777777872979641, | |
| "step": 239 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 419.1388931274414, | |
| "epoch": 0.20569959288622242, | |
| "grad_norm": 10.234456062316895, | |
| "kl": 1.1953125, | |
| "learning_rate": 6.588648530198504e-07, | |
| "loss": 0.3922, | |
| "reward": 0.6590520106256008, | |
| "reward_std": 0.7455599829554558, | |
| "rewards/cosine_scaled_reward": 0.14202599972486496, | |
| "rewards/format_reward": 0.37500000931322575, | |
| "step": 240 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 401.2638854980469, | |
| "epoch": 0.20655667452324833, | |
| "grad_norm": 2.2838265895843506, | |
| "kl": 1.1591796875, | |
| "learning_rate": 6.558139508961654e-07, | |
| "loss": 0.2743, | |
| "reward": 0.7352767586708069, | |
| "reward_std": 0.8139046281576157, | |
| "rewards/cosine_scaled_reward": 0.13847169885411859, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 241 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 407.9166793823242, | |
| "epoch": 0.20741375616027427, | |
| "grad_norm": 4.2408246994018555, | |
| "kl": 0.982421875, | |
| "learning_rate": 6.527578915497951e-07, | |
| "loss": 0.2713, | |
| "reward": 0.9523800164461136, | |
| "reward_std": 1.001899242401123, | |
| "rewards/cosine_scaled_reward": 0.28869001008570194, | |
| "rewards/format_reward": 0.375, | |
| "step": 242 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 439.652774810791, | |
| "epoch": 0.20827083779730018, | |
| "grad_norm": 5.787041664123535, | |
| "kl": 1.49609375, | |
| "learning_rate": 6.496968239287603e-07, | |
| "loss": 0.1675, | |
| "reward": 0.5723136551678181, | |
| "reward_std": 0.7788431346416473, | |
| "rewards/cosine_scaled_reward": 0.15421238262206316, | |
| "rewards/format_reward": 0.2638888955116272, | |
| "step": 243 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 398.08333587646484, | |
| "epoch": 0.20912791943432613, | |
| "grad_norm": 2.507175922393799, | |
| "kl": 1.6328125, | |
| "learning_rate": 6.466308972251785e-07, | |
| "loss": 0.3818, | |
| "reward": 1.0151595324277878, | |
| "reward_std": 1.0486897379159927, | |
| "rewards/cosine_scaled_reward": 0.285357553511858, | |
| "rewards/format_reward": 0.4444444514811039, | |
| "step": 244 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 364.277774810791, | |
| "epoch": 0.20998500107135204, | |
| "grad_norm": 8.139939308166504, | |
| "kl": 1.345703125, | |
| "learning_rate": 6.435602608679916e-07, | |
| "loss": 0.1818, | |
| "reward": 0.7046700529754162, | |
| "reward_std": 0.7369572669267654, | |
| "rewards/cosine_scaled_reward": 0.16483502835035324, | |
| "rewards/format_reward": 0.3750000074505806, | |
| "step": 245 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 460.98612213134766, | |
| "epoch": 0.21084208270837798, | |
| "grad_norm": 2.476030111312866, | |
| "kl": 1.345703125, | |
| "learning_rate": 6.404850645156841e-07, | |
| "loss": 0.3358, | |
| "reward": 0.342040394898504, | |
| "reward_std": 0.691289097070694, | |
| "rewards/cosine_scaled_reward": 0.07379795983433723, | |
| "rewards/format_reward": 0.19444444961845875, | |
| "step": 246 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 246.83333587646484, | |
| "epoch": 0.2116991643454039, | |
| "grad_norm": 6.393570899963379, | |
| "kl": 1.162109375, | |
| "learning_rate": 6.374054580489873e-07, | |
| "loss": 0.2145, | |
| "reward": 1.1625754237174988, | |
| "reward_std": 0.9642776250839233, | |
| "rewards/cosine_scaled_reward": 0.3937877155840397, | |
| "rewards/format_reward": 0.3750000074505806, | |
| "step": 247 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 441.25000762939453, | |
| "epoch": 0.21255624598242984, | |
| "grad_norm": 2.3917956352233887, | |
| "kl": 1.556640625, | |
| "learning_rate": 6.343215915635761e-07, | |
| "loss": 0.3667, | |
| "reward": 0.23520513158291578, | |
| "reward_std": 0.5816171392798424, | |
| "rewards/cosine_scaled_reward": 0.00649144034832716, | |
| "rewards/format_reward": 0.2222222313284874, | |
| "step": 248 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 406.94444274902344, | |
| "epoch": 0.21341332761945575, | |
| "grad_norm": 2.021603584289551, | |
| "kl": 1.32421875, | |
| "learning_rate": 6.31233615362752e-07, | |
| "loss": 0.3835, | |
| "reward": 0.6145300641655922, | |
| "reward_std": 0.8172438591718674, | |
| "rewards/cosine_scaled_reward": 0.0919872522354126, | |
| "rewards/format_reward": 0.430555559694767, | |
| "step": 249 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 545.9027786254883, | |
| "epoch": 0.2142704092564817, | |
| "grad_norm": 5.071481227874756, | |
| "kl": 1.462890625, | |
| "learning_rate": 6.281416799501187e-07, | |
| "loss": 0.3667, | |
| "reward": 0.5654324060305953, | |
| "reward_std": 0.7506130635738373, | |
| "rewards/cosine_scaled_reward": 0.12299397867172956, | |
| "rewards/format_reward": 0.3194444514811039, | |
| "step": 250 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 295.27778244018555, | |
| "epoch": 0.2151274908935076, | |
| "grad_norm": 6.028750896453857, | |
| "kl": 1.396484375, | |
| "learning_rate": 6.25045936022246e-07, | |
| "loss": 0.2881, | |
| "reward": 0.6139978468418121, | |
| "reward_std": 0.8909667134284973, | |
| "rewards/cosine_scaled_reward": 0.09866558946669102, | |
| "rewards/format_reward": 0.4166666679084301, | |
| "step": 251 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 455.0138854980469, | |
| "epoch": 0.21598457253053352, | |
| "grad_norm": 5.143624305725098, | |
| "kl": 1.3984375, | |
| "learning_rate": 6.219465344613258e-07, | |
| "loss": 0.2564, | |
| "reward": 0.8803071463480592, | |
| "reward_std": 0.8127338886260986, | |
| "rewards/cosine_scaled_reward": 0.23182022757828236, | |
| "rewards/format_reward": 0.416666679084301, | |
| "step": 252 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 319.5833396911621, | |
| "epoch": 0.21684165416755946, | |
| "grad_norm": 2.218837261199951, | |
| "kl": 1.318359375, | |
| "learning_rate": 6.188436263278172e-07, | |
| "loss": 0.2489, | |
| "reward": 0.7824475020170212, | |
| "reward_std": 0.8558803498744965, | |
| "rewards/cosine_scaled_reward": 0.22455710358917713, | |
| "rewards/format_reward": 0.3333333395421505, | |
| "step": 253 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 351.58333587646484, | |
| "epoch": 0.21769873580458537, | |
| "grad_norm": 2.8312885761260986, | |
| "kl": 1.45703125, | |
| "learning_rate": 6.157373628530852e-07, | |
| "loss": 0.329, | |
| "reward": 0.6108426973223686, | |
| "reward_std": 0.7599765211343765, | |
| "rewards/cosine_scaled_reward": 0.14569912757724524, | |
| "rewards/format_reward": 0.3194444477558136, | |
| "step": 254 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 357.66666412353516, | |
| "epoch": 0.21855581744161132, | |
| "grad_norm": 3.159221649169922, | |
| "kl": 1.310546875, | |
| "learning_rate": 6.126278954320294e-07, | |
| "loss": 0.3418, | |
| "reward": 0.8543988540768623, | |
| "reward_std": 0.8301258683204651, | |
| "rewards/cosine_scaled_reward": 0.22581054456532001, | |
| "rewards/format_reward": 0.4027777798473835, | |
| "step": 255 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 434.9166679382324, | |
| "epoch": 0.21941289907863723, | |
| "grad_norm": 2.6879823207855225, | |
| "kl": 1.517578125, | |
| "learning_rate": 6.095153756157051e-07, | |
| "loss": 0.2982, | |
| "reward": 0.6386940572410822, | |
| "reward_std": 0.7044311463832855, | |
| "rewards/cosine_scaled_reward": 0.13879146426916122, | |
| "rewards/format_reward": 0.3611111231148243, | |
| "step": 256 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 361.11112213134766, | |
| "epoch": 0.22026998071566317, | |
| "grad_norm": 3.5288994312286377, | |
| "kl": 1.27734375, | |
| "learning_rate": 6.06399955103937e-07, | |
| "loss": 0.2277, | |
| "reward": 0.813978984951973, | |
| "reward_std": 0.7525499165058136, | |
| "rewards/cosine_scaled_reward": 0.26810058392584324, | |
| "rewards/format_reward": 0.2777777835726738, | |
| "step": 257 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 364.84722900390625, | |
| "epoch": 0.22112706235268909, | |
| "grad_norm": 2.3899548053741455, | |
| "kl": 1.326171875, | |
| "learning_rate": 6.032817857379256e-07, | |
| "loss": 0.2684, | |
| "reward": 0.7509399205446243, | |
| "reward_std": 0.7508059442043304, | |
| "rewards/cosine_scaled_reward": 0.20880330353975296, | |
| "rewards/format_reward": 0.3333333395421505, | |
| "step": 258 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 361.50000762939453, | |
| "epoch": 0.22198414398971503, | |
| "grad_norm": 5.087327480316162, | |
| "kl": 1.62109375, | |
| "learning_rate": 6.001610194928464e-07, | |
| "loss": 0.3756, | |
| "reward": 0.9518274813890457, | |
| "reward_std": 0.8851035535335541, | |
| "rewards/cosine_scaled_reward": 0.29535816609859467, | |
| "rewards/format_reward": 0.36111112125217915, | |
| "step": 259 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 335.4027786254883, | |
| "epoch": 0.22284122562674094, | |
| "grad_norm": 2.6951775550842285, | |
| "kl": 1.43359375, | |
| "learning_rate": 5.97037808470444e-07, | |
| "loss": 0.2947, | |
| "reward": 0.5025412552058697, | |
| "reward_std": 0.7434158027172089, | |
| "rewards/cosine_scaled_reward": 0.09154839906841516, | |
| "rewards/format_reward": 0.3194444552063942, | |
| "step": 260 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 342.4861068725586, | |
| "epoch": 0.22369830726376688, | |
| "grad_norm": 3.2796826362609863, | |
| "kl": 1.1669921875, | |
| "learning_rate": 5.939123048916173e-07, | |
| "loss": 0.3505, | |
| "reward": 0.5631570406258106, | |
| "reward_std": 0.8579424917697906, | |
| "rewards/cosine_scaled_reward": 0.09407851565629244, | |
| "rewards/format_reward": 0.3750000037252903, | |
| "step": 261 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 319.04166412353516, | |
| "epoch": 0.2245553889007928, | |
| "grad_norm": 5.055459499359131, | |
| "kl": 1.193359375, | |
| "learning_rate": 5.907846610890011e-07, | |
| "loss": 0.2438, | |
| "reward": 0.5839212201535702, | |
| "reward_std": 0.797115832567215, | |
| "rewards/cosine_scaled_reward": 0.111405044561252, | |
| "rewards/format_reward": 0.3611111156642437, | |
| "step": 262 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 287.4166679382324, | |
| "epoch": 0.22541247053781874, | |
| "grad_norm": 4.351484775543213, | |
| "kl": 1.130859375, | |
| "learning_rate": 5.87655029499542e-07, | |
| "loss": 0.2364, | |
| "reward": 0.8294338285923004, | |
| "reward_std": 0.9452664703130722, | |
| "rewards/cosine_scaled_reward": 0.29666137136518955, | |
| "rewards/format_reward": 0.2361111156642437, | |
| "step": 263 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 344.6111030578613, | |
| "epoch": 0.22626955217484465, | |
| "grad_norm": 3.6436305046081543, | |
| "kl": 1.2177734375, | |
| "learning_rate": 5.845235626570683e-07, | |
| "loss": 0.2119, | |
| "reward": 0.9100049883127213, | |
| "reward_std": 0.9092362821102142, | |
| "rewards/cosine_scaled_reward": 0.2744469365570694, | |
| "rewards/format_reward": 0.3611111156642437, | |
| "step": 264 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 397.94445037841797, | |
| "epoch": 0.2271266338118706, | |
| "grad_norm": 4.111161231994629, | |
| "kl": 1.228515625, | |
| "learning_rate": 5.813904131848564e-07, | |
| "loss": 0.3083, | |
| "reward": 0.4675387665629387, | |
| "reward_std": 0.7649587690830231, | |
| "rewards/cosine_scaled_reward": 0.025436056777834892, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 265 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 321.83333587646484, | |
| "epoch": 0.2279837154488965, | |
| "grad_norm": 4.648217678070068, | |
| "kl": 1.2587890625, | |
| "learning_rate": 5.78255733788191e-07, | |
| "loss": 0.2023, | |
| "reward": 0.773270171135664, | |
| "reward_std": 0.7938476204872131, | |
| "rewards/cosine_scaled_reward": 0.18524618819355965, | |
| "rewards/format_reward": 0.4027777798473835, | |
| "step": 266 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 437.41666412353516, | |
| "epoch": 0.22884079708592242, | |
| "grad_norm": 5.6152472496032715, | |
| "kl": 1.37109375, | |
| "learning_rate": 5.751196772469237e-07, | |
| "loss": 0.2184, | |
| "reward": 0.5861554071307182, | |
| "reward_std": 0.7004242539405823, | |
| "rewards/cosine_scaled_reward": 0.09168882109224796, | |
| "rewards/format_reward": 0.4027777910232544, | |
| "step": 267 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 274.86111068725586, | |
| "epoch": 0.22969787872294836, | |
| "grad_norm": 3.5720877647399902, | |
| "kl": 1.1787109375, | |
| "learning_rate": 5.71982396408026e-07, | |
| "loss": 0.1723, | |
| "reward": 0.8904364705085754, | |
| "reward_std": 0.8545982241630554, | |
| "rewards/cosine_scaled_reward": 0.22299600392580032, | |
| "rewards/format_reward": 0.4444444477558136, | |
| "step": 268 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 296.2638931274414, | |
| "epoch": 0.23055496035997428, | |
| "grad_norm": 6.410920143127441, | |
| "kl": 1.08203125, | |
| "learning_rate": 5.688440441781398e-07, | |
| "loss": 0.1769, | |
| "reward": 0.7688810527324677, | |
| "reward_std": 1.0249820053577423, | |
| "rewards/cosine_scaled_reward": 0.17610719101503491, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 269 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 297.98611068725586, | |
| "epoch": 0.23141204199700022, | |
| "grad_norm": 2.672879695892334, | |
| "kl": 1.0966796875, | |
| "learning_rate": 5.657047735161255e-07, | |
| "loss": 0.2069, | |
| "reward": 1.3364269733428955, | |
| "reward_std": 0.8959543257951736, | |
| "rewards/cosine_scaled_reward": 0.43210237100720406, | |
| "rewards/format_reward": 0.4722222313284874, | |
| "step": 270 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 306.81945037841797, | |
| "epoch": 0.23226912363402613, | |
| "grad_norm": 4.001352787017822, | |
| "kl": 1.1513671875, | |
| "learning_rate": 5.625647374256061e-07, | |
| "loss": 0.1775, | |
| "reward": 0.9076458215713501, | |
| "reward_std": 0.803716853260994, | |
| "rewards/cosine_scaled_reward": 0.25243401899933815, | |
| "rewards/format_reward": 0.4027777835726738, | |
| "step": 271 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 319.7638931274414, | |
| "epoch": 0.23312620527105207, | |
| "grad_norm": 10.995331764221191, | |
| "kl": 1.0615234375, | |
| "learning_rate": 5.594240889475106e-07, | |
| "loss": 0.3224, | |
| "reward": 0.5522180162370205, | |
| "reward_std": 0.7163975164294243, | |
| "rewards/cosine_scaled_reward": 0.10944235138595104, | |
| "rewards/format_reward": 0.3333333432674408, | |
| "step": 272 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 363.5277786254883, | |
| "epoch": 0.233983286908078, | |
| "grad_norm": 2.6429786682128906, | |
| "kl": 1.224609375, | |
| "learning_rate": 5.562829811526154e-07, | |
| "loss": 0.2456, | |
| "reward": 0.4188144411891699, | |
| "reward_std": 0.6072976887226105, | |
| "rewards/cosine_scaled_reward": 0.07746277935802937, | |
| "rewards/format_reward": 0.2638888955116272, | |
| "step": 273 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 360.7361145019531, | |
| "epoch": 0.23484036854510393, | |
| "grad_norm": 2.7255380153656006, | |
| "kl": 1.240234375, | |
| "learning_rate": 5.531415671340826e-07, | |
| "loss": 0.1997, | |
| "reward": 1.1216635033488274, | |
| "reward_std": 0.7341399192810059, | |
| "rewards/cosine_scaled_reward": 0.3524984158575535, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 274 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 351.9027862548828, | |
| "epoch": 0.23569745018212984, | |
| "grad_norm": 3.7585530281066895, | |
| "kl": 1.185546875, | |
| "learning_rate": 5.5e-07, | |
| "loss": 0.2149, | |
| "reward": 0.8286983985453844, | |
| "reward_std": 0.8109488189220428, | |
| "rewards/cosine_scaled_reward": 0.21990476548671722, | |
| "rewards/format_reward": 0.3888888955116272, | |
| "step": 275 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 362.55555725097656, | |
| "epoch": 0.23655453181915578, | |
| "grad_norm": 5.316573143005371, | |
| "kl": 1.419921875, | |
| "learning_rate": 5.468584328659172e-07, | |
| "loss": 0.2454, | |
| "reward": 0.7834634706377983, | |
| "reward_std": 0.8923482447862625, | |
| "rewards/cosine_scaled_reward": 0.19728727941401303, | |
| "rewards/format_reward": 0.3888888917863369, | |
| "step": 276 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 474.31945037841797, | |
| "epoch": 0.2374116134561817, | |
| "grad_norm": 7.202195167541504, | |
| "kl": 1.765625, | |
| "learning_rate": 5.437170188473847e-07, | |
| "loss": 0.3221, | |
| "reward": 0.7273948639631271, | |
| "reward_std": 0.7511462718248367, | |
| "rewards/cosine_scaled_reward": 0.11369742685928941, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 277 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 371.7638931274414, | |
| "epoch": 0.23826869509320764, | |
| "grad_norm": 7.068774700164795, | |
| "kl": 1.662109375, | |
| "learning_rate": 5.405759110524894e-07, | |
| "loss": 0.1665, | |
| "reward": 1.2386417984962463, | |
| "reward_std": 0.9174100756645203, | |
| "rewards/cosine_scaled_reward": 0.3901542127132416, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 278 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 349.6527862548828, | |
| "epoch": 0.23912577673023355, | |
| "grad_norm": 11.34634017944336, | |
| "kl": 1.658203125, | |
| "learning_rate": 5.37435262574394e-07, | |
| "loss": 0.1887, | |
| "reward": 0.492940915748477, | |
| "reward_std": 0.721496045589447, | |
| "rewards/cosine_scaled_reward": 0.10063712205737829, | |
| "rewards/format_reward": 0.2916666716337204, | |
| "step": 279 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 480.2222366333008, | |
| "epoch": 0.2399828583672595, | |
| "grad_norm": 3.518911838531494, | |
| "kl": 1.3828125, | |
| "learning_rate": 5.342952264838747e-07, | |
| "loss": 0.3564, | |
| "reward": 0.39445267990231514, | |
| "reward_std": 0.733232319355011, | |
| "rewards/cosine_scaled_reward": 0.0027818959206342697, | |
| "rewards/format_reward": 0.3888888955116272, | |
| "step": 280 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 372.7361068725586, | |
| "epoch": 0.2408399400042854, | |
| "grad_norm": 5.535303115844727, | |
| "kl": 1.59765625, | |
| "learning_rate": 5.311559558218603e-07, | |
| "loss": 0.1378, | |
| "reward": 0.5733058899641037, | |
| "reward_std": 0.8472346812486649, | |
| "rewards/cosine_scaled_reward": 0.140819625928998, | |
| "rewards/format_reward": 0.2916666669771075, | |
| "step": 281 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 415.26390075683594, | |
| "epoch": 0.24169702164131132, | |
| "grad_norm": 5.782970428466797, | |
| "kl": 1.326171875, | |
| "learning_rate": 5.28017603591974e-07, | |
| "loss": 0.1165, | |
| "reward": 0.8404653370380402, | |
| "reward_std": 0.8565979599952698, | |
| "rewards/cosine_scaled_reward": 0.19801045581698418, | |
| "rewards/format_reward": 0.4444444552063942, | |
| "step": 282 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 427.9861068725586, | |
| "epoch": 0.24255410327833726, | |
| "grad_norm": 6.241755962371826, | |
| "kl": 1.23046875, | |
| "learning_rate": 5.248803227530763e-07, | |
| "loss": 0.1899, | |
| "reward": 1.1734114736318588, | |
| "reward_std": 1.0097443908452988, | |
| "rewards/cosine_scaled_reward": 0.3575390987098217, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 283 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 457.9027786254883, | |
| "epoch": 0.24341118491536318, | |
| "grad_norm": 12.400771141052246, | |
| "kl": 1.1044921875, | |
| "learning_rate": 5.21744266211809e-07, | |
| "loss": 0.2043, | |
| "reward": 0.6685996502637863, | |
| "reward_std": 0.896918535232544, | |
| "rewards/cosine_scaled_reward": 0.15374425239861012, | |
| "rewards/format_reward": 0.361111119389534, | |
| "step": 284 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 403.19444274902344, | |
| "epoch": 0.24426826655238912, | |
| "grad_norm": 5.228241443634033, | |
| "kl": 1.513671875, | |
| "learning_rate": 5.186095868151436e-07, | |
| "loss": 0.2219, | |
| "reward": 0.6613360345363617, | |
| "reward_std": 0.8527265787124634, | |
| "rewards/cosine_scaled_reward": 0.14316802099347115, | |
| "rewards/format_reward": 0.3750000074505806, | |
| "step": 285 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 465.8333435058594, | |
| "epoch": 0.24512534818941503, | |
| "grad_norm": 9.96542739868164, | |
| "kl": 1.0654296875, | |
| "learning_rate": 5.154764373429315e-07, | |
| "loss": 0.1583, | |
| "reward": 0.8348981812596321, | |
| "reward_std": 0.9325527995824814, | |
| "rewards/cosine_scaled_reward": 0.21606018114835024, | |
| "rewards/format_reward": 0.4027777835726738, | |
| "step": 286 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 372.5972366333008, | |
| "epoch": 0.24598242982644097, | |
| "grad_norm": 12.640274047851562, | |
| "kl": 1.1533203125, | |
| "learning_rate": 5.123449705004581e-07, | |
| "loss": 0.122, | |
| "reward": 1.0775522887706757, | |
| "reward_std": 0.8368659615516663, | |
| "rewards/cosine_scaled_reward": 0.30266502872109413, | |
| "rewards/format_reward": 0.4722222313284874, | |
| "step": 287 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 478.50000762939453, | |
| "epoch": 0.2468395114634669, | |
| "grad_norm": 101.3699951171875, | |
| "kl": 1.58984375, | |
| "learning_rate": 5.09215338910999e-07, | |
| "loss": 0.1448, | |
| "reward": 0.5251022726297379, | |
| "reward_std": 0.7163691967725754, | |
| "rewards/cosine_scaled_reward": 0.04032891429960728, | |
| "rewards/format_reward": 0.4444444477558136, | |
| "step": 288 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 425.20833587646484, | |
| "epoch": 0.24769659310049283, | |
| "grad_norm": 14.3350830078125, | |
| "kl": 1.1904296875, | |
| "learning_rate": 5.060876951083828e-07, | |
| "loss": 0.1997, | |
| "reward": 1.171989917755127, | |
| "reward_std": 0.9945340603590012, | |
| "rewards/cosine_scaled_reward": 0.3637727275490761, | |
| "rewards/format_reward": 0.4444444477558136, | |
| "step": 289 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 374.87500762939453, | |
| "epoch": 0.24855367473751874, | |
| "grad_norm": 7.5819315910339355, | |
| "kl": 1.0908203125, | |
| "learning_rate": 5.02962191529556e-07, | |
| "loss": 0.1346, | |
| "reward": 1.154215730726719, | |
| "reward_std": 0.894601583480835, | |
| "rewards/cosine_scaled_reward": 0.3965523011283949, | |
| "rewards/format_reward": 0.3611111231148243, | |
| "step": 290 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 406.93055725097656, | |
| "epoch": 0.24941075637454468, | |
| "grad_norm": 14.51109504699707, | |
| "kl": 1.333984375, | |
| "learning_rate": 4.998389805071536e-07, | |
| "loss": 0.3856, | |
| "reward": 1.0358281284570694, | |
| "reward_std": 0.996618315577507, | |
| "rewards/cosine_scaled_reward": 0.25402514450252056, | |
| "rewards/format_reward": 0.527777798473835, | |
| "step": 291 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 410.00000762939453, | |
| "epoch": 0.2502678380115706, | |
| "grad_norm": 12.098337173461914, | |
| "kl": 1.232421875, | |
| "learning_rate": 4.967182142620745e-07, | |
| "loss": 0.1032, | |
| "reward": 0.9677926301956177, | |
| "reward_std": 0.873188391327858, | |
| "rewards/cosine_scaled_reward": 0.26861853525042534, | |
| "rewards/format_reward": 0.4305555671453476, | |
| "step": 292 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 440.4722213745117, | |
| "epoch": 0.2511249196485965, | |
| "grad_norm": 682.6365966796875, | |
| "kl": 2.2578125, | |
| "learning_rate": 4.93600044896063e-07, | |
| "loss": 0.3027, | |
| "reward": 0.7292786613106728, | |
| "reward_std": 0.7885087877511978, | |
| "rewards/cosine_scaled_reward": 0.23963932693004608, | |
| "rewards/format_reward": 0.2500000037252903, | |
| "step": 293 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 483.08333587646484, | |
| "epoch": 0.2519820012856225, | |
| "grad_norm": 5.502435684204102, | |
| "kl": 1.58203125, | |
| "learning_rate": 4.904846243842949e-07, | |
| "loss": 0.2492, | |
| "reward": 0.9282772243022919, | |
| "reward_std": 1.0097183585166931, | |
| "rewards/cosine_scaled_reward": 0.24886082112789154, | |
| "rewards/format_reward": 0.4305555634200573, | |
| "step": 294 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 363.9305648803711, | |
| "epoch": 0.2528390829226484, | |
| "grad_norm": 12.606801986694336, | |
| "kl": 1.716796875, | |
| "learning_rate": 4.873721045679706e-07, | |
| "loss": 0.2035, | |
| "reward": 0.6285464763641357, | |
| "reward_std": 0.8090884387493134, | |
| "rewards/cosine_scaled_reward": 0.1406621327623725, | |
| "rewards/format_reward": 0.3472222276031971, | |
| "step": 295 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 444.0555648803711, | |
| "epoch": 0.2536961645596743, | |
| "grad_norm": 8.756891250610352, | |
| "kl": 1.45703125, | |
| "learning_rate": 4.842626371469149e-07, | |
| "loss": 0.2407, | |
| "reward": 0.6327312793582678, | |
| "reward_std": 0.776206910610199, | |
| "rewards/cosine_scaled_reward": 0.10803230293095112, | |
| "rewards/format_reward": 0.416666679084301, | |
| "step": 296 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 396.19445037841797, | |
| "epoch": 0.2545532461967002, | |
| "grad_norm": 4.2702484130859375, | |
| "kl": 1.763671875, | |
| "learning_rate": 4.811563736721829e-07, | |
| "loss": 0.2519, | |
| "reward": 0.718172661960125, | |
| "reward_std": 0.9500904381275177, | |
| "rewards/cosine_scaled_reward": 0.16464189253747463, | |
| "rewards/format_reward": 0.3888888917863369, | |
| "step": 297 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 400.20833587646484, | |
| "epoch": 0.25541032783372614, | |
| "grad_norm": 16.376901626586914, | |
| "kl": 1.56640625, | |
| "learning_rate": 4.780534655386743e-07, | |
| "loss": 0.1925, | |
| "reward": 0.9078701715916395, | |
| "reward_std": 0.7346006631851196, | |
| "rewards/cosine_scaled_reward": 0.26643507555127144, | |
| "rewards/format_reward": 0.3750000149011612, | |
| "step": 298 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 424.01390075683594, | |
| "epoch": 0.2562674094707521, | |
| "grad_norm": 41.33639907836914, | |
| "kl": 1.951171875, | |
| "learning_rate": 4.749540639777539e-07, | |
| "loss": 0.2305, | |
| "reward": 0.9393416047096252, | |
| "reward_std": 1.0011892914772034, | |
| "rewards/cosine_scaled_reward": 0.24050412327051163, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 299 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 414.84722900390625, | |
| "epoch": 0.257124491107778, | |
| "grad_norm": 6.579893589019775, | |
| "kl": 1.59375, | |
| "learning_rate": 4.7185832004988133e-07, | |
| "loss": 0.2066, | |
| "reward": 0.9773926436901093, | |
| "reward_std": 0.8585023283958435, | |
| "rewards/cosine_scaled_reward": 0.3150852136313915, | |
| "rewards/format_reward": 0.3472222350537777, | |
| "step": 300 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 328.0416717529297, | |
| "epoch": 0.25798157274480393, | |
| "grad_norm": 7.188145160675049, | |
| "kl": 2.0, | |
| "learning_rate": 4.68766384637248e-07, | |
| "loss": 0.263, | |
| "reward": 0.7739622257649899, | |
| "reward_std": 0.8753332197666168, | |
| "rewards/cosine_scaled_reward": 0.1855922369286418, | |
| "rewards/format_reward": 0.4027777835726738, | |
| "step": 301 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 413.6527862548828, | |
| "epoch": 0.25883865438182985, | |
| "grad_norm": 20.148056030273438, | |
| "kl": 1.46484375, | |
| "learning_rate": 4.656784084364238e-07, | |
| "loss": 0.2093, | |
| "reward": 0.8456357046961784, | |
| "reward_std": 0.8055497854948044, | |
| "rewards/cosine_scaled_reward": 0.23531784676015377, | |
| "rewards/format_reward": 0.3750000046566129, | |
| "step": 302 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 435.4027786254883, | |
| "epoch": 0.2596957360188558, | |
| "grad_norm": 6.98222541809082, | |
| "kl": 1.439453125, | |
| "learning_rate": 4.6259454195101267e-07, | |
| "loss": 0.1817, | |
| "reward": 0.7380593828856945, | |
| "reward_std": 0.6477810889482498, | |
| "rewards/cosine_scaled_reward": 0.20236299559473991, | |
| "rewards/format_reward": 0.3333333432674408, | |
| "step": 303 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 403.20833587646484, | |
| "epoch": 0.26055281765588173, | |
| "grad_norm": 10.506485939025879, | |
| "kl": 1.654296875, | |
| "learning_rate": 4.59514935484316e-07, | |
| "loss": 0.2068, | |
| "reward": 0.6883874237537384, | |
| "reward_std": 0.8970037549734116, | |
| "rewards/cosine_scaled_reward": 0.1566937081515789, | |
| "rewards/format_reward": 0.37500000186264515, | |
| "step": 304 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 393.63890075683594, | |
| "epoch": 0.26140989929290764, | |
| "grad_norm": 31.882427215576172, | |
| "kl": 1.412109375, | |
| "learning_rate": 4.5643973913200837e-07, | |
| "loss": 0.1912, | |
| "reward": 0.833147831261158, | |
| "reward_std": 0.7371143400669098, | |
| "rewards/cosine_scaled_reward": 0.19435168150812387, | |
| "rewards/format_reward": 0.4444444477558136, | |
| "step": 305 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 413.1111145019531, | |
| "epoch": 0.26226698092993356, | |
| "grad_norm": 12.78693962097168, | |
| "kl": 1.515625, | |
| "learning_rate": 4.5336910277482155e-07, | |
| "loss": 0.0471, | |
| "reward": 0.6375277414917946, | |
| "reward_std": 0.733474999666214, | |
| "rewards/cosine_scaled_reward": 0.09654165129177272, | |
| "rewards/format_reward": 0.4444444477558136, | |
| "step": 306 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 447.50001525878906, | |
| "epoch": 0.2631240625669595, | |
| "grad_norm": 117.86434936523438, | |
| "kl": 2.7265625, | |
| "learning_rate": 4.503031760712397e-07, | |
| "loss": 0.2281, | |
| "reward": 0.7174494117498398, | |
| "reward_std": 0.8096088320016861, | |
| "rewards/cosine_scaled_reward": 0.13650248385965824, | |
| "rewards/format_reward": 0.4444444589316845, | |
| "step": 307 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 452.8333435058594, | |
| "epoch": 0.26398114420398544, | |
| "grad_norm": 10.973830223083496, | |
| "kl": 1.271484375, | |
| "learning_rate": 4.4724210845020494e-07, | |
| "loss": 0.2238, | |
| "reward": 0.7689935564994812, | |
| "reward_std": 0.8074923604726791, | |
| "rewards/cosine_scaled_reward": 0.1761634573340416, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 308 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 439.8472213745117, | |
| "epoch": 0.26483822584101135, | |
| "grad_norm": 6.3766188621521, | |
| "kl": 1.54296875, | |
| "learning_rate": 4.441860491038345e-07, | |
| "loss": 0.209, | |
| "reward": 0.5929550379514694, | |
| "reward_std": 0.6037983000278473, | |
| "rewards/cosine_scaled_reward": 0.07425528764724731, | |
| "rewards/format_reward": 0.4444444440305233, | |
| "step": 309 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 418.65277099609375, | |
| "epoch": 0.26569530747803727, | |
| "grad_norm": 21.236345291137695, | |
| "kl": 1.384765625, | |
| "learning_rate": 4.4113514698014953e-07, | |
| "loss": 0.1979, | |
| "reward": 0.9790968149900436, | |
| "reward_std": 0.7783628851175308, | |
| "rewards/cosine_scaled_reward": 0.2881595455110073, | |
| "rewards/format_reward": 0.4027777872979641, | |
| "step": 310 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 450.2222213745117, | |
| "epoch": 0.26655238911506324, | |
| "grad_norm": 14.473074913024902, | |
| "kl": 1.33203125, | |
| "learning_rate": 4.3808955077581546e-07, | |
| "loss": 0.2879, | |
| "reward": 1.0430985651910305, | |
| "reward_std": 0.8672375828027725, | |
| "rewards/cosine_scaled_reward": 0.31321592442691326, | |
| "rewards/format_reward": 0.416666679084301, | |
| "step": 311 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 471.1111145019531, | |
| "epoch": 0.26740947075208915, | |
| "grad_norm": 9.259844779968262, | |
| "kl": 1.51171875, | |
| "learning_rate": 4.350494089288943e-07, | |
| "loss": 0.2339, | |
| "reward": 0.9496155381202698, | |
| "reward_std": 0.910922110080719, | |
| "rewards/cosine_scaled_reward": 0.2803633138537407, | |
| "rewards/format_reward": 0.3888888917863369, | |
| "step": 312 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 445.06945037841797, | |
| "epoch": 0.26826655238911506, | |
| "grad_norm": 4.26533317565918, | |
| "kl": 1.58203125, | |
| "learning_rate": 4.3201486961161093e-07, | |
| "loss": 0.2456, | |
| "reward": 1.2665941417217255, | |
| "reward_std": 0.9569890201091766, | |
| "rewards/cosine_scaled_reward": 0.43190818652510643, | |
| "rewards/format_reward": 0.4027777835726738, | |
| "step": 313 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 392.7777862548828, | |
| "epoch": 0.269123634026141, | |
| "grad_norm": 7.616832256317139, | |
| "kl": 1.375, | |
| "learning_rate": 4.2898608072313045e-07, | |
| "loss": 0.1271, | |
| "reward": 1.1350777596235275, | |
| "reward_std": 0.9450895041227341, | |
| "rewards/cosine_scaled_reward": 0.35920554026961327, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 314 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 440.08333587646484, | |
| "epoch": 0.2699807156631669, | |
| "grad_norm": 12.737751007080078, | |
| "kl": 1.46484375, | |
| "learning_rate": 4.2596318988235037e-07, | |
| "loss": 0.2746, | |
| "reward": 0.6033434271812439, | |
| "reward_std": 0.7901871353387833, | |
| "rewards/cosine_scaled_reward": 0.1072272639721632, | |
| "rewards/format_reward": 0.3888889029622078, | |
| "step": 315 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 423.3611145019531, | |
| "epoch": 0.27083779730019286, | |
| "grad_norm": 34.6724853515625, | |
| "kl": 1.4873046875, | |
| "learning_rate": 4.2294634442070553e-07, | |
| "loss": 0.3012, | |
| "reward": 0.8819021135568619, | |
| "reward_std": 0.6502636596560478, | |
| "rewards/cosine_scaled_reward": 0.30206217616796494, | |
| "rewards/format_reward": 0.2777777872979641, | |
| "step": 316 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 409.56944274902344, | |
| "epoch": 0.2716948789372188, | |
| "grad_norm": 8.627520561218262, | |
| "kl": 1.564453125, | |
| "learning_rate": 4.1993569137498776e-07, | |
| "loss": 0.1536, | |
| "reward": 0.9121913909912109, | |
| "reward_std": 0.8483704626560211, | |
| "rewards/cosine_scaled_reward": 0.2894290406256914, | |
| "rewards/format_reward": 0.3333333358168602, | |
| "step": 317 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 413.6111145019531, | |
| "epoch": 0.2725519605742447, | |
| "grad_norm": 2.8996214866638184, | |
| "kl": 1.509765625, | |
| "learning_rate": 4.1693137748017915e-07, | |
| "loss": 0.1049, | |
| "reward": 1.1796189993619919, | |
| "reward_std": 0.8427684605121613, | |
| "rewards/cosine_scaled_reward": 0.31897614523768425, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 318 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 416.55555725097656, | |
| "epoch": 0.2734090422112706, | |
| "grad_norm": 10.151158332824707, | |
| "kl": 1.458984375, | |
| "learning_rate": 4.1393354916230005e-07, | |
| "loss": 0.2491, | |
| "reward": 0.6920264512300491, | |
| "reward_std": 0.6206417083740234, | |
| "rewards/cosine_scaled_reward": 0.13767989072948694, | |
| "rewards/format_reward": 0.4166666679084301, | |
| "step": 319 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 384.05555725097656, | |
| "epoch": 0.2742661238482966, | |
| "grad_norm": 5.853416442871094, | |
| "kl": 1.333984375, | |
| "learning_rate": 4.1094235253127374e-07, | |
| "loss": 0.1025, | |
| "reward": 1.3231835961341858, | |
| "reward_std": 0.8811145946383476, | |
| "rewards/cosine_scaled_reward": 0.4463140070438385, | |
| "rewards/format_reward": 0.4305555559694767, | |
| "step": 320 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 438.59722900390625, | |
| "epoch": 0.2751232054853225, | |
| "grad_norm": 24.13959503173828, | |
| "kl": 1.798828125, | |
| "learning_rate": 4.079579333738039e-07, | |
| "loss": 0.1857, | |
| "reward": 0.7677492424845695, | |
| "reward_std": 0.8653182983398438, | |
| "rewards/cosine_scaled_reward": 0.16859683208167553, | |
| "rewards/format_reward": 0.430555559694767, | |
| "step": 321 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 419.22222900390625, | |
| "epoch": 0.2759802871223484, | |
| "grad_norm": 25.10494041442871, | |
| "kl": 1.732421875, | |
| "learning_rate": 4.0498043714627006e-07, | |
| "loss": 0.2985, | |
| "reward": 0.5208378061652184, | |
| "reward_std": 0.7815151214599609, | |
| "rewards/cosine_scaled_reward": 0.08680777484551072, | |
| "rewards/format_reward": 0.3472222350537777, | |
| "step": 322 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 458.25, | |
| "epoch": 0.2768373687593743, | |
| "grad_norm": 6.803804874420166, | |
| "kl": 1.67578125, | |
| "learning_rate": 4.020100089676376e-07, | |
| "loss": 0.1707, | |
| "reward": 0.8683362007141113, | |
| "reward_std": 0.8737305179238319, | |
| "rewards/cosine_scaled_reward": 0.13555700704455376, | |
| "rewards/format_reward": 0.5972222238779068, | |
| "step": 323 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 390.7361145019531, | |
| "epoch": 0.2776944503964003, | |
| "grad_norm": 6.555212020874023, | |
| "kl": 1.5546875, | |
| "learning_rate": 3.9904679361238526e-07, | |
| "loss": 0.0673, | |
| "reward": 0.8696636259555817, | |
| "reward_std": 0.8899157643318176, | |
| "rewards/cosine_scaled_reward": 0.19872068613767624, | |
| "rewards/format_reward": 0.4722222238779068, | |
| "step": 324 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 506.33333587646484, | |
| "epoch": 0.2785515320334262, | |
| "grad_norm": 10.4780912399292, | |
| "kl": 1.701171875, | |
| "learning_rate": 3.9609093550344907e-07, | |
| "loss": 0.1414, | |
| "reward": 0.8947531227022409, | |
| "reward_std": 1.0458511114120483, | |
| "rewards/cosine_scaled_reward": 0.24598768074065447, | |
| "rewards/format_reward": 0.4027777872979641, | |
| "step": 325 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 389.875, | |
| "epoch": 0.2794086136704521, | |
| "grad_norm": 8.659296989440918, | |
| "kl": 1.732421875, | |
| "learning_rate": 3.931425787051832e-07, | |
| "loss": 0.2063, | |
| "reward": 0.7398964213207364, | |
| "reward_std": 0.8089132308959961, | |
| "rewards/cosine_scaled_reward": 0.13383711129426956, | |
| "rewards/format_reward": 0.4722222238779068, | |
| "step": 326 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 448.20833587646484, | |
| "epoch": 0.280265695307478, | |
| "grad_norm": 19.596874237060547, | |
| "kl": 1.962890625, | |
| "learning_rate": 3.902018669163384e-07, | |
| "loss": 0.1572, | |
| "reward": 0.8671465888619423, | |
| "reward_std": 0.915204182267189, | |
| "rewards/cosine_scaled_reward": 0.21135106589645147, | |
| "rewards/format_reward": 0.4444444552063942, | |
| "step": 327 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 391.08333587646484, | |
| "epoch": 0.28112277694450394, | |
| "grad_norm": 32.9877815246582, | |
| "kl": 1.431640625, | |
| "learning_rate": 3.872689434630585e-07, | |
| "loss": 0.0696, | |
| "reward": 1.0604142509400845, | |
| "reward_std": 0.8675804287195206, | |
| "rewards/cosine_scaled_reward": 0.30104043427854776, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 328 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 465.2083435058594, | |
| "epoch": 0.2819798585815299, | |
| "grad_norm": 13.628067016601562, | |
| "kl": 1.689453125, | |
| "learning_rate": 3.843439512918949e-07, | |
| "loss": 0.1891, | |
| "reward": 0.7241683751344681, | |
| "reward_std": 0.7928906679153442, | |
| "rewards/cosine_scaled_reward": 0.13291750941425562, | |
| "rewards/format_reward": 0.4583333358168602, | |
| "step": 329 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 462.87500762939453, | |
| "epoch": 0.2828369402185558, | |
| "grad_norm": 6.838570594787598, | |
| "kl": 1.494140625, | |
| "learning_rate": 3.8142703296283953e-07, | |
| "loss": 0.1692, | |
| "reward": 0.9220460206270218, | |
| "reward_std": 0.6238923817873001, | |
| "rewards/cosine_scaled_reward": 0.25268966890871525, | |
| "rewards/format_reward": 0.416666679084301, | |
| "step": 330 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 473.2777862548828, | |
| "epoch": 0.28369402185558174, | |
| "grad_norm": 8.527922630310059, | |
| "kl": 1.451171875, | |
| "learning_rate": 3.785183306423767e-07, | |
| "loss": 0.2612, | |
| "reward": 0.7746013253927231, | |
| "reward_std": 0.6590248346328735, | |
| "rewards/cosine_scaled_reward": 0.21368957962840796, | |
| "rewards/format_reward": 0.3472222276031971, | |
| "step": 331 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 401.19444274902344, | |
| "epoch": 0.28455110349260765, | |
| "grad_norm": 18.457897186279297, | |
| "kl": 1.564453125, | |
| "learning_rate": 3.7561798609655373e-07, | |
| "loss": 0.2324, | |
| "reward": 1.1048437356948853, | |
| "reward_std": 0.8566301316022873, | |
| "rewards/cosine_scaled_reward": 0.25381074473261833, | |
| "rewards/format_reward": 0.5972222238779068, | |
| "step": 332 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 496.69446563720703, | |
| "epoch": 0.2854081851296336, | |
| "grad_norm": 10.751416206359863, | |
| "kl": 1.69921875, | |
| "learning_rate": 3.72726140684072e-07, | |
| "loss": 0.1604, | |
| "reward": 0.9123432487249374, | |
| "reward_std": 0.9888466447591782, | |
| "rewards/cosine_scaled_reward": 0.22700495785102248, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 333 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 370.75000762939453, | |
| "epoch": 0.28626526676665953, | |
| "grad_norm": 4.947657108306885, | |
| "kl": 1.51171875, | |
| "learning_rate": 3.6984293534939737e-07, | |
| "loss": 0.1946, | |
| "reward": 0.3965581804513931, | |
| "reward_std": 0.7018196731805801, | |
| "rewards/cosine_scaled_reward": 0.010779092612210661, | |
| "rewards/format_reward": 0.3750000074505806, | |
| "step": 334 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 417.25, | |
| "epoch": 0.28712234840368545, | |
| "grad_norm": 11.954354286193848, | |
| "kl": 1.294921875, | |
| "learning_rate": 3.6696851061588994e-07, | |
| "loss": 0.0114, | |
| "reward": 1.05374076962471, | |
| "reward_std": 0.8027269691228867, | |
| "rewards/cosine_scaled_reward": 0.24909262219443917, | |
| "rewards/format_reward": 0.5555555671453476, | |
| "step": 335 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 504.69445037841797, | |
| "epoch": 0.28797943004071136, | |
| "grad_norm": 7.7731170654296875, | |
| "kl": 1.669921875, | |
| "learning_rate": 3.641030065789562e-07, | |
| "loss": 0.2419, | |
| "reward": 0.8747316524386406, | |
| "reward_std": 0.8644589632749557, | |
| "rewards/cosine_scaled_reward": 0.1943102532532066, | |
| "rewards/format_reward": 0.486111119389534, | |
| "step": 336 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 363.2777862548828, | |
| "epoch": 0.28883651167773733, | |
| "grad_norm": 21.512271881103516, | |
| "kl": 1.5693359375, | |
| "learning_rate": 3.612465628992203e-07, | |
| "loss": 0.2896, | |
| "reward": 1.566197782754898, | |
| "reward_std": 0.9783513993024826, | |
| "rewards/cosine_scaled_reward": 0.49837667867541313, | |
| "rewards/format_reward": 0.5694444552063942, | |
| "step": 337 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 384.47222900390625, | |
| "epoch": 0.28969359331476324, | |
| "grad_norm": 10.973138809204102, | |
| "kl": 1.48828125, | |
| "learning_rate": 3.5839931879571725e-07, | |
| "loss": 0.1953, | |
| "reward": 0.8962609972804785, | |
| "reward_std": 0.9198465496301651, | |
| "rewards/cosine_scaled_reward": 0.19118605181574821, | |
| "rewards/format_reward": 0.5138888880610466, | |
| "step": 338 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 423.45833587646484, | |
| "epoch": 0.29055067495178916, | |
| "grad_norm": 22.09733772277832, | |
| "kl": 1.876953125, | |
| "learning_rate": 3.555614130391079e-07, | |
| "loss": 0.347, | |
| "reward": 0.811115313321352, | |
| "reward_std": 0.8457043170928955, | |
| "rewards/cosine_scaled_reward": 0.1625020916108042, | |
| "rewards/format_reward": 0.4861111231148243, | |
| "step": 339 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 423.72222900390625, | |
| "epoch": 0.29140775658881507, | |
| "grad_norm": 22.564533233642578, | |
| "kl": 1.546875, | |
| "learning_rate": 3.5273298394491515e-07, | |
| "loss": 0.2976, | |
| "reward": 0.8002185821533203, | |
| "reward_std": 0.8067903742194176, | |
| "rewards/cosine_scaled_reward": 0.15705375373363495, | |
| "rewards/format_reward": 0.4861111119389534, | |
| "step": 340 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 365.91666412353516, | |
| "epoch": 0.292264838225841, | |
| "grad_norm": 8.0919828414917, | |
| "kl": 1.380859375, | |
| "learning_rate": 3.4991416936678276e-07, | |
| "loss": 0.2234, | |
| "reward": 1.0582543164491653, | |
| "reward_std": 0.700420930981636, | |
| "rewards/cosine_scaled_reward": 0.2721826871857047, | |
| "rewards/format_reward": 0.5138888955116272, | |
| "step": 341 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 396.5555648803711, | |
| "epoch": 0.29312191986286695, | |
| "grad_norm": 13.28870964050293, | |
| "kl": 1.693359375, | |
| "learning_rate": 3.471051066897562e-07, | |
| "loss": 0.2345, | |
| "reward": 0.9433440640568733, | |
| "reward_std": 0.8704208433628082, | |
| "rewards/cosine_scaled_reward": 0.2772275973111391, | |
| "rewards/format_reward": 0.3888888955116272, | |
| "step": 342 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 425.625, | |
| "epoch": 0.29397900149989287, | |
| "grad_norm": 102.71472930908203, | |
| "kl": 2.208984375, | |
| "learning_rate": 3.4430593282358777e-07, | |
| "loss": 0.2262, | |
| "reward": 0.8235329911112785, | |
| "reward_std": 0.5788537338376045, | |
| "rewards/cosine_scaled_reward": 0.09926649276167154, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 343 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 365.58333587646484, | |
| "epoch": 0.2948360831369188, | |
| "grad_norm": 15.007022857666016, | |
| "kl": 1.810546875, | |
| "learning_rate": 3.4151678419606233e-07, | |
| "loss": 0.108, | |
| "reward": 0.8345845490694046, | |
| "reward_std": 0.8136800527572632, | |
| "rewards/cosine_scaled_reward": 0.1950700655579567, | |
| "rewards/format_reward": 0.4444444552063942, | |
| "step": 344 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 390.6527862548828, | |
| "epoch": 0.2956931647739447, | |
| "grad_norm": 17.396751403808594, | |
| "kl": 1.453125, | |
| "learning_rate": 3.387377967463493e-07, | |
| "loss": 0.1139, | |
| "reward": 1.2658544778823853, | |
| "reward_std": 0.8507587239146233, | |
| "rewards/cosine_scaled_reward": 0.36903833597898483, | |
| "rewards/format_reward": 0.5277777910232544, | |
| "step": 345 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 380.2777786254883, | |
| "epoch": 0.29655024641097066, | |
| "grad_norm": 6.361865043640137, | |
| "kl": 1.896484375, | |
| "learning_rate": 3.359691059183761e-07, | |
| "loss": 0.2842, | |
| "reward": 0.7403211258351803, | |
| "reward_std": 0.8092114925384521, | |
| "rewards/cosine_scaled_reward": 0.2034938931465149, | |
| "rewards/format_reward": 0.3333333395421505, | |
| "step": 346 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 401.58333587646484, | |
| "epoch": 0.2974073280479966, | |
| "grad_norm": 4.304242134094238, | |
| "kl": 1.55078125, | |
| "learning_rate": 3.3321084665422803e-07, | |
| "loss": 0.1731, | |
| "reward": 1.124197095632553, | |
| "reward_std": 0.6386073157191277, | |
| "rewards/cosine_scaled_reward": 0.31209855526685715, | |
| "rewards/format_reward": 0.5000000223517418, | |
| "step": 347 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 389.22222900390625, | |
| "epoch": 0.2982644096850225, | |
| "grad_norm": 5.652775764465332, | |
| "kl": 2.107421875, | |
| "learning_rate": 3.3046315338757026e-07, | |
| "loss": 0.3405, | |
| "reward": 0.4380467850714922, | |
| "reward_std": 0.7566511631011963, | |
| "rewards/cosine_scaled_reward": -0.003198828548192978, | |
| "rewards/format_reward": 0.4444444477558136, | |
| "step": 348 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 382.8611145019531, | |
| "epoch": 0.2991214913220484, | |
| "grad_norm": 139.029052734375, | |
| "kl": 2.10546875, | |
| "learning_rate": 3.2772616003709616e-07, | |
| "loss": 0.2329, | |
| "reward": 0.886528730392456, | |
| "reward_std": 0.9427484571933746, | |
| "rewards/cosine_scaled_reward": 0.22798660211265087, | |
| "rewards/format_reward": 0.430555559694767, | |
| "step": 349 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 378.08333587646484, | |
| "epoch": 0.2999785729590744, | |
| "grad_norm": 273.4222412109375, | |
| "kl": 2.13671875, | |
| "learning_rate": 3.250000000000001e-07, | |
| "loss": 0.223, | |
| "reward": 1.0541678816080093, | |
| "reward_std": 0.9911145269870758, | |
| "rewards/cosine_scaled_reward": 0.3118061521090567, | |
| "rewards/format_reward": 0.430555559694767, | |
| "step": 350 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 435.625, | |
| "epoch": 0.3008356545961003, | |
| "grad_norm": 5.59974479675293, | |
| "kl": 1.93359375, | |
| "learning_rate": 3.222848061454764e-07, | |
| "loss": 0.2827, | |
| "reward": 0.46341075748205185, | |
| "reward_std": 0.7887972742319107, | |
| "rewards/cosine_scaled_reward": 0.030316500924527645, | |
| "rewards/format_reward": 0.4027777835726738, | |
| "step": 351 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 499.12500762939453, | |
| "epoch": 0.3016927362331262, | |
| "grad_norm": 8.170894622802734, | |
| "kl": 1.8984375, | |
| "learning_rate": 3.195807108082429e-07, | |
| "loss": 0.246, | |
| "reward": 0.7659785971045494, | |
| "reward_std": 0.865535780787468, | |
| "rewards/cosine_scaled_reward": 0.1607670597732067, | |
| "rewards/format_reward": 0.4444444626569748, | |
| "step": 352 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 412.75001525878906, | |
| "epoch": 0.3025498178701521, | |
| "grad_norm": 4.896013259887695, | |
| "kl": 1.9609375, | |
| "learning_rate": 3.168878457820915e-07, | |
| "loss": 0.2717, | |
| "reward": 0.4919071840122342, | |
| "reward_std": 0.5877289474010468, | |
| "rewards/cosine_scaled_reward": 0.10012026876211166, | |
| "rewards/format_reward": 0.29166666977107525, | |
| "step": 353 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 444.3055648803711, | |
| "epoch": 0.3034068995071781, | |
| "grad_norm": 843.45703125, | |
| "kl": 7.69921875, | |
| "learning_rate": 3.142063423134644e-07, | |
| "loss": 0.3878, | |
| "reward": 0.47469986602663994, | |
| "reward_std": 0.6482968628406525, | |
| "rewards/cosine_scaled_reward": 0.06373882107436657, | |
| "rewards/format_reward": 0.3472222313284874, | |
| "step": 354 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 508.8055648803711, | |
| "epoch": 0.304263981144204, | |
| "grad_norm": 610.3280639648438, | |
| "kl": 2.517578125, | |
| "learning_rate": 3.115363310950578e-07, | |
| "loss": 0.3122, | |
| "reward": 0.9253015741705894, | |
| "reward_std": 0.9372780025005341, | |
| "rewards/cosine_scaled_reward": 0.19876189157366753, | |
| "rewards/format_reward": 0.5277777835726738, | |
| "step": 355 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 451.4861145019531, | |
| "epoch": 0.3051210627812299, | |
| "grad_norm": 6.685305595397949, | |
| "kl": 1.939453125, | |
| "learning_rate": 3.0887794225945143e-07, | |
| "loss": 0.2488, | |
| "reward": 1.0591753125190735, | |
| "reward_std": 0.9159562736749649, | |
| "rewards/cosine_scaled_reward": 0.27958764508366585, | |
| "rewards/format_reward": 0.5, | |
| "step": 356 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 386.027774810791, | |
| "epoch": 0.3059781444182558, | |
| "grad_norm": 75.1882095336914, | |
| "kl": 3.021484375, | |
| "learning_rate": 3.062313053727671e-07, | |
| "loss": 0.158, | |
| "reward": 0.9487558901309967, | |
| "reward_std": 0.8189297467470169, | |
| "rewards/cosine_scaled_reward": 0.2591001633554697, | |
| "rewards/format_reward": 0.430555559694767, | |
| "step": 357 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 420.33333587646484, | |
| "epoch": 0.30683522605528174, | |
| "grad_norm": 588.209228515625, | |
| "kl": 2.5234375, | |
| "learning_rate": 3.0359654942835247e-07, | |
| "loss": 0.196, | |
| "reward": 0.9485956132411957, | |
| "reward_std": 0.8210533708333969, | |
| "rewards/cosine_scaled_reward": 0.21735336817801, | |
| "rewards/format_reward": 0.5138888955116272, | |
| "step": 358 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 446.76390075683594, | |
| "epoch": 0.3076923076923077, | |
| "grad_norm": 128.001220703125, | |
| "kl": 2.671875, | |
| "learning_rate": 3.0097380284049523e-07, | |
| "loss": 0.1816, | |
| "reward": 1.0116847660392523, | |
| "reward_std": 0.8659616261720657, | |
| "rewards/cosine_scaled_reward": 0.24889790453016758, | |
| "rewards/format_reward": 0.5138888992369175, | |
| "step": 359 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 423.2777862548828, | |
| "epoch": 0.3085493893293336, | |
| "grad_norm": 13.23314380645752, | |
| "kl": 2.044921875, | |
| "learning_rate": 2.9836319343816397e-07, | |
| "loss": 0.1871, | |
| "reward": 1.3461374938488007, | |
| "reward_std": 0.9391425997018814, | |
| "rewards/cosine_scaled_reward": 0.4022354434709996, | |
| "rewards/format_reward": 0.5416666641831398, | |
| "step": 360 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 433.63890075683594, | |
| "epoch": 0.30940647096635954, | |
| "grad_norm": 232.4365692138672, | |
| "kl": 2.478515625, | |
| "learning_rate": 2.9576484845877793e-07, | |
| "loss": 0.2607, | |
| "reward": 1.047914907336235, | |
| "reward_std": 0.9106916189193726, | |
| "rewards/cosine_scaled_reward": 0.23229077784344554, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 361 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 414.6527862548828, | |
| "epoch": 0.31026355260338545, | |
| "grad_norm": 125.23755645751953, | |
| "kl": 2.080078125, | |
| "learning_rate": 2.931788945420058e-07, | |
| "loss": 0.149, | |
| "reward": 0.834898516535759, | |
| "reward_std": 0.7312210351228714, | |
| "rewards/cosine_scaled_reward": 0.1605048067867756, | |
| "rewards/format_reward": 0.5138888955116272, | |
| "step": 362 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 435.12499237060547, | |
| "epoch": 0.3111206342404114, | |
| "grad_norm": 6.104694366455078, | |
| "kl": 1.779296875, | |
| "learning_rate": 2.9060545772359305e-07, | |
| "loss": 0.2289, | |
| "reward": 1.3437991440296173, | |
| "reward_std": 0.8975027948617935, | |
| "rewards/cosine_scaled_reward": 0.38023288547992706, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 363 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 409.8472213745117, | |
| "epoch": 0.31197771587743733, | |
| "grad_norm": 24.235681533813477, | |
| "kl": 2.080078125, | |
| "learning_rate": 2.8804466342921987e-07, | |
| "loss": 0.1818, | |
| "reward": 0.9619172001257539, | |
| "reward_std": 0.8415810465812683, | |
| "rewards/cosine_scaled_reward": 0.27956968918442726, | |
| "rewards/format_reward": 0.4027777835726738, | |
| "step": 364 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 398.5138854980469, | |
| "epoch": 0.31283479751446325, | |
| "grad_norm": 11.025618553161621, | |
| "kl": 2.248046875, | |
| "learning_rate": 2.854966364683872e-07, | |
| "loss": 0.1973, | |
| "reward": 0.7603890486061573, | |
| "reward_std": 0.8526364490389824, | |
| "rewards/cosine_scaled_reward": 0.1857500895857811, | |
| "rewards/format_reward": 0.3888888955116272, | |
| "step": 365 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 498.2777862548828, | |
| "epoch": 0.31369187915148916, | |
| "grad_norm": 65.14573669433594, | |
| "kl": 2.064453125, | |
| "learning_rate": 2.829615010283344e-07, | |
| "loss": 0.1461, | |
| "reward": 1.086868055164814, | |
| "reward_std": 0.9657749831676483, | |
| "rewards/cosine_scaled_reward": 0.32815628172829747, | |
| "rewards/format_reward": 0.430555559694767, | |
| "step": 366 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 387.7777786254883, | |
| "epoch": 0.31454896078851513, | |
| "grad_norm": 50.92959976196289, | |
| "kl": 2.1015625, | |
| "learning_rate": 2.8043938066798645e-07, | |
| "loss": 0.1503, | |
| "reward": 0.6913352087140083, | |
| "reward_std": 0.7219003140926361, | |
| "rewards/cosine_scaled_reward": 0.12344538388424553, | |
| "rewards/format_reward": 0.4444444440305233, | |
| "step": 367 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 415.75000762939453, | |
| "epoch": 0.31540604242554104, | |
| "grad_norm": 23.864198684692383, | |
| "kl": 1.697265625, | |
| "learning_rate": 2.7793039831193133e-07, | |
| "loss": 0.2941, | |
| "reward": 0.803162232041359, | |
| "reward_std": 0.8274150788784027, | |
| "rewards/cosine_scaled_reward": 0.1376922446070239, | |
| "rewards/format_reward": 0.5277777910232544, | |
| "step": 368 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 404.37500762939453, | |
| "epoch": 0.31626312406256696, | |
| "grad_norm": 80.24060821533203, | |
| "kl": 1.919921875, | |
| "learning_rate": 2.7543467624442956e-07, | |
| "loss": 0.1397, | |
| "reward": 0.9380350708961487, | |
| "reward_std": 0.868830531835556, | |
| "rewards/cosine_scaled_reward": 0.23290642350912094, | |
| "rewards/format_reward": 0.4722222238779068, | |
| "step": 369 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 493.6666717529297, | |
| "epoch": 0.31712020569959287, | |
| "grad_norm": 16.302597045898438, | |
| "kl": 1.53515625, | |
| "learning_rate": 2.729523361034538e-07, | |
| "loss": 0.0377, | |
| "reward": 1.3772657215595245, | |
| "reward_std": 0.8640294969081879, | |
| "rewards/cosine_scaled_reward": 0.40391062945127487, | |
| "rewards/format_reward": 0.5694444552063942, | |
| "step": 370 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 444.6666717529297, | |
| "epoch": 0.3179772873366188, | |
| "grad_norm": 74.72409057617188, | |
| "kl": 2.32421875, | |
| "learning_rate": 2.7048349887476037e-07, | |
| "loss": 0.2057, | |
| "reward": 0.564115053974092, | |
| "reward_std": 0.7745189592242241, | |
| "rewards/cosine_scaled_reward": 0.08066862914711237, | |
| "rewards/format_reward": 0.4027777798473835, | |
| "step": 371 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 466.0277862548828, | |
| "epoch": 0.31883436897364476, | |
| "grad_norm": 23.615995407104492, | |
| "kl": 2.126953125, | |
| "learning_rate": 2.6802828488599294e-07, | |
| "loss": 0.1857, | |
| "reward": 0.9031898975372314, | |
| "reward_std": 0.8619142323732376, | |
| "rewards/cosine_scaled_reward": 0.236317184753716, | |
| "rewards/format_reward": 0.4305555671453476, | |
| "step": 372 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 438.3888854980469, | |
| "epoch": 0.31969145061067067, | |
| "grad_norm": 10.200531005859375, | |
| "kl": 1.654296875, | |
| "learning_rate": 2.655868138008171e-07, | |
| "loss": 0.1653, | |
| "reward": 0.8666345775127411, | |
| "reward_std": 0.8320091515779495, | |
| "rewards/cosine_scaled_reward": 0.19720618752762675, | |
| "rewards/format_reward": 0.4722222238779068, | |
| "step": 373 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 409.47222900390625, | |
| "epoch": 0.3205485322476966, | |
| "grad_norm": 40.82743453979492, | |
| "kl": 2.0703125, | |
| "learning_rate": 2.631592046130896e-07, | |
| "loss": 0.1313, | |
| "reward": 1.2135090231895447, | |
| "reward_std": 1.0808076113462448, | |
| "rewards/cosine_scaled_reward": 0.36369897052645683, | |
| "rewards/format_reward": 0.486111119389534, | |
| "step": 374 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 531.1389083862305, | |
| "epoch": 0.3214056138847225, | |
| "grad_norm": 20.503686904907227, | |
| "kl": 2.048828125, | |
| "learning_rate": 2.6074557564105724e-07, | |
| "loss": 0.1784, | |
| "reward": 1.1541820913553238, | |
| "reward_std": 1.0066500753164291, | |
| "rewards/cosine_scaled_reward": 0.3826466426253319, | |
| "rewards/format_reward": 0.3888888992369175, | |
| "step": 375 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 446.73612213134766, | |
| "epoch": 0.32226269552174847, | |
| "grad_norm": 36.28611373901367, | |
| "kl": 1.435546875, | |
| "learning_rate": 2.583460445215911e-07, | |
| "loss": 0.141, | |
| "reward": 1.155954971909523, | |
| "reward_std": 0.9807834774255753, | |
| "rewards/cosine_scaled_reward": 0.32103302888572216, | |
| "rewards/format_reward": 0.5138889029622078, | |
| "step": 376 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 369.1388931274414, | |
| "epoch": 0.3231197771587744, | |
| "grad_norm": 21.888423919677734, | |
| "kl": 2.341796875, | |
| "learning_rate": 2.5596072820445254e-07, | |
| "loss": 0.158, | |
| "reward": 0.8885826840996742, | |
| "reward_std": 0.8598978072404861, | |
| "rewards/cosine_scaled_reward": 0.19429135276004672, | |
| "rewards/format_reward": 0.5, | |
| "step": 377 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 517.125, | |
| "epoch": 0.3239768587958003, | |
| "grad_norm": 19.566164016723633, | |
| "kl": 1.60546875, | |
| "learning_rate": 2.5358974294659373e-07, | |
| "loss": 0.1273, | |
| "reward": 1.0485451221466064, | |
| "reward_std": 1.0302338749170303, | |
| "rewards/cosine_scaled_reward": 0.27427253872156143, | |
| "rewards/format_reward": 0.5, | |
| "step": 378 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 408.81945037841797, | |
| "epoch": 0.3248339404328262, | |
| "grad_norm": 51.357086181640625, | |
| "kl": 1.625, | |
| "learning_rate": 2.512332043064913e-07, | |
| "loss": 0.0661, | |
| "reward": 1.2394737899303436, | |
| "reward_std": 0.8347266316413879, | |
| "rewards/cosine_scaled_reward": 0.3280702382326126, | |
| "rewards/format_reward": 0.5833333358168602, | |
| "step": 379 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 449.50000762939453, | |
| "epoch": 0.3256910220698522, | |
| "grad_norm": 13.72800350189209, | |
| "kl": 1.7578125, | |
| "learning_rate": 2.488912271385139e-07, | |
| "loss": 0.2142, | |
| "reward": 1.0208731442689896, | |
| "reward_std": 0.7508396059274673, | |
| "rewards/cosine_scaled_reward": 0.2812698809430003, | |
| "rewards/format_reward": 0.4583333358168602, | |
| "step": 380 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 406.4861145019531, | |
| "epoch": 0.3265481037068781, | |
| "grad_norm": 5.486995220184326, | |
| "kl": 1.76953125, | |
| "learning_rate": 2.465639255873246e-07, | |
| "loss": 0.2384, | |
| "reward": 0.863591693341732, | |
| "reward_std": 0.80591781437397, | |
| "rewards/cosine_scaled_reward": 0.13318472169339657, | |
| "rewards/format_reward": 0.5972222313284874, | |
| "step": 381 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 413.5972213745117, | |
| "epoch": 0.327405185343904, | |
| "grad_norm": 12.793088912963867, | |
| "kl": 1.849609375, | |
| "learning_rate": 2.4425141308231765e-07, | |
| "loss": 0.0601, | |
| "reward": 1.3274620473384857, | |
| "reward_std": 0.9712315052747726, | |
| "rewards/cosine_scaled_reward": 0.40678660944104195, | |
| "rewards/format_reward": 0.5138888880610466, | |
| "step": 382 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 462.05555725097656, | |
| "epoch": 0.3282622669809299, | |
| "grad_norm": 25.417646408081055, | |
| "kl": 1.63671875, | |
| "learning_rate": 2.4195380233209006e-07, | |
| "loss": -0.0034, | |
| "reward": 0.7435576766729355, | |
| "reward_std": 0.7562405988574028, | |
| "rewards/cosine_scaled_reward": 0.17733439663425088, | |
| "rewards/format_reward": 0.3888889029622078, | |
| "step": 383 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 393.62500762939453, | |
| "epoch": 0.3291193486179559, | |
| "grad_norm": 11.36495590209961, | |
| "kl": 2.20703125, | |
| "learning_rate": 2.3967120531894857e-07, | |
| "loss": 0.1751, | |
| "reward": 0.8897156268358231, | |
| "reward_std": 0.8956611603498459, | |
| "rewards/cosine_scaled_reward": 0.20180223789066076, | |
| "rewards/format_reward": 0.4861111119389534, | |
| "step": 384 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 415.2777862548828, | |
| "epoch": 0.3299764302549818, | |
| "grad_norm": 28.817380905151367, | |
| "kl": 1.892578125, | |
| "learning_rate": 2.374037332934512e-07, | |
| "loss": 0.1319, | |
| "reward": 1.1349451541900635, | |
| "reward_std": 1.0257329195737839, | |
| "rewards/cosine_scaled_reward": 0.3244170341640711, | |
| "rewards/format_reward": 0.486111119389534, | |
| "step": 385 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 427.20833587646484, | |
| "epoch": 0.3308335118920077, | |
| "grad_norm": 62.827362060546875, | |
| "kl": 1.46875, | |
| "learning_rate": 2.3515149676898552e-07, | |
| "loss": 0.0615, | |
| "reward": 1.3169071674346924, | |
| "reward_std": 0.8778738602995872, | |
| "rewards/cosine_scaled_reward": 0.3667869158089161, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 386 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 385.94444274902344, | |
| "epoch": 0.33169059352903363, | |
| "grad_norm": 29.661022186279297, | |
| "kl": 1.708984375, | |
| "learning_rate": 2.3291460551638237e-07, | |
| "loss": 0.0891, | |
| "reward": 0.7852656096220016, | |
| "reward_std": 0.7992514222860336, | |
| "rewards/cosine_scaled_reward": 0.13568835996557027, | |
| "rewards/format_reward": 0.5138888880610466, | |
| "step": 387 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 453.55555725097656, | |
| "epoch": 0.33254767516605954, | |
| "grad_norm": 5.4821882247924805, | |
| "kl": 1.947265625, | |
| "learning_rate": 2.306931685585657e-07, | |
| "loss": 0.175, | |
| "reward": 0.6292361579835415, | |
| "reward_std": 0.8566596806049347, | |
| "rewards/cosine_scaled_reward": 0.0993402823805809, | |
| "rewards/format_reward": 0.430555559694767, | |
| "step": 388 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 467.68055725097656, | |
| "epoch": 0.3334047568030855, | |
| "grad_norm": 14.547080993652344, | |
| "kl": 2.296875, | |
| "learning_rate": 2.2848729416523859e-07, | |
| "loss": 0.2111, | |
| "reward": 0.729412317276001, | |
| "reward_std": 0.8567145764827728, | |
| "rewards/cosine_scaled_reward": 0.1355394944548607, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 389 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 424.0277786254883, | |
| "epoch": 0.3342618384401114, | |
| "grad_norm": 34.17932891845703, | |
| "kl": 1.759765625, | |
| "learning_rate": 2.2629708984760706e-07, | |
| "loss": 0.249, | |
| "reward": 0.6682014372199774, | |
| "reward_std": 0.7891437709331512, | |
| "rewards/cosine_scaled_reward": 0.13271182030439377, | |
| "rewards/format_reward": 0.4027777872979641, | |
| "step": 390 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 466.0416793823242, | |
| "epoch": 0.33511892007713734, | |
| "grad_norm": 79.28044891357422, | |
| "kl": 2.046875, | |
| "learning_rate": 2.2412266235313973e-07, | |
| "loss": 0.0428, | |
| "reward": 0.4808058775961399, | |
| "reward_std": 0.7752177119255066, | |
| "rewards/cosine_scaled_reward": 0.04595848359167576, | |
| "rewards/format_reward": 0.3888888955116272, | |
| "step": 391 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 482.4166793823242, | |
| "epoch": 0.33597600171416325, | |
| "grad_norm": 7.091101169586182, | |
| "kl": 1.888671875, | |
| "learning_rate": 2.2196411766036487e-07, | |
| "loss": 0.142, | |
| "reward": 0.8153834193944931, | |
| "reward_std": 0.9166710078716278, | |
| "rewards/cosine_scaled_reward": 0.22019170981366187, | |
| "rewards/format_reward": 0.3750000111758709, | |
| "step": 392 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 444.97222900390625, | |
| "epoch": 0.3368330833511892, | |
| "grad_norm": 5.4899396896362305, | |
| "kl": 1.81640625, | |
| "learning_rate": 2.1982156097370557e-07, | |
| "loss": 0.1499, | |
| "reward": 0.9435096383094788, | |
| "reward_std": 0.6671365574002266, | |
| "rewards/cosine_scaled_reward": 0.1870325729250908, | |
| "rewards/format_reward": 0.5694444477558136, | |
| "step": 393 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 462.69445037841797, | |
| "epoch": 0.33769016498821514, | |
| "grad_norm": 6.30225944519043, | |
| "kl": 1.921875, | |
| "learning_rate": 2.1769509671835223e-07, | |
| "loss": 0.2415, | |
| "reward": 1.1110005229711533, | |
| "reward_std": 0.8034192770719528, | |
| "rewards/cosine_scaled_reward": 0.2777224676683545, | |
| "rewards/format_reward": 0.555555559694767, | |
| "step": 394 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 449.52777099609375, | |
| "epoch": 0.33854724662524105, | |
| "grad_norm": 12.88353443145752, | |
| "kl": 1.69921875, | |
| "learning_rate": 2.1558482853517253e-07, | |
| "loss": 0.1616, | |
| "reward": 1.2834501564502716, | |
| "reward_std": 1.0129185914993286, | |
| "rewards/cosine_scaled_reward": 0.3639473095536232, | |
| "rewards/format_reward": 0.5555555522441864, | |
| "step": 395 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 424.6527786254883, | |
| "epoch": 0.33940432826226696, | |
| "grad_norm": 8.04071044921875, | |
| "kl": 1.734375, | |
| "learning_rate": 2.134908592756607e-07, | |
| "loss": 0.0449, | |
| "reward": 1.0254860520362854, | |
| "reward_std": 0.8995675295591354, | |
| "rewards/cosine_scaled_reward": 0.25579858385026455, | |
| "rewards/format_reward": 0.5138888955116272, | |
| "step": 396 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 419.04166412353516, | |
| "epoch": 0.34026140989929293, | |
| "grad_norm": 13.297782897949219, | |
| "kl": 1.74609375, | |
| "learning_rate": 2.1141329099692406e-07, | |
| "loss": 0.1566, | |
| "reward": 1.252098884433508, | |
| "reward_std": 0.8234367519617081, | |
| "rewards/cosine_scaled_reward": 0.3482716903090477, | |
| "rewards/format_reward": 0.5555555671453476, | |
| "step": 397 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 425.8472213745117, | |
| "epoch": 0.34111849153631885, | |
| "grad_norm": 57.158538818359375, | |
| "kl": 1.884765625, | |
| "learning_rate": 2.0935222495670968e-07, | |
| "loss": 0.0898, | |
| "reward": 1.0553182810544968, | |
| "reward_std": 0.8401579111814499, | |
| "rewards/cosine_scaled_reward": 0.22210356313735247, | |
| "rewards/format_reward": 0.611111119389534, | |
| "step": 398 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 431.81944274902344, | |
| "epoch": 0.34197557317334476, | |
| "grad_norm": 10.068607330322266, | |
| "kl": 2.22265625, | |
| "learning_rate": 2.0730776160846853e-07, | |
| "loss": 0.1113, | |
| "reward": 0.957691490650177, | |
| "reward_std": 0.9538578987121582, | |
| "rewards/cosine_scaled_reward": 0.2219013087451458, | |
| "rewards/format_reward": 0.5138889029622078, | |
| "step": 399 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 472.25001525878906, | |
| "epoch": 0.3428326548103707, | |
| "grad_norm": 4.390395641326904, | |
| "kl": 1.947265625, | |
| "learning_rate": 2.0528000059645995e-07, | |
| "loss": 0.2373, | |
| "reward": 0.8023122465237975, | |
| "reward_std": 0.8230636864900589, | |
| "rewards/cosine_scaled_reward": 0.19976721669081599, | |
| "rewards/format_reward": 0.4027777872979641, | |
| "step": 400 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 455.05555725097656, | |
| "epoch": 0.3436897364473966, | |
| "grad_norm": 28.075910568237305, | |
| "kl": 1.68359375, | |
| "learning_rate": 2.032690407508949e-07, | |
| "loss": 0.201, | |
| "reward": 1.1237227618694305, | |
| "reward_std": 0.9717634171247482, | |
| "rewards/cosine_scaled_reward": 0.28408361971378326, | |
| "rewards/format_reward": 0.5555555522441864, | |
| "step": 401 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 474.09722900390625, | |
| "epoch": 0.34454681808442256, | |
| "grad_norm": 34.623069763183594, | |
| "kl": 1.5625, | |
| "learning_rate": 2.0127498008311922e-07, | |
| "loss": 0.2037, | |
| "reward": 0.760801451979205, | |
| "reward_std": 0.9681618064641953, | |
| "rewards/cosine_scaled_reward": 0.1859562654281035, | |
| "rewards/format_reward": 0.3888888955116272, | |
| "step": 402 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 540.7361145019531, | |
| "epoch": 0.34540389972144847, | |
| "grad_norm": 7.207082271575928, | |
| "kl": 1.66015625, | |
| "learning_rate": 1.9929791578083655e-07, | |
| "loss": 0.1982, | |
| "reward": 0.8205151949077845, | |
| "reward_std": 0.904526948928833, | |
| "rewards/cosine_scaled_reward": 0.2088687140494585, | |
| "rewards/format_reward": 0.4027777835726738, | |
| "step": 403 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 435.2361145019531, | |
| "epoch": 0.3462609813584744, | |
| "grad_norm": 16.900236129760742, | |
| "kl": 1.931640625, | |
| "learning_rate": 1.9733794420337213e-07, | |
| "loss": 0.1385, | |
| "reward": 0.6418208181858063, | |
| "reward_std": 0.9299771934747696, | |
| "rewards/cosine_scaled_reward": 0.07091040024533868, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 404 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 429.41666412353516, | |
| "epoch": 0.3471180629955003, | |
| "grad_norm": 13.073430061340332, | |
| "kl": 2.048828125, | |
| "learning_rate": 1.9539516087697517e-07, | |
| "loss": 0.2188, | |
| "reward": 0.8325824737548828, | |
| "reward_std": 0.7748439311981201, | |
| "rewards/cosine_scaled_reward": 0.1593467751517892, | |
| "rewards/format_reward": 0.5138888955116272, | |
| "step": 405 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 524.9027862548828, | |
| "epoch": 0.34797514463252627, | |
| "grad_norm": 19.661832809448242, | |
| "kl": 1.90625, | |
| "learning_rate": 1.934696604901642e-07, | |
| "loss": 0.2202, | |
| "reward": 0.5801484230905771, | |
| "reward_std": 0.7412025928497314, | |
| "rewards/cosine_scaled_reward": 0.053963107988238335, | |
| "rewards/format_reward": 0.4722222238779068, | |
| "step": 406 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 389.1111145019531, | |
| "epoch": 0.3488322262695522, | |
| "grad_norm": 14.274138450622559, | |
| "kl": 1.583984375, | |
| "learning_rate": 1.915615368891117e-07, | |
| "loss": 0.124, | |
| "reward": 0.9547043144702911, | |
| "reward_std": 0.8225141167640686, | |
| "rewards/cosine_scaled_reward": 0.18568546572350897, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 407 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 445.6805648803711, | |
| "epoch": 0.3496893079065781, | |
| "grad_norm": 15.514725685119629, | |
| "kl": 2.017578125, | |
| "learning_rate": 1.8967088307307e-07, | |
| "loss": 0.0997, | |
| "reward": 1.0453148484230042, | |
| "reward_std": 1.1434958428144455, | |
| "rewards/cosine_scaled_reward": 0.32126854080706835, | |
| "rewards/format_reward": 0.4027777835726738, | |
| "step": 408 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 479.95833587646484, | |
| "epoch": 0.350546389543604, | |
| "grad_norm": 10.456230163574219, | |
| "kl": 1.640625, | |
| "learning_rate": 1.8779779118983867e-07, | |
| "loss": 0.1688, | |
| "reward": 0.7981878519058228, | |
| "reward_std": 0.8995783925056458, | |
| "rewards/cosine_scaled_reward": 0.14214947074651718, | |
| "rewards/format_reward": 0.5138888880610466, | |
| "step": 409 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 439.2361145019531, | |
| "epoch": 0.35140347118063, | |
| "grad_norm": 9.658367156982422, | |
| "kl": 1.91796875, | |
| "learning_rate": 1.8594235253127372e-07, | |
| "loss": 0.1396, | |
| "reward": 0.7300833091139793, | |
| "reward_std": 0.7368223965167999, | |
| "rewards/cosine_scaled_reward": 0.12198610045015812, | |
| "rewards/format_reward": 0.486111119389534, | |
| "step": 410 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 420.94445037841797, | |
| "epoch": 0.3522605528176559, | |
| "grad_norm": 27.74929428100586, | |
| "kl": 1.73046875, | |
| "learning_rate": 1.8410465752883758e-07, | |
| "loss": 0.2249, | |
| "reward": 0.6258293315768242, | |
| "reward_std": 0.8346365168690681, | |
| "rewards/cosine_scaled_reward": 0.11152577586472034, | |
| "rewards/format_reward": 0.4027777835726738, | |
| "step": 411 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 447.70833587646484, | |
| "epoch": 0.3531176344546818, | |
| "grad_norm": 8.646500587463379, | |
| "kl": 1.943359375, | |
| "learning_rate": 1.822847957491922e-07, | |
| "loss": 0.1584, | |
| "reward": 0.9124602228403091, | |
| "reward_std": 0.9870292246341705, | |
| "rewards/cosine_scaled_reward": 0.22011900879442692, | |
| "rewards/format_reward": 0.4722222313284874, | |
| "step": 412 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 416.6527862548828, | |
| "epoch": 0.3539747160917077, | |
| "grad_norm": 21.185155868530273, | |
| "kl": 1.671875, | |
| "learning_rate": 1.804828558898332e-07, | |
| "loss": 0.1789, | |
| "reward": 1.2152755111455917, | |
| "reward_std": 0.8829309791326523, | |
| "rewards/cosine_scaled_reward": 0.357637744396925, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 413 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 387.56945037841797, | |
| "epoch": 0.3548317977287337, | |
| "grad_norm": 7.426985740661621, | |
| "kl": 1.640625, | |
| "learning_rate": 1.7869892577476722e-07, | |
| "loss": 0.2077, | |
| "reward": 0.7159564755856991, | |
| "reward_std": 0.6246453821659088, | |
| "rewards/cosine_scaled_reward": 0.15658933855593204, | |
| "rewards/format_reward": 0.4027777872979641, | |
| "step": 414 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 422.47222900390625, | |
| "epoch": 0.3556888793657596, | |
| "grad_norm": 9.64609146118164, | |
| "kl": 1.505859375, | |
| "learning_rate": 1.7693309235023127e-07, | |
| "loss": 0.0921, | |
| "reward": 0.9304980635643005, | |
| "reward_std": 0.908847376704216, | |
| "rewards/cosine_scaled_reward": 0.2291379189118743, | |
| "rewards/format_reward": 0.4722222238779068, | |
| "step": 415 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 430.3055648803711, | |
| "epoch": 0.3565459610027855, | |
| "grad_norm": 46.99845886230469, | |
| "kl": 1.486328125, | |
| "learning_rate": 1.7518544168045524e-07, | |
| "loss": 0.1551, | |
| "reward": 0.926967169623822, | |
| "reward_std": 0.8230779618024826, | |
| "rewards/cosine_scaled_reward": 0.23431690875440836, | |
| "rewards/format_reward": 0.45833333022892475, | |
| "step": 416 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 455.1527786254883, | |
| "epoch": 0.35740304263981143, | |
| "grad_norm": 11.308908462524414, | |
| "kl": 1.482421875, | |
| "learning_rate": 1.7345605894346726e-07, | |
| "loss": 0.2065, | |
| "reward": 1.206569030880928, | |
| "reward_std": 0.852574422955513, | |
| "rewards/cosine_scaled_reward": 0.27689564414322376, | |
| "rewards/format_reward": 0.6527777910232544, | |
| "step": 417 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 449.0, | |
| "epoch": 0.35826012427683734, | |
| "grad_norm": 94.13972473144531, | |
| "kl": 1.634765625, | |
| "learning_rate": 1.7174502842694212e-07, | |
| "loss": 0.1649, | |
| "reward": 1.0392098128795624, | |
| "reward_std": 0.7440510094165802, | |
| "rewards/cosine_scaled_reward": 0.2418271228671074, | |
| "rewards/format_reward": 0.5555555671453476, | |
| "step": 418 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 364.3472213745117, | |
| "epoch": 0.3591172059138633, | |
| "grad_norm": 13.36479663848877, | |
| "kl": 1.822265625, | |
| "learning_rate": 1.7005243352409333e-07, | |
| "loss": 0.0298, | |
| "reward": 1.1378347873687744, | |
| "reward_std": 0.9185468256473541, | |
| "rewards/cosine_scaled_reward": 0.26336181070655584, | |
| "rewards/format_reward": 0.6111111268401146, | |
| "step": 419 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 406.4027786254883, | |
| "epoch": 0.3599742875508892, | |
| "grad_norm": 8.439767837524414, | |
| "kl": 1.845703125, | |
| "learning_rate": 1.6837835672960831e-07, | |
| "loss": 0.1749, | |
| "reward": 1.267815724015236, | |
| "reward_std": 0.7989730685949326, | |
| "rewards/cosine_scaled_reward": 0.38390786573290825, | |
| "rewards/format_reward": 0.5, | |
| "step": 420 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 461.3472213745117, | |
| "epoch": 0.36083136918791514, | |
| "grad_norm": 22.24334716796875, | |
| "kl": 1.705078125, | |
| "learning_rate": 1.6672287963562852e-07, | |
| "loss": 0.137, | |
| "reward": 0.7026118133217096, | |
| "reward_std": 0.6709327548742294, | |
| "rewards/cosine_scaled_reward": 0.10825034603476524, | |
| "rewards/format_reward": 0.486111119389534, | |
| "step": 421 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 429.2916717529297, | |
| "epoch": 0.36168845082494105, | |
| "grad_norm": 17.320945739746094, | |
| "kl": 1.8046875, | |
| "learning_rate": 1.6508608292777203e-07, | |
| "loss": 0.3511, | |
| "reward": 1.3339325338602066, | |
| "reward_std": 0.9927608072757721, | |
| "rewards/cosine_scaled_reward": 0.3891884870827198, | |
| "rewards/format_reward": 0.555555559694767, | |
| "step": 422 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 391.02779388427734, | |
| "epoch": 0.362545532461967, | |
| "grad_norm": 13.228276252746582, | |
| "kl": 1.7890625, | |
| "learning_rate": 1.6346804638120098e-07, | |
| "loss": 0.1998, | |
| "reward": 0.7798476368188858, | |
| "reward_std": 0.8442924916744232, | |
| "rewards/cosine_scaled_reward": 0.1815904900431633, | |
| "rewards/format_reward": 0.416666679084301, | |
| "step": 423 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 419.55555725097656, | |
| "epoch": 0.36340261409899294, | |
| "grad_norm": 4.867981910705566, | |
| "kl": 1.88671875, | |
| "learning_rate": 1.6186884885673413e-07, | |
| "loss": 0.141, | |
| "reward": 0.38255439326167107, | |
| "reward_std": 0.6649321764707565, | |
| "rewards/cosine_scaled_reward": 0.003777193371206522, | |
| "rewards/format_reward": 0.3750000074505806, | |
| "step": 424 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 444.80554962158203, | |
| "epoch": 0.36425969573601885, | |
| "grad_norm": 19.474985122680664, | |
| "kl": 1.388671875, | |
| "learning_rate": 1.6028856829700258e-07, | |
| "loss": 0.2453, | |
| "reward": 0.8091739304363728, | |
| "reward_std": 0.6519715338945389, | |
| "rewards/cosine_scaled_reward": 0.1476425053551793, | |
| "rewards/format_reward": 0.5138888955116272, | |
| "step": 425 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 364.4861068725586, | |
| "epoch": 0.36511677737304477, | |
| "grad_norm": 5.00600004196167, | |
| "kl": 1.5703125, | |
| "learning_rate": 1.5872728172265146e-07, | |
| "loss": 0.0966, | |
| "reward": 1.0419821739196777, | |
| "reward_std": 0.8779765665531158, | |
| "rewards/cosine_scaled_reward": 0.24321329407393932, | |
| "rewards/format_reward": 0.5555555745959282, | |
| "step": 426 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 403.34722900390625, | |
| "epoch": 0.36597385901007073, | |
| "grad_norm": 17.935285568237305, | |
| "kl": 2.138671875, | |
| "learning_rate": 1.5718506522858572e-07, | |
| "loss": 0.1828, | |
| "reward": 0.46462448686361313, | |
| "reward_std": 0.658824697136879, | |
| "rewards/cosine_scaled_reward": 0.01703446265310049, | |
| "rewards/format_reward": 0.430555559694767, | |
| "step": 427 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 457.01390075683594, | |
| "epoch": 0.36683094064709665, | |
| "grad_norm": 24.413970947265625, | |
| "kl": 1.9921875, | |
| "learning_rate": 1.5566199398026147e-07, | |
| "loss": 0.2397, | |
| "reward": 0.9884657636284828, | |
| "reward_std": 0.9193740636110306, | |
| "rewards/cosine_scaled_reward": 0.2928439930547029, | |
| "rewards/format_reward": 0.4027777835726738, | |
| "step": 428 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 411.9027786254883, | |
| "epoch": 0.36768802228412256, | |
| "grad_norm": 6.04814338684082, | |
| "kl": 2.146484375, | |
| "learning_rate": 1.5415814221002265e-07, | |
| "loss": 0.2913, | |
| "reward": 0.7584970518946648, | |
| "reward_std": 0.8831272125244141, | |
| "rewards/cosine_scaled_reward": 0.10841521085239947, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 429 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 378.62500762939453, | |
| "epoch": 0.3685451039211485, | |
| "grad_norm": 16.954452514648438, | |
| "kl": 1.873046875, | |
| "learning_rate": 1.5267358321348285e-07, | |
| "loss": 0.2931, | |
| "reward": 0.9981023781001568, | |
| "reward_std": 0.7928592413663864, | |
| "rewards/cosine_scaled_reward": 0.26988449646160007, | |
| "rewards/format_reward": 0.4583333395421505, | |
| "step": 430 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 461.6527862548828, | |
| "epoch": 0.3694021855581744, | |
| "grad_norm": 16.10793113708496, | |
| "kl": 1.787109375, | |
| "learning_rate": 1.5120838934595337e-07, | |
| "loss": 0.2419, | |
| "reward": 0.9780825227499008, | |
| "reward_std": 0.897609755396843, | |
| "rewards/cosine_scaled_reward": 0.22515234909951687, | |
| "rewards/format_reward": 0.5277777910232544, | |
| "step": 431 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 428.72222900390625, | |
| "epoch": 0.37025926719520036, | |
| "grad_norm": 6.1311869621276855, | |
| "kl": 1.884765625, | |
| "learning_rate": 1.4976263201891613e-07, | |
| "loss": 0.2316, | |
| "reward": 1.065040536224842, | |
| "reward_std": 0.827082633972168, | |
| "rewards/cosine_scaled_reward": 0.24779804050922394, | |
| "rewards/format_reward": 0.5694444626569748, | |
| "step": 432 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 391.5, | |
| "epoch": 0.3711163488322263, | |
| "grad_norm": 19.106285095214844, | |
| "kl": 1.501953125, | |
| "learning_rate": 1.483363816965435e-07, | |
| "loss": 0.0608, | |
| "reward": 1.2501190304756165, | |
| "reward_std": 0.6760459691286087, | |
| "rewards/cosine_scaled_reward": 0.36811505258083344, | |
| "rewards/format_reward": 0.5138888955116272, | |
| "step": 433 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 399.8472213745117, | |
| "epoch": 0.3719734304692522, | |
| "grad_norm": 17.120777130126953, | |
| "kl": 1.87890625, | |
| "learning_rate": 1.469297078922642e-07, | |
| "loss": 0.2051, | |
| "reward": 0.7955693230032921, | |
| "reward_std": 0.6731881201267242, | |
| "rewards/cosine_scaled_reward": 0.11306244693696499, | |
| "rewards/format_reward": 0.5694444626569748, | |
| "step": 434 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 464.7083435058594, | |
| "epoch": 0.3728305121062781, | |
| "grad_norm": 17.019750595092773, | |
| "kl": 1.630859375, | |
| "learning_rate": 1.4554267916537495e-07, | |
| "loss": 0.2255, | |
| "reward": 0.9147238731384277, | |
| "reward_std": 0.8846637308597565, | |
| "rewards/cosine_scaled_reward": 0.16569526493549347, | |
| "rewards/format_reward": 0.5833333507180214, | |
| "step": 435 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 374.4861145019531, | |
| "epoch": 0.37368759374330407, | |
| "grad_norm": 13.612043380737305, | |
| "kl": 2.029296875, | |
| "learning_rate": 1.4417536311769885e-07, | |
| "loss": 0.2119, | |
| "reward": 0.624715980142355, | |
| "reward_std": 0.8500286191701889, | |
| "rewards/cosine_scaled_reward": 0.06930242432281375, | |
| "rewards/format_reward": 0.4861111119389534, | |
| "step": 436 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 380.12500381469727, | |
| "epoch": 0.37454467538033, | |
| "grad_norm": 19.181344985961914, | |
| "kl": 1.3828125, | |
| "learning_rate": 1.4282782639029128e-07, | |
| "loss": 0.0972, | |
| "reward": 1.1469180285930634, | |
| "reward_std": 0.8286690264940262, | |
| "rewards/cosine_scaled_reward": 0.2609590096399188, | |
| "rewards/format_reward": 0.625, | |
| "step": 437 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 437.75000762939453, | |
| "epoch": 0.3754017570173559, | |
| "grad_norm": 32.06253433227539, | |
| "kl": 1.892578125, | |
| "learning_rate": 1.4150013466019114e-07, | |
| "loss": 0.1621, | |
| "reward": 0.9302653223276138, | |
| "reward_std": 0.9272859841585159, | |
| "rewards/cosine_scaled_reward": 0.22902152687311172, | |
| "rewards/format_reward": 0.4722222313284874, | |
| "step": 438 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 427.75, | |
| "epoch": 0.3762588386543818, | |
| "grad_norm": 12.294611930847168, | |
| "kl": 2.0078125, | |
| "learning_rate": 1.4019235263722034e-07, | |
| "loss": 0.2887, | |
| "reward": 0.9489690512418747, | |
| "reward_std": 0.91136734187603, | |
| "rewards/cosine_scaled_reward": 0.16892896872013807, | |
| "rewards/format_reward": 0.6111111119389534, | |
| "step": 439 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 392.9027862548828, | |
| "epoch": 0.3771159202914078, | |
| "grad_norm": 11.586893081665039, | |
| "kl": 1.779296875, | |
| "learning_rate": 1.3890454406082956e-07, | |
| "loss": 0.1511, | |
| "reward": 1.3165553212165833, | |
| "reward_std": 1.0282287746667862, | |
| "rewards/cosine_scaled_reward": 0.40827762335538864, | |
| "rewards/format_reward": 0.5, | |
| "step": 440 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 378.2777786254883, | |
| "epoch": 0.3779730019284337, | |
| "grad_norm": 7.445328235626221, | |
| "kl": 1.876953125, | |
| "learning_rate": 1.3763677169699217e-07, | |
| "loss": 0.2252, | |
| "reward": 0.4708889238536358, | |
| "reward_std": 0.5369373112916946, | |
| "rewards/cosine_scaled_reward": 0.01322223711758852, | |
| "rewards/format_reward": 0.4444444552063942, | |
| "step": 441 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 366.52777099609375, | |
| "epoch": 0.3788300835654596, | |
| "grad_norm": 37.63726043701172, | |
| "kl": 1.6171875, | |
| "learning_rate": 1.3638909733514452e-07, | |
| "loss": 0.2202, | |
| "reward": 0.9833096265792847, | |
| "reward_std": 0.6800422966480255, | |
| "rewards/cosine_scaled_reward": 0.19998812582343817, | |
| "rewards/format_reward": 0.5833333507180214, | |
| "step": 442 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 433.93055725097656, | |
| "epoch": 0.3796871652024855, | |
| "grad_norm": 10.03159236907959, | |
| "kl": 1.92578125, | |
| "learning_rate": 1.351615817851748e-07, | |
| "loss": 0.1216, | |
| "reward": 0.8934760093688965, | |
| "reward_std": 0.8315132707357407, | |
| "rewards/cosine_scaled_reward": 0.23840466793626547, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 443 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 408.19444274902344, | |
| "epoch": 0.3805442468395115, | |
| "grad_norm": 10.382827758789062, | |
| "kl": 1.80859375, | |
| "learning_rate": 1.3395428487445914e-07, | |
| "loss": 0.1845, | |
| "reward": 0.8439731672406197, | |
| "reward_std": 0.8578460216522217, | |
| "rewards/cosine_scaled_reward": 0.19976436160504818, | |
| "rewards/format_reward": 0.4444444552063942, | |
| "step": 444 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 397.7777786254883, | |
| "epoch": 0.3814013284765374, | |
| "grad_norm": 25.172704696655273, | |
| "kl": 1.984375, | |
| "learning_rate": 1.3276726544494571e-07, | |
| "loss": 0.2339, | |
| "reward": 0.9671718925237656, | |
| "reward_std": 0.8240036368370056, | |
| "rewards/cosine_scaled_reward": 0.2613637112081051, | |
| "rewards/format_reward": 0.4444444552063942, | |
| "step": 445 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 406.43055725097656, | |
| "epoch": 0.3822584101135633, | |
| "grad_norm": 5.355282783508301, | |
| "kl": 1.91015625, | |
| "learning_rate": 1.316005813502869e-07, | |
| "loss": 0.1568, | |
| "reward": 0.802594855427742, | |
| "reward_std": 0.7209294140338898, | |
| "rewards/cosine_scaled_reward": 0.13740853779017925, | |
| "rewards/format_reward": 0.5277777835726738, | |
| "step": 446 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 387.69444274902344, | |
| "epoch": 0.38311549175058923, | |
| "grad_norm": 7.822175979614258, | |
| "kl": 1.595703125, | |
| "learning_rate": 1.3045428945301953e-07, | |
| "loss": 0.1097, | |
| "reward": 0.7201132848858833, | |
| "reward_std": 0.9133000522851944, | |
| "rewards/cosine_scaled_reward": 0.1447788504883647, | |
| "rewards/format_reward": 0.4305555671453476, | |
| "step": 447 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 412.4722213745117, | |
| "epoch": 0.38397257338761515, | |
| "grad_norm": 7.698829650878906, | |
| "kl": 1.88671875, | |
| "learning_rate": 1.2932844562179352e-07, | |
| "loss": 0.1968, | |
| "reward": 0.8293040692806244, | |
| "reward_std": 0.9215101897716522, | |
| "rewards/cosine_scaled_reward": 0.1715964898467064, | |
| "rewards/format_reward": 0.486111119389534, | |
| "step": 448 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 393.54166412353516, | |
| "epoch": 0.3848296550246411, | |
| "grad_norm": 9.48220157623291, | |
| "kl": 1.6025390625, | |
| "learning_rate": 1.2822310472864885e-07, | |
| "loss": 0.149, | |
| "reward": 0.9041518270969391, | |
| "reward_std": 0.6833358332514763, | |
| "rewards/cosine_scaled_reward": 0.18124257400631905, | |
| "rewards/format_reward": 0.5416666641831398, | |
| "step": 449 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 418.66666412353516, | |
| "epoch": 0.38568673666166703, | |
| "grad_norm": 23.154109954833984, | |
| "kl": 1.86328125, | |
| "learning_rate": 1.2713832064634125e-07, | |
| "loss": 0.1058, | |
| "reward": 0.7730568274855614, | |
| "reward_std": 0.8151258826255798, | |
| "rewards/cosine_scaled_reward": 0.17819508351385593, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 450 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 412.0, | |
| "epoch": 0.38654381829869294, | |
| "grad_norm": 11.1907377243042, | |
| "kl": 1.623046875, | |
| "learning_rate": 1.260741462457165e-07, | |
| "loss": 0.1147, | |
| "reward": 0.7459932379424572, | |
| "reward_std": 0.6991625279188156, | |
| "rewards/cosine_scaled_reward": 0.09521883772686124, | |
| "rewards/format_reward": 0.5555555522441864, | |
| "step": 451 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 379.6388931274414, | |
| "epoch": 0.38740089993571886, | |
| "grad_norm": 38.63614273071289, | |
| "kl": 1.52734375, | |
| "learning_rate": 1.2503063339313356e-07, | |
| "loss": 0.0696, | |
| "reward": 0.8107917159795761, | |
| "reward_std": 0.8233655989170074, | |
| "rewards/cosine_scaled_reward": 0.11372919054701924, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 452 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 400.44444274902344, | |
| "epoch": 0.3882579815727448, | |
| "grad_norm": 7.2680439949035645, | |
| "kl": 1.779296875, | |
| "learning_rate": 1.2400783294793668e-07, | |
| "loss": 0.2399, | |
| "reward": 0.8938721343874931, | |
| "reward_std": 0.8410957902669907, | |
| "rewards/cosine_scaled_reward": 0.16915827617049217, | |
| "rewards/format_reward": 0.5555555671453476, | |
| "step": 453 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 418.44445037841797, | |
| "epoch": 0.38911506320977074, | |
| "grad_norm": 8.975519180297852, | |
| "kl": 1.748046875, | |
| "learning_rate": 1.2300579475997657e-07, | |
| "loss": 0.1852, | |
| "reward": 1.3426352962851524, | |
| "reward_std": 0.8928624093532562, | |
| "rewards/cosine_scaled_reward": 0.35881765000522137, | |
| "rewards/format_reward": 0.625, | |
| "step": 454 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 399.25000762939453, | |
| "epoch": 0.38997214484679665, | |
| "grad_norm": 8.324163436889648, | |
| "kl": 1.79296875, | |
| "learning_rate": 1.220245676671809e-07, | |
| "loss": 0.305, | |
| "reward": 0.5967597924172878, | |
| "reward_std": 0.6817344427108765, | |
| "rewards/cosine_scaled_reward": 0.06921320641413331, | |
| "rewards/format_reward": 0.4583333507180214, | |
| "step": 455 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 366.3888931274414, | |
| "epoch": 0.39082922648382257, | |
| "grad_norm": 6.22599458694458, | |
| "kl": 1.716796875, | |
| "learning_rate": 1.2106419949317388e-07, | |
| "loss": 0.195, | |
| "reward": 1.2024425864219666, | |
| "reward_std": 0.9370259791612625, | |
| "rewards/cosine_scaled_reward": 0.30955461598932743, | |
| "rewards/format_reward": 0.5833333358168602, | |
| "step": 456 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 379.9861145019531, | |
| "epoch": 0.39168630812084854, | |
| "grad_norm": 10.595834732055664, | |
| "kl": 2.078125, | |
| "learning_rate": 1.2012473704494537e-07, | |
| "loss": 0.1005, | |
| "reward": 1.4162960648536682, | |
| "reward_std": 0.9950994998216629, | |
| "rewards/cosine_scaled_reward": 0.3887035697698593, | |
| "rewards/format_reward": 0.6388889104127884, | |
| "step": 457 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 398.97222900390625, | |
| "epoch": 0.39254338975787445, | |
| "grad_norm": 9.582780838012695, | |
| "kl": 1.748046875, | |
| "learning_rate": 1.1920622611056974e-07, | |
| "loss": 0.1435, | |
| "reward": 0.8297743499279022, | |
| "reward_std": 0.6032012775540352, | |
| "rewards/cosine_scaled_reward": 0.1301649445667863, | |
| "rewards/format_reward": 0.569444440305233, | |
| "step": 458 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 416.4166793823242, | |
| "epoch": 0.39340047139490036, | |
| "grad_norm": 13.351261138916016, | |
| "kl": 1.697265625, | |
| "learning_rate": 1.1830871145697412e-07, | |
| "loss": 0.1826, | |
| "reward": 1.3233585357666016, | |
| "reward_std": 0.9728284627199173, | |
| "rewards/cosine_scaled_reward": 0.3561236932873726, | |
| "rewards/format_reward": 0.6111111268401146, | |
| "step": 459 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 389.50000762939453, | |
| "epoch": 0.3942575530319263, | |
| "grad_norm": 7.549479007720947, | |
| "kl": 1.9921875, | |
| "learning_rate": 1.1743223682775649e-07, | |
| "loss": 0.2677, | |
| "reward": 1.1245174407958984, | |
| "reward_std": 0.7800677567720413, | |
| "rewards/cosine_scaled_reward": 0.2705920338630676, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 460 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 398.0277862548828, | |
| "epoch": 0.3951146346689522, | |
| "grad_norm": 9.143904685974121, | |
| "kl": 1.7109375, | |
| "learning_rate": 1.1657684494105386e-07, | |
| "loss": 0.2179, | |
| "reward": 0.8851038962602615, | |
| "reward_std": 0.7892615795135498, | |
| "rewards/cosine_scaled_reward": 0.15782971866428852, | |
| "rewards/format_reward": 0.5694444552063942, | |
| "step": 461 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 373.0138931274414, | |
| "epoch": 0.39597171630597816, | |
| "grad_norm": 6.645680904388428, | |
| "kl": 1.9296875, | |
| "learning_rate": 1.1574257748745986e-07, | |
| "loss": 0.2493, | |
| "reward": 1.0129882618784904, | |
| "reward_std": 0.790315642952919, | |
| "rewards/cosine_scaled_reward": 0.22177189541980624, | |
| "rewards/format_reward": 0.5694444626569748, | |
| "step": 462 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 394.30555725097656, | |
| "epoch": 0.3968287979430041, | |
| "grad_norm": 8.618224143981934, | |
| "kl": 1.84765625, | |
| "learning_rate": 1.1492947512799328e-07, | |
| "loss": 0.1754, | |
| "reward": 1.06332229077816, | |
| "reward_std": 0.9512833207845688, | |
| "rewards/cosine_scaled_reward": 0.2677722591906786, | |
| "rewards/format_reward": 0.5277777910232544, | |
| "step": 463 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 327.58333587646484, | |
| "epoch": 0.39768587958003, | |
| "grad_norm": 6.552804470062256, | |
| "kl": 2.2421875, | |
| "learning_rate": 1.1413757749211602e-07, | |
| "loss": 0.1814, | |
| "reward": 1.1924152821302414, | |
| "reward_std": 0.8387909829616547, | |
| "rewards/cosine_scaled_reward": 0.2975965216755867, | |
| "rewards/format_reward": 0.5972222313284874, | |
| "step": 464 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 381.45833587646484, | |
| "epoch": 0.3985429612170559, | |
| "grad_norm": 23.327707290649414, | |
| "kl": 1.900390625, | |
| "learning_rate": 1.1336692317580158e-07, | |
| "loss": 0.2125, | |
| "reward": 0.9507962316274643, | |
| "reward_std": 0.8488497734069824, | |
| "rewards/cosine_scaled_reward": 0.21845365059562027, | |
| "rewards/format_reward": 0.5138888955116272, | |
| "step": 465 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 388.25000762939453, | |
| "epoch": 0.39940004285408187, | |
| "grad_norm": 16.77749252319336, | |
| "kl": 2.42578125, | |
| "learning_rate": 1.1261754973965422e-07, | |
| "loss": 0.1887, | |
| "reward": 1.0224937349557877, | |
| "reward_std": 0.7094171047210693, | |
| "rewards/cosine_scaled_reward": 0.2543024020269513, | |
| "rewards/format_reward": 0.5138888955116272, | |
| "step": 466 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 395.4027786254883, | |
| "epoch": 0.4002571244911078, | |
| "grad_norm": 22.206117630004883, | |
| "kl": 1.734375, | |
| "learning_rate": 1.1188949370707787e-07, | |
| "loss": 0.1153, | |
| "reward": 1.1953821629285812, | |
| "reward_std": 0.9889565110206604, | |
| "rewards/cosine_scaled_reward": 0.3338021747767925, | |
| "rewards/format_reward": 0.5277777835726738, | |
| "step": 467 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 443.1805648803711, | |
| "epoch": 0.4011142061281337, | |
| "grad_norm": 21.06090545654297, | |
| "kl": 2.35546875, | |
| "learning_rate": 1.1118279056249653e-07, | |
| "loss": 0.2039, | |
| "reward": 0.6936846375465393, | |
| "reward_std": 0.7559118419885635, | |
| "rewards/cosine_scaled_reward": 0.15934233367443085, | |
| "rewards/format_reward": 0.3750000074505806, | |
| "step": 468 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 375.4027786254883, | |
| "epoch": 0.4019712877651596, | |
| "grad_norm": 11.80557632446289, | |
| "kl": 1.93359375, | |
| "learning_rate": 1.1049747474962444e-07, | |
| "loss": 0.31, | |
| "reward": 1.1770465597510338, | |
| "reward_std": 0.9983221143484116, | |
| "rewards/cosine_scaled_reward": 0.3385232575237751, | |
| "rewards/format_reward": 0.5000000037252903, | |
| "step": 469 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 398.9722213745117, | |
| "epoch": 0.4028283694021856, | |
| "grad_norm": 3.4314489364624023, | |
| "kl": 2.005859375, | |
| "learning_rate": 1.0983357966978745e-07, | |
| "loss": 0.3436, | |
| "reward": 0.823687631636858, | |
| "reward_std": 1.019385039806366, | |
| "rewards/cosine_scaled_reward": 0.21045495197176933, | |
| "rewards/format_reward": 0.4027777872979641, | |
| "step": 470 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 405.87500762939453, | |
| "epoch": 0.4036854510392115, | |
| "grad_norm": 28.387819290161133, | |
| "kl": 2.25390625, | |
| "learning_rate": 1.0919113768029517e-07, | |
| "loss": 0.2678, | |
| "reward": 0.5487676113843918, | |
| "reward_std": 0.7938825041055679, | |
| "rewards/cosine_scaled_reward": 0.059106036089360714, | |
| "rewards/format_reward": 0.43055555410683155, | |
| "step": 471 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 405.44444274902344, | |
| "epoch": 0.4045425326762374, | |
| "grad_norm": 14.908821105957031, | |
| "kl": 1.94140625, | |
| "learning_rate": 1.0857018009286381e-07, | |
| "loss": 0.0599, | |
| "reward": 0.9214093834161758, | |
| "reward_std": 0.8387825936079025, | |
| "rewards/cosine_scaled_reward": 0.22459355555474758, | |
| "rewards/format_reward": 0.472222238779068, | |
| "step": 472 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 431.7777786254883, | |
| "epoch": 0.4053996143132633, | |
| "grad_norm": 31.563308715820312, | |
| "kl": 2.39453125, | |
| "learning_rate": 1.0797073717209013e-07, | |
| "loss": 0.1599, | |
| "reward": 0.9639946967363358, | |
| "reward_std": 0.9418660998344421, | |
| "rewards/cosine_scaled_reward": 0.23199738003313541, | |
| "rewards/format_reward": 0.5000000149011612, | |
| "step": 473 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 345.43055725097656, | |
| "epoch": 0.4062566959502893, | |
| "grad_norm": 23.537080764770508, | |
| "kl": 2.203125, | |
| "learning_rate": 1.0739283813397639e-07, | |
| "loss": 0.1802, | |
| "reward": 0.9301662147045135, | |
| "reward_std": 0.9208797365427017, | |
| "rewards/cosine_scaled_reward": 0.24286089045926929, | |
| "rewards/format_reward": 0.4444444552063942, | |
| "step": 474 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 407.0416717529297, | |
| "epoch": 0.4071137775873152, | |
| "grad_norm": 113.17594909667969, | |
| "kl": 2.009765625, | |
| "learning_rate": 1.068365111445064e-07, | |
| "loss": 0.1716, | |
| "reward": 0.6663865000009537, | |
| "reward_std": 0.6882055103778839, | |
| "rewards/cosine_scaled_reward": 0.12485991045832634, | |
| "rewards/format_reward": 0.4166666753590107, | |
| "step": 475 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 392.68055725097656, | |
| "epoch": 0.4079708592243411, | |
| "grad_norm": 7.956409454345703, | |
| "kl": 2.01171875, | |
| "learning_rate": 1.063017833182728e-07, | |
| "loss": 0.1263, | |
| "reward": 1.0667885690927505, | |
| "reward_std": 0.7681434005498886, | |
| "rewards/cosine_scaled_reward": 0.2347831572405994, | |
| "rewards/format_reward": 0.597222238779068, | |
| "step": 476 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 415.0416564941406, | |
| "epoch": 0.40882794086136703, | |
| "grad_norm": 9.29298210144043, | |
| "kl": 2.6171875, | |
| "learning_rate": 1.0578868071715544e-07, | |
| "loss": 0.2517, | |
| "reward": 0.8619468212127686, | |
| "reward_std": 0.951014369726181, | |
| "rewards/cosine_scaled_reward": 0.17402894236147404, | |
| "rewards/format_reward": 0.5138888955116272, | |
| "step": 477 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 420.3611145019531, | |
| "epoch": 0.40968502249839295, | |
| "grad_norm": 11.378466606140137, | |
| "kl": 1.888671875, | |
| "learning_rate": 1.0529722834905125e-07, | |
| "loss": 0.1412, | |
| "reward": 0.7714032009243965, | |
| "reward_std": 0.8700239658355713, | |
| "rewards/cosine_scaled_reward": 0.15653494279831648, | |
| "rewards/format_reward": 0.4583333358168602, | |
| "step": 478 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 368.3333435058594, | |
| "epoch": 0.4105421041354189, | |
| "grad_norm": 14.92309284210205, | |
| "kl": 2.076171875, | |
| "learning_rate": 1.0482745016665526e-07, | |
| "loss": 0.1466, | |
| "reward": 1.3654054403305054, | |
| "reward_std": 0.9904145002365112, | |
| "rewards/cosine_scaled_reward": 0.4327027127146721, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 479 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 401.55555725097656, | |
| "epoch": 0.41139918577244483, | |
| "grad_norm": 18.368589401245117, | |
| "kl": 2.294921875, | |
| "learning_rate": 1.0437936906629334e-07, | |
| "loss": 0.1576, | |
| "reward": 1.0128132551908493, | |
| "reward_std": 0.808688297867775, | |
| "rewards/cosine_scaled_reward": 0.24251772370189428, | |
| "rewards/format_reward": 0.527777798473835, | |
| "step": 480 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 394.93054962158203, | |
| "epoch": 0.41225626740947074, | |
| "grad_norm": 15.127246856689453, | |
| "kl": 2.04296875, | |
| "learning_rate": 1.0395300688680625e-07, | |
| "loss": 0.1467, | |
| "reward": 0.712806798517704, | |
| "reward_std": 1.0079237669706345, | |
| "rewards/cosine_scaled_reward": 0.15501452051103115, | |
| "rewards/format_reward": 0.4027777910232544, | |
| "step": 481 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 508.3333282470703, | |
| "epoch": 0.41311334904649666, | |
| "grad_norm": 14.581649780273438, | |
| "kl": 1.87890625, | |
| "learning_rate": 1.0354838440848501e-07, | |
| "loss": 0.0734, | |
| "reward": 1.3022873476147652, | |
| "reward_std": 0.9221579432487488, | |
| "rewards/cosine_scaled_reward": 0.36642143689095974, | |
| "rewards/format_reward": 0.5694444477558136, | |
| "step": 482 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 448.06945037841797, | |
| "epoch": 0.4139704306835226, | |
| "grad_norm": 16.139543533325195, | |
| "kl": 1.8984375, | |
| "learning_rate": 1.0316552135205837e-07, | |
| "loss": 0.1623, | |
| "reward": 0.8377330377697945, | |
| "reward_std": 0.822199173271656, | |
| "rewards/cosine_scaled_reward": 0.17581098433583975, | |
| "rewards/format_reward": 0.4861111119389534, | |
| "step": 483 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 371.7222213745117, | |
| "epoch": 0.41482751232054854, | |
| "grad_norm": 39.747249603271484, | |
| "kl": 2.271484375, | |
| "learning_rate": 1.0280443637773163e-07, | |
| "loss": 0.0188, | |
| "reward": 0.9065616875886917, | |
| "reward_std": 0.8971037119626999, | |
| "rewards/cosine_scaled_reward": 0.2032808493822813, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 484 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 411.3472213745117, | |
| "epoch": 0.41568459395757446, | |
| "grad_norm": 12.812687873840332, | |
| "kl": 2.158203125, | |
| "learning_rate": 1.0246514708427701e-07, | |
| "loss": 0.1918, | |
| "reward": 0.7322559207677841, | |
| "reward_std": 0.8131649047136307, | |
| "rewards/cosine_scaled_reward": 0.08835018612444401, | |
| "rewards/format_reward": 0.555555559694767, | |
| "step": 485 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 362.08333587646484, | |
| "epoch": 0.41654167559460037, | |
| "grad_norm": 35.39934539794922, | |
| "kl": 1.7734375, | |
| "learning_rate": 1.0214767000817596e-07, | |
| "loss": 0.1173, | |
| "reward": 0.9961818382143974, | |
| "reward_std": 0.6925189048051834, | |
| "rewards/cosine_scaled_reward": 0.22725759260356426, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 486 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 349.7777786254883, | |
| "epoch": 0.41739875723162634, | |
| "grad_norm": 35.59743881225586, | |
| "kl": 1.703125, | |
| "learning_rate": 1.0185202062281336e-07, | |
| "loss": 0.1241, | |
| "reward": 0.8367552310228348, | |
| "reward_std": 0.638402059674263, | |
| "rewards/cosine_scaled_reward": 0.14059981796890497, | |
| "rewards/format_reward": 0.555555559694767, | |
| "step": 487 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 446.94444274902344, | |
| "epoch": 0.41825583886865225, | |
| "grad_norm": 9.523124694824219, | |
| "kl": 2.037109375, | |
| "learning_rate": 1.0157821333772304e-07, | |
| "loss": 0.1536, | |
| "reward": 0.9173677563667297, | |
| "reward_std": 0.8871555328369141, | |
| "rewards/cosine_scaled_reward": 0.15312829986214638, | |
| "rewards/format_reward": 0.611111119389534, | |
| "step": 488 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 406.5416717529297, | |
| "epoch": 0.41911292050567817, | |
| "grad_norm": 18.345104217529297, | |
| "kl": 1.677734375, | |
| "learning_rate": 1.013262614978859e-07, | |
| "loss": 0.2052, | |
| "reward": 1.6547060012817383, | |
| "reward_std": 0.9650345891714096, | |
| "rewards/cosine_scaled_reward": 0.5217974632978439, | |
| "rewards/format_reward": 0.611111119389534, | |
| "step": 489 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 414.1527786254883, | |
| "epoch": 0.4199700021427041, | |
| "grad_norm": 31.926456451416016, | |
| "kl": 1.89453125, | |
| "learning_rate": 1.0109617738307911e-07, | |
| "loss": 0.1587, | |
| "reward": 1.0247105360031128, | |
| "reward_std": 0.9507659077644348, | |
| "rewards/cosine_scaled_reward": 0.2692996822297573, | |
| "rewards/format_reward": 0.4861111268401146, | |
| "step": 490 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 367.19445037841797, | |
| "epoch": 0.42082708377973, | |
| "grad_norm": 128.7314453125, | |
| "kl": 1.83203125, | |
| "learning_rate": 1.0088797220727779e-07, | |
| "loss": 0.0835, | |
| "reward": 0.910501167178154, | |
| "reward_std": 0.8024759143590927, | |
| "rewards/cosine_scaled_reward": 0.17747278325259686, | |
| "rewards/format_reward": 0.5555555671453476, | |
| "step": 491 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 438.0416717529297, | |
| "epoch": 0.42168416541675596, | |
| "grad_norm": 6.026768684387207, | |
| "kl": 1.734375, | |
| "learning_rate": 1.0070165611810855e-07, | |
| "loss": 0.1438, | |
| "reward": 1.2874046564102173, | |
| "reward_std": 0.817937821149826, | |
| "rewards/cosine_scaled_reward": 0.40064676851034164, | |
| "rewards/format_reward": 0.4861111268401146, | |
| "step": 492 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 419.15277099609375, | |
| "epoch": 0.4225412470537819, | |
| "grad_norm": 23.038436889648438, | |
| "kl": 1.763671875, | |
| "learning_rate": 1.005372381963547e-07, | |
| "loss": 0.2819, | |
| "reward": 1.25694739818573, | |
| "reward_std": 0.8772137686610222, | |
| "rewards/cosine_scaled_reward": 0.32291813008487225, | |
| "rewards/format_reward": 0.6111111119389534, | |
| "step": 493 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 414.87500762939453, | |
| "epoch": 0.4233983286908078, | |
| "grad_norm": 41.3348274230957, | |
| "kl": 1.759765625, | |
| "learning_rate": 1.0039472645551372e-07, | |
| "loss": 0.0636, | |
| "reward": 1.1973340883851051, | |
| "reward_std": 0.8486432880163193, | |
| "rewards/cosine_scaled_reward": 0.32088930322788656, | |
| "rewards/format_reward": 0.5555555745959282, | |
| "step": 494 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 472.7361068725586, | |
| "epoch": 0.4242554103278337, | |
| "grad_norm": 19.50507354736328, | |
| "kl": 1.740234375, | |
| "learning_rate": 1.002741278414069e-07, | |
| "loss": 0.1005, | |
| "reward": 1.1010611280798912, | |
| "reward_std": 0.7267665565013885, | |
| "rewards/cosine_scaled_reward": 0.238030556589365, | |
| "rewards/format_reward": 0.625, | |
| "step": 495 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 442.76390075683594, | |
| "epoch": 0.4251124919648597, | |
| "grad_norm": 17.064443588256836, | |
| "kl": 1.947265625, | |
| "learning_rate": 1.0017544823184055e-07, | |
| "loss": 0.1467, | |
| "reward": 0.777456559240818, | |
| "reward_std": 0.8870294690132141, | |
| "rewards/cosine_scaled_reward": 0.12483939621597528, | |
| "rewards/format_reward": 0.5277777910232544, | |
| "step": 496 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 459.98609924316406, | |
| "epoch": 0.4259695736018856, | |
| "grad_norm": 26.50463104248047, | |
| "kl": 1.8984375, | |
| "learning_rate": 1.0009869243631952e-07, | |
| "loss": 0.1413, | |
| "reward": 0.7809455767273903, | |
| "reward_std": 0.739835649728775, | |
| "rewards/cosine_scaled_reward": 0.11963944719173014, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 497 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 429.84722900390625, | |
| "epoch": 0.4268266552389115, | |
| "grad_norm": 7.440047740936279, | |
| "kl": 2.583984375, | |
| "learning_rate": 1.000438641958131e-07, | |
| "loss": 0.2265, | |
| "reward": 0.9033599346876144, | |
| "reward_std": 1.034580335021019, | |
| "rewards/cosine_scaled_reward": 0.20167998038232327, | |
| "rewards/format_reward": 0.5000000149011612, | |
| "step": 498 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 430.4166717529297, | |
| "epoch": 0.4276837368759374, | |
| "grad_norm": 9.689515113830566, | |
| "kl": 1.7890625, | |
| "learning_rate": 1.0001096618257236e-07, | |
| "loss": 0.1179, | |
| "reward": 0.6143735200166702, | |
| "reward_std": 0.6945628225803375, | |
| "rewards/cosine_scaled_reward": 0.02246453333646059, | |
| "rewards/format_reward": 0.5694444477558136, | |
| "step": 499 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 378.05555725097656, | |
| "epoch": 0.4285408185129634, | |
| "grad_norm": 44.88062286376953, | |
| "kl": 1.921875, | |
| "learning_rate": 1e-07, | |
| "loss": 0.127, | |
| "reward": 1.3835089206695557, | |
| "reward_std": 1.1280009299516678, | |
| "rewards/cosine_scaled_reward": 0.4000878185033798, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.4285408185129634, | |
| "step": 500, | |
| "total_flos": 0.0, | |
| "train_loss": 0.15917640645144274, | |
| "train_runtime": 31189.1196, | |
| "train_samples_per_second": 1.154, | |
| "train_steps_per_second": 0.016 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 6, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |