diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7042 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completion_length": 1502.0476379394531, + "epoch": 0.004, + "grad_norm": 0.17851164937019348, + "kl": 0.0, + "learning_rate": 2e-08, + "loss": -0.0206, + "reward": -0.23460307717323303, + "reward_std": 0.13429159671068192, + "rewards/cosine_scaled_reward": -0.11730154044926167, + "rewards/format_reward": 0.0, + "step": 1 + }, + { + "clip_ratio": 0.0, + "completion_length": 1483.3928833007812, + "epoch": 0.008, + "grad_norm": 0.20364968478679657, + "kl": 0.0, + "learning_rate": 4e-08, + "loss": -0.0247, + "reward": -0.24404804036021233, + "reward_std": 0.15527689084410667, + "rewards/cosine_scaled_reward": -0.12202401272952557, + "rewards/format_reward": 0.0, + "step": 2 + }, + { + "clip_ratio": 0.0, + "completion_length": 1498.7857360839844, + "epoch": 0.012, + "grad_norm": 0.2454664260149002, + "kl": -2.008676528930664e-05, + "learning_rate": 6e-08, + "loss": -0.024, + "reward": -0.2683428265154362, + "reward_std": 0.1479046531021595, + "rewards/cosine_scaled_reward": -0.13417141698300838, + "rewards/format_reward": 0.0, + "step": 3 + }, + { + "clip_ratio": 0.0, + "completion_length": 1531.5595397949219, + "epoch": 0.016, + "grad_norm": 0.26290684938430786, + "kl": -1.3932585716247559e-06, + "learning_rate": 8e-08, + "loss": -0.0063, + "reward": -0.22999461740255356, + "reward_std": 0.13789904117584229, + "rewards/cosine_scaled_reward": -0.11499731056392193, + "rewards/format_reward": 0.0, + "step": 4 + }, + { + "clip_ratio": 0.0, + "completion_length": 1496.1845703125, + "epoch": 0.02, + "grad_norm": 0.25596341490745544, + "kl": -5.0514936447143555e-06, + "learning_rate": 1e-07, + "loss": -0.0425, + "reward": -0.23508117347955704, + "reward_std": 0.15481781959533691, + "rewards/cosine_scaled_reward": -0.11754059046506882, + "rewards/format_reward": 0.0, + "step": 5 + }, + { + "clip_ratio": 0.0, + "completion_length": 1498.8690795898438, + "epoch": 0.024, + "grad_norm": 0.24606633186340332, + "kl": -1.0028481483459473e-05, + "learning_rate": 1.2e-07, + "loss": -0.0007, + "reward": -0.25243763625621796, + "reward_std": 0.13587487116456032, + "rewards/cosine_scaled_reward": -0.12621882185339928, + "rewards/format_reward": 0.0, + "step": 6 + }, + { + "clip_ratio": 0.0, + "completion_length": 1527.5238037109375, + "epoch": 0.028, + "grad_norm": 0.30039116740226746, + "kl": 3.2372772693634033e-06, + "learning_rate": 1.4e-07, + "loss": 0.0092, + "reward": -0.221938356757164, + "reward_std": 0.13823154009878635, + "rewards/cosine_scaled_reward": -0.11096917279064655, + "rewards/format_reward": 0.0, + "step": 7 + }, + { + "clip_ratio": 0.0, + "completion_length": 1509.2143249511719, + "epoch": 0.032, + "grad_norm": 0.2639496624469757, + "kl": -3.641587682068348e-06, + "learning_rate": 1.6e-07, + "loss": -0.035, + "reward": -0.27191318944096565, + "reward_std": 0.15618360042572021, + "rewards/cosine_scaled_reward": -0.13595658540725708, + "rewards/format_reward": 0.0, + "step": 8 + }, + { + "clip_ratio": 0.0, + "completion_length": 1510.0535888671875, + "epoch": 0.036, + "grad_norm": 0.21626578271389008, + "kl": 7.82310962677002e-08, + "learning_rate": 1.8e-07, + "loss": -0.0261, + "reward": -0.24237940087914467, + "reward_std": 0.15605646930634975, + "rewards/cosine_scaled_reward": -0.12118970789015293, + "rewards/format_reward": 0.0, + "step": 9 + }, + { + "clip_ratio": 0.0, + "completion_length": 1513.0357360839844, + "epoch": 0.04, + "grad_norm": 0.26584091782569885, + "kl": -6.256159394979477e-06, + "learning_rate": 2e-07, + "loss": -0.0261, + "reward": -0.24382107332348824, + "reward_std": 0.147519638761878, + "rewards/cosine_scaled_reward": -0.12191054411232471, + "rewards/format_reward": 0.0, + "step": 10 + }, + { + "clip_ratio": 0.0, + "completion_length": 1508.6190795898438, + "epoch": 0.044, + "grad_norm": 0.26533427834510803, + "kl": -5.21540641784668e-06, + "learning_rate": 2.1999999999999998e-07, + "loss": -0.0136, + "reward": -0.26315416768193245, + "reward_std": 0.14432235062122345, + "rewards/cosine_scaled_reward": -0.13157708384096622, + "rewards/format_reward": 0.0, + "step": 11 + }, + { + "clip_ratio": 0.0, + "completion_length": 1514.6785888671875, + "epoch": 0.048, + "grad_norm": 0.23109059035778046, + "kl": 4.7460198402404785e-06, + "learning_rate": 2.4e-07, + "loss": -0.0225, + "reward": -0.2342548444867134, + "reward_std": 0.1461905539035797, + "rewards/cosine_scaled_reward": -0.117127425968647, + "rewards/format_reward": 0.0, + "step": 12 + }, + { + "clip_ratio": 0.0, + "completion_length": 1486.6726379394531, + "epoch": 0.052, + "grad_norm": 0.18697038292884827, + "kl": 3.1348317861557007e-06, + "learning_rate": 2.6e-07, + "loss": -0.0475, + "reward": -0.23411722108721733, + "reward_std": 0.16247618943452835, + "rewards/cosine_scaled_reward": -0.11705861054360867, + "rewards/format_reward": 0.0, + "step": 13 + }, + { + "clip_ratio": 0.0, + "completion_length": 1493.9166870117188, + "epoch": 0.056, + "grad_norm": 0.26729440689086914, + "kl": -6.791204214096069e-06, + "learning_rate": 2.8e-07, + "loss": -0.0147, + "reward": -0.2375863455235958, + "reward_std": 0.1451248899102211, + "rewards/cosine_scaled_reward": -0.1187931690365076, + "rewards/format_reward": 0.0, + "step": 14 + }, + { + "clip_ratio": 0.0, + "completion_length": 1493.3392944335938, + "epoch": 0.06, + "grad_norm": 0.24526046216487885, + "kl": 1.4476478099822998e-05, + "learning_rate": 3e-07, + "loss": -0.0512, + "reward": -0.2283475622534752, + "reward_std": 0.1646866388618946, + "rewards/cosine_scaled_reward": -0.11417377926409245, + "rewards/format_reward": 0.0, + "step": 15 + }, + { + "clip_ratio": 0.0, + "completion_length": 1495.8809814453125, + "epoch": 0.064, + "grad_norm": 0.21982906758785248, + "kl": 7.815659046173096e-06, + "learning_rate": 3.2e-07, + "loss": -0.033, + "reward": -0.229737039655447, + "reward_std": 0.1594039984047413, + "rewards/cosine_scaled_reward": -0.1148685198277235, + "rewards/format_reward": 0.0, + "step": 16 + }, + { + "clip_ratio": 0.0, + "completion_length": 1471.8869018554688, + "epoch": 0.068, + "grad_norm": 0.32592612504959106, + "kl": 9.292736649513245e-06, + "learning_rate": 3.4000000000000003e-07, + "loss": -0.0491, + "reward": -0.22187525033950806, + "reward_std": 0.15859584510326385, + "rewards/cosine_scaled_reward": -0.11093762516975403, + "rewards/format_reward": 0.0, + "step": 17 + }, + { + "clip_ratio": 0.0, + "completion_length": 1505.4702453613281, + "epoch": 0.072, + "grad_norm": 0.27544769644737244, + "kl": 6.070360541343689e-06, + "learning_rate": 3.6e-07, + "loss": -0.0094, + "reward": -0.2488204501569271, + "reward_std": 0.14258970320224762, + "rewards/cosine_scaled_reward": -0.1244102232158184, + "rewards/format_reward": 0.0, + "step": 18 + }, + { + "clip_ratio": 0.0, + "completion_length": 1519.1428833007812, + "epoch": 0.076, + "grad_norm": 0.2813272774219513, + "kl": 4.976987838745117e-06, + "learning_rate": 3.7999999999999996e-07, + "loss": -0.0174, + "reward": -0.20330505073070526, + "reward_std": 0.12616467103362083, + "rewards/cosine_scaled_reward": -0.10165252350270748, + "rewards/format_reward": 0.0, + "step": 19 + }, + { + "clip_ratio": 0.0, + "completion_length": 1512.0595397949219, + "epoch": 0.08, + "grad_norm": 0.24715571105480194, + "kl": 1.5944242477416992e-06, + "learning_rate": 4e-07, + "loss": -0.0147, + "reward": -0.22946816682815552, + "reward_std": 0.12867936864495277, + "rewards/cosine_scaled_reward": -0.11473408341407776, + "rewards/format_reward": 0.0, + "step": 20 + }, + { + "clip_ratio": 0.0, + "completion_length": 1514.6428833007812, + "epoch": 0.084, + "grad_norm": 0.2901928424835205, + "kl": 6.8731606006622314e-06, + "learning_rate": 4.1999999999999995e-07, + "loss": -0.0034, + "reward": -0.25839560478925705, + "reward_std": 0.13489549793303013, + "rewards/cosine_scaled_reward": -0.12919779494404793, + "rewards/format_reward": 0.0, + "step": 21 + }, + { + "clip_ratio": 0.0, + "completion_length": 1488.4642944335938, + "epoch": 0.088, + "grad_norm": 0.2510276436805725, + "kl": 8.165836334228516e-06, + "learning_rate": 4.3999999999999997e-07, + "loss": -0.0203, + "reward": -0.26031654328107834, + "reward_std": 0.148418840020895, + "rewards/cosine_scaled_reward": -0.13015827164053917, + "rewards/format_reward": 0.0, + "step": 22 + }, + { + "clip_ratio": 0.0, + "completion_length": 1509.4761962890625, + "epoch": 0.092, + "grad_norm": 0.21568480134010315, + "kl": 3.085937350988388e-06, + "learning_rate": 4.6e-07, + "loss": -0.029, + "reward": -0.25878410786390305, + "reward_std": 0.1478017084300518, + "rewards/cosine_scaled_reward": -0.12939205765724182, + "rewards/format_reward": 0.0, + "step": 23 + }, + { + "clip_ratio": 0.0, + "completion_length": 1522.6667175292969, + "epoch": 0.096, + "grad_norm": 0.20768144726753235, + "kl": 5.729496479034424e-06, + "learning_rate": 4.8e-07, + "loss": -0.0056, + "reward": -0.23294677585363388, + "reward_std": 0.12670473381876945, + "rewards/cosine_scaled_reward": -0.11647338047623634, + "rewards/format_reward": 0.0, + "step": 24 + }, + { + "clip_ratio": 0.0, + "completion_length": 1505.5059814453125, + "epoch": 0.1, + "grad_norm": 0.20974963903427124, + "kl": 4.060566425323486e-06, + "learning_rate": 5e-07, + "loss": -0.0228, + "reward": -0.23598218336701393, + "reward_std": 0.13139526918530464, + "rewards/cosine_scaled_reward": -0.11799109354615211, + "rewards/format_reward": 0.0, + "step": 25 + }, + { + "clip_ratio": 0.0, + "completion_length": 1503.90478515625, + "epoch": 0.104, + "grad_norm": 0.31668928265571594, + "kl": 5.900859832763672e-06, + "learning_rate": 5.2e-07, + "loss": -0.0143, + "reward": -0.2467576116323471, + "reward_std": 0.15891429036855698, + "rewards/cosine_scaled_reward": -0.12337880209088326, + "rewards/format_reward": 0.0, + "step": 26 + }, + { + "clip_ratio": 0.0, + "completion_length": 1499.4166870117188, + "epoch": 0.108, + "grad_norm": 0.2661290764808655, + "kl": 1.9058585166931152e-05, + "learning_rate": 5.4e-07, + "loss": -0.0059, + "reward": -0.2198324091732502, + "reward_std": 0.13804786279797554, + "rewards/cosine_scaled_reward": -0.10991620272397995, + "rewards/format_reward": 0.0, + "step": 27 + }, + { + "clip_ratio": 0.0, + "completion_length": 1494.3929138183594, + "epoch": 0.112, + "grad_norm": 0.24837514758110046, + "kl": 1.190975308418274e-05, + "learning_rate": 5.6e-07, + "loss": -0.0358, + "reward": -0.2488894909620285, + "reward_std": 0.14470278844237328, + "rewards/cosine_scaled_reward": -0.12444474548101425, + "rewards/format_reward": 0.0, + "step": 28 + }, + { + "clip_ratio": 0.0, + "completion_length": 1495.5893249511719, + "epoch": 0.116, + "grad_norm": 0.2956860065460205, + "kl": 1.84476375579834e-05, + "learning_rate": 5.8e-07, + "loss": -0.0464, + "reward": -0.258271723985672, + "reward_std": 0.17091093584895134, + "rewards/cosine_scaled_reward": -0.129135861992836, + "rewards/format_reward": 0.0, + "step": 29 + }, + { + "clip_ratio": 0.0, + "completion_length": 1491.5833740234375, + "epoch": 0.12, + "grad_norm": 0.21843063831329346, + "kl": 3.403425216674805e-05, + "learning_rate": 6e-07, + "loss": -0.025, + "reward": -0.22956868633627892, + "reward_std": 0.13988509960472584, + "rewards/cosine_scaled_reward": -0.11478434316813946, + "rewards/format_reward": 0.0, + "step": 30 + }, + { + "clip_ratio": 0.0, + "completion_length": 1494.8691101074219, + "epoch": 0.124, + "grad_norm": 0.27265459299087524, + "kl": 3.37064266204834e-05, + "learning_rate": 6.2e-07, + "loss": -0.0028, + "reward": -0.22422148287296295, + "reward_std": 0.13563549891114235, + "rewards/cosine_scaled_reward": -0.11211073212325573, + "rewards/format_reward": 0.0, + "step": 31 + }, + { + "clip_ratio": 0.0, + "completion_length": 1500.4226379394531, + "epoch": 0.128, + "grad_norm": 0.2947193682193756, + "kl": 4.464387893676758e-05, + "learning_rate": 6.4e-07, + "loss": -0.0253, + "reward": -0.2335178479552269, + "reward_std": 0.15123932622373104, + "rewards/cosine_scaled_reward": -0.11675892770290375, + "rewards/format_reward": 0.0, + "step": 32 + }, + { + "clip_ratio": 0.0, + "completion_length": 1512.3988342285156, + "epoch": 0.132, + "grad_norm": 0.27640435099601746, + "kl": 3.385916352272034e-05, + "learning_rate": 6.6e-07, + "loss": 0.0004, + "reward": -0.22090798616409302, + "reward_std": 0.1239312905818224, + "rewards/cosine_scaled_reward": -0.11045399680733681, + "rewards/format_reward": 0.0, + "step": 33 + }, + { + "clip_ratio": 0.0, + "completion_length": 1506.6190795898438, + "epoch": 0.136, + "grad_norm": 0.24311278760433197, + "kl": 2.8401613235473633e-05, + "learning_rate": 6.800000000000001e-07, + "loss": -0.0182, + "reward": -0.24944494664669037, + "reward_std": 0.1345935631543398, + "rewards/cosine_scaled_reward": -0.12472246773540974, + "rewards/format_reward": 0.0, + "step": 34 + }, + { + "clip_ratio": 0.0, + "completion_length": 1497.9107666015625, + "epoch": 0.14, + "grad_norm": 0.21990132331848145, + "kl": 4.646182060241699e-05, + "learning_rate": 7e-07, + "loss": -0.0132, + "reward": -0.22735398262739182, + "reward_std": 0.14252249151468277, + "rewards/cosine_scaled_reward": -0.11367699131369591, + "rewards/format_reward": 0.0, + "step": 35 + }, + { + "clip_ratio": 0.0, + "completion_length": 1519.0357360839844, + "epoch": 0.144, + "grad_norm": 0.25016918778419495, + "kl": 5.340576171875e-05, + "learning_rate": 7.2e-07, + "loss": -0.0145, + "reward": -0.22054903954267502, + "reward_std": 0.14707811176776886, + "rewards/cosine_scaled_reward": -0.11027451977133751, + "rewards/format_reward": 0.0, + "step": 36 + }, + { + "clip_ratio": 0.0, + "completion_length": 1504.1250305175781, + "epoch": 0.148, + "grad_norm": 0.27119770646095276, + "kl": 6.008148193359375e-05, + "learning_rate": 7.4e-07, + "loss": -0.0074, + "reward": -0.21688038110733032, + "reward_std": 0.14238713681697845, + "rewards/cosine_scaled_reward": -0.10844019241631031, + "rewards/format_reward": 0.0, + "step": 37 + }, + { + "clip_ratio": 0.0, + "completion_length": 1498.2083740234375, + "epoch": 0.152, + "grad_norm": 0.22954951226711273, + "kl": 4.4226646423339844e-05, + "learning_rate": 7.599999999999999e-07, + "loss": -0.025, + "reward": -0.2533186711370945, + "reward_std": 0.13519956730306149, + "rewards/cosine_scaled_reward": -0.12665932811796665, + "rewards/format_reward": 0.0, + "step": 38 + }, + { + "clip_ratio": 0.0, + "completion_length": 1505.2440490722656, + "epoch": 0.156, + "grad_norm": 0.20884250104427338, + "kl": 3.9130449295043945e-05, + "learning_rate": 7.799999999999999e-07, + "loss": -0.0186, + "reward": -0.2563895806670189, + "reward_std": 0.1503661349415779, + "rewards/cosine_scaled_reward": -0.12819479033350945, + "rewards/format_reward": 0.0, + "step": 39 + }, + { + "clip_ratio": 0.0, + "completion_length": 1518.8809509277344, + "epoch": 0.16, + "grad_norm": 0.31034034490585327, + "kl": 4.661083221435547e-05, + "learning_rate": 8e-07, + "loss": -0.014, + "reward": -0.24387329444289207, + "reward_std": 0.14256866462528706, + "rewards/cosine_scaled_reward": -0.12193664722144604, + "rewards/format_reward": 0.0, + "step": 40 + }, + { + "clip_ratio": 0.0, + "completion_length": 1482.1964416503906, + "epoch": 0.164, + "grad_norm": 0.2809925973415375, + "kl": 0.00013720989227294922, + "learning_rate": 8.199999999999999e-07, + "loss": -0.0377, + "reward": -0.23471787199378014, + "reward_std": 0.1464288830757141, + "rewards/cosine_scaled_reward": -0.11735892854630947, + "rewards/format_reward": 0.0, + "step": 41 + }, + { + "clip_ratio": 0.0, + "completion_length": 1516.0416870117188, + "epoch": 0.168, + "grad_norm": 0.22887176275253296, + "kl": 0.00016939640045166016, + "learning_rate": 8.399999999999999e-07, + "loss": -0.0143, + "reward": -0.2401226907968521, + "reward_std": 0.12950069829821587, + "rewards/cosine_scaled_reward": -0.1200613472610712, + "rewards/format_reward": 0.0, + "step": 42 + }, + { + "clip_ratio": 0.0, + "completion_length": 1471.5536193847656, + "epoch": 0.172, + "grad_norm": 0.2556413412094116, + "kl": 0.0002295970916748047, + "learning_rate": 8.599999999999999e-07, + "loss": -0.0741, + "reward": -0.2366926297545433, + "reward_std": 0.17133169993758202, + "rewards/cosine_scaled_reward": -0.1183463241904974, + "rewards/format_reward": 0.0, + "step": 43 + }, + { + "clip_ratio": 0.0, + "completion_length": 1511.7678527832031, + "epoch": 0.176, + "grad_norm": 0.2524208724498749, + "kl": 0.0002219676971435547, + "learning_rate": 8.799999999999999e-07, + "loss": -0.0275, + "reward": -0.25312257930636406, + "reward_std": 0.16574446111917496, + "rewards/cosine_scaled_reward": -0.12656129337847233, + "rewards/format_reward": 0.0, + "step": 44 + }, + { + "clip_ratio": 0.0, + "completion_length": 1494.7321472167969, + "epoch": 0.18, + "grad_norm": 0.28623661398887634, + "kl": 0.0002727508544921875, + "learning_rate": 9e-07, + "loss": -0.0223, + "reward": -0.24078572914004326, + "reward_std": 0.14357317984104156, + "rewards/cosine_scaled_reward": -0.12039286643266678, + "rewards/format_reward": 0.0, + "step": 45 + }, + { + "clip_ratio": 0.0, + "completion_length": 1493.5238342285156, + "epoch": 0.184, + "grad_norm": 0.27407318353652954, + "kl": 0.000263214111328125, + "learning_rate": 9.2e-07, + "loss": -0.0368, + "reward": -0.23090793937444687, + "reward_std": 0.15557732805609703, + "rewards/cosine_scaled_reward": -0.11545397154986858, + "rewards/format_reward": 0.0, + "step": 46 + }, + { + "clip_ratio": 0.0, + "completion_length": 1501.1012268066406, + "epoch": 0.188, + "grad_norm": 0.26540830731391907, + "kl": 0.000286102294921875, + "learning_rate": 9.399999999999999e-07, + "loss": -0.0079, + "reward": -0.21494316309690475, + "reward_std": 0.14235420525074005, + "rewards/cosine_scaled_reward": -0.10747158527374268, + "rewards/format_reward": 0.0, + "step": 47 + }, + { + "clip_ratio": 0.0, + "completion_length": 1493.4285583496094, + "epoch": 0.192, + "grad_norm": 0.20362912118434906, + "kl": 0.0003046989440917969, + "learning_rate": 9.6e-07, + "loss": -0.0465, + "reward": -0.24233370646834373, + "reward_std": 0.16549209877848625, + "rewards/cosine_scaled_reward": -0.12116685323417187, + "rewards/format_reward": 0.0, + "step": 48 + }, + { + "clip_ratio": 0.0, + "completion_length": 1496.9107360839844, + "epoch": 0.196, + "grad_norm": 0.23884467780590057, + "kl": 0.00034809112548828125, + "learning_rate": 9.8e-07, + "loss": -0.0294, + "reward": -0.235232163220644, + "reward_std": 0.14046380668878555, + "rewards/cosine_scaled_reward": -0.1176160853356123, + "rewards/format_reward": 0.0, + "step": 49 + }, + { + "clip_ratio": 0.0, + "completion_length": 1512.2083740234375, + "epoch": 0.2, + "grad_norm": 0.2550745904445648, + "kl": 0.0003681182861328125, + "learning_rate": 1e-06, + "loss": -0.0009, + "reward": -0.14228077605366707, + "reward_std": 0.13498482666909695, + "rewards/cosine_scaled_reward": -0.07114038616418839, + "rewards/format_reward": 0.0, + "step": 50 + }, + { + "clip_ratio": 0.0, + "completion_length": 1514.3154602050781, + "epoch": 0.204, + "grad_norm": 0.26381194591522217, + "kl": 0.0003437995910644531, + "learning_rate": 9.999890338174275e-07, + "loss": -0.0287, + "reward": -0.24466010928153992, + "reward_std": 0.16256770864129066, + "rewards/cosine_scaled_reward": -0.12233005836606026, + "rewards/format_reward": 0.0, + "step": 51 + }, + { + "clip_ratio": 0.0, + "completion_length": 1503.1488647460938, + "epoch": 0.208, + "grad_norm": 0.1665869802236557, + "kl": 0.00042724609375, + "learning_rate": 9.999561358041868e-07, + "loss": -0.0102, + "reward": -0.2257002554833889, + "reward_std": 0.1360796671360731, + "rewards/cosine_scaled_reward": -0.11285012774169445, + "rewards/format_reward": 0.0, + "step": 52 + }, + { + "clip_ratio": 0.0, + "completion_length": 1486.7381286621094, + "epoch": 0.212, + "grad_norm": 0.2398059219121933, + "kl": 0.0004673004150390625, + "learning_rate": 9.999013075636804e-07, + "loss": -0.0184, + "reward": -0.23267855867743492, + "reward_std": 0.14873512834310532, + "rewards/cosine_scaled_reward": -0.11633927933871746, + "rewards/format_reward": 0.0, + "step": 53 + }, + { + "clip_ratio": 0.0, + "completion_length": 1509.2083435058594, + "epoch": 0.216, + "grad_norm": 0.2826734781265259, + "kl": 0.0006093978881835938, + "learning_rate": 9.998245517681593e-07, + "loss": -0.0234, + "reward": -0.2084299884736538, + "reward_std": 0.1463002786040306, + "rewards/cosine_scaled_reward": -0.1042149942368269, + "rewards/format_reward": 0.0, + "step": 54 + }, + { + "clip_ratio": 0.0, + "completion_length": 1501.0476379394531, + "epoch": 0.22, + "grad_norm": 0.29020005464553833, + "kl": 0.0004963874816894531, + "learning_rate": 9.997258721585931e-07, + "loss": -0.0355, + "reward": -0.2434540018439293, + "reward_std": 0.17717645689845085, + "rewards/cosine_scaled_reward": -0.12172700092196465, + "rewards/format_reward": 0.0, + "step": 55 + }, + { + "clip_ratio": 0.0, + "completion_length": 1504.0833435058594, + "epoch": 0.224, + "grad_norm": 0.26805394887924194, + "kl": 0.0004773139953613281, + "learning_rate": 9.996052735444862e-07, + "loss": 0.0009, + "reward": -0.2181985266506672, + "reward_std": 0.13354611210525036, + "rewards/cosine_scaled_reward": -0.10909926891326904, + "rewards/format_reward": 0.0, + "step": 56 + }, + { + "clip_ratio": 0.0, + "completion_length": 1507.6488342285156, + "epoch": 0.228, + "grad_norm": 0.27495479583740234, + "kl": 0.0012969970703125, + "learning_rate": 9.994627618036452e-07, + "loss": -0.0226, + "reward": -0.2011367231607437, + "reward_std": 0.14377126656472683, + "rewards/cosine_scaled_reward": -0.10056836158037186, + "rewards/format_reward": 0.0, + "step": 57 + }, + { + "clip_ratio": 0.0, + "completion_length": 1522.52978515625, + "epoch": 0.232, + "grad_norm": 0.22906683385372162, + "kl": 0.002330780029296875, + "learning_rate": 9.992983438818915e-07, + "loss": -0.0165, + "reward": -0.23109900206327438, + "reward_std": 0.13826126232743263, + "rewards/cosine_scaled_reward": -0.11554950661957264, + "rewards/format_reward": 0.0, + "step": 58 + }, + { + "clip_ratio": 0.0, + "completion_length": 1461.1904907226562, + "epoch": 0.236, + "grad_norm": 0.21256300806999207, + "kl": 0.004283905029296875, + "learning_rate": 9.991120277927223e-07, + "loss": -0.0729, + "reward": -0.19971829652786255, + "reward_std": 0.18633990362286568, + "rewards/cosine_scaled_reward": -0.09985914640128613, + "rewards/format_reward": 0.0, + "step": 59 + }, + { + "clip_ratio": 0.0, + "completion_length": 1518.7380981445312, + "epoch": 0.24, + "grad_norm": 0.2587541341781616, + "kl": 0.00316619873046875, + "learning_rate": 9.989038226169207e-07, + "loss": -0.0134, + "reward": -0.2089497372508049, + "reward_std": 0.14539672806859016, + "rewards/cosine_scaled_reward": -0.1044748667627573, + "rewards/format_reward": 0.0, + "step": 60 + }, + { + "clip_ratio": 0.0, + "completion_length": 1492.4881286621094, + "epoch": 0.244, + "grad_norm": 0.2516862154006958, + "kl": 0.0037384033203125, + "learning_rate": 9.98673738502114e-07, + "loss": -0.0497, + "reward": -0.22766747325658798, + "reward_std": 0.17161306738853455, + "rewards/cosine_scaled_reward": -0.11383373104035854, + "rewards/format_reward": 0.0, + "step": 61 + }, + { + "clip_ratio": 0.0, + "completion_length": 1479.232177734375, + "epoch": 0.248, + "grad_norm": 0.26911845803260803, + "kl": 0.004444122314453125, + "learning_rate": 9.98421786662277e-07, + "loss": -0.0545, + "reward": -0.22953158989548683, + "reward_std": 0.1646084077656269, + "rewards/cosine_scaled_reward": -0.11476579494774342, + "rewards/format_reward": 0.0, + "step": 62 + }, + { + "clip_ratio": 0.0, + "completion_length": 1500.9821472167969, + "epoch": 0.252, + "grad_norm": 0.30081605911254883, + "kl": 0.0038433074951171875, + "learning_rate": 9.981479793771866e-07, + "loss": -0.0306, + "reward": -0.2425815463066101, + "reward_std": 0.15784814581274986, + "rewards/cosine_scaled_reward": -0.1212907712906599, + "rewards/format_reward": 0.0, + "step": 63 + }, + { + "clip_ratio": 0.0, + "completion_length": 1474.2262268066406, + "epoch": 0.256, + "grad_norm": 0.11899983882904053, + "kl": 0.00433349609375, + "learning_rate": 9.97852329991824e-07, + "loss": -0.028, + "reward": -0.1484425999224186, + "reward_std": 0.13878681510686874, + "rewards/cosine_scaled_reward": -0.07422130089253187, + "rewards/format_reward": 0.0, + "step": 64 + }, + { + "clip_ratio": 0.0, + "completion_length": 1469.2440490722656, + "epoch": 0.26, + "grad_norm": 0.21076174080371857, + "kl": 0.0048980712890625, + "learning_rate": 9.975348529157229e-07, + "loss": -0.063, + "reward": -0.23214704915881157, + "reward_std": 0.15723764523863792, + "rewards/cosine_scaled_reward": -0.11607352085411549, + "rewards/format_reward": 0.0, + "step": 65 + }, + { + "clip_ratio": 0.0, + "completion_length": 1479.2738342285156, + "epoch": 0.264, + "grad_norm": 0.25172120332717896, + "kl": 0.0048065185546875, + "learning_rate": 9.971955636222684e-07, + "loss": -0.0575, + "reward": -0.24701237678527832, + "reward_std": 0.176135566085577, + "rewards/cosine_scaled_reward": -0.12350618466734886, + "rewards/format_reward": 0.0, + "step": 66 + }, + { + "clip_ratio": 0.0, + "completion_length": 1460.0178833007812, + "epoch": 0.268, + "grad_norm": 0.2059454619884491, + "kl": 0.006103515625, + "learning_rate": 9.968344786479415e-07, + "loss": -0.0581, + "reward": -0.21051475405693054, + "reward_std": 0.1539991032332182, + "rewards/cosine_scaled_reward": -0.10525736771523952, + "rewards/format_reward": 0.0, + "step": 67 + }, + { + "clip_ratio": 0.0, + "completion_length": 1510.8035888671875, + "epoch": 0.272, + "grad_norm": 0.2547614276409149, + "kl": 0.0057525634765625, + "learning_rate": 9.964516155915151e-07, + "loss": -0.017, + "reward": -0.18334244936704636, + "reward_std": 0.151863232254982, + "rewards/cosine_scaled_reward": -0.09167122654616833, + "rewards/format_reward": 0.0, + "step": 68 + }, + { + "clip_ratio": 0.0, + "completion_length": 1481.8392944335938, + "epoch": 0.276, + "grad_norm": 0.17602519690990448, + "kl": 0.00487518310546875, + "learning_rate": 9.960469931131936e-07, + "loss": -0.0368, + "reward": -0.15826850943267345, + "reward_std": 0.1565675064921379, + "rewards/cosine_scaled_reward": -0.07913425378501415, + "rewards/format_reward": 0.0, + "step": 69 + }, + { + "clip_ratio": 0.0, + "completion_length": 1488.6726379394531, + "epoch": 0.28, + "grad_norm": 0.2315431833267212, + "kl": 0.00534820556640625, + "learning_rate": 9.956206309337066e-07, + "loss": -0.0383, + "reward": -0.21064428612589836, + "reward_std": 0.14298444241285324, + "rewards/cosine_scaled_reward": -0.10532214120030403, + "rewards/format_reward": 0.0, + "step": 70 + }, + { + "clip_ratio": 0.0, + "completion_length": 1521.3095397949219, + "epoch": 0.284, + "grad_norm": 0.25028038024902344, + "kl": 0.00449371337890625, + "learning_rate": 9.951725498333448e-07, + "loss": -0.018, + "reward": -0.2252160757780075, + "reward_std": 0.1335316188633442, + "rewards/cosine_scaled_reward": -0.11260804533958435, + "rewards/format_reward": 0.0, + "step": 71 + }, + { + "clip_ratio": 0.0, + "completion_length": 1487.1666870117188, + "epoch": 0.288, + "grad_norm": 0.3014598488807678, + "kl": 0.0053863525390625, + "learning_rate": 9.947027716509488e-07, + "loss": -0.0296, + "reward": -0.22651539742946625, + "reward_std": 0.15895461291074753, + "rewards/cosine_scaled_reward": -0.11325769871473312, + "rewards/format_reward": 0.0, + "step": 72 + }, + { + "clip_ratio": 0.0, + "completion_length": 1500.0417175292969, + "epoch": 0.292, + "grad_norm": 0.19289681315422058, + "kl": 0.00534820556640625, + "learning_rate": 9.942113192828444e-07, + "loss": -0.0204, + "reward": -0.22585123777389526, + "reward_std": 0.1563442163169384, + "rewards/cosine_scaled_reward": -0.11292561888694763, + "rewards/format_reward": 0.0, + "step": 73 + }, + { + "clip_ratio": 0.0, + "completion_length": 1481.1845703125, + "epoch": 0.296, + "grad_norm": 0.21433287858963013, + "kl": 0.00696563720703125, + "learning_rate": 9.93698216681727e-07, + "loss": -0.0248, + "reward": -0.21346117928624153, + "reward_std": 0.16779018752276897, + "rewards/cosine_scaled_reward": -0.10673058778047562, + "rewards/format_reward": 0.0, + "step": 74 + }, + { + "clip_ratio": 0.0, + "completion_length": 1492.0595397949219, + "epoch": 0.3, + "grad_norm": 0.20086415112018585, + "kl": 0.0060577392578125, + "learning_rate": 9.931634888554935e-07, + "loss": -0.0315, + "reward": -0.24520759657025337, + "reward_std": 0.16547074727714062, + "rewards/cosine_scaled_reward": -0.12260380387306213, + "rewards/format_reward": 0.0, + "step": 75 + }, + { + "clip_ratio": 0.0, + "completion_length": 1509.7440795898438, + "epoch": 0.304, + "grad_norm": 0.2499692440032959, + "kl": 0.00594329833984375, + "learning_rate": 9.926071618660237e-07, + "loss": -0.029, + "reward": -0.22075266018509865, + "reward_std": 0.1520039215683937, + "rewards/cosine_scaled_reward": -0.11037633195519447, + "rewards/format_reward": 0.0, + "step": 76 + }, + { + "clip_ratio": 0.0, + "completion_length": 1493.0655212402344, + "epoch": 0.308, + "grad_norm": 0.1303885579109192, + "kl": 0.0078887939453125, + "learning_rate": 9.9202926282791e-07, + "loss": 0.0068, + "reward": -0.20533370971679688, + "reward_std": 0.1422851476818323, + "rewards/cosine_scaled_reward": -0.10266684927046299, + "rewards/format_reward": 0.0, + "step": 77 + }, + { + "clip_ratio": 0.0, + "completion_length": 1501.3035888671875, + "epoch": 0.312, + "grad_norm": 0.23692218959331512, + "kl": 0.0076141357421875, + "learning_rate": 9.91429819907136e-07, + "loss": -0.0294, + "reward": -0.1995321549475193, + "reward_std": 0.15328680351376534, + "rewards/cosine_scaled_reward": -0.0997660793364048, + "rewards/format_reward": 0.0, + "step": 78 + }, + { + "clip_ratio": 0.0, + "completion_length": 1516.3512268066406, + "epoch": 0.316, + "grad_norm": 0.2173466831445694, + "kl": 0.0062408447265625, + "learning_rate": 9.908088623197048e-07, + "loss": -0.0243, + "reward": -0.22904419153928757, + "reward_std": 0.1448185909539461, + "rewards/cosine_scaled_reward": -0.11452210135757923, + "rewards/format_reward": 0.0, + "step": 79 + }, + { + "clip_ratio": 0.0, + "completion_length": 1484.3869323730469, + "epoch": 0.32, + "grad_norm": 0.16200865805149078, + "kl": 0.00707244873046875, + "learning_rate": 9.901664203302124e-07, + "loss": -0.0239, + "reward": -0.22017718479037285, + "reward_std": 0.15285737439990044, + "rewards/cosine_scaled_reward": -0.11008859053254128, + "rewards/format_reward": 0.0, + "step": 80 + }, + { + "clip_ratio": 0.0, + "completion_length": 1484.7261962890625, + "epoch": 0.324, + "grad_norm": 0.1811896711587906, + "kl": 0.00650787353515625, + "learning_rate": 9.895025252503755e-07, + "loss": -0.0262, + "reward": -0.24382955580949783, + "reward_std": 0.1623360477387905, + "rewards/cosine_scaled_reward": -0.12191477790474892, + "rewards/format_reward": 0.0, + "step": 81 + }, + { + "clip_ratio": 0.0, + "completion_length": 1456.0357360839844, + "epoch": 0.328, + "grad_norm": 0.1585092842578888, + "kl": 0.010345458984375, + "learning_rate": 9.888172094375033e-07, + "loss": -0.066, + "reward": -0.23999103158712387, + "reward_std": 0.1869661882519722, + "rewards/cosine_scaled_reward": -0.11999551579356194, + "rewards/format_reward": 0.0, + "step": 82 + }, + { + "clip_ratio": 0.0, + "completion_length": 1485.02978515625, + "epoch": 0.332, + "grad_norm": 0.2301841825246811, + "kl": 0.0067596435546875, + "learning_rate": 9.881105062929221e-07, + "loss": -0.0321, + "reward": -0.2552091106772423, + "reward_std": 0.1685757040977478, + "rewards/cosine_scaled_reward": -0.12760455161333084, + "rewards/format_reward": 0.0, + "step": 83 + }, + { + "clip_ratio": 0.0, + "completion_length": 1486.7024230957031, + "epoch": 0.336, + "grad_norm": 0.24452205002307892, + "kl": 0.0095062255859375, + "learning_rate": 9.873824502603459e-07, + "loss": -0.0399, + "reward": -0.2177020013332367, + "reward_std": 0.1546536386013031, + "rewards/cosine_scaled_reward": -0.1088510025292635, + "rewards/format_reward": 0.0, + "step": 84 + }, + { + "clip_ratio": 0.0, + "completion_length": 1472.2083740234375, + "epoch": 0.34, + "grad_norm": 0.2162124067544937, + "kl": 0.00879669189453125, + "learning_rate": 9.866330768241983e-07, + "loss": -0.0363, + "reward": -0.2306637428700924, + "reward_std": 0.15445036813616753, + "rewards/cosine_scaled_reward": -0.11533187702298164, + "rewards/format_reward": 0.0, + "step": 85 + }, + { + "clip_ratio": 0.0, + "completion_length": 1514.90478515625, + "epoch": 0.344, + "grad_norm": 0.29467546939849854, + "kl": 0.010467529296875, + "learning_rate": 9.85862422507884e-07, + "loss": -0.0225, + "reward": -0.21584435179829597, + "reward_std": 0.15913846716284752, + "rewards/cosine_scaled_reward": -0.10792217776179314, + "rewards/format_reward": 0.0, + "step": 86 + }, + { + "clip_ratio": 0.0, + "completion_length": 1493.9166870117188, + "epoch": 0.348, + "grad_norm": 0.2552238404750824, + "kl": 0.0130157470703125, + "learning_rate": 9.850705248720068e-07, + "loss": -0.0419, + "reward": -0.22282668948173523, + "reward_std": 0.1558297798037529, + "rewards/cosine_scaled_reward": -0.11141334660351276, + "rewards/format_reward": 0.0, + "step": 87 + }, + { + "clip_ratio": 0.0, + "completion_length": 1516.4166870117188, + "epoch": 0.352, + "grad_norm": 0.18905065953731537, + "kl": 0.0161590576171875, + "learning_rate": 9.8425742251254e-07, + "loss": -0.0124, + "reward": -0.2435382977128029, + "reward_std": 0.1510351374745369, + "rewards/cosine_scaled_reward": -0.1217691469937563, + "rewards/format_reward": 0.0, + "step": 88 + }, + { + "clip_ratio": 0.0, + "completion_length": 1456.6131286621094, + "epoch": 0.356, + "grad_norm": 0.18983080983161926, + "kl": 0.0308837890625, + "learning_rate": 9.83423155058946e-07, + "loss": -0.0716, + "reward": -0.23182464018464088, + "reward_std": 0.17492059245705605, + "rewards/cosine_scaled_reward": -0.11591232009232044, + "rewards/format_reward": 0.0, + "step": 89 + }, + { + "clip_ratio": 0.0, + "completion_length": 1462.7916870117188, + "epoch": 0.36, + "grad_norm": 0.18697425723075867, + "kl": 0.036468505859375, + "learning_rate": 9.825677631722435e-07, + "loss": -0.0794, + "reward": -0.25727425515651703, + "reward_std": 0.18599339574575424, + "rewards/cosine_scaled_reward": -0.12863712757825851, + "rewards/format_reward": 0.0, + "step": 90 + }, + { + "clip_ratio": 0.0, + "completion_length": 1452.9166870117188, + "epoch": 0.364, + "grad_norm": 0.11306176334619522, + "kl": 0.05767822265625, + "learning_rate": 9.816912885430258e-07, + "loss": -0.074, + "reward": -0.2355377934873104, + "reward_std": 0.1886419989168644, + "rewards/cosine_scaled_reward": -0.11776889488101006, + "rewards/format_reward": 0.0, + "step": 91 + }, + { + "clip_ratio": 0.0, + "completion_length": 1423.2381286621094, + "epoch": 0.368, + "grad_norm": 0.12384199351072311, + "kl": 0.0667724609375, + "learning_rate": 9.807937738894303e-07, + "loss": -0.094, + "reward": -0.28002386912703514, + "reward_std": 0.22308171913027763, + "rewards/cosine_scaled_reward": -0.14001193456351757, + "rewards/format_reward": 0.0, + "step": 92 + }, + { + "clip_ratio": 0.0, + "completion_length": 1463.5298156738281, + "epoch": 0.372, + "grad_norm": 0.1097106859087944, + "kl": 0.0745849609375, + "learning_rate": 9.798752629550546e-07, + "loss": -0.0497, + "reward": -0.260006383061409, + "reward_std": 0.20789402723312378, + "rewards/cosine_scaled_reward": -0.1300031915307045, + "rewards/format_reward": 0.0, + "step": 93 + }, + { + "clip_ratio": 0.0, + "completion_length": 1423.0416870117188, + "epoch": 0.376, + "grad_norm": 0.13807816803455353, + "kl": 0.0921630859375, + "learning_rate": 9.78935800506826e-07, + "loss": -0.0664, + "reward": -0.2304685339331627, + "reward_std": 0.20511355623602867, + "rewards/cosine_scaled_reward": -0.11523427255451679, + "rewards/format_reward": 0.0, + "step": 94 + }, + { + "clip_ratio": 0.0, + "completion_length": 1439.827392578125, + "epoch": 0.38, + "grad_norm": 0.1847662776708603, + "kl": 0.080810546875, + "learning_rate": 9.779754323328192e-07, + "loss": -0.0574, + "reward": -0.22580492869019508, + "reward_std": 0.19689049944281578, + "rewards/cosine_scaled_reward": -0.11290246807038784, + "rewards/format_reward": 0.0, + "step": 95 + }, + { + "clip_ratio": 0.0, + "completion_length": 1450.6666870117188, + "epoch": 0.384, + "grad_norm": 0.14935868978500366, + "kl": 0.0830078125, + "learning_rate": 9.769942052400235e-07, + "loss": -0.0721, + "reward": -0.28848847001791, + "reward_std": 0.22373779118061066, + "rewards/cosine_scaled_reward": -0.144244235008955, + "rewards/format_reward": 0.0, + "step": 96 + }, + { + "clip_ratio": 0.0, + "completion_length": 1433.7916870117188, + "epoch": 0.388, + "grad_norm": 0.14467309415340424, + "kl": 0.0953369140625, + "learning_rate": 9.759921670520634e-07, + "loss": -0.0565, + "reward": -0.2675531320273876, + "reward_std": 0.19667528942227364, + "rewards/cosine_scaled_reward": -0.1337765622884035, + "rewards/format_reward": 0.0, + "step": 97 + }, + { + "clip_ratio": 0.0, + "completion_length": 1444.4345703125, + "epoch": 0.392, + "grad_norm": 0.14709749817848206, + "kl": 0.092529296875, + "learning_rate": 9.749693666068663e-07, + "loss": -0.0794, + "reward": -0.26461150124669075, + "reward_std": 0.2228638045489788, + "rewards/cosine_scaled_reward": -0.13230575062334538, + "rewards/format_reward": 0.0, + "step": 98 + }, + { + "clip_ratio": 0.0, + "completion_length": 1438.6369018554688, + "epoch": 0.396, + "grad_norm": 0.1645117700099945, + "kl": 0.090576171875, + "learning_rate": 9.739258537542835e-07, + "loss": -0.0804, + "reward": -0.2615456096827984, + "reward_std": 0.22437894716858864, + "rewards/cosine_scaled_reward": -0.13077280297875404, + "rewards/format_reward": 0.0, + "step": 99 + }, + { + "clip_ratio": 0.0, + "completion_length": 1447.9404907226562, + "epoch": 0.4, + "grad_norm": 0.12536393105983734, + "kl": 0.0902099609375, + "learning_rate": 9.728616793536587e-07, + "loss": -0.0565, + "reward": -0.23606609553098679, + "reward_std": 0.20211446657776833, + "rewards/cosine_scaled_reward": -0.11803305521607399, + "rewards/format_reward": 0.0, + "step": 100 + }, + { + "clip_ratio": 0.0, + "completion_length": 1461.0238647460938, + "epoch": 0.404, + "grad_norm": 0.1715729534626007, + "kl": 0.0977783203125, + "learning_rate": 9.717768952713511e-07, + "loss": -0.0419, + "reward": -0.25642454996705055, + "reward_std": 0.20571942254900932, + "rewards/cosine_scaled_reward": -0.12821227870881557, + "rewards/format_reward": 0.0, + "step": 101 + }, + { + "clip_ratio": 0.0, + "completion_length": 1441.0595397949219, + "epoch": 0.408, + "grad_norm": 0.14349091053009033, + "kl": 0.07373046875, + "learning_rate": 9.706715543782064e-07, + "loss": -0.0755, + "reward": -0.25798120722174644, + "reward_std": 0.18921422585844994, + "rewards/cosine_scaled_reward": -0.12899060919880867, + "rewards/format_reward": 0.0, + "step": 102 + }, + { + "clip_ratio": 0.0, + "completion_length": 1429.2559814453125, + "epoch": 0.412, + "grad_norm": 0.12457617372274399, + "kl": 0.08154296875, + "learning_rate": 9.695457105469804e-07, + "loss": -0.0635, + "reward": -0.2596958056092262, + "reward_std": 0.21758576482534409, + "rewards/cosine_scaled_reward": -0.12984789907932281, + "rewards/format_reward": 0.0, + "step": 103 + }, + { + "clip_ratio": 0.0, + "completion_length": 1455.0000610351562, + "epoch": 0.416, + "grad_norm": 0.1320362240076065, + "kl": 0.0777587890625, + "learning_rate": 9.683994186497132e-07, + "loss": -0.0587, + "reward": -0.23436888307332993, + "reward_std": 0.18900957331061363, + "rewards/cosine_scaled_reward": -0.11718444526195526, + "rewards/format_reward": 0.0, + "step": 104 + }, + { + "clip_ratio": 0.0, + "completion_length": 1421.0952758789062, + "epoch": 0.42, + "grad_norm": 0.13309422135353088, + "kl": 0.0897216796875, + "learning_rate": 9.672327345550543e-07, + "loss": -0.0934, + "reward": -0.25493229925632477, + "reward_std": 0.22001174464821815, + "rewards/cosine_scaled_reward": -0.12746614776551723, + "rewards/format_reward": 0.0, + "step": 105 + }, + { + "clip_ratio": 0.0, + "completion_length": 1490.9940795898438, + "epoch": 0.424, + "grad_norm": 0.1491984874010086, + "kl": 0.0736083984375, + "learning_rate": 9.66045715125541e-07, + "loss": -0.0299, + "reward": -0.2312908135354519, + "reward_std": 0.16199098154902458, + "rewards/cosine_scaled_reward": -0.11564541421830654, + "rewards/format_reward": 0.0, + "step": 106 + }, + { + "clip_ratio": 0.0, + "completion_length": 1421.0357666015625, + "epoch": 0.428, + "grad_norm": 0.16547471284866333, + "kl": 0.0780029296875, + "learning_rate": 9.648384182148252e-07, + "loss": -0.067, + "reward": -0.27354947850108147, + "reward_std": 0.1951713114976883, + "rewards/cosine_scaled_reward": -0.13677473925054073, + "rewards/format_reward": 0.0, + "step": 107 + }, + { + "clip_ratio": 0.0, + "completion_length": 1469.0833740234375, + "epoch": 0.432, + "grad_norm": 0.14035455882549286, + "kl": 0.0938720703125, + "learning_rate": 9.636109026648554e-07, + "loss": -0.0248, + "reward": -0.22176991775631905, + "reward_std": 0.1815592534840107, + "rewards/cosine_scaled_reward": -0.11088495329022408, + "rewards/format_reward": 0.0, + "step": 108 + }, + { + "clip_ratio": 0.0, + "completion_length": 1440.5238647460938, + "epoch": 0.436, + "grad_norm": 0.11456211656332016, + "kl": 0.077880859375, + "learning_rate": 9.623632283030077e-07, + "loss": -0.0604, + "reward": -0.2620925232768059, + "reward_std": 0.21131489053368568, + "rewards/cosine_scaled_reward": -0.13104625791311264, + "rewards/format_reward": 0.0, + "step": 109 + }, + { + "clip_ratio": 0.0, + "completion_length": 1479.4583740234375, + "epoch": 0.44, + "grad_norm": 0.11461742222309113, + "kl": 0.07177734375, + "learning_rate": 9.610954559391704e-07, + "loss": -0.0354, + "reward": -0.21340973302721977, + "reward_std": 0.18217052891850471, + "rewards/cosine_scaled_reward": -0.10670486651360989, + "rewards/format_reward": 0.0, + "step": 110 + }, + { + "clip_ratio": 0.0, + "completion_length": 1417.2976379394531, + "epoch": 0.444, + "grad_norm": 0.1527003049850464, + "kl": 0.07318115234375, + "learning_rate": 9.598076473627796e-07, + "loss": -0.1136, + "reward": -0.24674446135759354, + "reward_std": 0.19690455496311188, + "rewards/cosine_scaled_reward": -0.12337223440408707, + "rewards/format_reward": 0.0, + "step": 111 + }, + { + "clip_ratio": 0.0, + "completion_length": 1433.4464721679688, + "epoch": 0.448, + "grad_norm": 0.1355064958333969, + "kl": 0.0743408203125, + "learning_rate": 9.58499865339809e-07, + "loss": -0.0673, + "reward": -0.2527715191245079, + "reward_std": 0.19281791523098946, + "rewards/cosine_scaled_reward": -0.12638575583696365, + "rewards/format_reward": 0.0, + "step": 112 + }, + { + "clip_ratio": 0.0, + "completion_length": 1506.5714721679688, + "epoch": 0.452, + "grad_norm": 0.11734712868928909, + "kl": 0.0765380859375, + "learning_rate": 9.571721736097088e-07, + "loss": -0.0251, + "reward": -0.22142696008086205, + "reward_std": 0.18005133792757988, + "rewards/cosine_scaled_reward": -0.11071347445249557, + "rewards/format_reward": 0.0, + "step": 113 + }, + { + "clip_ratio": 0.0, + "completion_length": 1462.7084045410156, + "epoch": 0.456, + "grad_norm": 0.1974153220653534, + "kl": 0.0665283203125, + "learning_rate": 9.55824636882301e-07, + "loss": -0.0632, + "reward": -0.1928116953931749, + "reward_std": 0.19508182629942894, + "rewards/cosine_scaled_reward": -0.09640584839507937, + "rewards/format_reward": 0.0, + "step": 114 + }, + { + "clip_ratio": 0.0, + "completion_length": 1444.9524230957031, + "epoch": 0.46, + "grad_norm": 0.13885585963726044, + "kl": 0.0650634765625, + "learning_rate": 9.54457320834625e-07, + "loss": -0.0436, + "reward": -0.2363814003765583, + "reward_std": 0.19080934301018715, + "rewards/cosine_scaled_reward": -0.11819070391356945, + "rewards/format_reward": 0.0, + "step": 115 + }, + { + "clip_ratio": 0.0, + "completion_length": 1462.9881286621094, + "epoch": 0.464, + "grad_norm": 0.14188507199287415, + "kl": 0.0745849609375, + "learning_rate": 9.530702921077358e-07, + "loss": -0.0326, + "reward": -0.21813786774873734, + "reward_std": 0.17841872572898865, + "rewards/cosine_scaled_reward": -0.10906893201172352, + "rewards/format_reward": 0.0, + "step": 116 + }, + { + "clip_ratio": 0.0, + "completion_length": 1423.232177734375, + "epoch": 0.468, + "grad_norm": 0.14759542047977448, + "kl": 0.07354736328125, + "learning_rate": 9.516636183034564e-07, + "loss": -0.0955, + "reward": -0.24884852021932602, + "reward_std": 0.20949439704418182, + "rewards/cosine_scaled_reward": -0.12442425638437271, + "rewards/format_reward": 0.0, + "step": 117 + }, + { + "clip_ratio": 0.0, + "completion_length": 1433.3631286621094, + "epoch": 0.472, + "grad_norm": 0.12356512248516083, + "kl": 0.0736083984375, + "learning_rate": 9.502373679810839e-07, + "loss": -0.0907, + "reward": -0.2389114946126938, + "reward_std": 0.1962103582918644, + "rewards/cosine_scaled_reward": -0.11945574544370174, + "rewards/format_reward": 0.0, + "step": 118 + }, + { + "clip_ratio": 0.0, + "completion_length": 1425.5476684570312, + "epoch": 0.476, + "grad_norm": 0.13675737380981445, + "kl": 0.080322265625, + "learning_rate": 9.487916106540465e-07, + "loss": -0.0548, + "reward": -0.22732584923505783, + "reward_std": 0.17597166821360588, + "rewards/cosine_scaled_reward": -0.11366293206810951, + "rewards/format_reward": 0.0, + "step": 119 + }, + { + "clip_ratio": 0.0, + "completion_length": 1407.125, + "epoch": 0.48, + "grad_norm": 0.1944396197795868, + "kl": 0.0782470703125, + "learning_rate": 9.473264167865171e-07, + "loss": -0.1104, + "reward": -0.23876191675662994, + "reward_std": 0.21116216480731964, + "rewards/cosine_scaled_reward": -0.11938095837831497, + "rewards/format_reward": 0.0, + "step": 120 + }, + { + "clip_ratio": 0.0, + "completion_length": 1460.9940795898438, + "epoch": 0.484, + "grad_norm": 0.13976925611495972, + "kl": 0.07110595703125, + "learning_rate": 9.458418577899774e-07, + "loss": -0.0623, + "reward": -0.22172370925545692, + "reward_std": 0.17737172171473503, + "rewards/cosine_scaled_reward": -0.11086185090243816, + "rewards/format_reward": 0.0, + "step": 121 + }, + { + "clip_ratio": 0.0, + "completion_length": 1445.0536193847656, + "epoch": 0.488, + "grad_norm": 0.1271459460258484, + "kl": 0.0771484375, + "learning_rate": 9.443380060197385e-07, + "loss": -0.09, + "reward": -0.234033714979887, + "reward_std": 0.21107907965779305, + "rewards/cosine_scaled_reward": -0.1170168574899435, + "rewards/format_reward": 0.0, + "step": 122 + }, + { + "clip_ratio": 0.0, + "completion_length": 1461.5833740234375, + "epoch": 0.492, + "grad_norm": 0.1251479536294937, + "kl": 0.07427978515625, + "learning_rate": 9.428149347714143e-07, + "loss": -0.0677, + "reward": -0.21826723590493202, + "reward_std": 0.19678263366222382, + "rewards/cosine_scaled_reward": -0.10913361981511116, + "rewards/format_reward": 0.0, + "step": 123 + }, + { + "clip_ratio": 0.0, + "completion_length": 1441.7857360839844, + "epoch": 0.496, + "grad_norm": 0.16092143952846527, + "kl": 0.060302734375, + "learning_rate": 9.412727182773486e-07, + "loss": -0.0921, + "reward": -0.22351692616939545, + "reward_std": 0.2029583677649498, + "rewards/cosine_scaled_reward": -0.11175846680998802, + "rewards/format_reward": 0.0, + "step": 124 + }, + { + "clip_ratio": 0.0, + "completion_length": 1426.7262268066406, + "epoch": 0.5, + "grad_norm": 0.1188863217830658, + "kl": 0.0787353515625, + "learning_rate": 9.397114317029974e-07, + "loss": -0.0431, + "reward": -0.2245512492954731, + "reward_std": 0.17906467244029045, + "rewards/cosine_scaled_reward": -0.11227562837302685, + "rewards/format_reward": 0.0, + "step": 125 + }, + { + "clip_ratio": 0.0, + "completion_length": 1443.7917175292969, + "epoch": 0.504, + "grad_norm": 0.11914021521806717, + "kl": 0.0745849609375, + "learning_rate": 9.381311511432658e-07, + "loss": -0.0781, + "reward": -0.19801979139447212, + "reward_std": 0.1959025263786316, + "rewards/cosine_scaled_reward": -0.09900989942252636, + "rewards/format_reward": 0.0, + "step": 126 + }, + { + "clip_ratio": 0.0, + "completion_length": 1415.8512268066406, + "epoch": 0.508, + "grad_norm": 0.1357516646385193, + "kl": 0.07177734375, + "learning_rate": 9.36531953618799e-07, + "loss": -0.0865, + "reward": -0.21713878214359283, + "reward_std": 0.18351096659898758, + "rewards/cosine_scaled_reward": -0.10856938920915127, + "rewards/format_reward": 0.0, + "step": 127 + }, + { + "clip_ratio": 0.0, + "completion_length": 1472.5000305175781, + "epoch": 0.512, + "grad_norm": 0.1369076818227768, + "kl": 0.05865478515625, + "learning_rate": 9.34913917072228e-07, + "loss": -0.0478, + "reward": -0.21694474667310715, + "reward_std": 0.17777032032608986, + "rewards/cosine_scaled_reward": -0.10847238078713417, + "rewards/format_reward": 0.0, + "step": 128 + }, + { + "clip_ratio": 0.0, + "completion_length": 1477.2559509277344, + "epoch": 0.516, + "grad_norm": 0.17333942651748657, + "kl": 0.0572509765625, + "learning_rate": 9.332771203643714e-07, + "loss": -0.0249, + "reward": -0.21337437257170677, + "reward_std": 0.18009737133979797, + "rewards/cosine_scaled_reward": -0.10668718256056309, + "rewards/format_reward": 0.0, + "step": 129 + }, + { + "clip_ratio": 0.0, + "completion_length": 1473.7381286621094, + "epoch": 0.52, + "grad_norm": 0.10983088612556458, + "kl": 0.0703125, + "learning_rate": 9.316216432703916e-07, + "loss": -0.0583, + "reward": -0.19089093804359436, + "reward_std": 0.18549956753849983, + "rewards/cosine_scaled_reward": -0.09544547088444233, + "rewards/format_reward": 0.0, + "step": 130 + }, + { + "clip_ratio": 0.0, + "completion_length": 1432.7559814453125, + "epoch": 0.524, + "grad_norm": 0.16586177051067352, + "kl": 0.0791015625, + "learning_rate": 9.299475664759068e-07, + "loss": -0.0994, + "reward": -0.23504262417554855, + "reward_std": 0.21443113684654236, + "rewards/cosine_scaled_reward": -0.11752131581306458, + "rewards/format_reward": 0.0, + "step": 131 + }, + { + "clip_ratio": 0.0, + "completion_length": 1481.9762268066406, + "epoch": 0.528, + "grad_norm": 0.17732541263103485, + "kl": 0.06622314453125, + "learning_rate": 9.282549715730579e-07, + "loss": -0.0324, + "reward": -0.17852769792079926, + "reward_std": 0.1502333115786314, + "rewards/cosine_scaled_reward": -0.08926384896039963, + "rewards/format_reward": 0.0, + "step": 132 + }, + { + "clip_ratio": 0.0, + "completion_length": 1476.4048156738281, + "epoch": 0.532, + "grad_norm": 0.15965314209461212, + "kl": 0.060546875, + "learning_rate": 9.265439410565328e-07, + "loss": -0.058, + "reward": -0.22374625876545906, + "reward_std": 0.18331025168299675, + "rewards/cosine_scaled_reward": -0.11187312379479408, + "rewards/format_reward": 0.0, + "step": 133 + }, + { + "clip_ratio": 0.0, + "completion_length": 1490.8452453613281, + "epoch": 0.536, + "grad_norm": 0.11577272415161133, + "kl": 0.0740966796875, + "learning_rate": 9.248145583195447e-07, + "loss": -0.0366, + "reward": -0.1072634905576706, + "reward_std": 0.15717832930386066, + "rewards/cosine_scaled_reward": -0.05363174341619015, + "rewards/format_reward": 0.0, + "step": 134 + }, + { + "clip_ratio": 0.0, + "completion_length": 1490.982177734375, + "epoch": 0.54, + "grad_norm": 0.14535216987133026, + "kl": 0.05950927734375, + "learning_rate": 9.230669076497687e-07, + "loss": -0.0312, + "reward": -0.21274039149284363, + "reward_std": 0.15816222876310349, + "rewards/cosine_scaled_reward": -0.10637019760906696, + "rewards/format_reward": 0.0, + "step": 135 + }, + { + "clip_ratio": 0.0, + "completion_length": 1475.7679138183594, + "epoch": 0.544, + "grad_norm": 0.14940175414085388, + "kl": 0.072265625, + "learning_rate": 9.213010742252327e-07, + "loss": -0.0302, + "reward": -0.20727308467030525, + "reward_std": 0.18025333806872368, + "rewards/cosine_scaled_reward": -0.10363654233515263, + "rewards/format_reward": 0.0, + "step": 136 + }, + { + "clip_ratio": 0.0, + "completion_length": 1497.27978515625, + "epoch": 0.548, + "grad_norm": 0.11683686077594757, + "kl": 0.0675048828125, + "learning_rate": 9.195171441101668e-07, + "loss": -0.0354, + "reward": -0.19653399288654327, + "reward_std": 0.17340604588389397, + "rewards/cosine_scaled_reward": -0.09826699271798134, + "rewards/format_reward": 0.0, + "step": 137 + }, + { + "clip_ratio": 0.0, + "completion_length": 1472.4524230957031, + "epoch": 0.552, + "grad_norm": 0.19184042513370514, + "kl": 0.0703125, + "learning_rate": 9.177152042508077e-07, + "loss": -0.0365, + "reward": -0.20156748220324516, + "reward_std": 0.16703343763947487, + "rewards/cosine_scaled_reward": -0.10078373923897743, + "rewards/format_reward": 0.0, + "step": 138 + }, + { + "clip_ratio": 0.0, + "completion_length": 1441.5238342285156, + "epoch": 0.556, + "grad_norm": 0.21460425853729248, + "kl": 0.0751953125, + "learning_rate": 9.158953424711624e-07, + "loss": -0.0713, + "reward": -0.1934008002281189, + "reward_std": 0.1858229860663414, + "rewards/cosine_scaled_reward": -0.096700394526124, + "rewards/format_reward": 0.0, + "step": 139 + }, + { + "clip_ratio": 0.0, + "completion_length": 1477.5536193847656, + "epoch": 0.56, + "grad_norm": 0.1624855250120163, + "kl": 0.07452392578125, + "learning_rate": 9.140576474687263e-07, + "loss": -0.0644, + "reward": -0.20967105776071548, + "reward_std": 0.17497684434056282, + "rewards/cosine_scaled_reward": -0.1048355270177126, + "rewards/format_reward": 0.0, + "step": 140 + }, + { + "clip_ratio": 0.0, + "completion_length": 1488.3928833007812, + "epoch": 0.564, + "grad_norm": 0.13148972392082214, + "kl": 0.0745849609375, + "learning_rate": 9.122022088101613e-07, + "loss": -0.0365, + "reward": -0.12153960764408112, + "reward_std": 0.15567267499864101, + "rewards/cosine_scaled_reward": -0.06076979637145996, + "rewards/format_reward": 0.0, + "step": 141 + }, + { + "clip_ratio": 0.0, + "completion_length": 1439.5536193847656, + "epoch": 0.568, + "grad_norm": 0.18604105710983276, + "kl": 0.0751953125, + "learning_rate": 9.103291169269299e-07, + "loss": -0.0679, + "reward": -0.21637247875332832, + "reward_std": 0.19511258974671364, + "rewards/cosine_scaled_reward": -0.10818623751401901, + "rewards/format_reward": 0.0, + "step": 142 + }, + { + "clip_ratio": 0.0, + "completion_length": 1486.5774230957031, + "epoch": 0.572, + "grad_norm": 0.17263971269130707, + "kl": 0.07757568359375, + "learning_rate": 9.084384631108882e-07, + "loss": -0.0389, + "reward": -0.2009837031364441, + "reward_std": 0.16309702023863792, + "rewards/cosine_scaled_reward": -0.10049185156822205, + "rewards/format_reward": 0.0, + "step": 143 + }, + { + "clip_ratio": 0.0, + "completion_length": 1478.3988647460938, + "epoch": 0.576, + "grad_norm": 0.14938153326511383, + "kl": 0.079833984375, + "learning_rate": 9.065303395098358e-07, + "loss": -0.0585, + "reward": -0.1797693967819214, + "reward_std": 0.16727757826447487, + "rewards/cosine_scaled_reward": -0.08988469652831554, + "rewards/format_reward": 0.0, + "step": 144 + }, + { + "clip_ratio": 0.0, + "completion_length": 1460.5417175292969, + "epoch": 0.58, + "grad_norm": 0.18892593681812286, + "kl": 0.0838623046875, + "learning_rate": 9.046048391230247e-07, + "loss": -0.061, + "reward": -0.1960761584341526, + "reward_std": 0.16931083425879478, + "rewards/cosine_scaled_reward": -0.0980380792170763, + "rewards/format_reward": 0.0, + "step": 145 + }, + { + "clip_ratio": 0.0, + "completion_length": 1495.2916870117188, + "epoch": 0.584, + "grad_norm": 0.20080548524856567, + "kl": 0.0863037109375, + "learning_rate": 9.026620557966279e-07, + "loss": -0.0189, + "reward": -0.18559397384524345, + "reward_std": 0.16799203678965569, + "rewards/cosine_scaled_reward": -0.09279698692262173, + "rewards/format_reward": 0.0, + "step": 146 + }, + { + "clip_ratio": 0.0, + "completion_length": 1502.8809814453125, + "epoch": 0.588, + "grad_norm": 0.15992848575115204, + "kl": 0.0850830078125, + "learning_rate": 9.007020842191634e-07, + "loss": -0.0201, + "reward": -0.19452324509620667, + "reward_std": 0.1573326252400875, + "rewards/cosine_scaled_reward": -0.09726162627339363, + "rewards/format_reward": 0.0, + "step": 147 + }, + { + "clip_ratio": 0.0, + "completion_length": 1463.65478515625, + "epoch": 0.592, + "grad_norm": 0.12942340970039368, + "kl": 0.08453369140625, + "learning_rate": 8.987250199168808e-07, + "loss": -0.0368, + "reward": -0.20581213757395744, + "reward_std": 0.1570763811469078, + "rewards/cosine_scaled_reward": -0.10290606319904327, + "rewards/format_reward": 0.0, + "step": 148 + }, + { + "clip_ratio": 0.0, + "completion_length": 1493.7619323730469, + "epoch": 0.596, + "grad_norm": 0.18132224678993225, + "kl": 0.0960693359375, + "learning_rate": 8.967309592491052e-07, + "loss": -0.023, + "reward": -0.19545432925224304, + "reward_std": 0.1527048945426941, + "rewards/cosine_scaled_reward": -0.09772716276347637, + "rewards/format_reward": 0.0, + "step": 149 + }, + { + "clip_ratio": 0.0, + "completion_length": 1462.8452758789062, + "epoch": 0.6, + "grad_norm": 0.17014774680137634, + "kl": 0.1109619140625, + "learning_rate": 8.9471999940354e-07, + "loss": -0.0427, + "reward": -0.203117735683918, + "reward_std": 0.18610898405313492, + "rewards/cosine_scaled_reward": -0.10155886970460415, + "rewards/format_reward": 0.0, + "step": 150 + }, + { + "clip_ratio": 0.0, + "completion_length": 1477.3393249511719, + "epoch": 0.604, + "grad_norm": 0.15196801722049713, + "kl": 0.1031494140625, + "learning_rate": 8.926922383915315e-07, + "loss": -0.0465, + "reward": -0.20808908715844154, + "reward_std": 0.18062585964798927, + "rewards/cosine_scaled_reward": -0.10404454357922077, + "rewards/format_reward": 0.0, + "step": 151 + }, + { + "clip_ratio": 0.0, + "completion_length": 1479.3333740234375, + "epoch": 0.608, + "grad_norm": 0.13162937760353088, + "kl": 0.0885009765625, + "learning_rate": 8.906477750432903e-07, + "loss": -0.0501, + "reward": -0.21273208782076836, + "reward_std": 0.16958871111273766, + "rewards/cosine_scaled_reward": -0.10636604763567448, + "rewards/format_reward": 0.0, + "step": 152 + }, + { + "clip_ratio": 0.0, + "completion_length": 1508.1904907226562, + "epoch": 0.612, + "grad_norm": 0.15801787376403809, + "kl": 0.0894775390625, + "learning_rate": 8.88586709003076e-07, + "loss": -0.024, + "reward": -0.18563585355877876, + "reward_std": 0.14705245569348335, + "rewards/cosine_scaled_reward": -0.09281792864203453, + "rewards/format_reward": 0.0, + "step": 153 + }, + { + "clip_ratio": 0.0, + "completion_length": 1487.75, + "epoch": 0.616, + "grad_norm": 0.19425953924655914, + "kl": 0.1038818359375, + "learning_rate": 8.865091407243394e-07, + "loss": -0.0361, + "reward": -0.18547611683607101, + "reward_std": 0.1538998931646347, + "rewards/cosine_scaled_reward": -0.09273805841803551, + "rewards/format_reward": 0.0, + "step": 154 + }, + { + "clip_ratio": 0.0, + "completion_length": 1479.5595703125, + "epoch": 0.62, + "grad_norm": 0.13653838634490967, + "kl": 0.1011962890625, + "learning_rate": 8.844151714648274e-07, + "loss": -0.0666, + "reward": -0.20050010830163956, + "reward_std": 0.15926911309361458, + "rewards/cosine_scaled_reward": -0.10025005042552948, + "rewards/format_reward": 0.0, + "step": 155 + }, + { + "clip_ratio": 0.0, + "completion_length": 1516.6071472167969, + "epoch": 0.624, + "grad_norm": 0.1796942949295044, + "kl": 0.1038818359375, + "learning_rate": 8.823049032816478e-07, + "loss": -0.0216, + "reward": -0.1935092769563198, + "reward_std": 0.17067047394812107, + "rewards/cosine_scaled_reward": -0.09675464034080505, + "rewards/format_reward": 0.0, + "step": 156 + }, + { + "clip_ratio": 0.0, + "completion_length": 1490.2679138183594, + "epoch": 0.628, + "grad_norm": 0.15652166306972504, + "kl": 0.0980224609375, + "learning_rate": 8.801784390262943e-07, + "loss": -0.0199, + "reward": -0.10874435119330883, + "reward_std": 0.15503624081611633, + "rewards/cosine_scaled_reward": -0.05437217652797699, + "rewards/format_reward": 0.0, + "step": 157 + }, + { + "clip_ratio": 0.0, + "completion_length": 1478.327392578125, + "epoch": 0.632, + "grad_norm": 0.10631345212459564, + "kl": 0.10205078125, + "learning_rate": 8.780358823396352e-07, + "loss": -0.0558, + "reward": -0.19594154134392738, + "reward_std": 0.18128735944628716, + "rewards/cosine_scaled_reward": -0.09797077253460884, + "rewards/format_reward": 0.0, + "step": 158 + }, + { + "clip_ratio": 0.0, + "completion_length": 1492.0059814453125, + "epoch": 0.636, + "grad_norm": 0.1802942007780075, + "kl": 0.1143798828125, + "learning_rate": 8.758773376468604e-07, + "loss": -0.045, + "reward": -0.17765655368566513, + "reward_std": 0.16990270093083382, + "rewards/cosine_scaled_reward": -0.08882827498018742, + "rewards/format_reward": 0.0, + "step": 159 + }, + { + "clip_ratio": 0.0, + "completion_length": 1490.7798156738281, + "epoch": 0.64, + "grad_norm": 0.09826915711164474, + "kl": 0.1187744140625, + "learning_rate": 8.737029101523929e-07, + "loss": -0.0474, + "reward": -0.15192299336194992, + "reward_std": 0.1335773952305317, + "rewards/cosine_scaled_reward": -0.07596149481832981, + "rewards/format_reward": 0.0, + "step": 160 + }, + { + "clip_ratio": 0.0, + "completion_length": 1518.3035888671875, + "epoch": 0.644, + "grad_norm": 0.14415128529071808, + "kl": 0.1134033203125, + "learning_rate": 8.715127058347614e-07, + "loss": -0.0181, + "reward": -0.16958895698189735, + "reward_std": 0.14540697447955608, + "rewards/cosine_scaled_reward": -0.08479447849094868, + "rewards/format_reward": 0.0, + "step": 161 + }, + { + "clip_ratio": 0.0, + "completion_length": 1478.0714416503906, + "epoch": 0.648, + "grad_norm": 0.13143935799598694, + "kl": 0.1153564453125, + "learning_rate": 8.693068314414344e-07, + "loss": -0.0602, + "reward": -0.17831872776150703, + "reward_std": 0.16582145914435387, + "rewards/cosine_scaled_reward": -0.08915936388075352, + "rewards/format_reward": 0.0, + "step": 162 + }, + { + "clip_ratio": 0.0, + "completion_length": 1500.261962890625, + "epoch": 0.652, + "grad_norm": 0.2650466561317444, + "kl": 0.1025390625, + "learning_rate": 8.670853944836176e-07, + "loss": -0.0444, + "reward": -0.19617021456360817, + "reward_std": 0.16974329948425293, + "rewards/cosine_scaled_reward": -0.09808510728180408, + "rewards/format_reward": 0.0, + "step": 163 + }, + { + "clip_ratio": 0.0, + "completion_length": 1476.3630981445312, + "epoch": 0.656, + "grad_norm": 0.1531592160463333, + "kl": 0.1103515625, + "learning_rate": 8.648485032310144e-07, + "loss": -0.07, + "reward": -0.17511003464460373, + "reward_std": 0.16592327691614628, + "rewards/cosine_scaled_reward": -0.08755501732230186, + "rewards/format_reward": 0.0, + "step": 164 + }, + { + "clip_ratio": 0.0, + "completion_length": 1491.1845703125, + "epoch": 0.66, + "grad_norm": 0.11288218945264816, + "kl": 0.117919921875, + "learning_rate": 8.625962667065487e-07, + "loss": -0.0371, + "reward": -0.19623659178614616, + "reward_std": 0.15025305189192295, + "rewards/cosine_scaled_reward": -0.09811829589307308, + "rewards/format_reward": 0.0, + "step": 165 + }, + { + "clip_ratio": 0.0, + "completion_length": 1496.6607055664062, + "epoch": 0.664, + "grad_norm": 0.1232253834605217, + "kl": 0.115234375, + "learning_rate": 8.603287946810513e-07, + "loss": -0.0475, + "reward": -0.19438259676098824, + "reward_std": 0.1660812869668007, + "rewards/cosine_scaled_reward": -0.09719130024313927, + "rewards/format_reward": 0.0, + "step": 166 + }, + { + "clip_ratio": 0.0, + "completion_length": 1504.577392578125, + "epoch": 0.668, + "grad_norm": 0.11001460999250412, + "kl": 0.1339111328125, + "learning_rate": 8.580461976679099e-07, + "loss": -0.0191, + "reward": -0.07667672634124756, + "reward_std": 0.14517304301261902, + "rewards/cosine_scaled_reward": -0.03833836503326893, + "rewards/format_reward": 0.0, + "step": 167 + }, + { + "clip_ratio": 0.0, + "completion_length": 1507.4345397949219, + "epoch": 0.672, + "grad_norm": 0.12665332853794098, + "kl": 0.12939453125, + "learning_rate": 8.557485869176825e-07, + "loss": -0.0262, + "reward": -0.18039095029234886, + "reward_std": 0.14454744383692741, + "rewards/cosine_scaled_reward": -0.09019547514617443, + "rewards/format_reward": 0.0, + "step": 168 + }, + { + "clip_ratio": 0.0, + "completion_length": 1505.3809814453125, + "epoch": 0.676, + "grad_norm": 0.10181339830160141, + "kl": 0.1201171875, + "learning_rate": 8.534360744126753e-07, + "loss": -0.0296, + "reward": -0.15367550402879715, + "reward_std": 0.154752716422081, + "rewards/cosine_scaled_reward": -0.07683775387704372, + "rewards/format_reward": 0.0, + "step": 169 + }, + { + "clip_ratio": 0.0, + "completion_length": 1502.2678833007812, + "epoch": 0.68, + "grad_norm": 0.08636263757944107, + "kl": 0.140380859375, + "learning_rate": 8.511087728614862e-07, + "loss": -0.0405, + "reward": -0.14581535942852497, + "reward_std": 0.1418076604604721, + "rewards/cosine_scaled_reward": -0.07290767971426249, + "rewards/format_reward": 0.0, + "step": 170 + }, + { + "clip_ratio": 0.0, + "completion_length": 1503.3154907226562, + "epoch": 0.684, + "grad_norm": 0.1574849933385849, + "kl": 0.1326904296875, + "learning_rate": 8.487667956935087e-07, + "loss": -0.0257, + "reward": -0.14666364155709743, + "reward_std": 0.14365333877503872, + "rewards/cosine_scaled_reward": -0.07333182357251644, + "rewards/format_reward": 0.0, + "step": 171 + }, + { + "clip_ratio": 0.0, + "completion_length": 1501.3690490722656, + "epoch": 0.688, + "grad_norm": 0.11679176241159439, + "kl": 0.1322021484375, + "learning_rate": 8.464102570534061e-07, + "loss": -0.0394, + "reward": -0.1798371747136116, + "reward_std": 0.1620156615972519, + "rewards/cosine_scaled_reward": -0.0899185873568058, + "rewards/format_reward": 0.0, + "step": 172 + }, + { + "clip_ratio": 0.0, + "completion_length": 1489.6845703125, + "epoch": 0.692, + "grad_norm": 0.143167644739151, + "kl": 0.1221923828125, + "learning_rate": 8.440392717955475e-07, + "loss": -0.0454, + "reward": -0.11209908872842789, + "reward_std": 0.15652650594711304, + "rewards/cosine_scaled_reward": -0.05604954622685909, + "rewards/format_reward": 0.0, + "step": 173 + }, + { + "clip_ratio": 0.0, + "completion_length": 1498.1607360839844, + "epoch": 0.696, + "grad_norm": 0.13327138125896454, + "kl": 0.1173095703125, + "learning_rate": 8.416539554784089e-07, + "loss": -0.0349, + "reward": -0.18605408817529678, + "reward_std": 0.17399531230330467, + "rewards/cosine_scaled_reward": -0.09302704595029354, + "rewards/format_reward": 0.0, + "step": 174 + }, + { + "clip_ratio": 0.0, + "completion_length": 1500.1309509277344, + "epoch": 0.7, + "grad_norm": 0.12825773656368256, + "kl": 0.1453857421875, + "learning_rate": 8.392544243589427e-07, + "loss": -0.0389, + "reward": -0.18417096138000488, + "reward_std": 0.15830742567777634, + "rewards/cosine_scaled_reward": -0.09208548441529274, + "rewards/format_reward": 0.0, + "step": 175 + }, + { + "clip_ratio": 0.0, + "completion_length": 1512.1488342285156, + "epoch": 0.704, + "grad_norm": 0.19698651134967804, + "kl": 0.12744140625, + "learning_rate": 8.368407953869103e-07, + "loss": -0.0278, + "reward": -0.1590665504336357, + "reward_std": 0.1343939360231161, + "rewards/cosine_scaled_reward": -0.0795332733541727, + "rewards/format_reward": 0.0, + "step": 176 + }, + { + "clip_ratio": 0.0, + "completion_length": 1507.4583435058594, + "epoch": 0.708, + "grad_norm": 0.11850512772798538, + "kl": 0.12353515625, + "learning_rate": 8.344131861991828e-07, + "loss": -0.0209, + "reward": -0.17914509028196335, + "reward_std": 0.14407609589397907, + "rewards/cosine_scaled_reward": -0.08957254886627197, + "rewards/format_reward": 0.0, + "step": 177 + }, + { + "clip_ratio": 0.0, + "completion_length": 1494.6785888671875, + "epoch": 0.712, + "grad_norm": 0.12900114059448242, + "kl": 0.1517333984375, + "learning_rate": 8.319717151140072e-07, + "loss": -0.0476, + "reward": -0.1623889021575451, + "reward_std": 0.1595832072198391, + "rewards/cosine_scaled_reward": -0.08119445107877254, + "rewards/format_reward": 0.0, + "step": 178 + }, + { + "clip_ratio": 0.0, + "completion_length": 1519.1309509277344, + "epoch": 0.716, + "grad_norm": 0.14418767392635345, + "kl": 0.1248779296875, + "learning_rate": 8.295165011252396e-07, + "loss": -0.0145, + "reward": -0.15135440602898598, + "reward_std": 0.12371071800589561, + "rewards/cosine_scaled_reward": -0.07567720301449299, + "rewards/format_reward": 0.0, + "step": 179 + }, + { + "clip_ratio": 0.0, + "completion_length": 1498.607177734375, + "epoch": 0.72, + "grad_norm": 0.11457547545433044, + "kl": 0.1297607421875, + "learning_rate": 8.270476638965461e-07, + "loss": -0.0282, + "reward": -0.1590440645813942, + "reward_std": 0.13810313865542412, + "rewards/cosine_scaled_reward": -0.07952203415334225, + "rewards/format_reward": 0.0, + "step": 180 + }, + { + "clip_ratio": 0.0, + "completion_length": 1507.702392578125, + "epoch": 0.724, + "grad_norm": 0.16079466044902802, + "kl": 0.1300048828125, + "learning_rate": 8.245653237555705e-07, + "loss": -0.0259, + "reward": -0.14657551050186157, + "reward_std": 0.12384105287492275, + "rewards/cosine_scaled_reward": -0.07328775525093079, + "rewards/format_reward": 0.0, + "step": 181 + }, + { + "clip_ratio": 0.0, + "completion_length": 1513.0595703125, + "epoch": 0.728, + "grad_norm": 0.12367437779903412, + "kl": 0.13720703125, + "learning_rate": 8.220696016880687e-07, + "loss": -0.0248, + "reward": -0.15343056619167328, + "reward_std": 0.14851927012205124, + "rewards/cosine_scaled_reward": -0.07671528309583664, + "rewards/format_reward": 0.0, + "step": 182 + }, + { + "clip_ratio": 0.0, + "completion_length": 1522.5297546386719, + "epoch": 0.732, + "grad_norm": 0.10700845718383789, + "kl": 0.128662109375, + "learning_rate": 8.195606193320136e-07, + "loss": -0.0107, + "reward": -0.14938092976808548, + "reward_std": 0.11783361062407494, + "rewards/cosine_scaled_reward": -0.07469046581536531, + "rewards/format_reward": 0.0, + "step": 183 + }, + { + "clip_ratio": 0.0, + "completion_length": 1519.9226379394531, + "epoch": 0.736, + "grad_norm": 0.13011117279529572, + "kl": 0.1298828125, + "learning_rate": 8.170384989716657e-07, + "loss": -0.0148, + "reward": -0.15839479491114616, + "reward_std": 0.12780537828803062, + "rewards/cosine_scaled_reward": -0.07919739931821823, + "rewards/format_reward": 0.0, + "step": 184 + }, + { + "clip_ratio": 0.0, + "completion_length": 1519.7797546386719, + "epoch": 0.74, + "grad_norm": 0.08933281898498535, + "kl": 0.152099609375, + "learning_rate": 8.145033635316128e-07, + "loss": -0.0141, + "reward": -0.1524551585316658, + "reward_std": 0.12427524663507938, + "rewards/cosine_scaled_reward": -0.07622758112847805, + "rewards/format_reward": 0.0, + "step": 185 + }, + { + "clip_ratio": 0.0, + "completion_length": 1498.3809509277344, + "epoch": 0.744, + "grad_norm": 0.12463698536157608, + "kl": 0.1455078125, + "learning_rate": 8.119553365707802e-07, + "loss": -0.0308, + "reward": -0.1026168204843998, + "reward_std": 0.13808677345514297, + "rewards/cosine_scaled_reward": -0.051308413967490196, + "rewards/format_reward": 0.0, + "step": 186 + }, + { + "clip_ratio": 0.0, + "completion_length": 1466.6845397949219, + "epoch": 0.748, + "grad_norm": 0.11249163746833801, + "kl": 0.13720703125, + "learning_rate": 8.093945422764069e-07, + "loss": -0.0544, + "reward": -0.16347405686974525, + "reward_std": 0.14502743259072304, + "rewards/cosine_scaled_reward": -0.08173702843487263, + "rewards/format_reward": 0.0, + "step": 187 + }, + { + "clip_ratio": 0.0, + "completion_length": 1492.2976684570312, + "epoch": 0.752, + "grad_norm": 0.08640649914741516, + "kl": 0.151123046875, + "learning_rate": 8.068211054579943e-07, + "loss": -0.0329, + "reward": -0.16077740490436554, + "reward_std": 0.15286827459931374, + "rewards/cosine_scaled_reward": -0.08038870431482792, + "rewards/format_reward": 0.0, + "step": 188 + }, + { + "clip_ratio": 0.0, + "completion_length": 1507.9643249511719, + "epoch": 0.756, + "grad_norm": 0.10966922342777252, + "kl": 0.1417236328125, + "learning_rate": 8.04235151541222e-07, + "loss": -0.0235, + "reward": -0.1736162230372429, + "reward_std": 0.1491672247648239, + "rewards/cosine_scaled_reward": -0.08680811524391174, + "rewards/format_reward": 0.0, + "step": 189 + }, + { + "clip_ratio": 0.0, + "completion_length": 1518.5833740234375, + "epoch": 0.76, + "grad_norm": 0.12140454351902008, + "kl": 0.1397705078125, + "learning_rate": 8.01636806561836e-07, + "loss": -0.0182, + "reward": -0.15859121643006802, + "reward_std": 0.13637281768023968, + "rewards/cosine_scaled_reward": -0.07929560728371143, + "rewards/format_reward": 0.0, + "step": 190 + }, + { + "clip_ratio": 0.0, + "completion_length": 1518.4167175292969, + "epoch": 0.764, + "grad_norm": 0.12270302325487137, + "kl": 0.159423828125, + "learning_rate": 7.990261971595048e-07, + "loss": -0.0061, + "reward": -0.1657138504087925, + "reward_std": 0.1241717990487814, + "rewards/cosine_scaled_reward": -0.08285692892968655, + "rewards/format_reward": 0.0, + "step": 191 + }, + { + "clip_ratio": 0.0, + "completion_length": 1498.9345397949219, + "epoch": 0.768, + "grad_norm": 0.12375301867723465, + "kl": 0.162353515625, + "learning_rate": 7.964034505716476e-07, + "loss": -0.0398, + "reward": -0.17186777852475643, + "reward_std": 0.16331871785223484, + "rewards/cosine_scaled_reward": -0.08593388926237822, + "rewards/format_reward": 0.0, + "step": 192 + }, + { + "clip_ratio": 0.0, + "completion_length": 1497.3452758789062, + "epoch": 0.772, + "grad_norm": 0.08711956441402435, + "kl": 0.152587890625, + "learning_rate": 7.93768694627233e-07, + "loss": -0.0287, + "reward": -0.15793642029166222, + "reward_std": 0.1320202425122261, + "rewards/cosine_scaled_reward": -0.07896821573376656, + "rewards/format_reward": 0.0, + "step": 193 + }, + { + "clip_ratio": 0.0, + "completion_length": 1517.7976379394531, + "epoch": 0.776, + "grad_norm": 0.0980345830321312, + "kl": 0.14990234375, + "learning_rate": 7.911220577405484e-07, + "loss": -0.015, + "reward": -0.15146314911544323, + "reward_std": 0.13276733830571175, + "rewards/cosine_scaled_reward": -0.07573157269507647, + "rewards/format_reward": 0.0, + "step": 194 + }, + { + "clip_ratio": 0.0, + "completion_length": 1509.5595397949219, + "epoch": 0.78, + "grad_norm": 0.19111458957195282, + "kl": 0.14404296875, + "learning_rate": 7.884636689049422e-07, + "loss": -0.027, + "reward": -0.1567194815725088, + "reward_std": 0.14454844780266285, + "rewards/cosine_scaled_reward": -0.07835974264889956, + "rewards/format_reward": 0.0, + "step": 195 + }, + { + "clip_ratio": 0.0, + "completion_length": 1503.6785888671875, + "epoch": 0.784, + "grad_norm": 0.10002721846103668, + "kl": 0.150634765625, + "learning_rate": 7.857936576865356e-07, + "loss": -0.0234, + "reward": -0.17957428470253944, + "reward_std": 0.14822976663708687, + "rewards/cosine_scaled_reward": -0.08978714607656002, + "rewards/format_reward": 0.0, + "step": 196 + }, + { + "clip_ratio": 0.0, + "completion_length": 1529.0, + "epoch": 0.788, + "grad_norm": 0.1074904203414917, + "kl": 0.15283203125, + "learning_rate": 7.831121542179086e-07, + "loss": -0.0043, + "reward": -0.135637816041708, + "reward_std": 0.10331238061189651, + "rewards/cosine_scaled_reward": -0.06781890522688627, + "rewards/format_reward": 0.0, + "step": 197 + }, + { + "clip_ratio": 0.0, + "completion_length": 1485.7738342285156, + "epoch": 0.792, + "grad_norm": 0.09117424488067627, + "kl": 0.1513671875, + "learning_rate": 7.804192891917571e-07, + "loss": -0.0363, + "reward": -0.17118510231375694, + "reward_std": 0.16877446696162224, + "rewards/cosine_scaled_reward": -0.08559254929423332, + "rewards/format_reward": 0.0, + "step": 198 + }, + { + "clip_ratio": 0.0, + "completion_length": 1508.6607055664062, + "epoch": 0.796, + "grad_norm": 0.10829849541187286, + "kl": 0.14306640625, + "learning_rate": 7.777151938545235e-07, + "loss": -0.0227, + "reward": -0.15547415241599083, + "reward_std": 0.12111644446849823, + "rewards/cosine_scaled_reward": -0.07773707807064056, + "rewards/format_reward": 0.0, + "step": 199 + }, + { + "clip_ratio": 0.0, + "completion_length": 1514.5476379394531, + "epoch": 0.8, + "grad_norm": 0.17340432107448578, + "kl": 0.1441650390625, + "learning_rate": 7.75e-07, + "loss": -0.0141, + "reward": -0.15766149759292603, + "reward_std": 0.13325241580605507, + "rewards/cosine_scaled_reward": -0.07883074693381786, + "rewards/format_reward": 0.0, + "step": 200 + }, + { + "clip_ratio": 0.0, + "completion_length": 1517.1785888671875, + "epoch": 0.804, + "grad_norm": 0.12855444848537445, + "kl": 0.158447265625, + "learning_rate": 7.72273839962904e-07, + "loss": -0.0166, + "reward": -0.14288493990898132, + "reward_std": 0.1141566876322031, + "rewards/cosine_scaled_reward": -0.07144246716052294, + "rewards/format_reward": 0.0, + "step": 201 + }, + { + "clip_ratio": 0.0, + "completion_length": 1492.6488342285156, + "epoch": 0.808, + "grad_norm": 0.09727565199136734, + "kl": 0.163818359375, + "learning_rate": 7.695368466124296e-07, + "loss": -0.046, + "reward": -0.1400277316570282, + "reward_std": 0.1363294217735529, + "rewards/cosine_scaled_reward": -0.07001386396586895, + "rewards/format_reward": 0.0, + "step": 202 + }, + { + "clip_ratio": 0.0, + "completion_length": 1502.5416564941406, + "epoch": 0.812, + "grad_norm": 0.1032966673374176, + "kl": 0.166015625, + "learning_rate": 7.667891533457718e-07, + "loss": -0.0224, + "reward": -0.1764848232269287, + "reward_std": 0.15339024364948273, + "rewards/cosine_scaled_reward": -0.08824240230023861, + "rewards/format_reward": 0.0, + "step": 203 + }, + { + "clip_ratio": 0.0, + "completion_length": 1516.7202453613281, + "epoch": 0.816, + "grad_norm": 0.15399117767810822, + "kl": 0.175537109375, + "learning_rate": 7.640308940816239e-07, + "loss": -0.0163, + "reward": -0.14551853574812412, + "reward_std": 0.14542756974697113, + "rewards/cosine_scaled_reward": -0.07275926228612661, + "rewards/format_reward": 0.0, + "step": 204 + }, + { + "clip_ratio": 0.0, + "completion_length": 1509.761962890625, + "epoch": 0.82, + "grad_norm": 0.0828375294804573, + "kl": 0.165283203125, + "learning_rate": 7.612622032536507e-07, + "loss": -0.0178, + "reward": -0.15090281143784523, + "reward_std": 0.13103190064430237, + "rewards/cosine_scaled_reward": -0.07545140571892262, + "rewards/format_reward": 0.0, + "step": 205 + }, + { + "clip_ratio": 0.0, + "completion_length": 1530.4583435058594, + "epoch": 0.824, + "grad_norm": 0.09723107516765594, + "kl": 0.156005859375, + "learning_rate": 7.584832158039378e-07, + "loss": 0.0019, + "reward": -0.13943813741207123, + "reward_std": 0.12253955751657486, + "rewards/cosine_scaled_reward": -0.06971907056868076, + "rewards/format_reward": 0.0, + "step": 206 + }, + { + "clip_ratio": 0.0, + "completion_length": 1498.8928833007812, + "epoch": 0.828, + "grad_norm": 0.10988614708185196, + "kl": 0.16162109375, + "learning_rate": 7.556940671764124e-07, + "loss": -0.0272, + "reward": -0.15701762214303017, + "reward_std": 0.1170128583908081, + "rewards/cosine_scaled_reward": -0.07850880734622478, + "rewards/format_reward": 0.0, + "step": 207 + }, + { + "clip_ratio": 0.0, + "completion_length": 1525.5952453613281, + "epoch": 0.832, + "grad_norm": 0.11009800434112549, + "kl": 0.171142578125, + "learning_rate": 7.528948933102438e-07, + "loss": 0.0156, + "reward": -0.14172504469752312, + "reward_std": 0.11965266987681389, + "rewards/cosine_scaled_reward": -0.07086252421140671, + "rewards/format_reward": 0.0, + "step": 208 + }, + { + "clip_ratio": 0.0, + "completion_length": 1520.1011962890625, + "epoch": 0.836, + "grad_norm": 0.15401668846607208, + "kl": 0.16455078125, + "learning_rate": 7.500858306332172e-07, + "loss": -0.0106, + "reward": -0.06414606049656868, + "reward_std": 0.12946532107889652, + "rewards/cosine_scaled_reward": -0.032073031179606915, + "rewards/format_reward": 0.0, + "step": 209 + }, + { + "clip_ratio": 0.0, + "completion_length": 1522.232177734375, + "epoch": 0.84, + "grad_norm": 0.12647868692874908, + "kl": 0.160400390625, + "learning_rate": 7.472670160550848e-07, + "loss": 0.0009, + "reward": -0.12890829890966415, + "reward_std": 0.1113775297999382, + "rewards/cosine_scaled_reward": -0.06445414572954178, + "rewards/format_reward": 0.0, + "step": 210 + }, + { + "clip_ratio": 0.0, + "completion_length": 1495.9405212402344, + "epoch": 0.844, + "grad_norm": 0.09751134365797043, + "kl": 0.1451416015625, + "learning_rate": 7.444385869608921e-07, + "loss": -0.0263, + "reward": -0.15089180506765842, + "reward_std": 0.12495366670191288, + "rewards/cosine_scaled_reward": -0.07544590625911951, + "rewards/format_reward": 0.0, + "step": 211 + }, + { + "clip_ratio": 0.0, + "completion_length": 1517.6845397949219, + "epoch": 0.848, + "grad_norm": 0.10632960498332977, + "kl": 0.172607421875, + "learning_rate": 7.416006812042827e-07, + "loss": 0.0114, + "reward": -0.12592120468616486, + "reward_std": 0.10658309236168861, + "rewards/cosine_scaled_reward": -0.06296060606837273, + "rewards/format_reward": 0.0, + "step": 212 + }, + { + "clip_ratio": 0.0, + "completion_length": 1516.1726379394531, + "epoch": 0.852, + "grad_norm": 0.08701854944229126, + "kl": 0.173095703125, + "learning_rate": 7.387534371007797e-07, + "loss": -0.0094, + "reward": -0.04494331777095795, + "reward_std": 0.10984733514487743, + "rewards/cosine_scaled_reward": -0.022471658885478973, + "rewards/format_reward": 0.0, + "step": 213 + }, + { + "clip_ratio": 0.0, + "completion_length": 1514.8392944335938, + "epoch": 0.856, + "grad_norm": 0.10350044071674347, + "kl": 0.1650390625, + "learning_rate": 7.358969934210438e-07, + "loss": -0.0193, + "reward": -0.15005925297737122, + "reward_std": 0.1357622630894184, + "rewards/cosine_scaled_reward": -0.07502962276339531, + "rewards/format_reward": 0.0, + "step": 214 + }, + { + "clip_ratio": 0.0, + "completion_length": 1500.7738342285156, + "epoch": 0.86, + "grad_norm": 0.08610279858112335, + "kl": 0.17724609375, + "learning_rate": 7.330314893841101e-07, + "loss": -0.0337, + "reward": -0.06414718553423882, + "reward_std": 0.1431224588304758, + "rewards/cosine_scaled_reward": -0.03207359462976456, + "rewards/format_reward": 0.0, + "step": 215 + }, + { + "clip_ratio": 0.0, + "completion_length": 1495.6667175292969, + "epoch": 0.864, + "grad_norm": 0.0906975269317627, + "kl": 0.17041015625, + "learning_rate": 7.301570646506027e-07, + "loss": -0.0444, + "reward": -0.14620519801974297, + "reward_std": 0.1489107757806778, + "rewards/cosine_scaled_reward": -0.07310259714722633, + "rewards/format_reward": 0.0, + "step": 216 + }, + { + "clip_ratio": 0.0, + "completion_length": 1501.3810119628906, + "epoch": 0.868, + "grad_norm": 0.08579116314649582, + "kl": 0.16357421875, + "learning_rate": 7.27273859315928e-07, + "loss": -0.0184, + "reward": -0.1731514111161232, + "reward_std": 0.16837183013558388, + "rewards/cosine_scaled_reward": -0.0865757018327713, + "rewards/format_reward": 0.0, + "step": 217 + }, + { + "clip_ratio": 0.0, + "completion_length": 1524.6488342285156, + "epoch": 0.872, + "grad_norm": 0.0806507021188736, + "kl": 0.1669921875, + "learning_rate": 7.243820139034464e-07, + "loss": -0.0038, + "reward": -0.12116867862641811, + "reward_std": 0.10355196706950665, + "rewards/cosine_scaled_reward": -0.060584343038499355, + "rewards/format_reward": 0.0, + "step": 218 + }, + { + "clip_ratio": 0.0, + "completion_length": 1489.3809814453125, + "epoch": 0.876, + "grad_norm": 0.07953932136297226, + "kl": 0.18017578125, + "learning_rate": 7.214816693576234e-07, + "loss": -0.037, + "reward": -0.15147988684475422, + "reward_std": 0.13851544074714184, + "rewards/cosine_scaled_reward": -0.07573994528502226, + "rewards/format_reward": 0.0, + "step": 219 + }, + { + "clip_ratio": 0.0, + "completion_length": 1519.6071472167969, + "epoch": 0.88, + "grad_norm": 0.09334682673215866, + "kl": 0.161376953125, + "learning_rate": 7.185729670371604e-07, + "loss": -0.0124, + "reward": -0.14643241465091705, + "reward_std": 0.12924424931406975, + "rewards/cosine_scaled_reward": -0.07321620732545853, + "rewards/format_reward": 0.0, + "step": 220 + }, + { + "clip_ratio": 0.0, + "completion_length": 1530.5357360839844, + "epoch": 0.884, + "grad_norm": 0.15875208377838135, + "kl": 0.1796875, + "learning_rate": 7.156560487081051e-07, + "loss": -0.0004, + "reward": -0.12105057947337627, + "reward_std": 0.11511022225022316, + "rewards/cosine_scaled_reward": -0.060525291599333286, + "rewards/format_reward": 0.0, + "step": 221 + }, + { + "clip_ratio": 0.0, + "completion_length": 1471.8214721679688, + "epoch": 0.888, + "grad_norm": 0.09343700110912323, + "kl": 0.166015625, + "learning_rate": 7.127310565369415e-07, + "loss": -0.0241, + "reward": -0.1482422910630703, + "reward_std": 0.13543636724352837, + "rewards/cosine_scaled_reward": -0.07412114553153515, + "rewards/format_reward": 0.0, + "step": 222 + }, + { + "clip_ratio": 0.0, + "completion_length": 1513.8214416503906, + "epoch": 0.892, + "grad_norm": 0.12307551503181458, + "kl": 0.165771484375, + "learning_rate": 7.097981330836616e-07, + "loss": -0.0248, + "reward": -0.12069513462483883, + "reward_std": 0.13596191070973873, + "rewards/cosine_scaled_reward": -0.060347567312419415, + "rewards/format_reward": 0.0, + "step": 223 + }, + { + "clip_ratio": 0.0, + "completion_length": 1536.0, + "epoch": 0.896, + "grad_norm": 0.1183975487947464, + "kl": 0.153564453125, + "learning_rate": 7.068574212948169e-07, + "loss": 0.0062, + "reward": -0.14491389319300652, + "reward_std": 0.1196140144020319, + "rewards/cosine_scaled_reward": -0.07245695032179356, + "rewards/format_reward": 0.0, + "step": 224 + }, + { + "clip_ratio": 0.0, + "completion_length": 1497.4583435058594, + "epoch": 0.9, + "grad_norm": 0.0833095982670784, + "kl": 0.1728515625, + "learning_rate": 7.039090644965509e-07, + "loss": -0.0371, + "reward": -0.13499305956065655, + "reward_std": 0.12412836588919163, + "rewards/cosine_scaled_reward": -0.06749652978032827, + "rewards/format_reward": 0.0, + "step": 225 + }, + { + "clip_ratio": 0.0, + "completion_length": 1496.6845397949219, + "epoch": 0.904, + "grad_norm": 0.08985927700996399, + "kl": 0.16650390625, + "learning_rate": 7.009532063876148e-07, + "loss": -0.036, + "reward": -0.13741241209208965, + "reward_std": 0.13803981989622116, + "rewards/cosine_scaled_reward": -0.06870621163398027, + "rewards/format_reward": 0.0, + "step": 226 + }, + { + "clip_ratio": 0.0, + "completion_length": 1524.4047546386719, + "epoch": 0.908, + "grad_norm": 0.0919218361377716, + "kl": 0.16455078125, + "learning_rate": 6.979899910323624e-07, + "loss": -0.0092, + "reward": -0.1429591029882431, + "reward_std": 0.12002728693187237, + "rewards/cosine_scaled_reward": -0.0714795533567667, + "rewards/format_reward": 0.0, + "step": 227 + }, + { + "clip_ratio": 0.0, + "completion_length": 1503.6607360839844, + "epoch": 0.912, + "grad_norm": 0.11030226945877075, + "kl": 0.1650390625, + "learning_rate": 6.950195628537299e-07, + "loss": -0.0386, + "reward": -0.13499593548476696, + "reward_std": 0.13771596364676952, + "rewards/cosine_scaled_reward": -0.06749796774238348, + "rewards/format_reward": 0.0, + "step": 228 + }, + { + "clip_ratio": 0.0, + "completion_length": 1488.8214416503906, + "epoch": 0.916, + "grad_norm": 0.09756813943386078, + "kl": 0.1806640625, + "learning_rate": 6.920420666261961e-07, + "loss": -0.0537, + "reward": -0.13239642046391964, + "reward_std": 0.14560103975236416, + "rewards/cosine_scaled_reward": -0.06619821395725012, + "rewards/format_reward": 0.0, + "step": 229 + }, + { + "clip_ratio": 0.0, + "completion_length": 1520.875, + "epoch": 0.92, + "grad_norm": 0.13375338912010193, + "kl": 0.183349609375, + "learning_rate": 6.890576474687263e-07, + "loss": -0.0143, + "reward": -0.11103706806898117, + "reward_std": 0.11034407652914524, + "rewards/cosine_scaled_reward": -0.05551853682845831, + "rewards/format_reward": 0.0, + "step": 230 + }, + { + "clip_ratio": 0.0, + "completion_length": 1504.6369323730469, + "epoch": 0.924, + "grad_norm": 0.09202142059803009, + "kl": 0.14892578125, + "learning_rate": 6.860664508377001e-07, + "loss": -0.0339, + "reward": -0.14039192162454128, + "reward_std": 0.13506866246461868, + "rewards/cosine_scaled_reward": -0.07019595894962549, + "rewards/format_reward": 0.0, + "step": 231 + }, + { + "clip_ratio": 0.0, + "completion_length": 1520.5476379394531, + "epoch": 0.928, + "grad_norm": 0.098211869597435, + "kl": 0.1650390625, + "learning_rate": 6.83068622519821e-07, + "loss": -0.0127, + "reward": -0.12785408459603786, + "reward_std": 0.12025357596576214, + "rewards/cosine_scaled_reward": -0.06392704229801893, + "rewards/format_reward": 0.0, + "step": 232 + }, + { + "clip_ratio": 0.0, + "completion_length": 1519.5000305175781, + "epoch": 0.932, + "grad_norm": 0.1020435318350792, + "kl": 0.172119140625, + "learning_rate": 6.800643086250121e-07, + "loss": -0.0108, + "reward": -0.11733199469745159, + "reward_std": 0.09876838512718678, + "rewards/cosine_scaled_reward": -0.058665999211370945, + "rewards/format_reward": 0.0, + "step": 233 + }, + { + "clip_ratio": 0.0, + "completion_length": 1514.2500305175781, + "epoch": 0.936, + "grad_norm": 0.09378773719072342, + "kl": 0.154541015625, + "learning_rate": 6.770536555792944e-07, + "loss": -0.005, + "reward": -0.12837751768529415, + "reward_std": 0.12461850047111511, + "rewards/cosine_scaled_reward": -0.06418875977396965, + "rewards/format_reward": 0.0, + "step": 234 + }, + { + "clip_ratio": 0.0, + "completion_length": 1513.8750305175781, + "epoch": 0.94, + "grad_norm": 0.1036522388458252, + "kl": 0.1611328125, + "learning_rate": 6.740368101176495e-07, + "loss": -0.0073, + "reward": -0.12727641500532627, + "reward_std": 0.11483397521078587, + "rewards/cosine_scaled_reward": -0.06363820657134056, + "rewards/format_reward": 0.0, + "step": 235 + }, + { + "clip_ratio": 0.0, + "completion_length": 1522.732177734375, + "epoch": 0.944, + "grad_norm": 0.12251052260398865, + "kl": 0.1640625, + "learning_rate": 6.710139192768694e-07, + "loss": -0.0118, + "reward": -0.11951213330030441, + "reward_std": 0.1262509860098362, + "rewards/cosine_scaled_reward": -0.059756068512797356, + "rewards/format_reward": 0.0, + "step": 236 + }, + { + "clip_ratio": 0.0, + "completion_length": 1509.75, + "epoch": 0.948, + "grad_norm": 0.17989245057106018, + "kl": 0.156982421875, + "learning_rate": 6.679851303883891e-07, + "loss": -0.0289, + "reward": -0.13234319165349007, + "reward_std": 0.13814006373286247, + "rewards/cosine_scaled_reward": -0.06617159210145473, + "rewards/format_reward": 0.0, + "step": 237 + }, + { + "clip_ratio": 0.0, + "completion_length": 1512.8333435058594, + "epoch": 0.952, + "grad_norm": 0.12887263298034668, + "kl": 0.1552734375, + "learning_rate": 6.649505910711058e-07, + "loss": -0.0163, + "reward": -0.13513639941811562, + "reward_std": 0.1341167613863945, + "rewards/cosine_scaled_reward": -0.06756819784641266, + "rewards/format_reward": 0.0, + "step": 238 + }, + { + "clip_ratio": 0.0, + "completion_length": 1507.7202453613281, + "epoch": 0.956, + "grad_norm": 0.10920631885528564, + "kl": 0.163330078125, + "learning_rate": 6.619104492241847e-07, + "loss": -0.0208, + "reward": -0.11296023428440094, + "reward_std": 0.10206396505236626, + "rewards/cosine_scaled_reward": -0.056480118073523045, + "rewards/format_reward": 0.0, + "step": 239 + }, + { + "clip_ratio": 0.0, + "completion_length": 1501.2917175292969, + "epoch": 0.96, + "grad_norm": 0.10691919177770615, + "kl": 0.173828125, + "learning_rate": 6.588648530198504e-07, + "loss": -0.0251, + "reward": -0.12773141264915466, + "reward_std": 0.12681871838867664, + "rewards/cosine_scaled_reward": -0.06386570446193218, + "rewards/format_reward": 0.0, + "step": 240 + }, + { + "clip_ratio": 0.0, + "completion_length": 1508.9940795898438, + "epoch": 0.964, + "grad_norm": 0.12696890532970428, + "kl": 0.1640625, + "learning_rate": 6.558139508961654e-07, + "loss": -0.0235, + "reward": -0.12828433699905872, + "reward_std": 0.12480376102030277, + "rewards/cosine_scaled_reward": -0.06414216570556164, + "rewards/format_reward": 0.0, + "step": 241 + }, + { + "clip_ratio": 0.0, + "completion_length": 1528.40478515625, + "epoch": 0.968, + "grad_norm": 0.19038116931915283, + "kl": 0.166015625, + "learning_rate": 6.527578915497951e-07, + "loss": -0.0025, + "reward": -0.12644540891051292, + "reward_std": 0.11273947730660439, + "rewards/cosine_scaled_reward": -0.06322270352393389, + "rewards/format_reward": 0.0, + "step": 242 + }, + { + "clip_ratio": 0.0, + "completion_length": 1502.4404907226562, + "epoch": 0.972, + "grad_norm": 0.1264588087797165, + "kl": 0.1728515625, + "learning_rate": 6.496968239287603e-07, + "loss": -0.0168, + "reward": -0.13079960085451603, + "reward_std": 0.10997281037271023, + "rewards/cosine_scaled_reward": -0.06539979483932257, + "rewards/format_reward": 0.0, + "step": 243 + }, + { + "clip_ratio": 0.0, + "completion_length": 1514.0119323730469, + "epoch": 0.976, + "grad_norm": 0.08865034580230713, + "kl": 0.177734375, + "learning_rate": 6.466308972251785e-07, + "loss": -0.0154, + "reward": -0.13137424178421497, + "reward_std": 0.1251276545226574, + "rewards/cosine_scaled_reward": -0.06568712089210749, + "rewards/format_reward": 0.0, + "step": 244 + }, + { + "clip_ratio": 0.0, + "completion_length": 1482.8036193847656, + "epoch": 0.98, + "grad_norm": 0.11429865658283234, + "kl": 0.1796875, + "learning_rate": 6.435602608679916e-07, + "loss": -0.0297, + "reward": -0.13589460216462612, + "reward_std": 0.1405064184218645, + "rewards/cosine_scaled_reward": -0.06794730294495821, + "rewards/format_reward": 0.0, + "step": 245 + }, + { + "clip_ratio": 0.0, + "completion_length": 1502.5178833007812, + "epoch": 0.984, + "grad_norm": 0.06874290853738785, + "kl": 0.1602783203125, + "learning_rate": 6.404850645156841e-07, + "loss": -0.0348, + "reward": -0.0405126977711916, + "reward_std": 0.10752722714096308, + "rewards/cosine_scaled_reward": -0.020256347954273224, + "rewards/format_reward": 0.0, + "step": 246 + }, + { + "clip_ratio": 0.0, + "completion_length": 1509.2678833007812, + "epoch": 0.988, + "grad_norm": 0.1023624986410141, + "kl": 0.16748046875, + "learning_rate": 6.374054580489873e-07, + "loss": -0.0321, + "reward": -0.13477133214473724, + "reward_std": 0.13717731088399887, + "rewards/cosine_scaled_reward": -0.06738566607236862, + "rewards/format_reward": 0.0, + "step": 247 + }, + { + "clip_ratio": 0.0, + "completion_length": 1487.4464416503906, + "epoch": 0.992, + "grad_norm": 0.09231995791196823, + "kl": 0.17333984375, + "learning_rate": 6.343215915635761e-07, + "loss": -0.0601, + "reward": -0.12894214503467083, + "reward_std": 0.1458846628665924, + "rewards/cosine_scaled_reward": -0.06447107251733541, + "rewards/format_reward": 0.0, + "step": 248 + }, + { + "clip_ratio": 0.0, + "completion_length": 1510.107177734375, + "epoch": 0.996, + "grad_norm": 0.08529460430145264, + "kl": 0.171630859375, + "learning_rate": 6.31233615362752e-07, + "loss": -0.0203, + "reward": -0.11800242215394974, + "reward_std": 0.12636969611048698, + "rewards/cosine_scaled_reward": -0.059001206420361996, + "rewards/format_reward": 0.0, + "step": 249 + }, + { + "clip_ratio": 0.0, + "completion_length": 1524.5001220703125, + "epoch": 1.0, + "grad_norm": 0.08003545552492142, + "kl": 0.176513671875, + "learning_rate": 6.281416799501187e-07, + "loss": -0.0263, + "reward": -0.11569427512586117, + "reward_std": 0.10093419812619686, + "rewards/cosine_scaled_reward": -0.05784713663160801, + "rewards/format_reward": 0.0, + "step": 250 + }, + { + "clip_ratio": 0.0, + "completion_length": 1513.452392578125, + "epoch": 1.004, + "grad_norm": 0.0908510684967041, + "kl": 0.1728515625, + "learning_rate": 6.25045936022246e-07, + "loss": -0.0223, + "reward": -0.10760886035859585, + "reward_std": 0.10886831395328045, + "rewards/cosine_scaled_reward": -0.053804428316652775, + "rewards/format_reward": 0.0, + "step": 251 + }, + { + "clip_ratio": 0.0, + "completion_length": 1516.6250305175781, + "epoch": 1.008, + "grad_norm": 0.12224633246660233, + "kl": 0.18212890625, + "learning_rate": 6.219465344613258e-07, + "loss": -0.0196, + "reward": -0.12489514984190464, + "reward_std": 0.12947135604918003, + "rewards/cosine_scaled_reward": -0.062447573989629745, + "rewards/format_reward": 0.0, + "step": 252 + }, + { + "clip_ratio": 0.0, + "completion_length": 1504.5654907226562, + "epoch": 1.012, + "grad_norm": 0.13140302896499634, + "kl": 0.1845703125, + "learning_rate": 6.188436263278172e-07, + "loss": -0.0149, + "reward": -0.12439864501357079, + "reward_std": 0.12002300284802914, + "rewards/cosine_scaled_reward": -0.06219932623207569, + "rewards/format_reward": 0.0, + "step": 253 + }, + { + "clip_ratio": 0.0, + "completion_length": 1510.0357360839844, + "epoch": 1.016, + "grad_norm": 0.11975951492786407, + "kl": 0.17724609375, + "learning_rate": 6.157373628530852e-07, + "loss": -0.0206, + "reward": -0.1344124898314476, + "reward_std": 0.12755578383803368, + "rewards/cosine_scaled_reward": -0.0672062449157238, + "rewards/format_reward": 0.0, + "step": 254 + }, + { + "clip_ratio": 0.0, + "completion_length": 1510.5, + "epoch": 1.02, + "grad_norm": 0.12696978449821472, + "kl": 0.18701171875, + "learning_rate": 6.126278954320294e-07, + "loss": -0.0138, + "reward": -0.13142068311572075, + "reward_std": 0.13189822621643543, + "rewards/cosine_scaled_reward": -0.0657103369012475, + "rewards/format_reward": 0.0, + "step": 255 + }, + { + "clip_ratio": 0.0, + "completion_length": 1505.4642944335938, + "epoch": 1.024, + "grad_norm": 0.11200578510761261, + "kl": 0.177001953125, + "learning_rate": 6.095153756157051e-07, + "loss": -0.036, + "reward": -0.12037310004234314, + "reward_std": 0.11811047606170177, + "rewards/cosine_scaled_reward": -0.06018654815852642, + "rewards/format_reward": 0.0, + "step": 256 + }, + { + "clip_ratio": 0.0, + "completion_length": 1499.672607421875, + "epoch": 1.028, + "grad_norm": 0.1471128910779953, + "kl": 0.172607421875, + "learning_rate": 6.06399955103937e-07, + "loss": -0.0249, + "reward": -0.12174288742244244, + "reward_std": 0.1309817060828209, + "rewards/cosine_scaled_reward": -0.06087144184857607, + "rewards/format_reward": 0.0, + "step": 257 + }, + { + "clip_ratio": 0.0, + "completion_length": 1520.2559814453125, + "epoch": 1.032, + "grad_norm": 0.10926475375890732, + "kl": 0.170166015625, + "learning_rate": 6.032817857379256e-07, + "loss": -0.0154, + "reward": -0.13306790590286255, + "reward_std": 0.1421681884676218, + "rewards/cosine_scaled_reward": -0.06653395667672157, + "rewards/format_reward": 0.0, + "step": 258 + }, + { + "clip_ratio": 0.0, + "completion_length": 1523.1666870117188, + "epoch": 1.036, + "grad_norm": 0.15364930033683777, + "kl": 0.1787109375, + "learning_rate": 6.001610194928464e-07, + "loss": -0.0083, + "reward": -0.11868332512676716, + "reward_std": 0.11832969635725021, + "rewards/cosine_scaled_reward": -0.059341663494706154, + "rewards/format_reward": 0.0, + "step": 259 + }, + { + "clip_ratio": 0.0, + "completion_length": 1504.9583435058594, + "epoch": 1.04, + "grad_norm": 0.10218587517738342, + "kl": 0.163330078125, + "learning_rate": 5.97037808470444e-07, + "loss": -0.0278, + "reward": -0.12172269076108932, + "reward_std": 0.120206318795681, + "rewards/cosine_scaled_reward": -0.06086134351789951, + "rewards/format_reward": 0.0, + "step": 260 + }, + { + "clip_ratio": 0.0, + "completion_length": 1503.9880981445312, + "epoch": 1.044, + "grad_norm": 0.11966069042682648, + "kl": 0.16552734375, + "learning_rate": 5.939123048916173e-07, + "loss": -0.0226, + "reward": -0.11986922658979893, + "reward_std": 0.1164070088416338, + "rewards/cosine_scaled_reward": -0.05993461608886719, + "rewards/format_reward": 0.0, + "step": 261 + }, + { + "clip_ratio": 0.0, + "completion_length": 1498.857177734375, + "epoch": 1.048, + "grad_norm": 0.11268185824155807, + "kl": 0.161865234375, + "learning_rate": 5.907846610890011e-07, + "loss": -0.037, + "reward": -0.11277966573834419, + "reward_std": 0.11542832851409912, + "rewards/cosine_scaled_reward": -0.05638983380049467, + "rewards/format_reward": 0.0, + "step": 262 + }, + { + "clip_ratio": 0.0, + "completion_length": 1522.702392578125, + "epoch": 1.052, + "grad_norm": 0.1593422144651413, + "kl": 0.1494140625, + "learning_rate": 5.87655029499542e-07, + "loss": -0.0086, + "reward": -0.11897107027471066, + "reward_std": 0.12276079133152962, + "rewards/cosine_scaled_reward": -0.0594855360686779, + "rewards/format_reward": 0.0, + "step": 263 + }, + { + "clip_ratio": 0.0, + "completion_length": 1520.1904907226562, + "epoch": 1.056, + "grad_norm": 0.09266054630279541, + "kl": 0.1494140625, + "learning_rate": 5.845235626570683e-07, + "loss": -0.0117, + "reward": -0.10893694311380386, + "reward_std": 0.11326288990676403, + "rewards/cosine_scaled_reward": -0.05446847062557936, + "rewards/format_reward": 0.0, + "step": 264 + }, + { + "clip_ratio": 0.0, + "completion_length": 1499.922607421875, + "epoch": 1.06, + "grad_norm": 0.10393787920475006, + "kl": 0.157470703125, + "learning_rate": 5.813904131848564e-07, + "loss": -0.0333, + "reward": -0.12131388112902641, + "reward_std": 0.1239698026329279, + "rewards/cosine_scaled_reward": -0.06065694149583578, + "rewards/format_reward": 0.0, + "step": 265 + }, + { + "clip_ratio": 0.0, + "completion_length": 1513.3511962890625, + "epoch": 1.064, + "grad_norm": 0.0964948907494545, + "kl": 0.164794921875, + "learning_rate": 5.78255733788191e-07, + "loss": -0.0206, + "reward": -0.10519931092858315, + "reward_std": 0.11078912951052189, + "rewards/cosine_scaled_reward": -0.05259965639561415, + "rewards/format_reward": 0.0, + "step": 266 + }, + { + "clip_ratio": 0.0, + "completion_length": 1510.2798156738281, + "epoch": 1.068, + "grad_norm": 0.1041577160358429, + "kl": 0.167724609375, + "learning_rate": 5.751196772469237e-07, + "loss": -0.028, + "reward": -0.11491566710174084, + "reward_std": 0.13411646336317062, + "rewards/cosine_scaled_reward": -0.05745783355087042, + "rewards/format_reward": 0.0, + "step": 267 + }, + { + "clip_ratio": 0.0, + "completion_length": 1527.4285888671875, + "epoch": 1.072, + "grad_norm": 0.12076977640390396, + "kl": 0.172119140625, + "learning_rate": 5.71982396408026e-07, + "loss": -0.0013, + "reward": -0.09519998729228973, + "reward_std": 0.09795477986335754, + "rewards/cosine_scaled_reward": -0.04759999457746744, + "rewards/format_reward": 0.0, + "step": 268 + }, + { + "clip_ratio": 0.0, + "completion_length": 1502.1964721679688, + "epoch": 1.076, + "grad_norm": 0.15113261342048645, + "kl": 0.16552734375, + "learning_rate": 5.688440441781398e-07, + "loss": -0.0375, + "reward": -0.11081114411354065, + "reward_std": 0.12273510918021202, + "rewards/cosine_scaled_reward": -0.05540557578206062, + "rewards/format_reward": 0.0, + "step": 269 + }, + { + "clip_ratio": 0.0, + "completion_length": 1536.0, + "epoch": 1.08, + "grad_norm": 0.14382889866828918, + "kl": 0.154296875, + "learning_rate": 5.657047735161255e-07, + "loss": 0.0062, + "reward": -0.09718998149037361, + "reward_std": 0.09087535366415977, + "rewards/cosine_scaled_reward": -0.04859498701989651, + "rewards/format_reward": 0.0, + "step": 270 + }, + { + "clip_ratio": 0.0, + "completion_length": 1500.0, + "epoch": 1.084, + "grad_norm": 0.16920308768749237, + "kl": 0.1630859375, + "learning_rate": 5.625647374256061e-07, + "loss": -0.021, + "reward": -0.1035971287637949, + "reward_std": 0.11180637776851654, + "rewards/cosine_scaled_reward": -0.051798563450574875, + "rewards/format_reward": 0.0, + "step": 271 + }, + { + "clip_ratio": 0.0, + "completion_length": 1490.7143249511719, + "epoch": 1.088, + "grad_norm": 0.13314908742904663, + "kl": 0.157470703125, + "learning_rate": 5.594240889475106e-07, + "loss": -0.037, + "reward": -0.11400851234793663, + "reward_std": 0.12938306666910648, + "rewards/cosine_scaled_reward": -0.05700425896793604, + "rewards/format_reward": 0.0, + "step": 272 + }, + { + "clip_ratio": 0.0, + "completion_length": 1504.9464721679688, + "epoch": 1.092, + "grad_norm": 0.07967788726091385, + "kl": 0.166748046875, + "learning_rate": 5.562829811526154e-07, + "loss": -0.0369, + "reward": -0.10690408386290073, + "reward_std": 0.12804840318858624, + "rewards/cosine_scaled_reward": -0.053452043794095516, + "rewards/format_reward": 0.0, + "step": 273 + }, + { + "clip_ratio": 0.0, + "completion_length": 1529.7440795898438, + "epoch": 1.096, + "grad_norm": 0.09101837128400803, + "kl": 0.1650390625, + "learning_rate": 5.531415671340826e-07, + "loss": -0.0001, + "reward": -0.10337771661579609, + "reward_std": 0.10489914752542973, + "rewards/cosine_scaled_reward": -0.051688858307898045, + "rewards/format_reward": 0.0, + "step": 274 + }, + { + "clip_ratio": 0.0, + "completion_length": 1527.9404907226562, + "epoch": 1.1, + "grad_norm": 0.16794751584529877, + "kl": 0.168701171875, + "learning_rate": 5.5e-07, + "loss": -0.0041, + "reward": -0.08096367586404085, + "reward_std": 0.08801300823688507, + "rewards/cosine_scaled_reward": -0.040481837932020426, + "rewards/format_reward": 0.0, + "step": 275 + }, + { + "clip_ratio": 0.0, + "completion_length": 1512.3274230957031, + "epoch": 1.104, + "grad_norm": 0.0922553613781929, + "kl": 0.161376953125, + "learning_rate": 5.468584328659172e-07, + "loss": -0.0232, + "reward": -0.1129224356263876, + "reward_std": 0.12950855493545532, + "rewards/cosine_scaled_reward": -0.056461221538484097, + "rewards/format_reward": 0.0, + "step": 276 + }, + { + "clip_ratio": 0.0, + "completion_length": 1504.65478515625, + "epoch": 1.108, + "grad_norm": 0.1329527050256729, + "kl": 0.167236328125, + "learning_rate": 5.437170188473847e-07, + "loss": -0.0302, + "reward": -0.11213578283786774, + "reward_std": 0.11713657341897488, + "rewards/cosine_scaled_reward": -0.05606789421290159, + "rewards/format_reward": 0.0, + "step": 277 + }, + { + "clip_ratio": 0.0, + "completion_length": 1506.0, + "epoch": 1.112, + "grad_norm": 0.10364029556512833, + "kl": 0.163818359375, + "learning_rate": 5.405759110524894e-07, + "loss": -0.0298, + "reward": -0.10770672746002674, + "reward_std": 0.11716131307184696, + "rewards/cosine_scaled_reward": -0.053853364661335945, + "rewards/format_reward": 0.0, + "step": 278 + }, + { + "clip_ratio": 0.0, + "completion_length": 1474.4702453613281, + "epoch": 1.116, + "grad_norm": 0.1940528154373169, + "kl": 0.17138671875, + "learning_rate": 5.37435262574394e-07, + "loss": -0.0549, + "reward": -0.06423771567642689, + "reward_std": 0.14231404848396778, + "rewards/cosine_scaled_reward": -0.03211886156350374, + "rewards/format_reward": 0.0, + "step": 279 + }, + { + "clip_ratio": 0.0, + "completion_length": 1515.9821472167969, + "epoch": 1.12, + "grad_norm": 0.09174785017967224, + "kl": 0.165283203125, + "learning_rate": 5.342952264838747e-07, + "loss": -0.0167, + "reward": -0.12564732693135738, + "reward_std": 0.12320295721292496, + "rewards/cosine_scaled_reward": -0.06282366160303354, + "rewards/format_reward": 0.0, + "step": 280 + }, + { + "clip_ratio": 0.0, + "completion_length": 1525.8214416503906, + "epoch": 1.124, + "grad_norm": 0.11484113335609436, + "kl": 0.16650390625, + "learning_rate": 5.311559558218603e-07, + "loss": -0.0015, + "reward": -0.09635571762919426, + "reward_std": 0.09338909015059471, + "rewards/cosine_scaled_reward": -0.048177859745919704, + "rewards/format_reward": 0.0, + "step": 281 + }, + { + "clip_ratio": 0.0, + "completion_length": 1502.0714721679688, + "epoch": 1.1280000000000001, + "grad_norm": 0.1825956404209137, + "kl": 0.158203125, + "learning_rate": 5.28017603591974e-07, + "loss": -0.0234, + "reward": -0.11337108165025711, + "reward_std": 0.1339975707232952, + "rewards/cosine_scaled_reward": -0.05668553803116083, + "rewards/format_reward": 0.0, + "step": 282 + }, + { + "clip_ratio": 0.0, + "completion_length": 1506.6964416503906, + "epoch": 1.1320000000000001, + "grad_norm": 0.1683872640132904, + "kl": 0.185546875, + "learning_rate": 5.248803227530763e-07, + "loss": -0.0202, + "reward": -0.1038710568100214, + "reward_std": 0.1097688339650631, + "rewards/cosine_scaled_reward": -0.051935529336333275, + "rewards/format_reward": 0.0, + "step": 283 + }, + { + "clip_ratio": 0.0, + "completion_length": 1528.0595397949219, + "epoch": 1.1360000000000001, + "grad_norm": 0.10006389766931534, + "kl": 0.1591796875, + "learning_rate": 5.21744266211809e-07, + "loss": 0.0012, + "reward": -0.12082774937152863, + "reward_std": 0.11856056936085224, + "rewards/cosine_scaled_reward": -0.06041387468576431, + "rewards/format_reward": 0.0, + "step": 284 + }, + { + "clip_ratio": 0.0, + "completion_length": 1505.7916870117188, + "epoch": 1.1400000000000001, + "grad_norm": 0.1140761449933052, + "kl": 0.156982421875, + "learning_rate": 5.186095868151436e-07, + "loss": -0.035, + "reward": -0.11102944053709507, + "reward_std": 0.143316388130188, + "rewards/cosine_scaled_reward": -0.05551472119987011, + "rewards/format_reward": 0.0, + "step": 285 + }, + { + "clip_ratio": 0.0, + "completion_length": 1529.6309509277344, + "epoch": 1.144, + "grad_norm": 0.09991593658924103, + "kl": 0.158447265625, + "learning_rate": 5.154764373429315e-07, + "loss": -0.0016, + "reward": -0.11252638325095177, + "reward_std": 0.11945481784641743, + "rewards/cosine_scaled_reward": -0.05626319348812103, + "rewards/format_reward": 0.0, + "step": 286 + }, + { + "clip_ratio": 0.0, + "completion_length": 1520.4821472167969, + "epoch": 1.148, + "grad_norm": 0.12035401910543442, + "kl": 0.17529296875, + "learning_rate": 5.123449705004581e-07, + "loss": -0.0112, + "reward": -0.1297306139022112, + "reward_std": 0.13188758678734303, + "rewards/cosine_scaled_reward": -0.0648653069511056, + "rewards/format_reward": 0.0, + "step": 287 + }, + { + "clip_ratio": 0.0, + "completion_length": 1514.7083740234375, + "epoch": 1.152, + "grad_norm": 0.11799659579992294, + "kl": 0.169189453125, + "learning_rate": 5.09215338910999e-07, + "loss": -0.0217, + "reward": -0.10410146042704582, + "reward_std": 0.10413151048123837, + "rewards/cosine_scaled_reward": -0.05205072835087776, + "rewards/format_reward": 0.0, + "step": 288 + }, + { + "clip_ratio": 0.0, + "completion_length": 1496.8512268066406, + "epoch": 1.156, + "grad_norm": 0.1316956877708435, + "kl": 0.15869140625, + "learning_rate": 5.060876951083828e-07, + "loss": -0.0207, + "reward": -0.11279321648180485, + "reward_std": 0.11689519137144089, + "rewards/cosine_scaled_reward": -0.05639660730957985, + "rewards/format_reward": 0.0, + "step": 289 + }, + { + "clip_ratio": 0.0, + "completion_length": 1513.6369323730469, + "epoch": 1.16, + "grad_norm": 0.07206040620803833, + "kl": 0.165283203125, + "learning_rate": 5.02962191529556e-07, + "loss": -0.0101, + "reward": -0.11338524892926216, + "reward_std": 0.10832424648106098, + "rewards/cosine_scaled_reward": -0.05669262260198593, + "rewards/format_reward": 0.0, + "step": 290 + }, + { + "clip_ratio": 0.0, + "completion_length": 1514.3511962890625, + "epoch": 1.164, + "grad_norm": 0.20136146247386932, + "kl": 0.164794921875, + "learning_rate": 4.998389805071536e-07, + "loss": -0.0202, + "reward": -0.12464358657598495, + "reward_std": 0.10932311788201332, + "rewards/cosine_scaled_reward": -0.06232179142534733, + "rewards/format_reward": 0.0, + "step": 291 + }, + { + "clip_ratio": 0.0, + "completion_length": 1510.1547546386719, + "epoch": 1.168, + "grad_norm": 0.10925502330064774, + "kl": 0.185791015625, + "learning_rate": 4.967182142620745e-07, + "loss": 0.003, + "reward": -0.09491665475070477, + "reward_std": 0.08971338160336018, + "rewards/cosine_scaled_reward": -0.04745833110064268, + "rewards/format_reward": 0.0, + "step": 292 + }, + { + "clip_ratio": 0.0, + "completion_length": 1501.0535888671875, + "epoch": 1.172, + "grad_norm": 0.09527327865362167, + "kl": 0.173828125, + "learning_rate": 4.93600044896063e-07, + "loss": -0.0318, + "reward": -0.13249907828867435, + "reward_std": 0.1209456454962492, + "rewards/cosine_scaled_reward": -0.06624954286962748, + "rewards/format_reward": 0.0, + "step": 293 + }, + { + "clip_ratio": 0.0, + "completion_length": 1516.3869323730469, + "epoch": 1.176, + "grad_norm": 0.13041535019874573, + "kl": 0.188720703125, + "learning_rate": 4.904846243842949e-07, + "loss": -0.015, + "reward": -0.11728105135262012, + "reward_std": 0.11388706415891647, + "rewards/cosine_scaled_reward": -0.058640528470277786, + "rewards/format_reward": 0.0, + "step": 294 + }, + { + "clip_ratio": 0.0, + "completion_length": 1516.1488342285156, + "epoch": 1.18, + "grad_norm": 0.13715095818042755, + "kl": 0.184814453125, + "learning_rate": 4.873721045679706e-07, + "loss": -0.0191, + "reward": -0.10163358226418495, + "reward_std": 0.10592619515955448, + "rewards/cosine_scaled_reward": -0.05081678926944733, + "rewards/format_reward": 0.0, + "step": 295 + }, + { + "clip_ratio": 0.0, + "completion_length": 1517.1548156738281, + "epoch": 1.184, + "grad_norm": 0.10485463589429855, + "kl": 0.172119140625, + "learning_rate": 4.842626371469149e-07, + "loss": -0.0183, + "reward": -0.11159243248403072, + "reward_std": 0.11549564823508263, + "rewards/cosine_scaled_reward": -0.05579621531069279, + "rewards/format_reward": 0.0, + "step": 296 + }, + { + "clip_ratio": 0.0, + "completion_length": 1505.4702453613281, + "epoch": 1.188, + "grad_norm": 0.16054122149944305, + "kl": 0.19091796875, + "learning_rate": 4.811563736721829e-07, + "loss": -0.0199, + "reward": -0.10421715676784515, + "reward_std": 0.10152514837682247, + "rewards/cosine_scaled_reward": -0.05210857838392258, + "rewards/format_reward": 0.0, + "step": 297 + }, + { + "clip_ratio": 0.0, + "completion_length": 1510.8630981445312, + "epoch": 1.192, + "grad_norm": 0.1103135496377945, + "kl": 0.167236328125, + "learning_rate": 4.780534655386743e-07, + "loss": -0.0185, + "reward": -0.11138204857707024, + "reward_std": 0.12202793546020985, + "rewards/cosine_scaled_reward": -0.05569102708250284, + "rewards/format_reward": 0.0, + "step": 298 + }, + { + "clip_ratio": 0.0, + "completion_length": 1498.1905212402344, + "epoch": 1.196, + "grad_norm": 0.13167892396450043, + "kl": 0.165283203125, + "learning_rate": 4.749540639777539e-07, + "loss": -0.0422, + "reward": -0.12753658182919025, + "reward_std": 0.15048057585954666, + "rewards/cosine_scaled_reward": -0.06376829091459513, + "rewards/format_reward": 0.0, + "step": 299 + }, + { + "clip_ratio": 0.0, + "completion_length": 1504.3333740234375, + "epoch": 1.2, + "grad_norm": 0.11807087808847427, + "kl": 0.1787109375, + "learning_rate": 4.7185832004988133e-07, + "loss": -0.0233, + "reward": -0.10466483794152737, + "reward_std": 0.09747852385044098, + "rewards/cosine_scaled_reward": -0.05233241897076368, + "rewards/format_reward": 0.0, + "step": 300 + }, + { + "clip_ratio": 0.0, + "completion_length": 1504.6369018554688, + "epoch": 1.204, + "grad_norm": 0.10298826545476913, + "kl": 0.17041015625, + "learning_rate": 4.68766384637248e-07, + "loss": -0.0369, + "reward": -0.030609130859375, + "reward_std": 0.1283973567187786, + "rewards/cosine_scaled_reward": -0.0153045654296875, + "rewards/format_reward": 0.0, + "step": 301 + }, + { + "clip_ratio": 0.0, + "completion_length": 1512.0119323730469, + "epoch": 1.208, + "grad_norm": 0.11705794930458069, + "kl": 0.16259765625, + "learning_rate": 4.656784084364238e-07, + "loss": -0.0288, + "reward": -0.1029995009303093, + "reward_std": 0.1215117834508419, + "rewards/cosine_scaled_reward": -0.0514997486025095, + "rewards/format_reward": 0.0, + "step": 302 + }, + { + "clip_ratio": 0.0, + "completion_length": 1512.077392578125, + "epoch": 1.212, + "grad_norm": 0.13367965817451477, + "kl": 0.186279296875, + "learning_rate": 4.6259454195101267e-07, + "loss": -0.024, + "reward": -0.1126109603792429, + "reward_std": 0.11090057343244553, + "rewards/cosine_scaled_reward": -0.05630548112094402, + "rewards/format_reward": 0.0, + "step": 303 + }, + { + "clip_ratio": 0.0, + "completion_length": 1516.8511962890625, + "epoch": 1.216, + "grad_norm": 0.1228313073515892, + "kl": 0.17578125, + "learning_rate": 4.59514935484316e-07, + "loss": -0.0105, + "reward": -0.08529405388981104, + "reward_std": 0.08704109024256468, + "rewards/cosine_scaled_reward": -0.042647027876228094, + "rewards/format_reward": 0.0, + "step": 304 + }, + { + "clip_ratio": 0.0, + "completion_length": 1480.511962890625, + "epoch": 1.22, + "grad_norm": 0.12493617087602615, + "kl": 0.1513671875, + "learning_rate": 4.5643973913200837e-07, + "loss": -0.0485, + "reward": -0.12973922118544579, + "reward_std": 0.15159142762422562, + "rewards/cosine_scaled_reward": -0.06486961059272289, + "rewards/format_reward": 0.0, + "step": 305 + }, + { + "clip_ratio": 0.0, + "completion_length": 1525.3988342285156, + "epoch": 1.224, + "grad_norm": 0.08764777332544327, + "kl": 0.165283203125, + "learning_rate": 4.5336910277482155e-07, + "loss": -0.006, + "reward": -0.08825473487377167, + "reward_std": 0.10441902838647366, + "rewards/cosine_scaled_reward": -0.04412736464291811, + "rewards/format_reward": 0.0, + "step": 306 + }, + { + "clip_ratio": 0.0, + "completion_length": 1529.3869018554688, + "epoch": 1.228, + "grad_norm": 0.08289927244186401, + "kl": 0.169677734375, + "learning_rate": 4.503031760712397e-07, + "loss": -0.003, + "reward": -0.07872879132628441, + "reward_std": 0.07979759760200977, + "rewards/cosine_scaled_reward": -0.03936439473181963, + "rewards/format_reward": 0.0, + "step": 307 + }, + { + "clip_ratio": 0.0, + "completion_length": 1513.3630981445312, + "epoch": 1.232, + "grad_norm": 0.14125800132751465, + "kl": 0.17578125, + "learning_rate": 4.4724210845020494e-07, + "loss": -0.0177, + "reward": -0.08932580798864365, + "reward_std": 0.08838274143636227, + "rewards/cosine_scaled_reward": -0.04466290678828955, + "rewards/format_reward": 0.0, + "step": 308 + }, + { + "clip_ratio": 0.0, + "completion_length": 1499.9642944335938, + "epoch": 1.236, + "grad_norm": 0.1731095314025879, + "kl": 0.146484375, + "learning_rate": 4.441860491038345e-07, + "loss": -0.0359, + "reward": -0.10843771509826183, + "reward_std": 0.11413087695837021, + "rewards/cosine_scaled_reward": -0.05421885754913092, + "rewards/format_reward": 0.0, + "step": 309 + }, + { + "clip_ratio": 0.0, + "completion_length": 1494.4702758789062, + "epoch": 1.24, + "grad_norm": 0.2260572761297226, + "kl": 0.164306640625, + "learning_rate": 4.4113514698014953e-07, + "loss": -0.0351, + "reward": -0.11317398212850094, + "reward_std": 0.124705346301198, + "rewards/cosine_scaled_reward": -0.05658698920160532, + "rewards/format_reward": 0.0, + "step": 310 + }, + { + "clip_ratio": 0.0, + "completion_length": 1519.1428833007812, + "epoch": 1.244, + "grad_norm": 0.10579942166805267, + "kl": 0.17333984375, + "learning_rate": 4.3808955077581546e-07, + "loss": -0.0133, + "reward": -0.10701234266161919, + "reward_std": 0.11289746686816216, + "rewards/cosine_scaled_reward": -0.05350616853684187, + "rewards/format_reward": 0.0, + "step": 311 + }, + { + "clip_ratio": 0.0, + "completion_length": 1499.4940795898438, + "epoch": 1.248, + "grad_norm": 0.12452121824026108, + "kl": 0.151611328125, + "learning_rate": 4.350494089288943e-07, + "loss": -0.0416, + "reward": -0.12627964280545712, + "reward_std": 0.1439381241798401, + "rewards/cosine_scaled_reward": -0.06313982233405113, + "rewards/format_reward": 0.0, + "step": 312 + }, + { + "clip_ratio": 0.0, + "completion_length": 1519.7559814453125, + "epoch": 1.252, + "grad_norm": 0.14036187529563904, + "kl": 0.164306640625, + "learning_rate": 4.3201486961161093e-07, + "loss": -0.0152, + "reward": -0.08883011713624, + "reward_std": 0.09118240885436535, + "rewards/cosine_scaled_reward": -0.0444150622934103, + "rewards/format_reward": 0.0, + "step": 313 + }, + { + "clip_ratio": 0.0, + "completion_length": 1525.2440795898438, + "epoch": 1.256, + "grad_norm": 0.1239011213183403, + "kl": 0.165283203125, + "learning_rate": 4.2898608072313045e-07, + "loss": -0.0058, + "reward": -0.10498439148068428, + "reward_std": 0.11687885224819183, + "rewards/cosine_scaled_reward": -0.05249219387769699, + "rewards/format_reward": 0.0, + "step": 314 + }, + { + "clip_ratio": 0.0, + "completion_length": 1490.8631286621094, + "epoch": 1.26, + "grad_norm": 0.09535997360944748, + "kl": 0.159423828125, + "learning_rate": 4.2596318988235037e-07, + "loss": -0.0165, + "reward": -0.02371996082365513, + "reward_std": 0.09539724607020617, + "rewards/cosine_scaled_reward": -0.011859980411827564, + "rewards/format_reward": 0.0, + "step": 315 + }, + { + "clip_ratio": 0.0, + "completion_length": 1494.3988342285156, + "epoch": 1.264, + "grad_norm": 0.11306377500295639, + "kl": 0.161865234375, + "learning_rate": 4.2294634442070553e-07, + "loss": -0.0466, + "reward": -0.03322407230734825, + "reward_std": 0.12497628107666969, + "rewards/cosine_scaled_reward": -0.0166120370849967, + "rewards/format_reward": 0.0, + "step": 316 + }, + { + "clip_ratio": 0.0, + "completion_length": 1530.4166870117188, + "epoch": 1.268, + "grad_norm": 0.11471430212259293, + "kl": 0.160888671875, + "learning_rate": 4.1993569137498776e-07, + "loss": 0.0016, + "reward": -0.08301959745585918, + "reward_std": 0.08951563201844692, + "rewards/cosine_scaled_reward": -0.04150979872792959, + "rewards/format_reward": 0.0, + "step": 317 + }, + { + "clip_ratio": 0.0, + "completion_length": 1523.1607055664062, + "epoch": 1.272, + "grad_norm": 0.18867182731628418, + "kl": 0.155029296875, + "learning_rate": 4.1693137748017915e-07, + "loss": -0.0106, + "reward": -0.10158013552427292, + "reward_std": 0.10436173714697361, + "rewards/cosine_scaled_reward": -0.050790068693459034, + "rewards/format_reward": 0.0, + "step": 318 + }, + { + "clip_ratio": 0.0, + "completion_length": 1492.1548156738281, + "epoch": 1.276, + "grad_norm": 0.1889064460992813, + "kl": 0.16552734375, + "learning_rate": 4.1393354916230005e-07, + "loss": -0.0519, + "reward": -0.1043181549757719, + "reward_std": 0.13521616160869598, + "rewards/cosine_scaled_reward": -0.0521590793505311, + "rewards/format_reward": 0.0, + "step": 319 + }, + { + "clip_ratio": 0.0, + "completion_length": 1516.2261962890625, + "epoch": 1.28, + "grad_norm": 0.12251739203929901, + "kl": 0.18359375, + "learning_rate": 4.1094235253127374e-07, + "loss": -0.0218, + "reward": -0.0895642340183258, + "reward_std": 0.11007728800177574, + "rewards/cosine_scaled_reward": -0.04478211794048548, + "rewards/format_reward": 0.0, + "step": 320 + }, + { + "clip_ratio": 0.0, + "completion_length": 1486.0059814453125, + "epoch": 1.284, + "grad_norm": 0.11772434413433075, + "kl": 0.152587890625, + "learning_rate": 4.079579333738039e-07, + "loss": -0.0484, + "reward": -0.1089986227452755, + "reward_std": 0.12628877721726894, + "rewards/cosine_scaled_reward": -0.0544993132352829, + "rewards/format_reward": 0.0, + "step": 321 + }, + { + "clip_ratio": 0.0, + "completion_length": 1519.9940490722656, + "epoch": 1.288, + "grad_norm": 0.11078700423240662, + "kl": 0.1650390625, + "learning_rate": 4.0498043714627006e-07, + "loss": -0.0131, + "reward": -0.0795932961627841, + "reward_std": 0.08838632330298424, + "rewards/cosine_scaled_reward": -0.03979664808139205, + "rewards/format_reward": 0.0, + "step": 322 + }, + { + "clip_ratio": 0.0, + "completion_length": 1493.6488342285156, + "epoch": 1.292, + "grad_norm": 0.11576636880636215, + "kl": 0.16064453125, + "learning_rate": 4.020100089676376e-07, + "loss": -0.0516, + "reward": -0.09283129125833511, + "reward_std": 0.11138802394270897, + "rewards/cosine_scaled_reward": -0.04641564283519983, + "rewards/format_reward": 0.0, + "step": 323 + }, + { + "clip_ratio": 0.0, + "completion_length": 1513.1904907226562, + "epoch": 1.296, + "grad_norm": 0.0929921567440033, + "kl": 0.163818359375, + "learning_rate": 3.9904679361238526e-07, + "loss": -0.0256, + "reward": -0.11852945201098919, + "reward_std": 0.14786842092871666, + "rewards/cosine_scaled_reward": -0.059264726005494595, + "rewards/format_reward": 0.0, + "step": 324 + }, + { + "clip_ratio": 0.0, + "completion_length": 1513.8809814453125, + "epoch": 1.3, + "grad_norm": 0.10295694321393967, + "kl": 0.151123046875, + "learning_rate": 3.9609093550344907e-07, + "loss": -0.025, + "reward": -0.09280366078019142, + "reward_std": 0.1096403207629919, + "rewards/cosine_scaled_reward": -0.046401829458773136, + "rewards/format_reward": 0.0, + "step": 325 + }, + { + "clip_ratio": 0.0, + "completion_length": 1526.6428833007812, + "epoch": 1.304, + "grad_norm": 0.08783736079931259, + "kl": 0.15576171875, + "learning_rate": 3.931425787051832e-07, + "loss": -0.0069, + "reward": -0.10956737771630287, + "reward_std": 0.11006363853812218, + "rewards/cosine_scaled_reward": -0.054783688858151436, + "rewards/format_reward": 0.0, + "step": 326 + }, + { + "clip_ratio": 0.0, + "completion_length": 1495.7024230957031, + "epoch": 1.308, + "grad_norm": 0.10409428924322128, + "kl": 0.15234375, + "learning_rate": 3.902018669163384e-07, + "loss": -0.0457, + "reward": -0.10653090476989746, + "reward_std": 0.12193662486970425, + "rewards/cosine_scaled_reward": -0.053265451453626156, + "rewards/format_reward": 0.0, + "step": 327 + }, + { + "clip_ratio": 0.0, + "completion_length": 1530.3988037109375, + "epoch": 1.312, + "grad_norm": 0.09973278641700745, + "kl": 0.151123046875, + "learning_rate": 3.872689434630585e-07, + "loss": -0.0022, + "reward": -0.08937697485089302, + "reward_std": 0.09377033449709415, + "rewards/cosine_scaled_reward": -0.04468849208205938, + "rewards/format_reward": 0.0, + "step": 328 + }, + { + "clip_ratio": 0.0, + "completion_length": 1499.6607360839844, + "epoch": 1.316, + "grad_norm": 0.13606639206409454, + "kl": 0.166259765625, + "learning_rate": 3.843439512918949e-07, + "loss": -0.0237, + "reward": -0.11537123657763004, + "reward_std": 0.1290461514145136, + "rewards/cosine_scaled_reward": -0.05768561642616987, + "rewards/format_reward": 0.0, + "step": 329 + }, + { + "clip_ratio": 0.0, + "completion_length": 1519.7857666015625, + "epoch": 1.32, + "grad_norm": 0.10613211989402771, + "kl": 0.167724609375, + "learning_rate": 3.8142703296283953e-07, + "loss": -0.0159, + "reward": -0.09533977694809437, + "reward_std": 0.10748440586030483, + "rewards/cosine_scaled_reward": -0.04766988940536976, + "rewards/format_reward": 0.0, + "step": 330 + }, + { + "clip_ratio": 0.0, + "completion_length": 1511.4464721679688, + "epoch": 1.324, + "grad_norm": 0.17454038560390472, + "kl": 0.170166015625, + "learning_rate": 3.785183306423767e-07, + "loss": -0.0282, + "reward": -0.015690762549638748, + "reward_std": 0.0955708883702755, + "rewards/cosine_scaled_reward": -0.007845382206141949, + "rewards/format_reward": 0.0, + "step": 331 + }, + { + "clip_ratio": 0.0, + "completion_length": 1499.7024230957031, + "epoch": 1.328, + "grad_norm": 0.10816742479801178, + "kl": 0.153564453125, + "learning_rate": 3.7561798609655373e-07, + "loss": -0.0399, + "reward": -0.09764312580227852, + "reward_std": 0.10405797138810158, + "rewards/cosine_scaled_reward": -0.04882156103849411, + "rewards/format_reward": 0.0, + "step": 332 + }, + { + "clip_ratio": 0.0, + "completion_length": 1513.2738037109375, + "epoch": 1.332, + "grad_norm": 0.09580235928297043, + "kl": 0.1630859375, + "learning_rate": 3.72726140684072e-07, + "loss": -0.0238, + "reward": -0.0930531919002533, + "reward_std": 0.10378883965313435, + "rewards/cosine_scaled_reward": -0.0465265978127718, + "rewards/format_reward": 0.0, + "step": 333 + }, + { + "clip_ratio": 0.0, + "completion_length": 1497.4762268066406, + "epoch": 1.336, + "grad_norm": 0.18943195044994354, + "kl": 0.186767578125, + "learning_rate": 3.6984293534939737e-07, + "loss": -0.0458, + "reward": -0.09320422261953354, + "reward_std": 0.11783652380108833, + "rewards/cosine_scaled_reward": -0.04660210944712162, + "rewards/format_reward": 0.0, + "step": 334 + }, + { + "clip_ratio": 0.0, + "completion_length": 1511.4583740234375, + "epoch": 1.34, + "grad_norm": 0.11527442187070847, + "kl": 0.169677734375, + "learning_rate": 3.6696851061588994e-07, + "loss": -0.0222, + "reward": -0.09490611962974072, + "reward_std": 0.106621278449893, + "rewards/cosine_scaled_reward": -0.04745305888354778, + "rewards/format_reward": 0.0, + "step": 335 + }, + { + "clip_ratio": 0.0, + "completion_length": 1510.3333435058594, + "epoch": 1.3439999999999999, + "grad_norm": 0.1746179610490799, + "kl": 0.15966796875, + "learning_rate": 3.641030065789562e-07, + "loss": -0.0303, + "reward": -0.0963439904153347, + "reward_std": 0.11076842434704304, + "rewards/cosine_scaled_reward": -0.04817199241369963, + "rewards/format_reward": 0.0, + "step": 336 + }, + { + "clip_ratio": 0.0, + "completion_length": 1512.5893249511719, + "epoch": 1.3479999999999999, + "grad_norm": 0.1353609561920166, + "kl": 0.16748046875, + "learning_rate": 3.612465628992203e-07, + "loss": -0.0247, + "reward": -0.09672348201274872, + "reward_std": 0.1137369517236948, + "rewards/cosine_scaled_reward": -0.04836174100637436, + "rewards/format_reward": 0.0, + "step": 337 + }, + { + "clip_ratio": 0.0, + "completion_length": 1506.3809509277344, + "epoch": 1.3519999999999999, + "grad_norm": 0.15681445598602295, + "kl": 0.156982421875, + "learning_rate": 3.5839931879571725e-07, + "loss": -0.0329, + "reward": -0.09822369925677776, + "reward_std": 0.11475454457104206, + "rewards/cosine_scaled_reward": -0.04911184962838888, + "rewards/format_reward": 0.0, + "step": 338 + }, + { + "clip_ratio": 0.0, + "completion_length": 1526.2619323730469, + "epoch": 1.3559999999999999, + "grad_norm": 0.12026900053024292, + "kl": 0.15185546875, + "learning_rate": 3.555614130391079e-07, + "loss": -0.0027, + "reward": -0.09019140899181366, + "reward_std": 0.08315368928015232, + "rewards/cosine_scaled_reward": -0.045095707289874554, + "rewards/format_reward": 0.0, + "step": 339 + }, + { + "clip_ratio": 0.0, + "completion_length": 1519.3988342285156, + "epoch": 1.3599999999999999, + "grad_norm": 0.189870685338974, + "kl": 0.185791015625, + "learning_rate": 3.5273298394491515e-07, + "loss": -0.0085, + "reward": -0.08764730766415596, + "reward_std": 0.08974755555391312, + "rewards/cosine_scaled_reward": -0.043823654763400555, + "rewards/format_reward": 0.0, + "step": 340 + }, + { + "clip_ratio": 0.0, + "completion_length": 1526.1309509277344, + "epoch": 1.3639999999999999, + "grad_norm": 0.11866843700408936, + "kl": 0.1502685546875, + "learning_rate": 3.4991416936678276e-07, + "loss": 0.0082, + "reward": -0.10054401028901339, + "reward_std": 0.091705821454525, + "rewards/cosine_scaled_reward": -0.05027200886979699, + "rewards/format_reward": 0.0, + "step": 341 + }, + { + "clip_ratio": 0.0, + "completion_length": 1504.6190795898438, + "epoch": 1.3679999999999999, + "grad_norm": 0.07965697348117828, + "kl": 0.160888671875, + "learning_rate": 3.471051066897562e-07, + "loss": -0.0327, + "reward": -0.03098013624548912, + "reward_std": 0.10922298207879066, + "rewards/cosine_scaled_reward": -0.015490064397454262, + "rewards/format_reward": 0.0, + "step": 342 + }, + { + "clip_ratio": 0.0, + "completion_length": 1515.702392578125, + "epoch": 1.3719999999999999, + "grad_norm": 0.12758509814739227, + "kl": 0.172607421875, + "learning_rate": 3.4430593282358777e-07, + "loss": -0.0202, + "reward": -0.09887174144387245, + "reward_std": 0.11539069190621376, + "rewards/cosine_scaled_reward": -0.049435872584581375, + "rewards/format_reward": 0.0, + "step": 343 + }, + { + "clip_ratio": 0.0, + "completion_length": 1519.0535888671875, + "epoch": 1.376, + "grad_norm": 0.09368550777435303, + "kl": 0.18115234375, + "learning_rate": 3.4151678419606233e-07, + "loss": -0.0143, + "reward": -0.09874763153493404, + "reward_std": 0.0960962763056159, + "rewards/cosine_scaled_reward": -0.049373818561434746, + "rewards/format_reward": 0.0, + "step": 344 + }, + { + "clip_ratio": 0.0, + "completion_length": 1505.8631286621094, + "epoch": 1.38, + "grad_norm": 0.10917885601520538, + "kl": 0.177001953125, + "learning_rate": 3.387377967463493e-07, + "loss": -0.0331, + "reward": -0.10075951926410198, + "reward_std": 0.11745327524840832, + "rewards/cosine_scaled_reward": -0.050379764288663864, + "rewards/format_reward": 0.0, + "step": 345 + }, + { + "clip_ratio": 0.0, + "completion_length": 1508.7976379394531, + "epoch": 1.384, + "grad_norm": 0.08625519275665283, + "kl": 0.1611328125, + "learning_rate": 3.359691059183761e-07, + "loss": -0.0277, + "reward": -0.11206395924091339, + "reward_std": 0.13379977643489838, + "rewards/cosine_scaled_reward": -0.05603197868913412, + "rewards/format_reward": 0.0, + "step": 346 + }, + { + "clip_ratio": 0.0, + "completion_length": 1514.77978515625, + "epoch": 1.388, + "grad_norm": 0.09115591645240784, + "kl": 0.164306640625, + "learning_rate": 3.3321084665422803e-07, + "loss": -0.0129, + "reward": -0.08784853294491768, + "reward_std": 0.09035127516835928, + "rewards/cosine_scaled_reward": -0.043924265541136265, + "rewards/format_reward": 0.0, + "step": 347 + }, + { + "clip_ratio": 0.0, + "completion_length": 1508.4702453613281, + "epoch": 1.392, + "grad_norm": 0.12537740170955658, + "kl": 0.155517578125, + "learning_rate": 3.3046315338757026e-07, + "loss": -0.0323, + "reward": -0.11530621163547039, + "reward_std": 0.1266392320394516, + "rewards/cosine_scaled_reward": -0.05765310861170292, + "rewards/format_reward": 0.0, + "step": 348 + }, + { + "clip_ratio": 0.0, + "completion_length": 1496.6726379394531, + "epoch": 1.396, + "grad_norm": 0.11020371317863464, + "kl": 0.158935546875, + "learning_rate": 3.2772616003709616e-07, + "loss": -0.0268, + "reward": -0.10983618721365929, + "reward_std": 0.10708382353186607, + "rewards/cosine_scaled_reward": -0.054918091744184494, + "rewards/format_reward": 0.0, + "step": 349 + }, + { + "clip_ratio": 0.0, + "completion_length": 1530.0416564941406, + "epoch": 1.4, + "grad_norm": 0.17068089544773102, + "kl": 0.163818359375, + "learning_rate": 3.250000000000001e-07, + "loss": 0.0002, + "reward": -0.08832419849932194, + "reward_std": 0.09396599233150482, + "rewards/cosine_scaled_reward": -0.04416209738701582, + "rewards/format_reward": 0.0, + "step": 350 + }, + { + "clip_ratio": 0.0, + "completion_length": 1515.4345397949219, + "epoch": 1.404, + "grad_norm": 0.1316055804491043, + "kl": 0.1689453125, + "learning_rate": 3.222848061454764e-07, + "loss": -0.0178, + "reward": -0.08349752612411976, + "reward_std": 0.08428733702749014, + "rewards/cosine_scaled_reward": -0.041748762130737305, + "rewards/format_reward": 0.0, + "step": 351 + }, + { + "clip_ratio": 0.0, + "completion_length": 1501.7976379394531, + "epoch": 1.408, + "grad_norm": 0.1486114114522934, + "kl": 0.16552734375, + "learning_rate": 3.195807108082429e-07, + "loss": -0.0412, + "reward": -0.0832710936665535, + "reward_std": 0.10713749751448631, + "rewards/cosine_scaled_reward": -0.04163554683327675, + "rewards/format_reward": 0.0, + "step": 352 + }, + { + "clip_ratio": 0.0, + "completion_length": 1510.6428833007812, + "epoch": 1.412, + "grad_norm": 0.12983661890029907, + "kl": 0.163330078125, + "learning_rate": 3.168878457820915e-07, + "loss": -0.0096, + "reward": -0.10671682469546795, + "reward_std": 0.11679115891456604, + "rewards/cosine_scaled_reward": -0.0533584114164114, + "rewards/format_reward": 0.0, + "step": 353 + }, + { + "clip_ratio": 0.0, + "completion_length": 1497.7976379394531, + "epoch": 1.416, + "grad_norm": 0.09482970088720322, + "kl": 0.163818359375, + "learning_rate": 3.142063423134644e-07, + "loss": -0.0377, + "reward": -0.10173431225121021, + "reward_std": 0.11217576451599598, + "rewards/cosine_scaled_reward": -0.050867156125605106, + "rewards/format_reward": 0.0, + "step": 354 + }, + { + "clip_ratio": 0.0, + "completion_length": 1511.9226379394531, + "epoch": 1.42, + "grad_norm": 0.11015576124191284, + "kl": 0.16796875, + "learning_rate": 3.115363310950578e-07, + "loss": -0.027, + "reward": -0.10424264334142208, + "reward_std": 0.10744853690266609, + "rewards/cosine_scaled_reward": -0.05212132353335619, + "rewards/format_reward": 0.0, + "step": 355 + }, + { + "clip_ratio": 0.0, + "completion_length": 1507.7143249511719, + "epoch": 1.424, + "grad_norm": 0.1039690375328064, + "kl": 0.15771484375, + "learning_rate": 3.0887794225945143e-07, + "loss": -0.0104, + "reward": -0.11364280618727207, + "reward_std": 0.11577463708817959, + "rewards/cosine_scaled_reward": -0.05682140402495861, + "rewards/format_reward": 0.0, + "step": 356 + }, + { + "clip_ratio": 0.0, + "completion_length": 1522.547607421875, + "epoch": 1.428, + "grad_norm": 0.13563141226768494, + "kl": 0.16943359375, + "learning_rate": 3.062313053727671e-07, + "loss": -0.0127, + "reward": -0.09091841243207455, + "reward_std": 0.1005500927567482, + "rewards/cosine_scaled_reward": -0.04545920621603727, + "rewards/format_reward": 0.0, + "step": 357 + }, + { + "clip_ratio": 0.0, + "completion_length": 1508.1607360839844, + "epoch": 1.432, + "grad_norm": 0.12330485880374908, + "kl": 0.175048828125, + "learning_rate": 3.0359654942835247e-07, + "loss": -0.0276, + "reward": -0.09949876181781292, + "reward_std": 0.10788233578205109, + "rewards/cosine_scaled_reward": -0.049749381840229034, + "rewards/format_reward": 0.0, + "step": 358 + }, + { + "clip_ratio": 0.0, + "completion_length": 1526.1785888671875, + "epoch": 1.436, + "grad_norm": 0.1008228212594986, + "kl": 0.172607421875, + "learning_rate": 3.0097380284049523e-07, + "loss": -0.0072, + "reward": -0.08119065128266811, + "reward_std": 0.09274793975055218, + "rewards/cosine_scaled_reward": -0.04059532564133406, + "rewards/format_reward": 0.0, + "step": 359 + }, + { + "clip_ratio": 0.0, + "completion_length": 1507.6964721679688, + "epoch": 1.44, + "grad_norm": 0.11536505818367004, + "kl": 0.1640625, + "learning_rate": 2.9836319343816397e-07, + "loss": -0.0305, + "reward": -0.10905157402157784, + "reward_std": 0.11038926243782043, + "rewards/cosine_scaled_reward": -0.05452578607946634, + "rewards/format_reward": 0.0, + "step": 360 + }, + { + "clip_ratio": 0.0, + "completion_length": 1518.827392578125, + "epoch": 1.444, + "grad_norm": 0.12276989966630936, + "kl": 0.1514892578125, + "learning_rate": 2.9576484845877793e-07, + "loss": -0.0175, + "reward": -0.08610734064131975, + "reward_std": 0.09063750877976418, + "rewards/cosine_scaled_reward": -0.043053670320659876, + "rewards/format_reward": 0.0, + "step": 361 + }, + { + "clip_ratio": 0.0, + "completion_length": 1519.4702453613281, + "epoch": 1.448, + "grad_norm": 0.08738084882497787, + "kl": 0.18017578125, + "learning_rate": 2.931788945420058e-07, + "loss": -0.0112, + "reward": -0.09291153028607368, + "reward_std": 0.09842956997454166, + "rewards/cosine_scaled_reward": -0.046455767937004566, + "rewards/format_reward": 0.0, + "step": 362 + }, + { + "clip_ratio": 0.0, + "completion_length": 1509.0774230957031, + "epoch": 1.452, + "grad_norm": 0.11346267908811569, + "kl": 0.175048828125, + "learning_rate": 2.9060545772359305e-07, + "loss": -0.0277, + "reward": -0.11039301194250584, + "reward_std": 0.12665076181292534, + "rewards/cosine_scaled_reward": -0.05519650410860777, + "rewards/format_reward": 0.0, + "step": 363 + }, + { + "clip_ratio": 0.0, + "completion_length": 1524.1964416503906, + "epoch": 1.456, + "grad_norm": 0.14776764810085297, + "kl": 0.150634765625, + "learning_rate": 2.8804466342921987e-07, + "loss": -0.006, + "reward": -0.022786946967244148, + "reward_std": 0.10106383822858334, + "rewards/cosine_scaled_reward": -0.011393471620976925, + "rewards/format_reward": 0.0, + "step": 364 + }, + { + "clip_ratio": 0.0, + "completion_length": 1493.607177734375, + "epoch": 1.46, + "grad_norm": 0.09510252624750137, + "kl": 0.153076171875, + "learning_rate": 2.854966364683872e-07, + "loss": -0.0487, + "reward": -0.10556191392242908, + "reward_std": 0.10932666808366776, + "rewards/cosine_scaled_reward": -0.05278095696121454, + "rewards/format_reward": 0.0, + "step": 365 + }, + { + "clip_ratio": 0.0, + "completion_length": 1529.875, + "epoch": 1.464, + "grad_norm": 0.15466201305389404, + "kl": 0.15966796875, + "learning_rate": 2.829615010283344e-07, + "loss": 0.0007, + "reward": -0.09042776376008987, + "reward_std": 0.10296636447310448, + "rewards/cosine_scaled_reward": -0.04521388094872236, + "rewards/format_reward": 0.0, + "step": 366 + }, + { + "clip_ratio": 0.0, + "completion_length": 1504.6786193847656, + "epoch": 1.468, + "grad_norm": 0.08847711980342865, + "kl": 0.17041015625, + "learning_rate": 2.8043938066798645e-07, + "loss": -0.0311, + "reward": -0.10542780347168446, + "reward_std": 0.11852787062525749, + "rewards/cosine_scaled_reward": -0.05271390173584223, + "rewards/format_reward": 0.0, + "step": 367 + }, + { + "clip_ratio": 0.0, + "completion_length": 1498.7083435058594, + "epoch": 1.472, + "grad_norm": 0.1147918626666069, + "kl": 0.17041015625, + "learning_rate": 2.7793039831193133e-07, + "loss": -0.0404, + "reward": -0.10119456797838211, + "reward_std": 0.1359020471572876, + "rewards/cosine_scaled_reward": -0.050597282126545906, + "rewards/format_reward": 0.0, + "step": 368 + }, + { + "clip_ratio": 0.0, + "completion_length": 1507.5774230957031, + "epoch": 1.476, + "grad_norm": 0.0867527574300766, + "kl": 0.154052734375, + "learning_rate": 2.7543467624442956e-07, + "loss": -0.0327, + "reward": -0.09615712240338326, + "reward_std": 0.11924017407000065, + "rewards/cosine_scaled_reward": -0.0480785621330142, + "rewards/format_reward": 0.0, + "step": 369 + }, + { + "clip_ratio": 0.0, + "completion_length": 1515.5357360839844, + "epoch": 1.48, + "grad_norm": 0.07760825008153915, + "kl": 0.172607421875, + "learning_rate": 2.729523361034538e-07, + "loss": -0.012, + "reward": -0.09595928713679314, + "reward_std": 0.10662926360964775, + "rewards/cosine_scaled_reward": -0.04797964543104172, + "rewards/format_reward": 0.0, + "step": 370 + }, + { + "clip_ratio": 0.0, + "completion_length": 1510.3809814453125, + "epoch": 1.484, + "grad_norm": 0.1310672163963318, + "kl": 0.1671142578125, + "learning_rate": 2.7048349887476037e-07, + "loss": -0.0266, + "reward": -0.08946863748133183, + "reward_std": 0.0914797130972147, + "rewards/cosine_scaled_reward": -0.04473431780934334, + "rewards/format_reward": 0.0, + "step": 371 + }, + { + "clip_ratio": 0.0, + "completion_length": 1491.5179138183594, + "epoch": 1.488, + "grad_norm": 0.08744286000728607, + "kl": 0.156494140625, + "learning_rate": 2.6802828488599294e-07, + "loss": -0.045, + "reward": -0.1184717956930399, + "reward_std": 0.13941991329193115, + "rewards/cosine_scaled_reward": -0.059235901571810246, + "rewards/format_reward": 0.0, + "step": 372 + }, + { + "clip_ratio": 0.0, + "completion_length": 1521.0238342285156, + "epoch": 1.492, + "grad_norm": 0.1646253615617752, + "kl": 0.16357421875, + "learning_rate": 2.655868138008171e-07, + "loss": 0.0142, + "reward": -0.084196537733078, + "reward_std": 0.07485349848866463, + "rewards/cosine_scaled_reward": -0.042098269797861576, + "rewards/format_reward": 0.0, + "step": 373 + }, + { + "clip_ratio": 0.0, + "completion_length": 1523.375, + "epoch": 1.496, + "grad_norm": 0.11430079489946365, + "kl": 0.172607421875, + "learning_rate": 2.631592046130896e-07, + "loss": -0.0099, + "reward": -0.07816067058593035, + "reward_std": 0.08419617265462875, + "rewards/cosine_scaled_reward": -0.03908033389598131, + "rewards/format_reward": 0.0, + "step": 374 + }, + { + "clip_ratio": 0.0, + "completion_length": 1506.4524230957031, + "epoch": 1.5, + "grad_norm": 0.14677973091602325, + "kl": 0.18603515625, + "learning_rate": 2.6074557564105724e-07, + "loss": -0.0194, + "reward": -0.08955581299960613, + "reward_std": 0.09336170181632042, + "rewards/cosine_scaled_reward": -0.04477790556848049, + "rewards/format_reward": 0.0, + "step": 375 + }, + { + "clip_ratio": 0.0, + "completion_length": 1527.3988342285156, + "epoch": 1.504, + "grad_norm": 0.12783505022525787, + "kl": 0.159912109375, + "learning_rate": 2.583460445215911e-07, + "loss": -0.0049, + "reward": -0.0952699575573206, + "reward_std": 0.09568927250802517, + "rewards/cosine_scaled_reward": -0.047634975984692574, + "rewards/format_reward": 0.0, + "step": 376 + }, + { + "clip_ratio": 0.0, + "completion_length": 1520.7678833007812, + "epoch": 1.508, + "grad_norm": 0.1176699697971344, + "kl": 0.1650390625, + "learning_rate": 2.5596072820445254e-07, + "loss": -0.011, + "reward": -0.019147060811519623, + "reward_std": 0.09721549972891808, + "rewards/cosine_scaled_reward": -0.009573530405759811, + "rewards/format_reward": 0.0, + "step": 377 + }, + { + "clip_ratio": 0.0, + "completion_length": 1519.297607421875, + "epoch": 1.512, + "grad_norm": 0.11060648411512375, + "kl": 0.1650390625, + "learning_rate": 2.5358974294659373e-07, + "loss": -0.0134, + "reward": -0.09460222348570824, + "reward_std": 0.1032384280115366, + "rewards/cosine_scaled_reward": -0.04730111453682184, + "rewards/format_reward": 0.0, + "step": 378 + }, + { + "clip_ratio": 0.0, + "completion_length": 1524.2916870117188, + "epoch": 1.516, + "grad_norm": 0.12652094662189484, + "kl": 0.16064453125, + "learning_rate": 2.512332043064913e-07, + "loss": -0.0078, + "reward": -0.07960367575287819, + "reward_std": 0.08834364637732506, + "rewards/cosine_scaled_reward": -0.03980184067040682, + "rewards/format_reward": 0.0, + "step": 379 + }, + { + "clip_ratio": 0.0, + "completion_length": 1524.8750305175781, + "epoch": 1.52, + "grad_norm": 0.10227189213037491, + "kl": 0.16748046875, + "learning_rate": 2.488912271385139e-07, + "loss": -0.0064, + "reward": -0.08977451547980309, + "reward_std": 0.1080553438514471, + "rewards/cosine_scaled_reward": -0.04488725960254669, + "rewards/format_reward": 0.0, + "step": 380 + }, + { + "clip_ratio": 0.0, + "completion_length": 1506.3511962890625, + "epoch": 1.524, + "grad_norm": 0.12043853104114532, + "kl": 0.17138671875, + "learning_rate": 2.465639255873246e-07, + "loss": -0.035, + "reward": -0.11090395227074623, + "reward_std": 0.12006122805178165, + "rewards/cosine_scaled_reward": -0.05545197706669569, + "rewards/format_reward": 0.0, + "step": 381 + }, + { + "clip_ratio": 0.0, + "completion_length": 1523.5178833007812, + "epoch": 1.528, + "grad_norm": 0.13229811191558838, + "kl": 0.170654296875, + "learning_rate": 2.4425141308231765e-07, + "loss": -0.0068, + "reward": -0.09728248044848442, + "reward_std": 0.107889199629426, + "rewards/cosine_scaled_reward": -0.048641239292919636, + "rewards/format_reward": 0.0, + "step": 382 + }, + { + "clip_ratio": 0.0, + "completion_length": 1527.2261962890625, + "epoch": 1.532, + "grad_norm": 0.10695023834705353, + "kl": 0.1630859375, + "learning_rate": 2.4195380233209006e-07, + "loss": -0.0022, + "reward": -0.09213725849986076, + "reward_std": 0.10676849260926247, + "rewards/cosine_scaled_reward": -0.046068630181252956, + "rewards/format_reward": 0.0, + "step": 383 + }, + { + "clip_ratio": 0.0, + "completion_length": 1510.7083740234375, + "epoch": 1.536, + "grad_norm": 0.13451160490512848, + "kl": 0.150390625, + "learning_rate": 2.3967120531894857e-07, + "loss": -0.0256, + "reward": -0.10359417460858822, + "reward_std": 0.12065772153437138, + "rewards/cosine_scaled_reward": -0.05179708730429411, + "rewards/format_reward": 0.0, + "step": 384 + }, + { + "clip_ratio": 0.0, + "completion_length": 1501.4940795898438, + "epoch": 1.54, + "grad_norm": 0.1391247659921646, + "kl": 0.16357421875, + "learning_rate": 2.374037332934512e-07, + "loss": -0.0409, + "reward": -0.09379393607378006, + "reward_std": 0.10166217759251595, + "rewards/cosine_scaled_reward": -0.04689696989953518, + "rewards/format_reward": 0.0, + "step": 385 + }, + { + "clip_ratio": 0.0, + "completion_length": 1516.2500305175781, + "epoch": 1.544, + "grad_norm": 0.1330152153968811, + "kl": 0.166748046875, + "learning_rate": 2.3515149676898552e-07, + "loss": 0.0087, + "reward": -0.07589279673993587, + "reward_std": 0.09089674055576324, + "rewards/cosine_scaled_reward": -0.03794640023261309, + "rewards/format_reward": 0.0, + "step": 386 + }, + { + "clip_ratio": 0.0, + "completion_length": 1488.6131286621094, + "epoch": 1.548, + "grad_norm": 0.10263092815876007, + "kl": 0.1630859375, + "learning_rate": 2.3291460551638237e-07, + "loss": -0.0521, + "reward": -0.12465786561369896, + "reward_std": 0.1609484814107418, + "rewards/cosine_scaled_reward": -0.06232893466949463, + "rewards/format_reward": 0.0, + "step": 387 + }, + { + "clip_ratio": 0.0, + "completion_length": 1516.8928833007812, + "epoch": 1.552, + "grad_norm": 0.09812143445014954, + "kl": 0.15966796875, + "learning_rate": 2.306931685585657e-07, + "loss": -0.015, + "reward": -0.0796813191846013, + "reward_std": 0.08767454512417316, + "rewards/cosine_scaled_reward": -0.039840660989284515, + "rewards/format_reward": 0.0, + "step": 388 + }, + { + "clip_ratio": 0.0, + "completion_length": 1505.2738342285156, + "epoch": 1.556, + "grad_norm": 0.16943664848804474, + "kl": 0.1591796875, + "learning_rate": 2.2848729416523859e-07, + "loss": -0.0254, + "reward": -0.11296515539288521, + "reward_std": 0.12935122102499008, + "rewards/cosine_scaled_reward": -0.05648257676512003, + "rewards/format_reward": 0.0, + "step": 389 + }, + { + "clip_ratio": 0.0, + "completion_length": 1511.9345397949219, + "epoch": 1.56, + "grad_norm": 0.1270017921924591, + "kl": 0.1513671875, + "learning_rate": 2.2629708984760706e-07, + "loss": -0.0186, + "reward": -0.08384528011083603, + "reward_std": 0.08424858003854752, + "rewards/cosine_scaled_reward": -0.04192264098674059, + "rewards/format_reward": 0.0, + "step": 390 + }, + { + "clip_ratio": 0.0, + "completion_length": 1525.982177734375, + "epoch": 1.564, + "grad_norm": 0.16950343549251556, + "kl": 0.169921875, + "learning_rate": 2.2412266235313973e-07, + "loss": -0.0058, + "reward": -0.08042520564049482, + "reward_std": 0.08201098442077637, + "rewards/cosine_scaled_reward": -0.04021260142326355, + "rewards/format_reward": 0.0, + "step": 391 + }, + { + "clip_ratio": 0.0, + "completion_length": 1506.6666564941406, + "epoch": 1.568, + "grad_norm": 0.13040253520011902, + "kl": 0.14990234375, + "learning_rate": 2.2196411766036487e-07, + "loss": -0.0288, + "reward": -0.10378818027675152, + "reward_std": 0.1260694395750761, + "rewards/cosine_scaled_reward": -0.05189409013837576, + "rewards/format_reward": 0.0, + "step": 392 + }, + { + "clip_ratio": 0.0, + "completion_length": 1518.4047546386719, + "epoch": 1.572, + "grad_norm": 0.11275047063827515, + "kl": 0.1572265625, + "learning_rate": 2.1982156097370557e-07, + "loss": -0.0157, + "reward": -0.09913922101259232, + "reward_std": 0.10591815412044525, + "rewards/cosine_scaled_reward": -0.04956961143761873, + "rewards/format_reward": 0.0, + "step": 393 + }, + { + "clip_ratio": 0.0, + "completion_length": 1524.8035888671875, + "epoch": 1.576, + "grad_norm": 0.11497998982667923, + "kl": 0.17578125, + "learning_rate": 2.1769509671835223e-07, + "loss": -0.009, + "reward": -0.08646929264068604, + "reward_std": 0.09624841343611479, + "rewards/cosine_scaled_reward": -0.04323464818298817, + "rewards/format_reward": 0.0, + "step": 394 + }, + { + "clip_ratio": 0.0, + "completion_length": 1518.5416870117188, + "epoch": 1.58, + "grad_norm": 0.19012141227722168, + "kl": 0.148193359375, + "learning_rate": 2.1558482853517253e-07, + "loss": -0.0063, + "reward": -0.09366242028772831, + "reward_std": 0.1069308090955019, + "rewards/cosine_scaled_reward": -0.046831210143864155, + "rewards/format_reward": 0.0, + "step": 395 + }, + { + "clip_ratio": 0.0, + "completion_length": 1508.1488342285156, + "epoch": 1.584, + "grad_norm": 0.1060405820608139, + "kl": 0.156982421875, + "learning_rate": 2.134908592756607e-07, + "loss": -0.0309, + "reward": -0.10527068562805653, + "reward_std": 0.12328575551509857, + "rewards/cosine_scaled_reward": -0.05263534560799599, + "rewards/format_reward": 0.0, + "step": 396 + }, + { + "clip_ratio": 0.0, + "completion_length": 1506.7916870117188, + "epoch": 1.588, + "grad_norm": 0.10801802575588226, + "kl": 0.140869140625, + "learning_rate": 2.1141329099692406e-07, + "loss": -0.0205, + "reward": -0.11307091265916824, + "reward_std": 0.123080899938941, + "rewards/cosine_scaled_reward": -0.05653545819222927, + "rewards/format_reward": 0.0, + "step": 397 + }, + { + "clip_ratio": 0.0, + "completion_length": 1504.5595703125, + "epoch": 1.592, + "grad_norm": 0.08320983499288559, + "kl": 0.1591796875, + "learning_rate": 2.0935222495670968e-07, + "loss": -0.037, + "reward": -0.09146481472998857, + "reward_std": 0.09883083030581474, + "rewards/cosine_scaled_reward": -0.04573240736499429, + "rewards/format_reward": 0.0, + "step": 398 + }, + { + "clip_ratio": 0.0, + "completion_length": 1531.6607360839844, + "epoch": 1.596, + "grad_norm": 0.09601892530918121, + "kl": 0.1533203125, + "learning_rate": 2.0730776160846853e-07, + "loss": 0.0006, + "reward": -0.08569015190005302, + "reward_std": 0.0903671607375145, + "rewards/cosine_scaled_reward": -0.04284507688134909, + "rewards/format_reward": 0.0, + "step": 399 + }, + { + "clip_ratio": 0.0, + "completion_length": 1501.8928833007812, + "epoch": 1.6, + "grad_norm": 0.12060719728469849, + "kl": 0.16943359375, + "learning_rate": 2.0528000059645995e-07, + "loss": -0.0212, + "reward": -0.09470336884260178, + "reward_std": 0.11146636307239532, + "rewards/cosine_scaled_reward": -0.04735168442130089, + "rewards/format_reward": 0.0, + "step": 400 + }, + { + "clip_ratio": 0.0, + "completion_length": 1523.4940795898438, + "epoch": 1.604, + "grad_norm": 0.10291819274425507, + "kl": 0.1591796875, + "learning_rate": 2.032690407508949e-07, + "loss": -0.0098, + "reward": -0.09152790158987045, + "reward_std": 0.11163719370961189, + "rewards/cosine_scaled_reward": -0.045763951260596514, + "rewards/format_reward": 0.0, + "step": 401 + }, + { + "clip_ratio": 0.0, + "completion_length": 1515.0952453613281, + "epoch": 1.608, + "grad_norm": 0.13543279469013214, + "kl": 0.1689453125, + "learning_rate": 2.0127498008311922e-07, + "loss": -0.0193, + "reward": -0.08428375516086817, + "reward_std": 0.08265121094882488, + "rewards/cosine_scaled_reward": -0.042141877580434084, + "rewards/format_reward": 0.0, + "step": 402 + }, + { + "clip_ratio": 0.0, + "completion_length": 1514.3154907226562, + "epoch": 1.612, + "grad_norm": 0.10554395616054535, + "kl": 0.15380859375, + "learning_rate": 1.9929791578083655e-07, + "loss": -0.0233, + "reward": -0.09138609375804663, + "reward_std": 0.09994357451796532, + "rewards/cosine_scaled_reward": -0.04569304594770074, + "rewards/format_reward": 0.0, + "step": 403 + }, + { + "clip_ratio": 0.0, + "completion_length": 1503.452392578125, + "epoch": 1.616, + "grad_norm": 0.10063979774713516, + "kl": 0.155517578125, + "learning_rate": 1.9733794420337213e-07, + "loss": -0.0392, + "reward": -0.100379329174757, + "reward_std": 0.12372113950550556, + "rewards/cosine_scaled_reward": -0.05018966645002365, + "rewards/format_reward": 0.0, + "step": 404 + }, + { + "clip_ratio": 0.0, + "completion_length": 1516.607177734375, + "epoch": 1.62, + "grad_norm": 0.090563103556633, + "kl": 0.163818359375, + "learning_rate": 1.9539516087697517e-07, + "loss": -0.0215, + "reward": -0.08255079202353954, + "reward_std": 0.09494246542453766, + "rewards/cosine_scaled_reward": -0.041275396943092346, + "rewards/format_reward": 0.0, + "step": 405 + }, + { + "clip_ratio": 0.0, + "completion_length": 1524.3988342285156, + "epoch": 1.624, + "grad_norm": 0.10463332384824753, + "kl": 0.154541015625, + "learning_rate": 1.934696604901642e-07, + "loss": -0.0101, + "reward": -0.09653126262128353, + "reward_std": 0.11365084536373615, + "rewards/cosine_scaled_reward": -0.04826563224196434, + "rewards/format_reward": 0.0, + "step": 406 + }, + { + "clip_ratio": 0.0, + "completion_length": 1504.4345703125, + "epoch": 1.6280000000000001, + "grad_norm": 0.18202035129070282, + "kl": 0.164794921875, + "learning_rate": 1.915615368891117e-07, + "loss": -0.0156, + "reward": -0.10028301551938057, + "reward_std": 0.12200421467423439, + "rewards/cosine_scaled_reward": -0.050141509622335434, + "rewards/format_reward": 0.0, + "step": 407 + }, + { + "clip_ratio": 0.0, + "completion_length": 1501.7857360839844, + "epoch": 1.6320000000000001, + "grad_norm": 0.10041651129722595, + "kl": 0.1552734375, + "learning_rate": 1.8967088307307e-07, + "loss": -0.0405, + "reward": -0.1025087870657444, + "reward_std": 0.12081354483962059, + "rewards/cosine_scaled_reward": -0.051254394464194775, + "rewards/format_reward": 0.0, + "step": 408 + }, + { + "clip_ratio": 0.0, + "completion_length": 1507.65478515625, + "epoch": 1.6360000000000001, + "grad_norm": 0.0929652526974678, + "kl": 0.164306640625, + "learning_rate": 1.8779779118983867e-07, + "loss": -0.0336, + "reward": -0.10521730966866016, + "reward_std": 0.12063234858214855, + "rewards/cosine_scaled_reward": -0.05260865669697523, + "rewards/format_reward": 0.0, + "step": 409 + }, + { + "clip_ratio": 0.0, + "completion_length": 1489.9404907226562, + "epoch": 1.6400000000000001, + "grad_norm": 0.08884437382221222, + "kl": 0.16552734375, + "learning_rate": 1.8594235253127372e-07, + "loss": -0.0292, + "reward": -0.09861567430198193, + "reward_std": 0.10845682211220264, + "rewards/cosine_scaled_reward": -0.04930783715099096, + "rewards/format_reward": 0.0, + "step": 410 + }, + { + "clip_ratio": 0.0, + "completion_length": 1514.0476379394531, + "epoch": 1.6440000000000001, + "grad_norm": 0.1006086990237236, + "kl": 0.156494140625, + "learning_rate": 1.8410465752883758e-07, + "loss": -0.021, + "reward": -0.09761104919016361, + "reward_std": 0.10258225724101067, + "rewards/cosine_scaled_reward": -0.048805526457726955, + "rewards/format_reward": 0.0, + "step": 411 + }, + { + "clip_ratio": 0.0, + "completion_length": 1519.90478515625, + "epoch": 1.6480000000000001, + "grad_norm": 0.11515481770038605, + "kl": 0.16943359375, + "learning_rate": 1.822847957491922e-07, + "loss": -0.016, + "reward": -0.08585721254348755, + "reward_std": 0.10039913840591908, + "rewards/cosine_scaled_reward": -0.042928608134388924, + "rewards/format_reward": 0.0, + "step": 412 + }, + { + "clip_ratio": 0.0, + "completion_length": 1510.357177734375, + "epoch": 1.6520000000000001, + "grad_norm": 0.09629681706428528, + "kl": 0.159423828125, + "learning_rate": 1.804828558898332e-07, + "loss": -0.0283, + "reward": -0.08894845098257065, + "reward_std": 0.10276514105498791, + "rewards/cosine_scaled_reward": -0.04447422595694661, + "rewards/format_reward": 0.0, + "step": 413 + }, + { + "clip_ratio": 0.0, + "completion_length": 1511.0357360839844, + "epoch": 1.6560000000000001, + "grad_norm": 0.12116753309965134, + "kl": 0.169189453125, + "learning_rate": 1.7869892577476722e-07, + "loss": -0.0245, + "reward": -0.10262815281748772, + "reward_std": 0.12108992040157318, + "rewards/cosine_scaled_reward": -0.051314075477421284, + "rewards/format_reward": 0.0, + "step": 414 + }, + { + "clip_ratio": 0.0, + "completion_length": 1521.9345397949219, + "epoch": 1.6600000000000001, + "grad_norm": 0.10831650346517563, + "kl": 0.163818359375, + "learning_rate": 1.7693309235023127e-07, + "loss": -0.0085, + "reward": -0.08154256083071232, + "reward_std": 0.09072042256593704, + "rewards/cosine_scaled_reward": -0.04077128041535616, + "rewards/format_reward": 0.0, + "step": 415 + }, + { + "clip_ratio": 0.0, + "completion_length": 1508.0059814453125, + "epoch": 1.6640000000000001, + "grad_norm": 0.10683077573776245, + "kl": 0.175537109375, + "learning_rate": 1.7518544168045524e-07, + "loss": -0.0242, + "reward": -0.1116462592035532, + "reward_std": 0.10574496164917946, + "rewards/cosine_scaled_reward": -0.05582312494516373, + "rewards/format_reward": 0.0, + "step": 416 + }, + { + "clip_ratio": 0.0, + "completion_length": 1515.3393249511719, + "epoch": 1.6680000000000001, + "grad_norm": 0.14900319278240204, + "kl": 0.1650390625, + "learning_rate": 1.7345605894346726e-07, + "loss": -0.021, + "reward": -0.08745052106678486, + "reward_std": 0.11200828477740288, + "rewards/cosine_scaled_reward": -0.04372526053339243, + "rewards/format_reward": 0.0, + "step": 417 + }, + { + "clip_ratio": 0.0, + "completion_length": 1523.3690490722656, + "epoch": 1.6720000000000002, + "grad_norm": 0.11940804123878479, + "kl": 0.154541015625, + "learning_rate": 1.7174502842694212e-07, + "loss": -0.0124, + "reward": -0.0070722997188568115, + "reward_std": 0.09923059120774269, + "rewards/cosine_scaled_reward": -0.0035361526533961296, + "rewards/format_reward": 0.0, + "step": 418 + }, + { + "clip_ratio": 0.0, + "completion_length": 1485.9583435058594, + "epoch": 1.6760000000000002, + "grad_norm": 0.08529967814683914, + "kl": 0.172607421875, + "learning_rate": 1.7005243352409333e-07, + "loss": -0.0651, + "reward": -0.04097301326692104, + "reward_std": 0.15967968851327896, + "rewards/cosine_scaled_reward": -0.02048650663346052, + "rewards/format_reward": 0.0, + "step": 419 + }, + { + "clip_ratio": 0.0, + "completion_length": 1504.0059814453125, + "epoch": 1.6800000000000002, + "grad_norm": 0.0939546748995781, + "kl": 0.16015625, + "learning_rate": 1.6837835672960831e-07, + "loss": -0.0345, + "reward": -0.091935895383358, + "reward_std": 0.11023806594312191, + "rewards/cosine_scaled_reward": -0.04596794489771128, + "rewards/format_reward": 0.0, + "step": 420 + }, + { + "clip_ratio": 0.0, + "completion_length": 1502.7500305175781, + "epoch": 1.6840000000000002, + "grad_norm": 0.114561066031456, + "kl": 0.17724609375, + "learning_rate": 1.6672287963562852e-07, + "loss": -0.0193, + "reward": -0.07856714259833097, + "reward_std": 0.08897042460739613, + "rewards/cosine_scaled_reward": -0.03928357409313321, + "rewards/format_reward": 0.0, + "step": 421 + }, + { + "clip_ratio": 0.0, + "completion_length": 1496.7261962890625, + "epoch": 1.688, + "grad_norm": 0.11227195709943771, + "kl": 0.160888671875, + "learning_rate": 1.6508608292777203e-07, + "loss": -0.0359, + "reward": -0.09738295152783394, + "reward_std": 0.11914198100566864, + "rewards/cosine_scaled_reward": -0.048691474832594395, + "rewards/format_reward": 0.0, + "step": 422 + }, + { + "clip_ratio": 0.0, + "completion_length": 1511.1964416503906, + "epoch": 1.692, + "grad_norm": 0.13162577152252197, + "kl": 0.181640625, + "learning_rate": 1.6346804638120098e-07, + "loss": -0.0245, + "reward": -0.07754436880350113, + "reward_std": 0.10734674707055092, + "rewards/cosine_scaled_reward": -0.0387721830047667, + "rewards/format_reward": 0.0, + "step": 423 + }, + { + "clip_ratio": 0.0, + "completion_length": 1508.1012268066406, + "epoch": 1.696, + "grad_norm": 0.10524528473615646, + "kl": 0.164306640625, + "learning_rate": 1.6186884885673413e-07, + "loss": -0.024, + "reward": -0.08680723141878843, + "reward_std": 0.0982758505269885, + "rewards/cosine_scaled_reward": -0.04340361384674907, + "rewards/format_reward": 0.0, + "step": 424 + }, + { + "clip_ratio": 0.0, + "completion_length": 1505.3154907226562, + "epoch": 1.7, + "grad_norm": 0.10563742369413376, + "kl": 0.16015625, + "learning_rate": 1.6028856829700258e-07, + "loss": -0.0092, + "reward": -0.08459902927279472, + "reward_std": 0.09910181537270546, + "rewards/cosine_scaled_reward": -0.04229951370507479, + "rewards/format_reward": 0.0, + "step": 425 + }, + { + "clip_ratio": 0.0, + "completion_length": 1520.6785888671875, + "epoch": 1.704, + "grad_norm": 0.08786718547344208, + "kl": 0.161865234375, + "learning_rate": 1.5872728172265146e-07, + "loss": -0.0165, + "reward": -0.07912362925708294, + "reward_std": 0.08175937831401825, + "rewards/cosine_scaled_reward": -0.03956181462854147, + "rewards/format_reward": 0.0, + "step": 426 + }, + { + "clip_ratio": 0.0, + "completion_length": 1498.9762573242188, + "epoch": 1.708, + "grad_norm": 0.08499140292406082, + "kl": 0.15625, + "learning_rate": 1.5718506522858572e-07, + "loss": -0.0364, + "reward": -0.0896658506244421, + "reward_std": 0.10279479995369911, + "rewards/cosine_scaled_reward": -0.0448329309001565, + "rewards/format_reward": 0.0, + "step": 427 + }, + { + "clip_ratio": 0.0, + "completion_length": 1508.3035583496094, + "epoch": 1.712, + "grad_norm": 0.08925153315067291, + "kl": 0.14453125, + "learning_rate": 1.5566199398026147e-07, + "loss": -0.0309, + "reward": -0.09496857039630413, + "reward_std": 0.1123510580509901, + "rewards/cosine_scaled_reward": -0.04748428799211979, + "rewards/format_reward": 0.0, + "step": 428 + }, + { + "clip_ratio": 0.0, + "completion_length": 1523.2916870117188, + "epoch": 1.716, + "grad_norm": 0.10608566552400589, + "kl": 0.167724609375, + "learning_rate": 1.5415814221002265e-07, + "loss": -0.0113, + "reward": -0.09526684321463108, + "reward_std": 0.0988641269505024, + "rewards/cosine_scaled_reward": -0.047633420675992966, + "rewards/format_reward": 0.0, + "step": 429 + }, + { + "clip_ratio": 0.0, + "completion_length": 1498.2261962890625, + "epoch": 1.72, + "grad_norm": 0.10655763745307922, + "kl": 0.16162109375, + "learning_rate": 1.5267358321348285e-07, + "loss": -0.0414, + "reward": -0.10186839010566473, + "reward_std": 0.13254049234092236, + "rewards/cosine_scaled_reward": -0.05093420064076781, + "rewards/format_reward": 0.0, + "step": 430 + }, + { + "clip_ratio": 0.0, + "completion_length": 1528.0476379394531, + "epoch": 1.724, + "grad_norm": 0.1068165972828865, + "kl": 0.15673828125, + "learning_rate": 1.5120838934595337e-07, + "loss": -0.0041, + "reward": -0.09541826322674751, + "reward_std": 0.1032972726970911, + "rewards/cosine_scaled_reward": -0.04770912975072861, + "rewards/format_reward": 0.0, + "step": 431 + }, + { + "clip_ratio": 0.0, + "completion_length": 1522.4583435058594, + "epoch": 1.728, + "grad_norm": 0.12408644706010818, + "kl": 0.16796875, + "learning_rate": 1.4976263201891613e-07, + "loss": -0.013, + "reward": -0.08289302699267864, + "reward_std": 0.08351449854671955, + "rewards/cosine_scaled_reward": -0.041446512565016747, + "rewards/format_reward": 0.0, + "step": 432 + }, + { + "clip_ratio": 0.0, + "completion_length": 1497.9107360839844, + "epoch": 1.732, + "grad_norm": 0.07734204828739166, + "kl": 0.173095703125, + "learning_rate": 1.483363816965435e-07, + "loss": -0.0391, + "reward": -0.10260258801281452, + "reward_std": 0.12828159891068935, + "rewards/cosine_scaled_reward": -0.051301293075084686, + "rewards/format_reward": 0.0, + "step": 433 + }, + { + "clip_ratio": 0.0, + "completion_length": 1513.7559814453125, + "epoch": 1.736, + "grad_norm": 0.12413759529590607, + "kl": 0.17919921875, + "learning_rate": 1.469297078922642e-07, + "loss": -0.0244, + "reward": -0.015836404636502266, + "reward_std": 0.09649943746626377, + "rewards/cosine_scaled_reward": -0.007918204180896282, + "rewards/format_reward": 0.0, + "step": 434 + }, + { + "clip_ratio": 0.0, + "completion_length": 1522.1190795898438, + "epoch": 1.74, + "grad_norm": 0.12912577390670776, + "kl": 0.16650390625, + "learning_rate": 1.4554267916537495e-07, + "loss": -0.0112, + "reward": -0.08571217954158783, + "reward_std": 0.10151237808167934, + "rewards/cosine_scaled_reward": -0.042856089770793915, + "rewards/format_reward": 0.0, + "step": 435 + }, + { + "clip_ratio": 0.0, + "completion_length": 1474.952392578125, + "epoch": 1.744, + "grad_norm": 0.08318183571100235, + "kl": 0.1689453125, + "learning_rate": 1.4417536311769885e-07, + "loss": -0.0637, + "reward": -0.09841407462954521, + "reward_std": 0.1172296404838562, + "rewards/cosine_scaled_reward": -0.049207039177417755, + "rewards/format_reward": 0.0, + "step": 436 + }, + { + "clip_ratio": 0.0, + "completion_length": 1481.3095397949219, + "epoch": 1.748, + "grad_norm": 0.0786975845694542, + "kl": 0.156005859375, + "learning_rate": 1.4282782639029128e-07, + "loss": -0.0386, + "reward": -0.08532883040606976, + "reward_std": 0.09728906117379665, + "rewards/cosine_scaled_reward": -0.04266441613435745, + "rewards/format_reward": 0.0, + "step": 437 + }, + { + "clip_ratio": 0.0, + "completion_length": 1500.7440490722656, + "epoch": 1.752, + "grad_norm": 0.0900636538863182, + "kl": 0.161376953125, + "learning_rate": 1.4150013466019114e-07, + "loss": -0.0316, + "reward": -0.08545132167637348, + "reward_std": 0.10151121858507395, + "rewards/cosine_scaled_reward": -0.042725661769509315, + "rewards/format_reward": 0.0, + "step": 438 + }, + { + "clip_ratio": 0.0, + "completion_length": 1506.9404907226562, + "epoch": 1.756, + "grad_norm": 0.11020209640264511, + "kl": 0.1640625, + "learning_rate": 1.4019235263722034e-07, + "loss": -0.0259, + "reward": -0.08197178691625595, + "reward_std": 0.09423052612692118, + "rewards/cosine_scaled_reward": -0.040985893458127975, + "rewards/format_reward": 0.0, + "step": 439 + }, + { + "clip_ratio": 0.0, + "completion_length": 1517.482177734375, + "epoch": 1.76, + "grad_norm": 0.08999020606279373, + "kl": 0.165283203125, + "learning_rate": 1.3890454406082956e-07, + "loss": -0.017, + "reward": -0.07763329334557056, + "reward_std": 0.08629796095192432, + "rewards/cosine_scaled_reward": -0.03881664574146271, + "rewards/format_reward": 0.0, + "step": 440 + }, + { + "clip_ratio": 0.0, + "completion_length": 1510.8511962890625, + "epoch": 1.764, + "grad_norm": 0.13127504289150238, + "kl": 0.15234375, + "learning_rate": 1.3763677169699217e-07, + "loss": -0.0232, + "reward": -0.08330708928406239, + "reward_std": 0.09235509857535362, + "rewards/cosine_scaled_reward": -0.04165354464203119, + "rewards/format_reward": 0.0, + "step": 441 + }, + { + "clip_ratio": 0.0, + "completion_length": 1507.7619323730469, + "epoch": 1.768, + "grad_norm": 0.14613445103168488, + "kl": 0.152099609375, + "learning_rate": 1.3638909733514452e-07, + "loss": -0.0284, + "reward": -0.09447834640741348, + "reward_std": 0.09266001731157303, + "rewards/cosine_scaled_reward": -0.04723917320370674, + "rewards/format_reward": 0.0, + "step": 442 + }, + { + "clip_ratio": 0.0, + "completion_length": 1516.875, + "epoch": 1.772, + "grad_norm": 0.18538497388362885, + "kl": 0.15966796875, + "learning_rate": 1.351615817851748e-07, + "loss": -0.0153, + "reward": -0.08249685540795326, + "reward_std": 0.09769860841333866, + "rewards/cosine_scaled_reward": -0.04124843003228307, + "rewards/format_reward": 0.0, + "step": 443 + }, + { + "clip_ratio": 0.0, + "completion_length": 1501.8155212402344, + "epoch": 1.776, + "grad_norm": 0.1319953352212906, + "kl": 0.155029296875, + "learning_rate": 1.3395428487445914e-07, + "loss": -0.039, + "reward": -0.09754344820976257, + "reward_std": 0.11035412549972534, + "rewards/cosine_scaled_reward": -0.04877172317355871, + "rewards/format_reward": 0.0, + "step": 444 + }, + { + "clip_ratio": 0.0, + "completion_length": 1511.9821472167969, + "epoch": 1.78, + "grad_norm": 0.1029873788356781, + "kl": 0.1484375, + "learning_rate": 1.3276726544494571e-07, + "loss": -0.0251, + "reward": -0.0899391695857048, + "reward_std": 0.10835397988557816, + "rewards/cosine_scaled_reward": -0.04496958386152983, + "rewards/format_reward": 0.0, + "step": 445 + }, + { + "clip_ratio": 0.0, + "completion_length": 1469.40478515625, + "epoch": 1.784, + "grad_norm": 0.08266568928956985, + "kl": 0.158935546875, + "learning_rate": 1.316005813502869e-07, + "loss": -0.0788, + "reward": -0.10532401315867901, + "reward_std": 0.12493490241467953, + "rewards/cosine_scaled_reward": -0.05266200751066208, + "rewards/format_reward": 0.0, + "step": 446 + }, + { + "clip_ratio": 0.0, + "completion_length": 1505.5952453613281, + "epoch": 1.788, + "grad_norm": 0.13063663244247437, + "kl": 0.15576171875, + "learning_rate": 1.3045428945301953e-07, + "loss": -0.0317, + "reward": -0.022020583972334862, + "reward_std": 0.1138888020068407, + "rewards/cosine_scaled_reward": -0.011010290123522282, + "rewards/format_reward": 0.0, + "step": 447 + }, + { + "clip_ratio": 0.0, + "completion_length": 1493.7142944335938, + "epoch": 1.792, + "grad_norm": 0.1297776997089386, + "kl": 0.17236328125, + "learning_rate": 1.2932844562179352e-07, + "loss": -0.0294, + "reward": -0.08543841261416674, + "reward_std": 0.09460597112774849, + "rewards/cosine_scaled_reward": -0.042719203513115644, + "rewards/format_reward": 0.0, + "step": 448 + }, + { + "clip_ratio": 0.0, + "completion_length": 1521.1071472167969, + "epoch": 1.796, + "grad_norm": 0.1407863050699234, + "kl": 0.16357421875, + "learning_rate": 1.2822310472864885e-07, + "loss": -0.0138, + "reward": -0.10117548704147339, + "reward_std": 0.12595792300999165, + "rewards/cosine_scaled_reward": -0.05058774631470442, + "rewards/format_reward": 0.0, + "step": 449 + }, + { + "clip_ratio": 0.0, + "completion_length": 1507.1964721679688, + "epoch": 1.8, + "grad_norm": 0.16165214776992798, + "kl": 0.1787109375, + "learning_rate": 1.2713832064634125e-07, + "loss": -0.0099, + "reward": -0.08527638856321573, + "reward_std": 0.09594122413545847, + "rewards/cosine_scaled_reward": -0.04263819335028529, + "rewards/format_reward": 0.0, + "step": 450 + }, + { + "clip_ratio": 0.0, + "completion_length": 1511.7202758789062, + "epoch": 1.804, + "grad_norm": 0.1142469272017479, + "kl": 0.163818359375, + "learning_rate": 1.260741462457165e-07, + "loss": -0.023, + "reward": -0.09475222788751125, + "reward_std": 0.10312853008508682, + "rewards/cosine_scaled_reward": -0.04737611673772335, + "rewards/format_reward": 0.0, + "step": 451 + }, + { + "clip_ratio": 0.0, + "completion_length": 1527.5654907226562, + "epoch": 1.808, + "grad_norm": 0.1640588343143463, + "kl": 0.171142578125, + "learning_rate": 1.2503063339313356e-07, + "loss": -0.0028, + "reward": -0.09184761717915535, + "reward_std": 0.09996213018894196, + "rewards/cosine_scaled_reward": -0.0459238076582551, + "rewards/format_reward": 0.0, + "step": 452 + }, + { + "clip_ratio": 0.0, + "completion_length": 1493.8036193847656, + "epoch": 1.812, + "grad_norm": 0.07719198614358902, + "kl": 0.177978515625, + "learning_rate": 1.2400783294793668e-07, + "loss": -0.0428, + "reward": -0.10916751623153687, + "reward_std": 0.12887151166796684, + "rewards/cosine_scaled_reward": -0.054583752527832985, + "rewards/format_reward": 0.0, + "step": 453 + }, + { + "clip_ratio": 0.0, + "completion_length": 1504.3631286621094, + "epoch": 1.8159999999999998, + "grad_norm": 0.12536108493804932, + "kl": 0.152587890625, + "learning_rate": 1.2300579475997657e-07, + "loss": -0.038, + "reward": -0.1067353542894125, + "reward_std": 0.12835400737822056, + "rewards/cosine_scaled_reward": -0.05336767714470625, + "rewards/format_reward": 0.0, + "step": 454 + }, + { + "clip_ratio": 0.0, + "completion_length": 1507.77978515625, + "epoch": 1.8199999999999998, + "grad_norm": 0.1128176897764206, + "kl": 0.16259765625, + "learning_rate": 1.220245676671809e-07, + "loss": -0.0177, + "reward": -0.10911162942647934, + "reward_std": 0.13343517668545246, + "rewards/cosine_scaled_reward": -0.054555815644562244, + "rewards/format_reward": 0.0, + "step": 455 + }, + { + "clip_ratio": 0.0, + "completion_length": 1536.0, + "epoch": 1.8239999999999998, + "grad_norm": 0.13748064637184143, + "kl": 0.172119140625, + "learning_rate": 1.2106419949317388e-07, + "loss": 0.0069, + "reward": -0.07034523971378803, + "reward_std": 0.07846208661794662, + "rewards/cosine_scaled_reward": -0.03517262078821659, + "rewards/format_reward": 0.0, + "step": 456 + }, + { + "clip_ratio": 0.0, + "completion_length": 1492.8690490722656, + "epoch": 1.8279999999999998, + "grad_norm": 0.1389494091272354, + "kl": 0.152099609375, + "learning_rate": 1.2012473704494537e-07, + "loss": -0.0455, + "reward": -0.11908807791769505, + "reward_std": 0.13069945573806763, + "rewards/cosine_scaled_reward": -0.059544037096202374, + "rewards/format_reward": 0.0, + "step": 457 + }, + { + "clip_ratio": 0.0, + "completion_length": 1499.9464721679688, + "epoch": 1.8319999999999999, + "grad_norm": 0.1537049114704132, + "kl": 0.1683349609375, + "learning_rate": 1.1920622611056974e-07, + "loss": -0.0307, + "reward": -0.08774650190025568, + "reward_std": 0.10440967045724392, + "rewards/cosine_scaled_reward": -0.043873251881450415, + "rewards/format_reward": 0.0, + "step": 458 + }, + { + "clip_ratio": 0.0, + "completion_length": 1524.2083435058594, + "epoch": 1.8359999999999999, + "grad_norm": 0.13081440329551697, + "kl": 0.176513671875, + "learning_rate": 1.1830871145697412e-07, + "loss": -0.0066, + "reward": -0.08506089821457863, + "reward_std": 0.09712946228682995, + "rewards/cosine_scaled_reward": -0.04253045003861189, + "rewards/format_reward": 0.0, + "step": 459 + }, + { + "clip_ratio": 0.0, + "completion_length": 1502.9583435058594, + "epoch": 1.8399999999999999, + "grad_norm": 0.09031596034765244, + "kl": 0.154296875, + "learning_rate": 1.1743223682775649e-07, + "loss": -0.0358, + "reward": -0.09923446178436279, + "reward_std": 0.11484255269169807, + "rewards/cosine_scaled_reward": -0.049617230892181396, + "rewards/format_reward": 0.0, + "step": 460 + }, + { + "clip_ratio": 0.0, + "completion_length": 1514.5059509277344, + "epoch": 1.8439999999999999, + "grad_norm": 0.15131881833076477, + "kl": 0.161376953125, + "learning_rate": 1.1657684494105386e-07, + "loss": -0.0215, + "reward": -0.09375773929059505, + "reward_std": 0.1126671563833952, + "rewards/cosine_scaled_reward": -0.04687886871397495, + "rewards/format_reward": 0.0, + "step": 461 + }, + { + "clip_ratio": 0.0, + "completion_length": 1515.2678833007812, + "epoch": 1.8479999999999999, + "grad_norm": 0.08426119387149811, + "kl": 0.15576171875, + "learning_rate": 1.1574257748745986e-07, + "loss": -0.0228, + "reward": -0.09476478770375252, + "reward_std": 0.10140549577772617, + "rewards/cosine_scaled_reward": -0.04738239198923111, + "rewards/format_reward": 0.0, + "step": 462 + }, + { + "clip_ratio": 0.0, + "completion_length": 1512.8333435058594, + "epoch": 1.8519999999999999, + "grad_norm": 0.08592584729194641, + "kl": 0.17333984375, + "learning_rate": 1.1492947512799328e-07, + "loss": -0.0256, + "reward": -0.011012900620698929, + "reward_std": 0.08133355341851711, + "rewards/cosine_scaled_reward": -0.005506448447704315, + "rewards/format_reward": 0.0, + "step": 463 + }, + { + "clip_ratio": 0.0, + "completion_length": 1515.3511962890625, + "epoch": 1.8559999999999999, + "grad_norm": 0.1047179102897644, + "kl": 0.17529296875, + "learning_rate": 1.1413757749211602e-07, + "loss": -0.0208, + "reward": -0.08088574931025505, + "reward_std": 0.09925234131515026, + "rewards/cosine_scaled_reward": -0.040442874655127525, + "rewards/format_reward": 0.0, + "step": 464 + }, + { + "clip_ratio": 0.0, + "completion_length": 1506.4642944335938, + "epoch": 1.8599999999999999, + "grad_norm": 0.1791323721408844, + "kl": 0.177734375, + "learning_rate": 1.1336692317580158e-07, + "loss": -0.0299, + "reward": -0.09140351600944996, + "reward_std": 0.1100204586982727, + "rewards/cosine_scaled_reward": -0.045701757073402405, + "rewards/format_reward": 0.0, + "step": 465 + }, + { + "clip_ratio": 0.0, + "completion_length": 1522.0535583496094, + "epoch": 1.8639999999999999, + "grad_norm": 0.13129960000514984, + "kl": 0.1676025390625, + "learning_rate": 1.1261754973965422e-07, + "loss": -0.0129, + "reward": -0.08949675410985947, + "reward_std": 0.09439942799508572, + "rewards/cosine_scaled_reward": -0.04474837705492973, + "rewards/format_reward": 0.0, + "step": 466 + }, + { + "clip_ratio": 0.0, + "completion_length": 1484.0535888671875, + "epoch": 1.8679999999999999, + "grad_norm": 0.13218103349208832, + "kl": 0.15966796875, + "learning_rate": 1.1188949370707787e-07, + "loss": -0.0593, + "reward": -0.11925767548382282, + "reward_std": 0.15814346075057983, + "rewards/cosine_scaled_reward": -0.05962884332984686, + "rewards/format_reward": 0.0, + "step": 467 + }, + { + "clip_ratio": 0.0, + "completion_length": 1496.4702453613281, + "epoch": 1.8719999999999999, + "grad_norm": 0.09863686561584473, + "kl": 0.146484375, + "learning_rate": 1.1118279056249653e-07, + "loss": -0.0405, + "reward": -0.0985277071595192, + "reward_std": 0.11279423907399178, + "rewards/cosine_scaled_reward": -0.04926385171711445, + "rewards/format_reward": 0.0, + "step": 468 + }, + { + "clip_ratio": 0.0, + "completion_length": 1515.8869323730469, + "epoch": 1.876, + "grad_norm": 0.09514996409416199, + "kl": 0.158203125, + "learning_rate": 1.1049747474962444e-07, + "loss": -0.0164, + "reward": -0.08131754398345947, + "reward_std": 0.0914676021784544, + "rewards/cosine_scaled_reward": -0.04065877292305231, + "rewards/format_reward": 0.0, + "step": 469 + }, + { + "clip_ratio": 0.0, + "completion_length": 1525.4464416503906, + "epoch": 1.88, + "grad_norm": 0.11126792430877686, + "kl": 0.158447265625, + "learning_rate": 1.0983357966978745e-07, + "loss": -0.0064, + "reward": -0.0882963128387928, + "reward_std": 0.09903069026768208, + "rewards/cosine_scaled_reward": -0.04414815828204155, + "rewards/format_reward": 0.0, + "step": 470 + }, + { + "clip_ratio": 0.0, + "completion_length": 1502.3392944335938, + "epoch": 1.884, + "grad_norm": 0.12479417026042938, + "kl": 0.168701171875, + "learning_rate": 1.0919113768029517e-07, + "loss": -0.0409, + "reward": -0.07126700505614281, + "reward_std": 0.08810876682400703, + "rewards/cosine_scaled_reward": -0.035633502528071404, + "rewards/format_reward": 0.0, + "step": 471 + }, + { + "clip_ratio": 0.0, + "completion_length": 1523.3511962890625, + "epoch": 1.888, + "grad_norm": 0.15230634808540344, + "kl": 0.1669921875, + "learning_rate": 1.0857018009286381e-07, + "loss": -0.0107, + "reward": -0.07650433294475079, + "reward_std": 0.09276540018618107, + "rewards/cosine_scaled_reward": -0.03825216554105282, + "rewards/format_reward": 0.0, + "step": 472 + }, + { + "clip_ratio": 0.0, + "completion_length": 1529.2916564941406, + "epoch": 1.892, + "grad_norm": 0.10235889256000519, + "kl": 0.163818359375, + "learning_rate": 1.0797073717209013e-07, + "loss": -0.0031, + "reward": -0.08031682576984167, + "reward_std": 0.08574636466801167, + "rewards/cosine_scaled_reward": -0.0401584105566144, + "rewards/format_reward": 0.0, + "step": 473 + }, + { + "clip_ratio": 0.0, + "completion_length": 1514.4643249511719, + "epoch": 1.896, + "grad_norm": 0.24448621273040771, + "kl": 0.17724609375, + "learning_rate": 1.0739283813397639e-07, + "loss": -0.0168, + "reward": -0.09807473048567772, + "reward_std": 0.11232626810669899, + "rewards/cosine_scaled_reward": -0.049037366174161434, + "rewards/format_reward": 0.0, + "step": 474 + }, + { + "clip_ratio": 0.0, + "completion_length": 1508.8155212402344, + "epoch": 1.9, + "grad_norm": 0.11423542350530624, + "kl": 0.15625, + "learning_rate": 1.068365111445064e-07, + "loss": -0.0138, + "reward": -0.09042352437973022, + "reward_std": 0.10206466354429722, + "rewards/cosine_scaled_reward": -0.04521176405251026, + "rewards/format_reward": 0.0, + "step": 475 + }, + { + "clip_ratio": 0.0, + "completion_length": 1506.4404907226562, + "epoch": 1.904, + "grad_norm": 0.1770845502614975, + "kl": 0.150390625, + "learning_rate": 1.063017833182728e-07, + "loss": -0.0315, + "reward": -0.1031611617654562, + "reward_std": 0.11722332611680031, + "rewards/cosine_scaled_reward": -0.051580581814050674, + "rewards/format_reward": 0.0, + "step": 476 + }, + { + "clip_ratio": 0.0, + "completion_length": 1512.5178527832031, + "epoch": 1.908, + "grad_norm": 0.1035989373922348, + "kl": 0.155517578125, + "learning_rate": 1.0578868071715544e-07, + "loss": -0.0263, + "reward": -0.10269530303776264, + "reward_std": 0.13116441946476698, + "rewards/cosine_scaled_reward": -0.05134765151888132, + "rewards/format_reward": 0.0, + "step": 477 + }, + { + "clip_ratio": 0.0, + "completion_length": 1506.7083435058594, + "epoch": 1.912, + "grad_norm": 0.1195254847407341, + "kl": 0.162841796875, + "learning_rate": 1.0529722834905125e-07, + "loss": -0.0243, + "reward": -0.10078963078558445, + "reward_std": 0.11876899935305119, + "rewards/cosine_scaled_reward": -0.050394815392792225, + "rewards/format_reward": 0.0, + "step": 478 + }, + { + "clip_ratio": 0.0, + "completion_length": 1515.0952453613281, + "epoch": 1.916, + "grad_norm": 0.1497546136379242, + "kl": 0.1513671875, + "learning_rate": 1.0482745016665526e-07, + "loss": -0.0204, + "reward": -0.10379143245518208, + "reward_std": 0.11940331198275089, + "rewards/cosine_scaled_reward": -0.05189571529626846, + "rewards/format_reward": 0.0, + "step": 479 + }, + { + "clip_ratio": 0.0, + "completion_length": 1522.8690490722656, + "epoch": 1.92, + "grad_norm": 0.12198466807603836, + "kl": 0.169189453125, + "learning_rate": 1.0437936906629334e-07, + "loss": -0.0117, + "reward": -0.0045996010303497314, + "reward_std": 0.09211089462041855, + "rewards/cosine_scaled_reward": -0.002299800980836153, + "rewards/format_reward": 0.0, + "step": 480 + }, + { + "clip_ratio": 0.0, + "completion_length": 1498.5833740234375, + "epoch": 1.924, + "grad_norm": 0.10352538526058197, + "kl": 0.14697265625, + "learning_rate": 1.0395300688680625e-07, + "loss": -0.0433, + "reward": -0.1318805105984211, + "reward_std": 0.15495008416473866, + "rewards/cosine_scaled_reward": -0.06594025250524282, + "rewards/format_reward": 0.0, + "step": 481 + }, + { + "clip_ratio": 0.0, + "completion_length": 1517.107177734375, + "epoch": 1.928, + "grad_norm": 0.08769793808460236, + "kl": 0.14990234375, + "learning_rate": 1.0354838440848501e-07, + "loss": -0.0207, + "reward": -0.10253190249204636, + "reward_std": 0.121914217248559, + "rewards/cosine_scaled_reward": -0.05126595124602318, + "rewards/format_reward": 0.0, + "step": 482 + }, + { + "clip_ratio": 0.0, + "completion_length": 1523.0059814453125, + "epoch": 1.932, + "grad_norm": 0.09510423243045807, + "kl": 0.16064453125, + "learning_rate": 1.0316552135205837e-07, + "loss": -0.0073, + "reward": -0.09257967211306095, + "reward_std": 0.09288883674889803, + "rewards/cosine_scaled_reward": -0.0462898388504982, + "rewards/format_reward": 0.0, + "step": 483 + }, + { + "clip_ratio": 0.0, + "completion_length": 1522.1666870117188, + "epoch": 1.936, + "grad_norm": 0.11582231521606445, + "kl": 0.159912109375, + "learning_rate": 1.0280443637773163e-07, + "loss": -0.013, + "reward": -0.09562139585614204, + "reward_std": 0.10689939372241497, + "rewards/cosine_scaled_reward": -0.04781070165336132, + "rewards/format_reward": 0.0, + "step": 484 + }, + { + "clip_ratio": 0.0, + "completion_length": 1527.6428527832031, + "epoch": 1.94, + "grad_norm": 0.1510264277458191, + "kl": 0.168701171875, + "learning_rate": 1.0246514708427701e-07, + "loss": 0.0057, + "reward": -0.07166448421776295, + "reward_std": 0.06708121951669455, + "rewards/cosine_scaled_reward": -0.03583224397152662, + "rewards/format_reward": 0.0, + "step": 485 + }, + { + "clip_ratio": 0.0, + "completion_length": 1508.6607360839844, + "epoch": 1.944, + "grad_norm": 0.10827223211526871, + "kl": 0.153076171875, + "learning_rate": 1.0214767000817596e-07, + "loss": -0.0092, + "reward": -0.077840281650424, + "reward_std": 0.09841375425457954, + "rewards/cosine_scaled_reward": -0.038920141756534576, + "rewards/format_reward": 0.0, + "step": 486 + }, + { + "clip_ratio": 0.0, + "completion_length": 1515.2857360839844, + "epoch": 1.948, + "grad_norm": 0.08925757557153702, + "kl": 0.1376953125, + "learning_rate": 1.0185202062281336e-07, + "loss": -0.0088, + "reward": -0.0860859602689743, + "reward_std": 0.10117382928729057, + "rewards/cosine_scaled_reward": -0.04304297920316458, + "rewards/format_reward": 0.0, + "step": 487 + }, + { + "clip_ratio": 0.0, + "completion_length": 1510.1130981445312, + "epoch": 1.952, + "grad_norm": 0.1778467744588852, + "kl": 0.160400390625, + "learning_rate": 1.0157821333772304e-07, + "loss": -0.0269, + "reward": -0.08845487236976624, + "reward_std": 0.09980816766619682, + "rewards/cosine_scaled_reward": -0.04422743525356054, + "rewards/format_reward": 0.0, + "step": 488 + }, + { + "clip_ratio": 0.0, + "completion_length": 1523.482177734375, + "epoch": 1.956, + "grad_norm": 0.09875297546386719, + "kl": 0.162841796875, + "learning_rate": 1.013262614978859e-07, + "loss": -0.0109, + "reward": -0.08624122757464647, + "reward_std": 0.09759997017681599, + "rewards/cosine_scaled_reward": -0.04312061285600066, + "rewards/format_reward": 0.0, + "step": 489 + }, + { + "clip_ratio": 0.0, + "completion_length": 1488.8929138183594, + "epoch": 1.96, + "grad_norm": 0.10072106122970581, + "kl": 0.16064453125, + "learning_rate": 1.0109617738307911e-07, + "loss": -0.0561, + "reward": -0.1024992810562253, + "reward_std": 0.1395698133856058, + "rewards/cosine_scaled_reward": -0.051249639596790075, + "rewards/format_reward": 0.0, + "step": 490 + }, + { + "clip_ratio": 0.0, + "completion_length": 1507.6190795898438, + "epoch": 1.964, + "grad_norm": 0.14594227075576782, + "kl": 0.170166015625, + "learning_rate": 1.0088797220727779e-07, + "loss": -0.0329, + "reward": -0.09847836010158062, + "reward_std": 0.12085962668061256, + "rewards/cosine_scaled_reward": -0.049239179119467735, + "rewards/format_reward": 0.0, + "step": 491 + }, + { + "clip_ratio": 0.0, + "completion_length": 1515.52978515625, + "epoch": 1.968, + "grad_norm": 0.1781827211380005, + "kl": 0.171875, + "learning_rate": 1.0070165611810855e-07, + "loss": -0.0183, + "reward": -0.09458879381418228, + "reward_std": 0.10490395873785019, + "rewards/cosine_scaled_reward": -0.047294397838413715, + "rewards/format_reward": 0.0, + "step": 492 + }, + { + "clip_ratio": 0.0, + "completion_length": 1510.4464416503906, + "epoch": 1.972, + "grad_norm": 0.14723263680934906, + "kl": 0.166259765625, + "learning_rate": 1.005372381963547e-07, + "loss": -0.0245, + "reward": -0.08563583716750145, + "reward_std": 0.09580126218497753, + "rewards/cosine_scaled_reward": -0.04281791765242815, + "rewards/format_reward": 0.0, + "step": 493 + }, + { + "clip_ratio": 0.0, + "completion_length": 1508.9822082519531, + "epoch": 1.976, + "grad_norm": 0.08827134966850281, + "kl": 0.171142578125, + "learning_rate": 1.0039472645551372e-07, + "loss": -0.0314, + "reward": -0.09158815257251263, + "reward_std": 0.11639940552413464, + "rewards/cosine_scaled_reward": -0.04579407814890146, + "rewards/format_reward": 0.0, + "step": 494 + }, + { + "clip_ratio": 0.0, + "completion_length": 1508.8095397949219, + "epoch": 1.98, + "grad_norm": 0.09312310069799423, + "kl": 0.15380859375, + "learning_rate": 1.002741278414069e-07, + "loss": -0.0125, + "reward": -0.08922230452299118, + "reward_std": 0.09334707166999578, + "rewards/cosine_scaled_reward": -0.044611155055463314, + "rewards/format_reward": 0.0, + "step": 495 + }, + { + "clip_ratio": 0.0, + "completion_length": 1511.452392578125, + "epoch": 1.984, + "grad_norm": 0.10467734187841415, + "kl": 0.14697265625, + "learning_rate": 1.0017544823184055e-07, + "loss": -0.0302, + "reward": -0.09572159126400948, + "reward_std": 0.11608831025660038, + "rewards/cosine_scaled_reward": -0.04786079656332731, + "rewards/format_reward": 0.0, + "step": 496 + }, + { + "clip_ratio": 0.0, + "completion_length": 1488.7262573242188, + "epoch": 1.988, + "grad_norm": 0.11462484300136566, + "kl": 0.152099609375, + "learning_rate": 1.0009869243631952e-07, + "loss": -0.0413, + "reward": -0.10070546343922615, + "reward_std": 0.12976408563554287, + "rewards/cosine_scaled_reward": -0.05035272892564535, + "rewards/format_reward": 0.0, + "step": 497 + }, + { + "clip_ratio": 0.0, + "completion_length": 1513.2143249511719, + "epoch": 1.992, + "grad_norm": 0.09213641285896301, + "kl": 0.135498046875, + "learning_rate": 1.000438641958131e-07, + "loss": -0.0224, + "reward": -0.09420822747051716, + "reward_std": 0.1108301505446434, + "rewards/cosine_scaled_reward": -0.04710411373525858, + "rewards/format_reward": 0.0, + "step": 498 + }, + { + "clip_ratio": 0.0, + "completion_length": 1523.3511962890625, + "epoch": 1.996, + "grad_norm": 0.10646738111972809, + "kl": 0.1552734375, + "learning_rate": 1.0001096618257236e-07, + "loss": -0.0112, + "reward": -0.10537549015134573, + "reward_std": 0.11287760734558105, + "rewards/cosine_scaled_reward": -0.052687746938318014, + "rewards/format_reward": 0.0, + "step": 499 + }, + { + "clip_ratio": 0.0, + "completion_length": 1513.0893859863281, + "epoch": 2.0, + "grad_norm": 0.12409133464097977, + "kl": 0.166015625, + "learning_rate": 1e-07, + "loss": -0.0134, + "reward": -0.08198001235723495, + "reward_std": 0.0940225888043642, + "rewards/cosine_scaled_reward": -0.04099000431597233, + "rewards/format_reward": 0.0, + "step": 500 + }, + { + "epoch": 2.0, + "step": 500, + "total_flos": 0.0, + "train_loss": -0.028187389324251855, + "train_runtime": 65552.2916, + "train_samples_per_second": 1.281, + "train_steps_per_second": 0.008 + } + ], + "logging_steps": 1, + "max_steps": 500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +}