Litespark-1.5B-IFT-Math-Openrs / trainer_state.json
advaithc's picture
Model save
63b1664 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 1502.0476379394531,
"epoch": 0.004,
"grad_norm": 0.17851164937019348,
"kl": 0.0,
"learning_rate": 2e-08,
"loss": -0.0206,
"reward": -0.23460307717323303,
"reward_std": 0.13429159671068192,
"rewards/cosine_scaled_reward": -0.11730154044926167,
"rewards/format_reward": 0.0,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 1483.3928833007812,
"epoch": 0.008,
"grad_norm": 0.20364968478679657,
"kl": 0.0,
"learning_rate": 4e-08,
"loss": -0.0247,
"reward": -0.24404804036021233,
"reward_std": 0.15527689084410667,
"rewards/cosine_scaled_reward": -0.12202401272952557,
"rewards/format_reward": 0.0,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 1498.7857360839844,
"epoch": 0.012,
"grad_norm": 0.2454664260149002,
"kl": -2.008676528930664e-05,
"learning_rate": 6e-08,
"loss": -0.024,
"reward": -0.2683428265154362,
"reward_std": 0.1479046531021595,
"rewards/cosine_scaled_reward": -0.13417141698300838,
"rewards/format_reward": 0.0,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 1531.5595397949219,
"epoch": 0.016,
"grad_norm": 0.26290684938430786,
"kl": -1.3932585716247559e-06,
"learning_rate": 8e-08,
"loss": -0.0063,
"reward": -0.22999461740255356,
"reward_std": 0.13789904117584229,
"rewards/cosine_scaled_reward": -0.11499731056392193,
"rewards/format_reward": 0.0,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 1496.1845703125,
"epoch": 0.02,
"grad_norm": 0.25596341490745544,
"kl": -5.0514936447143555e-06,
"learning_rate": 1e-07,
"loss": -0.0425,
"reward": -0.23508117347955704,
"reward_std": 0.15481781959533691,
"rewards/cosine_scaled_reward": -0.11754059046506882,
"rewards/format_reward": 0.0,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 1498.8690795898438,
"epoch": 0.024,
"grad_norm": 0.24606633186340332,
"kl": -1.0028481483459473e-05,
"learning_rate": 1.2e-07,
"loss": -0.0007,
"reward": -0.25243763625621796,
"reward_std": 0.13587487116456032,
"rewards/cosine_scaled_reward": -0.12621882185339928,
"rewards/format_reward": 0.0,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 1527.5238037109375,
"epoch": 0.028,
"grad_norm": 0.30039116740226746,
"kl": 3.2372772693634033e-06,
"learning_rate": 1.4e-07,
"loss": 0.0092,
"reward": -0.221938356757164,
"reward_std": 0.13823154009878635,
"rewards/cosine_scaled_reward": -0.11096917279064655,
"rewards/format_reward": 0.0,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 1509.2143249511719,
"epoch": 0.032,
"grad_norm": 0.2639496624469757,
"kl": -3.641587682068348e-06,
"learning_rate": 1.6e-07,
"loss": -0.035,
"reward": -0.27191318944096565,
"reward_std": 0.15618360042572021,
"rewards/cosine_scaled_reward": -0.13595658540725708,
"rewards/format_reward": 0.0,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 1510.0535888671875,
"epoch": 0.036,
"grad_norm": 0.21626578271389008,
"kl": 7.82310962677002e-08,
"learning_rate": 1.8e-07,
"loss": -0.0261,
"reward": -0.24237940087914467,
"reward_std": 0.15605646930634975,
"rewards/cosine_scaled_reward": -0.12118970789015293,
"rewards/format_reward": 0.0,
"step": 9
},
{
"clip_ratio": 0.0,
"completion_length": 1513.0357360839844,
"epoch": 0.04,
"grad_norm": 0.26584091782569885,
"kl": -6.256159394979477e-06,
"learning_rate": 2e-07,
"loss": -0.0261,
"reward": -0.24382107332348824,
"reward_std": 0.147519638761878,
"rewards/cosine_scaled_reward": -0.12191054411232471,
"rewards/format_reward": 0.0,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 1508.6190795898438,
"epoch": 0.044,
"grad_norm": 0.26533427834510803,
"kl": -5.21540641784668e-06,
"learning_rate": 2.1999999999999998e-07,
"loss": -0.0136,
"reward": -0.26315416768193245,
"reward_std": 0.14432235062122345,
"rewards/cosine_scaled_reward": -0.13157708384096622,
"rewards/format_reward": 0.0,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 1514.6785888671875,
"epoch": 0.048,
"grad_norm": 0.23109059035778046,
"kl": 4.7460198402404785e-06,
"learning_rate": 2.4e-07,
"loss": -0.0225,
"reward": -0.2342548444867134,
"reward_std": 0.1461905539035797,
"rewards/cosine_scaled_reward": -0.117127425968647,
"rewards/format_reward": 0.0,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 1486.6726379394531,
"epoch": 0.052,
"grad_norm": 0.18697038292884827,
"kl": 3.1348317861557007e-06,
"learning_rate": 2.6e-07,
"loss": -0.0475,
"reward": -0.23411722108721733,
"reward_std": 0.16247618943452835,
"rewards/cosine_scaled_reward": -0.11705861054360867,
"rewards/format_reward": 0.0,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 1493.9166870117188,
"epoch": 0.056,
"grad_norm": 0.26729440689086914,
"kl": -6.791204214096069e-06,
"learning_rate": 2.8e-07,
"loss": -0.0147,
"reward": -0.2375863455235958,
"reward_std": 0.1451248899102211,
"rewards/cosine_scaled_reward": -0.1187931690365076,
"rewards/format_reward": 0.0,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 1493.3392944335938,
"epoch": 0.06,
"grad_norm": 0.24526046216487885,
"kl": 1.4476478099822998e-05,
"learning_rate": 3e-07,
"loss": -0.0512,
"reward": -0.2283475622534752,
"reward_std": 0.1646866388618946,
"rewards/cosine_scaled_reward": -0.11417377926409245,
"rewards/format_reward": 0.0,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 1495.8809814453125,
"epoch": 0.064,
"grad_norm": 0.21982906758785248,
"kl": 7.815659046173096e-06,
"learning_rate": 3.2e-07,
"loss": -0.033,
"reward": -0.229737039655447,
"reward_std": 0.1594039984047413,
"rewards/cosine_scaled_reward": -0.1148685198277235,
"rewards/format_reward": 0.0,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 1471.8869018554688,
"epoch": 0.068,
"grad_norm": 0.32592612504959106,
"kl": 9.292736649513245e-06,
"learning_rate": 3.4000000000000003e-07,
"loss": -0.0491,
"reward": -0.22187525033950806,
"reward_std": 0.15859584510326385,
"rewards/cosine_scaled_reward": -0.11093762516975403,
"rewards/format_reward": 0.0,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 1505.4702453613281,
"epoch": 0.072,
"grad_norm": 0.27544769644737244,
"kl": 6.070360541343689e-06,
"learning_rate": 3.6e-07,
"loss": -0.0094,
"reward": -0.2488204501569271,
"reward_std": 0.14258970320224762,
"rewards/cosine_scaled_reward": -0.1244102232158184,
"rewards/format_reward": 0.0,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 1519.1428833007812,
"epoch": 0.076,
"grad_norm": 0.2813272774219513,
"kl": 4.976987838745117e-06,
"learning_rate": 3.7999999999999996e-07,
"loss": -0.0174,
"reward": -0.20330505073070526,
"reward_std": 0.12616467103362083,
"rewards/cosine_scaled_reward": -0.10165252350270748,
"rewards/format_reward": 0.0,
"step": 19
},
{
"clip_ratio": 0.0,
"completion_length": 1512.0595397949219,
"epoch": 0.08,
"grad_norm": 0.24715571105480194,
"kl": 1.5944242477416992e-06,
"learning_rate": 4e-07,
"loss": -0.0147,
"reward": -0.22946816682815552,
"reward_std": 0.12867936864495277,
"rewards/cosine_scaled_reward": -0.11473408341407776,
"rewards/format_reward": 0.0,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 1514.6428833007812,
"epoch": 0.084,
"grad_norm": 0.2901928424835205,
"kl": 6.8731606006622314e-06,
"learning_rate": 4.1999999999999995e-07,
"loss": -0.0034,
"reward": -0.25839560478925705,
"reward_std": 0.13489549793303013,
"rewards/cosine_scaled_reward": -0.12919779494404793,
"rewards/format_reward": 0.0,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 1488.4642944335938,
"epoch": 0.088,
"grad_norm": 0.2510276436805725,
"kl": 8.165836334228516e-06,
"learning_rate": 4.3999999999999997e-07,
"loss": -0.0203,
"reward": -0.26031654328107834,
"reward_std": 0.148418840020895,
"rewards/cosine_scaled_reward": -0.13015827164053917,
"rewards/format_reward": 0.0,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 1509.4761962890625,
"epoch": 0.092,
"grad_norm": 0.21568480134010315,
"kl": 3.085937350988388e-06,
"learning_rate": 4.6e-07,
"loss": -0.029,
"reward": -0.25878410786390305,
"reward_std": 0.1478017084300518,
"rewards/cosine_scaled_reward": -0.12939205765724182,
"rewards/format_reward": 0.0,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 1522.6667175292969,
"epoch": 0.096,
"grad_norm": 0.20768144726753235,
"kl": 5.729496479034424e-06,
"learning_rate": 4.8e-07,
"loss": -0.0056,
"reward": -0.23294677585363388,
"reward_std": 0.12670473381876945,
"rewards/cosine_scaled_reward": -0.11647338047623634,
"rewards/format_reward": 0.0,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 1505.5059814453125,
"epoch": 0.1,
"grad_norm": 0.20974963903427124,
"kl": 4.060566425323486e-06,
"learning_rate": 5e-07,
"loss": -0.0228,
"reward": -0.23598218336701393,
"reward_std": 0.13139526918530464,
"rewards/cosine_scaled_reward": -0.11799109354615211,
"rewards/format_reward": 0.0,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 1503.90478515625,
"epoch": 0.104,
"grad_norm": 0.31668928265571594,
"kl": 5.900859832763672e-06,
"learning_rate": 5.2e-07,
"loss": -0.0143,
"reward": -0.2467576116323471,
"reward_std": 0.15891429036855698,
"rewards/cosine_scaled_reward": -0.12337880209088326,
"rewards/format_reward": 0.0,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 1499.4166870117188,
"epoch": 0.108,
"grad_norm": 0.2661290764808655,
"kl": 1.9058585166931152e-05,
"learning_rate": 5.4e-07,
"loss": -0.0059,
"reward": -0.2198324091732502,
"reward_std": 0.13804786279797554,
"rewards/cosine_scaled_reward": -0.10991620272397995,
"rewards/format_reward": 0.0,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 1494.3929138183594,
"epoch": 0.112,
"grad_norm": 0.24837514758110046,
"kl": 1.190975308418274e-05,
"learning_rate": 5.6e-07,
"loss": -0.0358,
"reward": -0.2488894909620285,
"reward_std": 0.14470278844237328,
"rewards/cosine_scaled_reward": -0.12444474548101425,
"rewards/format_reward": 0.0,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 1495.5893249511719,
"epoch": 0.116,
"grad_norm": 0.2956860065460205,
"kl": 1.84476375579834e-05,
"learning_rate": 5.8e-07,
"loss": -0.0464,
"reward": -0.258271723985672,
"reward_std": 0.17091093584895134,
"rewards/cosine_scaled_reward": -0.129135861992836,
"rewards/format_reward": 0.0,
"step": 29
},
{
"clip_ratio": 0.0,
"completion_length": 1491.5833740234375,
"epoch": 0.12,
"grad_norm": 0.21843063831329346,
"kl": 3.403425216674805e-05,
"learning_rate": 6e-07,
"loss": -0.025,
"reward": -0.22956868633627892,
"reward_std": 0.13988509960472584,
"rewards/cosine_scaled_reward": -0.11478434316813946,
"rewards/format_reward": 0.0,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 1494.8691101074219,
"epoch": 0.124,
"grad_norm": 0.27265459299087524,
"kl": 3.37064266204834e-05,
"learning_rate": 6.2e-07,
"loss": -0.0028,
"reward": -0.22422148287296295,
"reward_std": 0.13563549891114235,
"rewards/cosine_scaled_reward": -0.11211073212325573,
"rewards/format_reward": 0.0,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 1500.4226379394531,
"epoch": 0.128,
"grad_norm": 0.2947193682193756,
"kl": 4.464387893676758e-05,
"learning_rate": 6.4e-07,
"loss": -0.0253,
"reward": -0.2335178479552269,
"reward_std": 0.15123932622373104,
"rewards/cosine_scaled_reward": -0.11675892770290375,
"rewards/format_reward": 0.0,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 1512.3988342285156,
"epoch": 0.132,
"grad_norm": 0.27640435099601746,
"kl": 3.385916352272034e-05,
"learning_rate": 6.6e-07,
"loss": 0.0004,
"reward": -0.22090798616409302,
"reward_std": 0.1239312905818224,
"rewards/cosine_scaled_reward": -0.11045399680733681,
"rewards/format_reward": 0.0,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 1506.6190795898438,
"epoch": 0.136,
"grad_norm": 0.24311278760433197,
"kl": 2.8401613235473633e-05,
"learning_rate": 6.800000000000001e-07,
"loss": -0.0182,
"reward": -0.24944494664669037,
"reward_std": 0.1345935631543398,
"rewards/cosine_scaled_reward": -0.12472246773540974,
"rewards/format_reward": 0.0,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 1497.9107666015625,
"epoch": 0.14,
"grad_norm": 0.21990132331848145,
"kl": 4.646182060241699e-05,
"learning_rate": 7e-07,
"loss": -0.0132,
"reward": -0.22735398262739182,
"reward_std": 0.14252249151468277,
"rewards/cosine_scaled_reward": -0.11367699131369591,
"rewards/format_reward": 0.0,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 1519.0357360839844,
"epoch": 0.144,
"grad_norm": 0.25016918778419495,
"kl": 5.340576171875e-05,
"learning_rate": 7.2e-07,
"loss": -0.0145,
"reward": -0.22054903954267502,
"reward_std": 0.14707811176776886,
"rewards/cosine_scaled_reward": -0.11027451977133751,
"rewards/format_reward": 0.0,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 1504.1250305175781,
"epoch": 0.148,
"grad_norm": 0.27119770646095276,
"kl": 6.008148193359375e-05,
"learning_rate": 7.4e-07,
"loss": -0.0074,
"reward": -0.21688038110733032,
"reward_std": 0.14238713681697845,
"rewards/cosine_scaled_reward": -0.10844019241631031,
"rewards/format_reward": 0.0,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 1498.2083740234375,
"epoch": 0.152,
"grad_norm": 0.22954951226711273,
"kl": 4.4226646423339844e-05,
"learning_rate": 7.599999999999999e-07,
"loss": -0.025,
"reward": -0.2533186711370945,
"reward_std": 0.13519956730306149,
"rewards/cosine_scaled_reward": -0.12665932811796665,
"rewards/format_reward": 0.0,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 1505.2440490722656,
"epoch": 0.156,
"grad_norm": 0.20884250104427338,
"kl": 3.9130449295043945e-05,
"learning_rate": 7.799999999999999e-07,
"loss": -0.0186,
"reward": -0.2563895806670189,
"reward_std": 0.1503661349415779,
"rewards/cosine_scaled_reward": -0.12819479033350945,
"rewards/format_reward": 0.0,
"step": 39
},
{
"clip_ratio": 0.0,
"completion_length": 1518.8809509277344,
"epoch": 0.16,
"grad_norm": 0.31034034490585327,
"kl": 4.661083221435547e-05,
"learning_rate": 8e-07,
"loss": -0.014,
"reward": -0.24387329444289207,
"reward_std": 0.14256866462528706,
"rewards/cosine_scaled_reward": -0.12193664722144604,
"rewards/format_reward": 0.0,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 1482.1964416503906,
"epoch": 0.164,
"grad_norm": 0.2809925973415375,
"kl": 0.00013720989227294922,
"learning_rate": 8.199999999999999e-07,
"loss": -0.0377,
"reward": -0.23471787199378014,
"reward_std": 0.1464288830757141,
"rewards/cosine_scaled_reward": -0.11735892854630947,
"rewards/format_reward": 0.0,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 1516.0416870117188,
"epoch": 0.168,
"grad_norm": 0.22887176275253296,
"kl": 0.00016939640045166016,
"learning_rate": 8.399999999999999e-07,
"loss": -0.0143,
"reward": -0.2401226907968521,
"reward_std": 0.12950069829821587,
"rewards/cosine_scaled_reward": -0.1200613472610712,
"rewards/format_reward": 0.0,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 1471.5536193847656,
"epoch": 0.172,
"grad_norm": 0.2556413412094116,
"kl": 0.0002295970916748047,
"learning_rate": 8.599999999999999e-07,
"loss": -0.0741,
"reward": -0.2366926297545433,
"reward_std": 0.17133169993758202,
"rewards/cosine_scaled_reward": -0.1183463241904974,
"rewards/format_reward": 0.0,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 1511.7678527832031,
"epoch": 0.176,
"grad_norm": 0.2524208724498749,
"kl": 0.0002219676971435547,
"learning_rate": 8.799999999999999e-07,
"loss": -0.0275,
"reward": -0.25312257930636406,
"reward_std": 0.16574446111917496,
"rewards/cosine_scaled_reward": -0.12656129337847233,
"rewards/format_reward": 0.0,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 1494.7321472167969,
"epoch": 0.18,
"grad_norm": 0.28623661398887634,
"kl": 0.0002727508544921875,
"learning_rate": 9e-07,
"loss": -0.0223,
"reward": -0.24078572914004326,
"reward_std": 0.14357317984104156,
"rewards/cosine_scaled_reward": -0.12039286643266678,
"rewards/format_reward": 0.0,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 1493.5238342285156,
"epoch": 0.184,
"grad_norm": 0.27407318353652954,
"kl": 0.000263214111328125,
"learning_rate": 9.2e-07,
"loss": -0.0368,
"reward": -0.23090793937444687,
"reward_std": 0.15557732805609703,
"rewards/cosine_scaled_reward": -0.11545397154986858,
"rewards/format_reward": 0.0,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 1501.1012268066406,
"epoch": 0.188,
"grad_norm": 0.26540830731391907,
"kl": 0.000286102294921875,
"learning_rate": 9.399999999999999e-07,
"loss": -0.0079,
"reward": -0.21494316309690475,
"reward_std": 0.14235420525074005,
"rewards/cosine_scaled_reward": -0.10747158527374268,
"rewards/format_reward": 0.0,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 1493.4285583496094,
"epoch": 0.192,
"grad_norm": 0.20362912118434906,
"kl": 0.0003046989440917969,
"learning_rate": 9.6e-07,
"loss": -0.0465,
"reward": -0.24233370646834373,
"reward_std": 0.16549209877848625,
"rewards/cosine_scaled_reward": -0.12116685323417187,
"rewards/format_reward": 0.0,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 1496.9107360839844,
"epoch": 0.196,
"grad_norm": 0.23884467780590057,
"kl": 0.00034809112548828125,
"learning_rate": 9.8e-07,
"loss": -0.0294,
"reward": -0.235232163220644,
"reward_std": 0.14046380668878555,
"rewards/cosine_scaled_reward": -0.1176160853356123,
"rewards/format_reward": 0.0,
"step": 49
},
{
"clip_ratio": 0.0,
"completion_length": 1512.2083740234375,
"epoch": 0.2,
"grad_norm": 0.2550745904445648,
"kl": 0.0003681182861328125,
"learning_rate": 1e-06,
"loss": -0.0009,
"reward": -0.14228077605366707,
"reward_std": 0.13498482666909695,
"rewards/cosine_scaled_reward": -0.07114038616418839,
"rewards/format_reward": 0.0,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 1514.3154602050781,
"epoch": 0.204,
"grad_norm": 0.26381194591522217,
"kl": 0.0003437995910644531,
"learning_rate": 9.999890338174275e-07,
"loss": -0.0287,
"reward": -0.24466010928153992,
"reward_std": 0.16256770864129066,
"rewards/cosine_scaled_reward": -0.12233005836606026,
"rewards/format_reward": 0.0,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 1503.1488647460938,
"epoch": 0.208,
"grad_norm": 0.1665869802236557,
"kl": 0.00042724609375,
"learning_rate": 9.999561358041868e-07,
"loss": -0.0102,
"reward": -0.2257002554833889,
"reward_std": 0.1360796671360731,
"rewards/cosine_scaled_reward": -0.11285012774169445,
"rewards/format_reward": 0.0,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 1486.7381286621094,
"epoch": 0.212,
"grad_norm": 0.2398059219121933,
"kl": 0.0004673004150390625,
"learning_rate": 9.999013075636804e-07,
"loss": -0.0184,
"reward": -0.23267855867743492,
"reward_std": 0.14873512834310532,
"rewards/cosine_scaled_reward": -0.11633927933871746,
"rewards/format_reward": 0.0,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 1509.2083435058594,
"epoch": 0.216,
"grad_norm": 0.2826734781265259,
"kl": 0.0006093978881835938,
"learning_rate": 9.998245517681593e-07,
"loss": -0.0234,
"reward": -0.2084299884736538,
"reward_std": 0.1463002786040306,
"rewards/cosine_scaled_reward": -0.1042149942368269,
"rewards/format_reward": 0.0,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 1501.0476379394531,
"epoch": 0.22,
"grad_norm": 0.29020005464553833,
"kl": 0.0004963874816894531,
"learning_rate": 9.997258721585931e-07,
"loss": -0.0355,
"reward": -0.2434540018439293,
"reward_std": 0.17717645689845085,
"rewards/cosine_scaled_reward": -0.12172700092196465,
"rewards/format_reward": 0.0,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 1504.0833435058594,
"epoch": 0.224,
"grad_norm": 0.26805394887924194,
"kl": 0.0004773139953613281,
"learning_rate": 9.996052735444862e-07,
"loss": 0.0009,
"reward": -0.2181985266506672,
"reward_std": 0.13354611210525036,
"rewards/cosine_scaled_reward": -0.10909926891326904,
"rewards/format_reward": 0.0,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 1507.6488342285156,
"epoch": 0.228,
"grad_norm": 0.27495479583740234,
"kl": 0.0012969970703125,
"learning_rate": 9.994627618036452e-07,
"loss": -0.0226,
"reward": -0.2011367231607437,
"reward_std": 0.14377126656472683,
"rewards/cosine_scaled_reward": -0.10056836158037186,
"rewards/format_reward": 0.0,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 1522.52978515625,
"epoch": 0.232,
"grad_norm": 0.22906683385372162,
"kl": 0.002330780029296875,
"learning_rate": 9.992983438818915e-07,
"loss": -0.0165,
"reward": -0.23109900206327438,
"reward_std": 0.13826126232743263,
"rewards/cosine_scaled_reward": -0.11554950661957264,
"rewards/format_reward": 0.0,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 1461.1904907226562,
"epoch": 0.236,
"grad_norm": 0.21256300806999207,
"kl": 0.004283905029296875,
"learning_rate": 9.991120277927223e-07,
"loss": -0.0729,
"reward": -0.19971829652786255,
"reward_std": 0.18633990362286568,
"rewards/cosine_scaled_reward": -0.09985914640128613,
"rewards/format_reward": 0.0,
"step": 59
},
{
"clip_ratio": 0.0,
"completion_length": 1518.7380981445312,
"epoch": 0.24,
"grad_norm": 0.2587541341781616,
"kl": 0.00316619873046875,
"learning_rate": 9.989038226169207e-07,
"loss": -0.0134,
"reward": -0.2089497372508049,
"reward_std": 0.14539672806859016,
"rewards/cosine_scaled_reward": -0.1044748667627573,
"rewards/format_reward": 0.0,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 1492.4881286621094,
"epoch": 0.244,
"grad_norm": 0.2516862154006958,
"kl": 0.0037384033203125,
"learning_rate": 9.98673738502114e-07,
"loss": -0.0497,
"reward": -0.22766747325658798,
"reward_std": 0.17161306738853455,
"rewards/cosine_scaled_reward": -0.11383373104035854,
"rewards/format_reward": 0.0,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 1479.232177734375,
"epoch": 0.248,
"grad_norm": 0.26911845803260803,
"kl": 0.004444122314453125,
"learning_rate": 9.98421786662277e-07,
"loss": -0.0545,
"reward": -0.22953158989548683,
"reward_std": 0.1646084077656269,
"rewards/cosine_scaled_reward": -0.11476579494774342,
"rewards/format_reward": 0.0,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 1500.9821472167969,
"epoch": 0.252,
"grad_norm": 0.30081605911254883,
"kl": 0.0038433074951171875,
"learning_rate": 9.981479793771866e-07,
"loss": -0.0306,
"reward": -0.2425815463066101,
"reward_std": 0.15784814581274986,
"rewards/cosine_scaled_reward": -0.1212907712906599,
"rewards/format_reward": 0.0,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 1474.2262268066406,
"epoch": 0.256,
"grad_norm": 0.11899983882904053,
"kl": 0.00433349609375,
"learning_rate": 9.97852329991824e-07,
"loss": -0.028,
"reward": -0.1484425999224186,
"reward_std": 0.13878681510686874,
"rewards/cosine_scaled_reward": -0.07422130089253187,
"rewards/format_reward": 0.0,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 1469.2440490722656,
"epoch": 0.26,
"grad_norm": 0.21076174080371857,
"kl": 0.0048980712890625,
"learning_rate": 9.975348529157229e-07,
"loss": -0.063,
"reward": -0.23214704915881157,
"reward_std": 0.15723764523863792,
"rewards/cosine_scaled_reward": -0.11607352085411549,
"rewards/format_reward": 0.0,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 1479.2738342285156,
"epoch": 0.264,
"grad_norm": 0.25172120332717896,
"kl": 0.0048065185546875,
"learning_rate": 9.971955636222684e-07,
"loss": -0.0575,
"reward": -0.24701237678527832,
"reward_std": 0.176135566085577,
"rewards/cosine_scaled_reward": -0.12350618466734886,
"rewards/format_reward": 0.0,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 1460.0178833007812,
"epoch": 0.268,
"grad_norm": 0.2059454619884491,
"kl": 0.006103515625,
"learning_rate": 9.968344786479415e-07,
"loss": -0.0581,
"reward": -0.21051475405693054,
"reward_std": 0.1539991032332182,
"rewards/cosine_scaled_reward": -0.10525736771523952,
"rewards/format_reward": 0.0,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 1510.8035888671875,
"epoch": 0.272,
"grad_norm": 0.2547614276409149,
"kl": 0.0057525634765625,
"learning_rate": 9.964516155915151e-07,
"loss": -0.017,
"reward": -0.18334244936704636,
"reward_std": 0.151863232254982,
"rewards/cosine_scaled_reward": -0.09167122654616833,
"rewards/format_reward": 0.0,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 1481.8392944335938,
"epoch": 0.276,
"grad_norm": 0.17602519690990448,
"kl": 0.00487518310546875,
"learning_rate": 9.960469931131936e-07,
"loss": -0.0368,
"reward": -0.15826850943267345,
"reward_std": 0.1565675064921379,
"rewards/cosine_scaled_reward": -0.07913425378501415,
"rewards/format_reward": 0.0,
"step": 69
},
{
"clip_ratio": 0.0,
"completion_length": 1488.6726379394531,
"epoch": 0.28,
"grad_norm": 0.2315431833267212,
"kl": 0.00534820556640625,
"learning_rate": 9.956206309337066e-07,
"loss": -0.0383,
"reward": -0.21064428612589836,
"reward_std": 0.14298444241285324,
"rewards/cosine_scaled_reward": -0.10532214120030403,
"rewards/format_reward": 0.0,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 1521.3095397949219,
"epoch": 0.284,
"grad_norm": 0.25028038024902344,
"kl": 0.00449371337890625,
"learning_rate": 9.951725498333448e-07,
"loss": -0.018,
"reward": -0.2252160757780075,
"reward_std": 0.1335316188633442,
"rewards/cosine_scaled_reward": -0.11260804533958435,
"rewards/format_reward": 0.0,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 1487.1666870117188,
"epoch": 0.288,
"grad_norm": 0.3014598488807678,
"kl": 0.0053863525390625,
"learning_rate": 9.947027716509488e-07,
"loss": -0.0296,
"reward": -0.22651539742946625,
"reward_std": 0.15895461291074753,
"rewards/cosine_scaled_reward": -0.11325769871473312,
"rewards/format_reward": 0.0,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 1500.0417175292969,
"epoch": 0.292,
"grad_norm": 0.19289681315422058,
"kl": 0.00534820556640625,
"learning_rate": 9.942113192828444e-07,
"loss": -0.0204,
"reward": -0.22585123777389526,
"reward_std": 0.1563442163169384,
"rewards/cosine_scaled_reward": -0.11292561888694763,
"rewards/format_reward": 0.0,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 1481.1845703125,
"epoch": 0.296,
"grad_norm": 0.21433287858963013,
"kl": 0.00696563720703125,
"learning_rate": 9.93698216681727e-07,
"loss": -0.0248,
"reward": -0.21346117928624153,
"reward_std": 0.16779018752276897,
"rewards/cosine_scaled_reward": -0.10673058778047562,
"rewards/format_reward": 0.0,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 1492.0595397949219,
"epoch": 0.3,
"grad_norm": 0.20086415112018585,
"kl": 0.0060577392578125,
"learning_rate": 9.931634888554935e-07,
"loss": -0.0315,
"reward": -0.24520759657025337,
"reward_std": 0.16547074727714062,
"rewards/cosine_scaled_reward": -0.12260380387306213,
"rewards/format_reward": 0.0,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 1509.7440795898438,
"epoch": 0.304,
"grad_norm": 0.2499692440032959,
"kl": 0.00594329833984375,
"learning_rate": 9.926071618660237e-07,
"loss": -0.029,
"reward": -0.22075266018509865,
"reward_std": 0.1520039215683937,
"rewards/cosine_scaled_reward": -0.11037633195519447,
"rewards/format_reward": 0.0,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 1493.0655212402344,
"epoch": 0.308,
"grad_norm": 0.1303885579109192,
"kl": 0.0078887939453125,
"learning_rate": 9.9202926282791e-07,
"loss": 0.0068,
"reward": -0.20533370971679688,
"reward_std": 0.1422851476818323,
"rewards/cosine_scaled_reward": -0.10266684927046299,
"rewards/format_reward": 0.0,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 1501.3035888671875,
"epoch": 0.312,
"grad_norm": 0.23692218959331512,
"kl": 0.0076141357421875,
"learning_rate": 9.91429819907136e-07,
"loss": -0.0294,
"reward": -0.1995321549475193,
"reward_std": 0.15328680351376534,
"rewards/cosine_scaled_reward": -0.0997660793364048,
"rewards/format_reward": 0.0,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 1516.3512268066406,
"epoch": 0.316,
"grad_norm": 0.2173466831445694,
"kl": 0.0062408447265625,
"learning_rate": 9.908088623197048e-07,
"loss": -0.0243,
"reward": -0.22904419153928757,
"reward_std": 0.1448185909539461,
"rewards/cosine_scaled_reward": -0.11452210135757923,
"rewards/format_reward": 0.0,
"step": 79
},
{
"clip_ratio": 0.0,
"completion_length": 1484.3869323730469,
"epoch": 0.32,
"grad_norm": 0.16200865805149078,
"kl": 0.00707244873046875,
"learning_rate": 9.901664203302124e-07,
"loss": -0.0239,
"reward": -0.22017718479037285,
"reward_std": 0.15285737439990044,
"rewards/cosine_scaled_reward": -0.11008859053254128,
"rewards/format_reward": 0.0,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 1484.7261962890625,
"epoch": 0.324,
"grad_norm": 0.1811896711587906,
"kl": 0.00650787353515625,
"learning_rate": 9.895025252503755e-07,
"loss": -0.0262,
"reward": -0.24382955580949783,
"reward_std": 0.1623360477387905,
"rewards/cosine_scaled_reward": -0.12191477790474892,
"rewards/format_reward": 0.0,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 1456.0357360839844,
"epoch": 0.328,
"grad_norm": 0.1585092842578888,
"kl": 0.010345458984375,
"learning_rate": 9.888172094375033e-07,
"loss": -0.066,
"reward": -0.23999103158712387,
"reward_std": 0.1869661882519722,
"rewards/cosine_scaled_reward": -0.11999551579356194,
"rewards/format_reward": 0.0,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 1485.02978515625,
"epoch": 0.332,
"grad_norm": 0.2301841825246811,
"kl": 0.0067596435546875,
"learning_rate": 9.881105062929221e-07,
"loss": -0.0321,
"reward": -0.2552091106772423,
"reward_std": 0.1685757040977478,
"rewards/cosine_scaled_reward": -0.12760455161333084,
"rewards/format_reward": 0.0,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 1486.7024230957031,
"epoch": 0.336,
"grad_norm": 0.24452205002307892,
"kl": 0.0095062255859375,
"learning_rate": 9.873824502603459e-07,
"loss": -0.0399,
"reward": -0.2177020013332367,
"reward_std": 0.1546536386013031,
"rewards/cosine_scaled_reward": -0.1088510025292635,
"rewards/format_reward": 0.0,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 1472.2083740234375,
"epoch": 0.34,
"grad_norm": 0.2162124067544937,
"kl": 0.00879669189453125,
"learning_rate": 9.866330768241983e-07,
"loss": -0.0363,
"reward": -0.2306637428700924,
"reward_std": 0.15445036813616753,
"rewards/cosine_scaled_reward": -0.11533187702298164,
"rewards/format_reward": 0.0,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 1514.90478515625,
"epoch": 0.344,
"grad_norm": 0.29467546939849854,
"kl": 0.010467529296875,
"learning_rate": 9.85862422507884e-07,
"loss": -0.0225,
"reward": -0.21584435179829597,
"reward_std": 0.15913846716284752,
"rewards/cosine_scaled_reward": -0.10792217776179314,
"rewards/format_reward": 0.0,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 1493.9166870117188,
"epoch": 0.348,
"grad_norm": 0.2552238404750824,
"kl": 0.0130157470703125,
"learning_rate": 9.850705248720068e-07,
"loss": -0.0419,
"reward": -0.22282668948173523,
"reward_std": 0.1558297798037529,
"rewards/cosine_scaled_reward": -0.11141334660351276,
"rewards/format_reward": 0.0,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 1516.4166870117188,
"epoch": 0.352,
"grad_norm": 0.18905065953731537,
"kl": 0.0161590576171875,
"learning_rate": 9.8425742251254e-07,
"loss": -0.0124,
"reward": -0.2435382977128029,
"reward_std": 0.1510351374745369,
"rewards/cosine_scaled_reward": -0.1217691469937563,
"rewards/format_reward": 0.0,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 1456.6131286621094,
"epoch": 0.356,
"grad_norm": 0.18983080983161926,
"kl": 0.0308837890625,
"learning_rate": 9.83423155058946e-07,
"loss": -0.0716,
"reward": -0.23182464018464088,
"reward_std": 0.17492059245705605,
"rewards/cosine_scaled_reward": -0.11591232009232044,
"rewards/format_reward": 0.0,
"step": 89
},
{
"clip_ratio": 0.0,
"completion_length": 1462.7916870117188,
"epoch": 0.36,
"grad_norm": 0.18697425723075867,
"kl": 0.036468505859375,
"learning_rate": 9.825677631722435e-07,
"loss": -0.0794,
"reward": -0.25727425515651703,
"reward_std": 0.18599339574575424,
"rewards/cosine_scaled_reward": -0.12863712757825851,
"rewards/format_reward": 0.0,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 1452.9166870117188,
"epoch": 0.364,
"grad_norm": 0.11306176334619522,
"kl": 0.05767822265625,
"learning_rate": 9.816912885430258e-07,
"loss": -0.074,
"reward": -0.2355377934873104,
"reward_std": 0.1886419989168644,
"rewards/cosine_scaled_reward": -0.11776889488101006,
"rewards/format_reward": 0.0,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 1423.2381286621094,
"epoch": 0.368,
"grad_norm": 0.12384199351072311,
"kl": 0.0667724609375,
"learning_rate": 9.807937738894303e-07,
"loss": -0.094,
"reward": -0.28002386912703514,
"reward_std": 0.22308171913027763,
"rewards/cosine_scaled_reward": -0.14001193456351757,
"rewards/format_reward": 0.0,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 1463.5298156738281,
"epoch": 0.372,
"grad_norm": 0.1097106859087944,
"kl": 0.0745849609375,
"learning_rate": 9.798752629550546e-07,
"loss": -0.0497,
"reward": -0.260006383061409,
"reward_std": 0.20789402723312378,
"rewards/cosine_scaled_reward": -0.1300031915307045,
"rewards/format_reward": 0.0,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 1423.0416870117188,
"epoch": 0.376,
"grad_norm": 0.13807816803455353,
"kl": 0.0921630859375,
"learning_rate": 9.78935800506826e-07,
"loss": -0.0664,
"reward": -0.2304685339331627,
"reward_std": 0.20511355623602867,
"rewards/cosine_scaled_reward": -0.11523427255451679,
"rewards/format_reward": 0.0,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 1439.827392578125,
"epoch": 0.38,
"grad_norm": 0.1847662776708603,
"kl": 0.080810546875,
"learning_rate": 9.779754323328192e-07,
"loss": -0.0574,
"reward": -0.22580492869019508,
"reward_std": 0.19689049944281578,
"rewards/cosine_scaled_reward": -0.11290246807038784,
"rewards/format_reward": 0.0,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 1450.6666870117188,
"epoch": 0.384,
"grad_norm": 0.14935868978500366,
"kl": 0.0830078125,
"learning_rate": 9.769942052400235e-07,
"loss": -0.0721,
"reward": -0.28848847001791,
"reward_std": 0.22373779118061066,
"rewards/cosine_scaled_reward": -0.144244235008955,
"rewards/format_reward": 0.0,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 1433.7916870117188,
"epoch": 0.388,
"grad_norm": 0.14467309415340424,
"kl": 0.0953369140625,
"learning_rate": 9.759921670520634e-07,
"loss": -0.0565,
"reward": -0.2675531320273876,
"reward_std": 0.19667528942227364,
"rewards/cosine_scaled_reward": -0.1337765622884035,
"rewards/format_reward": 0.0,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 1444.4345703125,
"epoch": 0.392,
"grad_norm": 0.14709749817848206,
"kl": 0.092529296875,
"learning_rate": 9.749693666068663e-07,
"loss": -0.0794,
"reward": -0.26461150124669075,
"reward_std": 0.2228638045489788,
"rewards/cosine_scaled_reward": -0.13230575062334538,
"rewards/format_reward": 0.0,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 1438.6369018554688,
"epoch": 0.396,
"grad_norm": 0.1645117700099945,
"kl": 0.090576171875,
"learning_rate": 9.739258537542835e-07,
"loss": -0.0804,
"reward": -0.2615456096827984,
"reward_std": 0.22437894716858864,
"rewards/cosine_scaled_reward": -0.13077280297875404,
"rewards/format_reward": 0.0,
"step": 99
},
{
"clip_ratio": 0.0,
"completion_length": 1447.9404907226562,
"epoch": 0.4,
"grad_norm": 0.12536393105983734,
"kl": 0.0902099609375,
"learning_rate": 9.728616793536587e-07,
"loss": -0.0565,
"reward": -0.23606609553098679,
"reward_std": 0.20211446657776833,
"rewards/cosine_scaled_reward": -0.11803305521607399,
"rewards/format_reward": 0.0,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 1461.0238647460938,
"epoch": 0.404,
"grad_norm": 0.1715729534626007,
"kl": 0.0977783203125,
"learning_rate": 9.717768952713511e-07,
"loss": -0.0419,
"reward": -0.25642454996705055,
"reward_std": 0.20571942254900932,
"rewards/cosine_scaled_reward": -0.12821227870881557,
"rewards/format_reward": 0.0,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 1441.0595397949219,
"epoch": 0.408,
"grad_norm": 0.14349091053009033,
"kl": 0.07373046875,
"learning_rate": 9.706715543782064e-07,
"loss": -0.0755,
"reward": -0.25798120722174644,
"reward_std": 0.18921422585844994,
"rewards/cosine_scaled_reward": -0.12899060919880867,
"rewards/format_reward": 0.0,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 1429.2559814453125,
"epoch": 0.412,
"grad_norm": 0.12457617372274399,
"kl": 0.08154296875,
"learning_rate": 9.695457105469804e-07,
"loss": -0.0635,
"reward": -0.2596958056092262,
"reward_std": 0.21758576482534409,
"rewards/cosine_scaled_reward": -0.12984789907932281,
"rewards/format_reward": 0.0,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 1455.0000610351562,
"epoch": 0.416,
"grad_norm": 0.1320362240076065,
"kl": 0.0777587890625,
"learning_rate": 9.683994186497132e-07,
"loss": -0.0587,
"reward": -0.23436888307332993,
"reward_std": 0.18900957331061363,
"rewards/cosine_scaled_reward": -0.11718444526195526,
"rewards/format_reward": 0.0,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 1421.0952758789062,
"epoch": 0.42,
"grad_norm": 0.13309422135353088,
"kl": 0.0897216796875,
"learning_rate": 9.672327345550543e-07,
"loss": -0.0934,
"reward": -0.25493229925632477,
"reward_std": 0.22001174464821815,
"rewards/cosine_scaled_reward": -0.12746614776551723,
"rewards/format_reward": 0.0,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 1490.9940795898438,
"epoch": 0.424,
"grad_norm": 0.1491984874010086,
"kl": 0.0736083984375,
"learning_rate": 9.66045715125541e-07,
"loss": -0.0299,
"reward": -0.2312908135354519,
"reward_std": 0.16199098154902458,
"rewards/cosine_scaled_reward": -0.11564541421830654,
"rewards/format_reward": 0.0,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 1421.0357666015625,
"epoch": 0.428,
"grad_norm": 0.16547471284866333,
"kl": 0.0780029296875,
"learning_rate": 9.648384182148252e-07,
"loss": -0.067,
"reward": -0.27354947850108147,
"reward_std": 0.1951713114976883,
"rewards/cosine_scaled_reward": -0.13677473925054073,
"rewards/format_reward": 0.0,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 1469.0833740234375,
"epoch": 0.432,
"grad_norm": 0.14035455882549286,
"kl": 0.0938720703125,
"learning_rate": 9.636109026648554e-07,
"loss": -0.0248,
"reward": -0.22176991775631905,
"reward_std": 0.1815592534840107,
"rewards/cosine_scaled_reward": -0.11088495329022408,
"rewards/format_reward": 0.0,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 1440.5238647460938,
"epoch": 0.436,
"grad_norm": 0.11456211656332016,
"kl": 0.077880859375,
"learning_rate": 9.623632283030077e-07,
"loss": -0.0604,
"reward": -0.2620925232768059,
"reward_std": 0.21131489053368568,
"rewards/cosine_scaled_reward": -0.13104625791311264,
"rewards/format_reward": 0.0,
"step": 109
},
{
"clip_ratio": 0.0,
"completion_length": 1479.4583740234375,
"epoch": 0.44,
"grad_norm": 0.11461742222309113,
"kl": 0.07177734375,
"learning_rate": 9.610954559391704e-07,
"loss": -0.0354,
"reward": -0.21340973302721977,
"reward_std": 0.18217052891850471,
"rewards/cosine_scaled_reward": -0.10670486651360989,
"rewards/format_reward": 0.0,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 1417.2976379394531,
"epoch": 0.444,
"grad_norm": 0.1527003049850464,
"kl": 0.07318115234375,
"learning_rate": 9.598076473627796e-07,
"loss": -0.1136,
"reward": -0.24674446135759354,
"reward_std": 0.19690455496311188,
"rewards/cosine_scaled_reward": -0.12337223440408707,
"rewards/format_reward": 0.0,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 1433.4464721679688,
"epoch": 0.448,
"grad_norm": 0.1355064958333969,
"kl": 0.0743408203125,
"learning_rate": 9.58499865339809e-07,
"loss": -0.0673,
"reward": -0.2527715191245079,
"reward_std": 0.19281791523098946,
"rewards/cosine_scaled_reward": -0.12638575583696365,
"rewards/format_reward": 0.0,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 1506.5714721679688,
"epoch": 0.452,
"grad_norm": 0.11734712868928909,
"kl": 0.0765380859375,
"learning_rate": 9.571721736097088e-07,
"loss": -0.0251,
"reward": -0.22142696008086205,
"reward_std": 0.18005133792757988,
"rewards/cosine_scaled_reward": -0.11071347445249557,
"rewards/format_reward": 0.0,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 1462.7084045410156,
"epoch": 0.456,
"grad_norm": 0.1974153220653534,
"kl": 0.0665283203125,
"learning_rate": 9.55824636882301e-07,
"loss": -0.0632,
"reward": -0.1928116953931749,
"reward_std": 0.19508182629942894,
"rewards/cosine_scaled_reward": -0.09640584839507937,
"rewards/format_reward": 0.0,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 1444.9524230957031,
"epoch": 0.46,
"grad_norm": 0.13885585963726044,
"kl": 0.0650634765625,
"learning_rate": 9.54457320834625e-07,
"loss": -0.0436,
"reward": -0.2363814003765583,
"reward_std": 0.19080934301018715,
"rewards/cosine_scaled_reward": -0.11819070391356945,
"rewards/format_reward": 0.0,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 1462.9881286621094,
"epoch": 0.464,
"grad_norm": 0.14188507199287415,
"kl": 0.0745849609375,
"learning_rate": 9.530702921077358e-07,
"loss": -0.0326,
"reward": -0.21813786774873734,
"reward_std": 0.17841872572898865,
"rewards/cosine_scaled_reward": -0.10906893201172352,
"rewards/format_reward": 0.0,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 1423.232177734375,
"epoch": 0.468,
"grad_norm": 0.14759542047977448,
"kl": 0.07354736328125,
"learning_rate": 9.516636183034564e-07,
"loss": -0.0955,
"reward": -0.24884852021932602,
"reward_std": 0.20949439704418182,
"rewards/cosine_scaled_reward": -0.12442425638437271,
"rewards/format_reward": 0.0,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 1433.3631286621094,
"epoch": 0.472,
"grad_norm": 0.12356512248516083,
"kl": 0.0736083984375,
"learning_rate": 9.502373679810839e-07,
"loss": -0.0907,
"reward": -0.2389114946126938,
"reward_std": 0.1962103582918644,
"rewards/cosine_scaled_reward": -0.11945574544370174,
"rewards/format_reward": 0.0,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 1425.5476684570312,
"epoch": 0.476,
"grad_norm": 0.13675737380981445,
"kl": 0.080322265625,
"learning_rate": 9.487916106540465e-07,
"loss": -0.0548,
"reward": -0.22732584923505783,
"reward_std": 0.17597166821360588,
"rewards/cosine_scaled_reward": -0.11366293206810951,
"rewards/format_reward": 0.0,
"step": 119
},
{
"clip_ratio": 0.0,
"completion_length": 1407.125,
"epoch": 0.48,
"grad_norm": 0.1944396197795868,
"kl": 0.0782470703125,
"learning_rate": 9.473264167865171e-07,
"loss": -0.1104,
"reward": -0.23876191675662994,
"reward_std": 0.21116216480731964,
"rewards/cosine_scaled_reward": -0.11938095837831497,
"rewards/format_reward": 0.0,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 1460.9940795898438,
"epoch": 0.484,
"grad_norm": 0.13976925611495972,
"kl": 0.07110595703125,
"learning_rate": 9.458418577899774e-07,
"loss": -0.0623,
"reward": -0.22172370925545692,
"reward_std": 0.17737172171473503,
"rewards/cosine_scaled_reward": -0.11086185090243816,
"rewards/format_reward": 0.0,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 1445.0536193847656,
"epoch": 0.488,
"grad_norm": 0.1271459460258484,
"kl": 0.0771484375,
"learning_rate": 9.443380060197385e-07,
"loss": -0.09,
"reward": -0.234033714979887,
"reward_std": 0.21107907965779305,
"rewards/cosine_scaled_reward": -0.1170168574899435,
"rewards/format_reward": 0.0,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 1461.5833740234375,
"epoch": 0.492,
"grad_norm": 0.1251479536294937,
"kl": 0.07427978515625,
"learning_rate": 9.428149347714143e-07,
"loss": -0.0677,
"reward": -0.21826723590493202,
"reward_std": 0.19678263366222382,
"rewards/cosine_scaled_reward": -0.10913361981511116,
"rewards/format_reward": 0.0,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 1441.7857360839844,
"epoch": 0.496,
"grad_norm": 0.16092143952846527,
"kl": 0.060302734375,
"learning_rate": 9.412727182773486e-07,
"loss": -0.0921,
"reward": -0.22351692616939545,
"reward_std": 0.2029583677649498,
"rewards/cosine_scaled_reward": -0.11175846680998802,
"rewards/format_reward": 0.0,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 1426.7262268066406,
"epoch": 0.5,
"grad_norm": 0.1188863217830658,
"kl": 0.0787353515625,
"learning_rate": 9.397114317029974e-07,
"loss": -0.0431,
"reward": -0.2245512492954731,
"reward_std": 0.17906467244029045,
"rewards/cosine_scaled_reward": -0.11227562837302685,
"rewards/format_reward": 0.0,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 1443.7917175292969,
"epoch": 0.504,
"grad_norm": 0.11914021521806717,
"kl": 0.0745849609375,
"learning_rate": 9.381311511432658e-07,
"loss": -0.0781,
"reward": -0.19801979139447212,
"reward_std": 0.1959025263786316,
"rewards/cosine_scaled_reward": -0.09900989942252636,
"rewards/format_reward": 0.0,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 1415.8512268066406,
"epoch": 0.508,
"grad_norm": 0.1357516646385193,
"kl": 0.07177734375,
"learning_rate": 9.36531953618799e-07,
"loss": -0.0865,
"reward": -0.21713878214359283,
"reward_std": 0.18351096659898758,
"rewards/cosine_scaled_reward": -0.10856938920915127,
"rewards/format_reward": 0.0,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 1472.5000305175781,
"epoch": 0.512,
"grad_norm": 0.1369076818227768,
"kl": 0.05865478515625,
"learning_rate": 9.34913917072228e-07,
"loss": -0.0478,
"reward": -0.21694474667310715,
"reward_std": 0.17777032032608986,
"rewards/cosine_scaled_reward": -0.10847238078713417,
"rewards/format_reward": 0.0,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 1477.2559509277344,
"epoch": 0.516,
"grad_norm": 0.17333942651748657,
"kl": 0.0572509765625,
"learning_rate": 9.332771203643714e-07,
"loss": -0.0249,
"reward": -0.21337437257170677,
"reward_std": 0.18009737133979797,
"rewards/cosine_scaled_reward": -0.10668718256056309,
"rewards/format_reward": 0.0,
"step": 129
},
{
"clip_ratio": 0.0,
"completion_length": 1473.7381286621094,
"epoch": 0.52,
"grad_norm": 0.10983088612556458,
"kl": 0.0703125,
"learning_rate": 9.316216432703916e-07,
"loss": -0.0583,
"reward": -0.19089093804359436,
"reward_std": 0.18549956753849983,
"rewards/cosine_scaled_reward": -0.09544547088444233,
"rewards/format_reward": 0.0,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 1432.7559814453125,
"epoch": 0.524,
"grad_norm": 0.16586177051067352,
"kl": 0.0791015625,
"learning_rate": 9.299475664759068e-07,
"loss": -0.0994,
"reward": -0.23504262417554855,
"reward_std": 0.21443113684654236,
"rewards/cosine_scaled_reward": -0.11752131581306458,
"rewards/format_reward": 0.0,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 1481.9762268066406,
"epoch": 0.528,
"grad_norm": 0.17732541263103485,
"kl": 0.06622314453125,
"learning_rate": 9.282549715730579e-07,
"loss": -0.0324,
"reward": -0.17852769792079926,
"reward_std": 0.1502333115786314,
"rewards/cosine_scaled_reward": -0.08926384896039963,
"rewards/format_reward": 0.0,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 1476.4048156738281,
"epoch": 0.532,
"grad_norm": 0.15965314209461212,
"kl": 0.060546875,
"learning_rate": 9.265439410565328e-07,
"loss": -0.058,
"reward": -0.22374625876545906,
"reward_std": 0.18331025168299675,
"rewards/cosine_scaled_reward": -0.11187312379479408,
"rewards/format_reward": 0.0,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 1490.8452453613281,
"epoch": 0.536,
"grad_norm": 0.11577272415161133,
"kl": 0.0740966796875,
"learning_rate": 9.248145583195447e-07,
"loss": -0.0366,
"reward": -0.1072634905576706,
"reward_std": 0.15717832930386066,
"rewards/cosine_scaled_reward": -0.05363174341619015,
"rewards/format_reward": 0.0,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 1490.982177734375,
"epoch": 0.54,
"grad_norm": 0.14535216987133026,
"kl": 0.05950927734375,
"learning_rate": 9.230669076497687e-07,
"loss": -0.0312,
"reward": -0.21274039149284363,
"reward_std": 0.15816222876310349,
"rewards/cosine_scaled_reward": -0.10637019760906696,
"rewards/format_reward": 0.0,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 1475.7679138183594,
"epoch": 0.544,
"grad_norm": 0.14940175414085388,
"kl": 0.072265625,
"learning_rate": 9.213010742252327e-07,
"loss": -0.0302,
"reward": -0.20727308467030525,
"reward_std": 0.18025333806872368,
"rewards/cosine_scaled_reward": -0.10363654233515263,
"rewards/format_reward": 0.0,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 1497.27978515625,
"epoch": 0.548,
"grad_norm": 0.11683686077594757,
"kl": 0.0675048828125,
"learning_rate": 9.195171441101668e-07,
"loss": -0.0354,
"reward": -0.19653399288654327,
"reward_std": 0.17340604588389397,
"rewards/cosine_scaled_reward": -0.09826699271798134,
"rewards/format_reward": 0.0,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 1472.4524230957031,
"epoch": 0.552,
"grad_norm": 0.19184042513370514,
"kl": 0.0703125,
"learning_rate": 9.177152042508077e-07,
"loss": -0.0365,
"reward": -0.20156748220324516,
"reward_std": 0.16703343763947487,
"rewards/cosine_scaled_reward": -0.10078373923897743,
"rewards/format_reward": 0.0,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 1441.5238342285156,
"epoch": 0.556,
"grad_norm": 0.21460425853729248,
"kl": 0.0751953125,
"learning_rate": 9.158953424711624e-07,
"loss": -0.0713,
"reward": -0.1934008002281189,
"reward_std": 0.1858229860663414,
"rewards/cosine_scaled_reward": -0.096700394526124,
"rewards/format_reward": 0.0,
"step": 139
},
{
"clip_ratio": 0.0,
"completion_length": 1477.5536193847656,
"epoch": 0.56,
"grad_norm": 0.1624855250120163,
"kl": 0.07452392578125,
"learning_rate": 9.140576474687263e-07,
"loss": -0.0644,
"reward": -0.20967105776071548,
"reward_std": 0.17497684434056282,
"rewards/cosine_scaled_reward": -0.1048355270177126,
"rewards/format_reward": 0.0,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 1488.3928833007812,
"epoch": 0.564,
"grad_norm": 0.13148972392082214,
"kl": 0.0745849609375,
"learning_rate": 9.122022088101613e-07,
"loss": -0.0365,
"reward": -0.12153960764408112,
"reward_std": 0.15567267499864101,
"rewards/cosine_scaled_reward": -0.06076979637145996,
"rewards/format_reward": 0.0,
"step": 141
},
{
"clip_ratio": 0.0,
"completion_length": 1439.5536193847656,
"epoch": 0.568,
"grad_norm": 0.18604105710983276,
"kl": 0.0751953125,
"learning_rate": 9.103291169269299e-07,
"loss": -0.0679,
"reward": -0.21637247875332832,
"reward_std": 0.19511258974671364,
"rewards/cosine_scaled_reward": -0.10818623751401901,
"rewards/format_reward": 0.0,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 1486.5774230957031,
"epoch": 0.572,
"grad_norm": 0.17263971269130707,
"kl": 0.07757568359375,
"learning_rate": 9.084384631108882e-07,
"loss": -0.0389,
"reward": -0.2009837031364441,
"reward_std": 0.16309702023863792,
"rewards/cosine_scaled_reward": -0.10049185156822205,
"rewards/format_reward": 0.0,
"step": 143
},
{
"clip_ratio": 0.0,
"completion_length": 1478.3988647460938,
"epoch": 0.576,
"grad_norm": 0.14938153326511383,
"kl": 0.079833984375,
"learning_rate": 9.065303395098358e-07,
"loss": -0.0585,
"reward": -0.1797693967819214,
"reward_std": 0.16727757826447487,
"rewards/cosine_scaled_reward": -0.08988469652831554,
"rewards/format_reward": 0.0,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 1460.5417175292969,
"epoch": 0.58,
"grad_norm": 0.18892593681812286,
"kl": 0.0838623046875,
"learning_rate": 9.046048391230247e-07,
"loss": -0.061,
"reward": -0.1960761584341526,
"reward_std": 0.16931083425879478,
"rewards/cosine_scaled_reward": -0.0980380792170763,
"rewards/format_reward": 0.0,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 1495.2916870117188,
"epoch": 0.584,
"grad_norm": 0.20080548524856567,
"kl": 0.0863037109375,
"learning_rate": 9.026620557966279e-07,
"loss": -0.0189,
"reward": -0.18559397384524345,
"reward_std": 0.16799203678965569,
"rewards/cosine_scaled_reward": -0.09279698692262173,
"rewards/format_reward": 0.0,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 1502.8809814453125,
"epoch": 0.588,
"grad_norm": 0.15992848575115204,
"kl": 0.0850830078125,
"learning_rate": 9.007020842191634e-07,
"loss": -0.0201,
"reward": -0.19452324509620667,
"reward_std": 0.1573326252400875,
"rewards/cosine_scaled_reward": -0.09726162627339363,
"rewards/format_reward": 0.0,
"step": 147
},
{
"clip_ratio": 0.0,
"completion_length": 1463.65478515625,
"epoch": 0.592,
"grad_norm": 0.12942340970039368,
"kl": 0.08453369140625,
"learning_rate": 8.987250199168808e-07,
"loss": -0.0368,
"reward": -0.20581213757395744,
"reward_std": 0.1570763811469078,
"rewards/cosine_scaled_reward": -0.10290606319904327,
"rewards/format_reward": 0.0,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 1493.7619323730469,
"epoch": 0.596,
"grad_norm": 0.18132224678993225,
"kl": 0.0960693359375,
"learning_rate": 8.967309592491052e-07,
"loss": -0.023,
"reward": -0.19545432925224304,
"reward_std": 0.1527048945426941,
"rewards/cosine_scaled_reward": -0.09772716276347637,
"rewards/format_reward": 0.0,
"step": 149
},
{
"clip_ratio": 0.0,
"completion_length": 1462.8452758789062,
"epoch": 0.6,
"grad_norm": 0.17014774680137634,
"kl": 0.1109619140625,
"learning_rate": 8.9471999940354e-07,
"loss": -0.0427,
"reward": -0.203117735683918,
"reward_std": 0.18610898405313492,
"rewards/cosine_scaled_reward": -0.10155886970460415,
"rewards/format_reward": 0.0,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 1477.3393249511719,
"epoch": 0.604,
"grad_norm": 0.15196801722049713,
"kl": 0.1031494140625,
"learning_rate": 8.926922383915315e-07,
"loss": -0.0465,
"reward": -0.20808908715844154,
"reward_std": 0.18062585964798927,
"rewards/cosine_scaled_reward": -0.10404454357922077,
"rewards/format_reward": 0.0,
"step": 151
},
{
"clip_ratio": 0.0,
"completion_length": 1479.3333740234375,
"epoch": 0.608,
"grad_norm": 0.13162937760353088,
"kl": 0.0885009765625,
"learning_rate": 8.906477750432903e-07,
"loss": -0.0501,
"reward": -0.21273208782076836,
"reward_std": 0.16958871111273766,
"rewards/cosine_scaled_reward": -0.10636604763567448,
"rewards/format_reward": 0.0,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 1508.1904907226562,
"epoch": 0.612,
"grad_norm": 0.15801787376403809,
"kl": 0.0894775390625,
"learning_rate": 8.88586709003076e-07,
"loss": -0.024,
"reward": -0.18563585355877876,
"reward_std": 0.14705245569348335,
"rewards/cosine_scaled_reward": -0.09281792864203453,
"rewards/format_reward": 0.0,
"step": 153
},
{
"clip_ratio": 0.0,
"completion_length": 1487.75,
"epoch": 0.616,
"grad_norm": 0.19425953924655914,
"kl": 0.1038818359375,
"learning_rate": 8.865091407243394e-07,
"loss": -0.0361,
"reward": -0.18547611683607101,
"reward_std": 0.1538998931646347,
"rewards/cosine_scaled_reward": -0.09273805841803551,
"rewards/format_reward": 0.0,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 1479.5595703125,
"epoch": 0.62,
"grad_norm": 0.13653838634490967,
"kl": 0.1011962890625,
"learning_rate": 8.844151714648274e-07,
"loss": -0.0666,
"reward": -0.20050010830163956,
"reward_std": 0.15926911309361458,
"rewards/cosine_scaled_reward": -0.10025005042552948,
"rewards/format_reward": 0.0,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 1516.6071472167969,
"epoch": 0.624,
"grad_norm": 0.1796942949295044,
"kl": 0.1038818359375,
"learning_rate": 8.823049032816478e-07,
"loss": -0.0216,
"reward": -0.1935092769563198,
"reward_std": 0.17067047394812107,
"rewards/cosine_scaled_reward": -0.09675464034080505,
"rewards/format_reward": 0.0,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 1490.2679138183594,
"epoch": 0.628,
"grad_norm": 0.15652166306972504,
"kl": 0.0980224609375,
"learning_rate": 8.801784390262943e-07,
"loss": -0.0199,
"reward": -0.10874435119330883,
"reward_std": 0.15503624081611633,
"rewards/cosine_scaled_reward": -0.05437217652797699,
"rewards/format_reward": 0.0,
"step": 157
},
{
"clip_ratio": 0.0,
"completion_length": 1478.327392578125,
"epoch": 0.632,
"grad_norm": 0.10631345212459564,
"kl": 0.10205078125,
"learning_rate": 8.780358823396352e-07,
"loss": -0.0558,
"reward": -0.19594154134392738,
"reward_std": 0.18128735944628716,
"rewards/cosine_scaled_reward": -0.09797077253460884,
"rewards/format_reward": 0.0,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 1492.0059814453125,
"epoch": 0.636,
"grad_norm": 0.1802942007780075,
"kl": 0.1143798828125,
"learning_rate": 8.758773376468604e-07,
"loss": -0.045,
"reward": -0.17765655368566513,
"reward_std": 0.16990270093083382,
"rewards/cosine_scaled_reward": -0.08882827498018742,
"rewards/format_reward": 0.0,
"step": 159
},
{
"clip_ratio": 0.0,
"completion_length": 1490.7798156738281,
"epoch": 0.64,
"grad_norm": 0.09826915711164474,
"kl": 0.1187744140625,
"learning_rate": 8.737029101523929e-07,
"loss": -0.0474,
"reward": -0.15192299336194992,
"reward_std": 0.1335773952305317,
"rewards/cosine_scaled_reward": -0.07596149481832981,
"rewards/format_reward": 0.0,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 1518.3035888671875,
"epoch": 0.644,
"grad_norm": 0.14415128529071808,
"kl": 0.1134033203125,
"learning_rate": 8.715127058347614e-07,
"loss": -0.0181,
"reward": -0.16958895698189735,
"reward_std": 0.14540697447955608,
"rewards/cosine_scaled_reward": -0.08479447849094868,
"rewards/format_reward": 0.0,
"step": 161
},
{
"clip_ratio": 0.0,
"completion_length": 1478.0714416503906,
"epoch": 0.648,
"grad_norm": 0.13143935799598694,
"kl": 0.1153564453125,
"learning_rate": 8.693068314414344e-07,
"loss": -0.0602,
"reward": -0.17831872776150703,
"reward_std": 0.16582145914435387,
"rewards/cosine_scaled_reward": -0.08915936388075352,
"rewards/format_reward": 0.0,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 1500.261962890625,
"epoch": 0.652,
"grad_norm": 0.2650466561317444,
"kl": 0.1025390625,
"learning_rate": 8.670853944836176e-07,
"loss": -0.0444,
"reward": -0.19617021456360817,
"reward_std": 0.16974329948425293,
"rewards/cosine_scaled_reward": -0.09808510728180408,
"rewards/format_reward": 0.0,
"step": 163
},
{
"clip_ratio": 0.0,
"completion_length": 1476.3630981445312,
"epoch": 0.656,
"grad_norm": 0.1531592160463333,
"kl": 0.1103515625,
"learning_rate": 8.648485032310144e-07,
"loss": -0.07,
"reward": -0.17511003464460373,
"reward_std": 0.16592327691614628,
"rewards/cosine_scaled_reward": -0.08755501732230186,
"rewards/format_reward": 0.0,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 1491.1845703125,
"epoch": 0.66,
"grad_norm": 0.11288218945264816,
"kl": 0.117919921875,
"learning_rate": 8.625962667065487e-07,
"loss": -0.0371,
"reward": -0.19623659178614616,
"reward_std": 0.15025305189192295,
"rewards/cosine_scaled_reward": -0.09811829589307308,
"rewards/format_reward": 0.0,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 1496.6607055664062,
"epoch": 0.664,
"grad_norm": 0.1232253834605217,
"kl": 0.115234375,
"learning_rate": 8.603287946810513e-07,
"loss": -0.0475,
"reward": -0.19438259676098824,
"reward_std": 0.1660812869668007,
"rewards/cosine_scaled_reward": -0.09719130024313927,
"rewards/format_reward": 0.0,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 1504.577392578125,
"epoch": 0.668,
"grad_norm": 0.11001460999250412,
"kl": 0.1339111328125,
"learning_rate": 8.580461976679099e-07,
"loss": -0.0191,
"reward": -0.07667672634124756,
"reward_std": 0.14517304301261902,
"rewards/cosine_scaled_reward": -0.03833836503326893,
"rewards/format_reward": 0.0,
"step": 167
},
{
"clip_ratio": 0.0,
"completion_length": 1507.4345397949219,
"epoch": 0.672,
"grad_norm": 0.12665332853794098,
"kl": 0.12939453125,
"learning_rate": 8.557485869176825e-07,
"loss": -0.0262,
"reward": -0.18039095029234886,
"reward_std": 0.14454744383692741,
"rewards/cosine_scaled_reward": -0.09019547514617443,
"rewards/format_reward": 0.0,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 1505.3809814453125,
"epoch": 0.676,
"grad_norm": 0.10181339830160141,
"kl": 0.1201171875,
"learning_rate": 8.534360744126753e-07,
"loss": -0.0296,
"reward": -0.15367550402879715,
"reward_std": 0.154752716422081,
"rewards/cosine_scaled_reward": -0.07683775387704372,
"rewards/format_reward": 0.0,
"step": 169
},
{
"clip_ratio": 0.0,
"completion_length": 1502.2678833007812,
"epoch": 0.68,
"grad_norm": 0.08636263757944107,
"kl": 0.140380859375,
"learning_rate": 8.511087728614862e-07,
"loss": -0.0405,
"reward": -0.14581535942852497,
"reward_std": 0.1418076604604721,
"rewards/cosine_scaled_reward": -0.07290767971426249,
"rewards/format_reward": 0.0,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 1503.3154907226562,
"epoch": 0.684,
"grad_norm": 0.1574849933385849,
"kl": 0.1326904296875,
"learning_rate": 8.487667956935087e-07,
"loss": -0.0257,
"reward": -0.14666364155709743,
"reward_std": 0.14365333877503872,
"rewards/cosine_scaled_reward": -0.07333182357251644,
"rewards/format_reward": 0.0,
"step": 171
},
{
"clip_ratio": 0.0,
"completion_length": 1501.3690490722656,
"epoch": 0.688,
"grad_norm": 0.11679176241159439,
"kl": 0.1322021484375,
"learning_rate": 8.464102570534061e-07,
"loss": -0.0394,
"reward": -0.1798371747136116,
"reward_std": 0.1620156615972519,
"rewards/cosine_scaled_reward": -0.0899185873568058,
"rewards/format_reward": 0.0,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 1489.6845703125,
"epoch": 0.692,
"grad_norm": 0.143167644739151,
"kl": 0.1221923828125,
"learning_rate": 8.440392717955475e-07,
"loss": -0.0454,
"reward": -0.11209908872842789,
"reward_std": 0.15652650594711304,
"rewards/cosine_scaled_reward": -0.05604954622685909,
"rewards/format_reward": 0.0,
"step": 173
},
{
"clip_ratio": 0.0,
"completion_length": 1498.1607360839844,
"epoch": 0.696,
"grad_norm": 0.13327138125896454,
"kl": 0.1173095703125,
"learning_rate": 8.416539554784089e-07,
"loss": -0.0349,
"reward": -0.18605408817529678,
"reward_std": 0.17399531230330467,
"rewards/cosine_scaled_reward": -0.09302704595029354,
"rewards/format_reward": 0.0,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 1500.1309509277344,
"epoch": 0.7,
"grad_norm": 0.12825773656368256,
"kl": 0.1453857421875,
"learning_rate": 8.392544243589427e-07,
"loss": -0.0389,
"reward": -0.18417096138000488,
"reward_std": 0.15830742567777634,
"rewards/cosine_scaled_reward": -0.09208548441529274,
"rewards/format_reward": 0.0,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 1512.1488342285156,
"epoch": 0.704,
"grad_norm": 0.19698651134967804,
"kl": 0.12744140625,
"learning_rate": 8.368407953869103e-07,
"loss": -0.0278,
"reward": -0.1590665504336357,
"reward_std": 0.1343939360231161,
"rewards/cosine_scaled_reward": -0.0795332733541727,
"rewards/format_reward": 0.0,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 1507.4583435058594,
"epoch": 0.708,
"grad_norm": 0.11850512772798538,
"kl": 0.12353515625,
"learning_rate": 8.344131861991828e-07,
"loss": -0.0209,
"reward": -0.17914509028196335,
"reward_std": 0.14407609589397907,
"rewards/cosine_scaled_reward": -0.08957254886627197,
"rewards/format_reward": 0.0,
"step": 177
},
{
"clip_ratio": 0.0,
"completion_length": 1494.6785888671875,
"epoch": 0.712,
"grad_norm": 0.12900114059448242,
"kl": 0.1517333984375,
"learning_rate": 8.319717151140072e-07,
"loss": -0.0476,
"reward": -0.1623889021575451,
"reward_std": 0.1595832072198391,
"rewards/cosine_scaled_reward": -0.08119445107877254,
"rewards/format_reward": 0.0,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 1519.1309509277344,
"epoch": 0.716,
"grad_norm": 0.14418767392635345,
"kl": 0.1248779296875,
"learning_rate": 8.295165011252396e-07,
"loss": -0.0145,
"reward": -0.15135440602898598,
"reward_std": 0.12371071800589561,
"rewards/cosine_scaled_reward": -0.07567720301449299,
"rewards/format_reward": 0.0,
"step": 179
},
{
"clip_ratio": 0.0,
"completion_length": 1498.607177734375,
"epoch": 0.72,
"grad_norm": 0.11457547545433044,
"kl": 0.1297607421875,
"learning_rate": 8.270476638965461e-07,
"loss": -0.0282,
"reward": -0.1590440645813942,
"reward_std": 0.13810313865542412,
"rewards/cosine_scaled_reward": -0.07952203415334225,
"rewards/format_reward": 0.0,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 1507.702392578125,
"epoch": 0.724,
"grad_norm": 0.16079466044902802,
"kl": 0.1300048828125,
"learning_rate": 8.245653237555705e-07,
"loss": -0.0259,
"reward": -0.14657551050186157,
"reward_std": 0.12384105287492275,
"rewards/cosine_scaled_reward": -0.07328775525093079,
"rewards/format_reward": 0.0,
"step": 181
},
{
"clip_ratio": 0.0,
"completion_length": 1513.0595703125,
"epoch": 0.728,
"grad_norm": 0.12367437779903412,
"kl": 0.13720703125,
"learning_rate": 8.220696016880687e-07,
"loss": -0.0248,
"reward": -0.15343056619167328,
"reward_std": 0.14851927012205124,
"rewards/cosine_scaled_reward": -0.07671528309583664,
"rewards/format_reward": 0.0,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 1522.5297546386719,
"epoch": 0.732,
"grad_norm": 0.10700845718383789,
"kl": 0.128662109375,
"learning_rate": 8.195606193320136e-07,
"loss": -0.0107,
"reward": -0.14938092976808548,
"reward_std": 0.11783361062407494,
"rewards/cosine_scaled_reward": -0.07469046581536531,
"rewards/format_reward": 0.0,
"step": 183
},
{
"clip_ratio": 0.0,
"completion_length": 1519.9226379394531,
"epoch": 0.736,
"grad_norm": 0.13011117279529572,
"kl": 0.1298828125,
"learning_rate": 8.170384989716657e-07,
"loss": -0.0148,
"reward": -0.15839479491114616,
"reward_std": 0.12780537828803062,
"rewards/cosine_scaled_reward": -0.07919739931821823,
"rewards/format_reward": 0.0,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 1519.7797546386719,
"epoch": 0.74,
"grad_norm": 0.08933281898498535,
"kl": 0.152099609375,
"learning_rate": 8.145033635316128e-07,
"loss": -0.0141,
"reward": -0.1524551585316658,
"reward_std": 0.12427524663507938,
"rewards/cosine_scaled_reward": -0.07622758112847805,
"rewards/format_reward": 0.0,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 1498.3809509277344,
"epoch": 0.744,
"grad_norm": 0.12463698536157608,
"kl": 0.1455078125,
"learning_rate": 8.119553365707802e-07,
"loss": -0.0308,
"reward": -0.1026168204843998,
"reward_std": 0.13808677345514297,
"rewards/cosine_scaled_reward": -0.051308413967490196,
"rewards/format_reward": 0.0,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 1466.6845397949219,
"epoch": 0.748,
"grad_norm": 0.11249163746833801,
"kl": 0.13720703125,
"learning_rate": 8.093945422764069e-07,
"loss": -0.0544,
"reward": -0.16347405686974525,
"reward_std": 0.14502743259072304,
"rewards/cosine_scaled_reward": -0.08173702843487263,
"rewards/format_reward": 0.0,
"step": 187
},
{
"clip_ratio": 0.0,
"completion_length": 1492.2976684570312,
"epoch": 0.752,
"grad_norm": 0.08640649914741516,
"kl": 0.151123046875,
"learning_rate": 8.068211054579943e-07,
"loss": -0.0329,
"reward": -0.16077740490436554,
"reward_std": 0.15286827459931374,
"rewards/cosine_scaled_reward": -0.08038870431482792,
"rewards/format_reward": 0.0,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 1507.9643249511719,
"epoch": 0.756,
"grad_norm": 0.10966922342777252,
"kl": 0.1417236328125,
"learning_rate": 8.04235151541222e-07,
"loss": -0.0235,
"reward": -0.1736162230372429,
"reward_std": 0.1491672247648239,
"rewards/cosine_scaled_reward": -0.08680811524391174,
"rewards/format_reward": 0.0,
"step": 189
},
{
"clip_ratio": 0.0,
"completion_length": 1518.5833740234375,
"epoch": 0.76,
"grad_norm": 0.12140454351902008,
"kl": 0.1397705078125,
"learning_rate": 8.01636806561836e-07,
"loss": -0.0182,
"reward": -0.15859121643006802,
"reward_std": 0.13637281768023968,
"rewards/cosine_scaled_reward": -0.07929560728371143,
"rewards/format_reward": 0.0,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 1518.4167175292969,
"epoch": 0.764,
"grad_norm": 0.12270302325487137,
"kl": 0.159423828125,
"learning_rate": 7.990261971595048e-07,
"loss": -0.0061,
"reward": -0.1657138504087925,
"reward_std": 0.1241717990487814,
"rewards/cosine_scaled_reward": -0.08285692892968655,
"rewards/format_reward": 0.0,
"step": 191
},
{
"clip_ratio": 0.0,
"completion_length": 1498.9345397949219,
"epoch": 0.768,
"grad_norm": 0.12375301867723465,
"kl": 0.162353515625,
"learning_rate": 7.964034505716476e-07,
"loss": -0.0398,
"reward": -0.17186777852475643,
"reward_std": 0.16331871785223484,
"rewards/cosine_scaled_reward": -0.08593388926237822,
"rewards/format_reward": 0.0,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 1497.3452758789062,
"epoch": 0.772,
"grad_norm": 0.08711956441402435,
"kl": 0.152587890625,
"learning_rate": 7.93768694627233e-07,
"loss": -0.0287,
"reward": -0.15793642029166222,
"reward_std": 0.1320202425122261,
"rewards/cosine_scaled_reward": -0.07896821573376656,
"rewards/format_reward": 0.0,
"step": 193
},
{
"clip_ratio": 0.0,
"completion_length": 1517.7976379394531,
"epoch": 0.776,
"grad_norm": 0.0980345830321312,
"kl": 0.14990234375,
"learning_rate": 7.911220577405484e-07,
"loss": -0.015,
"reward": -0.15146314911544323,
"reward_std": 0.13276733830571175,
"rewards/cosine_scaled_reward": -0.07573157269507647,
"rewards/format_reward": 0.0,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 1509.5595397949219,
"epoch": 0.78,
"grad_norm": 0.19111458957195282,
"kl": 0.14404296875,
"learning_rate": 7.884636689049422e-07,
"loss": -0.027,
"reward": -0.1567194815725088,
"reward_std": 0.14454844780266285,
"rewards/cosine_scaled_reward": -0.07835974264889956,
"rewards/format_reward": 0.0,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 1503.6785888671875,
"epoch": 0.784,
"grad_norm": 0.10002721846103668,
"kl": 0.150634765625,
"learning_rate": 7.857936576865356e-07,
"loss": -0.0234,
"reward": -0.17957428470253944,
"reward_std": 0.14822976663708687,
"rewards/cosine_scaled_reward": -0.08978714607656002,
"rewards/format_reward": 0.0,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 1529.0,
"epoch": 0.788,
"grad_norm": 0.1074904203414917,
"kl": 0.15283203125,
"learning_rate": 7.831121542179086e-07,
"loss": -0.0043,
"reward": -0.135637816041708,
"reward_std": 0.10331238061189651,
"rewards/cosine_scaled_reward": -0.06781890522688627,
"rewards/format_reward": 0.0,
"step": 197
},
{
"clip_ratio": 0.0,
"completion_length": 1485.7738342285156,
"epoch": 0.792,
"grad_norm": 0.09117424488067627,
"kl": 0.1513671875,
"learning_rate": 7.804192891917571e-07,
"loss": -0.0363,
"reward": -0.17118510231375694,
"reward_std": 0.16877446696162224,
"rewards/cosine_scaled_reward": -0.08559254929423332,
"rewards/format_reward": 0.0,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 1508.6607055664062,
"epoch": 0.796,
"grad_norm": 0.10829849541187286,
"kl": 0.14306640625,
"learning_rate": 7.777151938545235e-07,
"loss": -0.0227,
"reward": -0.15547415241599083,
"reward_std": 0.12111644446849823,
"rewards/cosine_scaled_reward": -0.07773707807064056,
"rewards/format_reward": 0.0,
"step": 199
},
{
"clip_ratio": 0.0,
"completion_length": 1514.5476379394531,
"epoch": 0.8,
"grad_norm": 0.17340432107448578,
"kl": 0.1441650390625,
"learning_rate": 7.75e-07,
"loss": -0.0141,
"reward": -0.15766149759292603,
"reward_std": 0.13325241580605507,
"rewards/cosine_scaled_reward": -0.07883074693381786,
"rewards/format_reward": 0.0,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 1517.1785888671875,
"epoch": 0.804,
"grad_norm": 0.12855444848537445,
"kl": 0.158447265625,
"learning_rate": 7.72273839962904e-07,
"loss": -0.0166,
"reward": -0.14288493990898132,
"reward_std": 0.1141566876322031,
"rewards/cosine_scaled_reward": -0.07144246716052294,
"rewards/format_reward": 0.0,
"step": 201
},
{
"clip_ratio": 0.0,
"completion_length": 1492.6488342285156,
"epoch": 0.808,
"grad_norm": 0.09727565199136734,
"kl": 0.163818359375,
"learning_rate": 7.695368466124296e-07,
"loss": -0.046,
"reward": -0.1400277316570282,
"reward_std": 0.1363294217735529,
"rewards/cosine_scaled_reward": -0.07001386396586895,
"rewards/format_reward": 0.0,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 1502.5416564941406,
"epoch": 0.812,
"grad_norm": 0.1032966673374176,
"kl": 0.166015625,
"learning_rate": 7.667891533457718e-07,
"loss": -0.0224,
"reward": -0.1764848232269287,
"reward_std": 0.15339024364948273,
"rewards/cosine_scaled_reward": -0.08824240230023861,
"rewards/format_reward": 0.0,
"step": 203
},
{
"clip_ratio": 0.0,
"completion_length": 1516.7202453613281,
"epoch": 0.816,
"grad_norm": 0.15399117767810822,
"kl": 0.175537109375,
"learning_rate": 7.640308940816239e-07,
"loss": -0.0163,
"reward": -0.14551853574812412,
"reward_std": 0.14542756974697113,
"rewards/cosine_scaled_reward": -0.07275926228612661,
"rewards/format_reward": 0.0,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 1509.761962890625,
"epoch": 0.82,
"grad_norm": 0.0828375294804573,
"kl": 0.165283203125,
"learning_rate": 7.612622032536507e-07,
"loss": -0.0178,
"reward": -0.15090281143784523,
"reward_std": 0.13103190064430237,
"rewards/cosine_scaled_reward": -0.07545140571892262,
"rewards/format_reward": 0.0,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 1530.4583435058594,
"epoch": 0.824,
"grad_norm": 0.09723107516765594,
"kl": 0.156005859375,
"learning_rate": 7.584832158039378e-07,
"loss": 0.0019,
"reward": -0.13943813741207123,
"reward_std": 0.12253955751657486,
"rewards/cosine_scaled_reward": -0.06971907056868076,
"rewards/format_reward": 0.0,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 1498.8928833007812,
"epoch": 0.828,
"grad_norm": 0.10988614708185196,
"kl": 0.16162109375,
"learning_rate": 7.556940671764124e-07,
"loss": -0.0272,
"reward": -0.15701762214303017,
"reward_std": 0.1170128583908081,
"rewards/cosine_scaled_reward": -0.07850880734622478,
"rewards/format_reward": 0.0,
"step": 207
},
{
"clip_ratio": 0.0,
"completion_length": 1525.5952453613281,
"epoch": 0.832,
"grad_norm": 0.11009800434112549,
"kl": 0.171142578125,
"learning_rate": 7.528948933102438e-07,
"loss": 0.0156,
"reward": -0.14172504469752312,
"reward_std": 0.11965266987681389,
"rewards/cosine_scaled_reward": -0.07086252421140671,
"rewards/format_reward": 0.0,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 1520.1011962890625,
"epoch": 0.836,
"grad_norm": 0.15401668846607208,
"kl": 0.16455078125,
"learning_rate": 7.500858306332172e-07,
"loss": -0.0106,
"reward": -0.06414606049656868,
"reward_std": 0.12946532107889652,
"rewards/cosine_scaled_reward": -0.032073031179606915,
"rewards/format_reward": 0.0,
"step": 209
},
{
"clip_ratio": 0.0,
"completion_length": 1522.232177734375,
"epoch": 0.84,
"grad_norm": 0.12647868692874908,
"kl": 0.160400390625,
"learning_rate": 7.472670160550848e-07,
"loss": 0.0009,
"reward": -0.12890829890966415,
"reward_std": 0.1113775297999382,
"rewards/cosine_scaled_reward": -0.06445414572954178,
"rewards/format_reward": 0.0,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 1495.9405212402344,
"epoch": 0.844,
"grad_norm": 0.09751134365797043,
"kl": 0.1451416015625,
"learning_rate": 7.444385869608921e-07,
"loss": -0.0263,
"reward": -0.15089180506765842,
"reward_std": 0.12495366670191288,
"rewards/cosine_scaled_reward": -0.07544590625911951,
"rewards/format_reward": 0.0,
"step": 211
},
{
"clip_ratio": 0.0,
"completion_length": 1517.6845397949219,
"epoch": 0.848,
"grad_norm": 0.10632960498332977,
"kl": 0.172607421875,
"learning_rate": 7.416006812042827e-07,
"loss": 0.0114,
"reward": -0.12592120468616486,
"reward_std": 0.10658309236168861,
"rewards/cosine_scaled_reward": -0.06296060606837273,
"rewards/format_reward": 0.0,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 1516.1726379394531,
"epoch": 0.852,
"grad_norm": 0.08701854944229126,
"kl": 0.173095703125,
"learning_rate": 7.387534371007797e-07,
"loss": -0.0094,
"reward": -0.04494331777095795,
"reward_std": 0.10984733514487743,
"rewards/cosine_scaled_reward": -0.022471658885478973,
"rewards/format_reward": 0.0,
"step": 213
},
{
"clip_ratio": 0.0,
"completion_length": 1514.8392944335938,
"epoch": 0.856,
"grad_norm": 0.10350044071674347,
"kl": 0.1650390625,
"learning_rate": 7.358969934210438e-07,
"loss": -0.0193,
"reward": -0.15005925297737122,
"reward_std": 0.1357622630894184,
"rewards/cosine_scaled_reward": -0.07502962276339531,
"rewards/format_reward": 0.0,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 1500.7738342285156,
"epoch": 0.86,
"grad_norm": 0.08610279858112335,
"kl": 0.17724609375,
"learning_rate": 7.330314893841101e-07,
"loss": -0.0337,
"reward": -0.06414718553423882,
"reward_std": 0.1431224588304758,
"rewards/cosine_scaled_reward": -0.03207359462976456,
"rewards/format_reward": 0.0,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 1495.6667175292969,
"epoch": 0.864,
"grad_norm": 0.0906975269317627,
"kl": 0.17041015625,
"learning_rate": 7.301570646506027e-07,
"loss": -0.0444,
"reward": -0.14620519801974297,
"reward_std": 0.1489107757806778,
"rewards/cosine_scaled_reward": -0.07310259714722633,
"rewards/format_reward": 0.0,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 1501.3810119628906,
"epoch": 0.868,
"grad_norm": 0.08579116314649582,
"kl": 0.16357421875,
"learning_rate": 7.27273859315928e-07,
"loss": -0.0184,
"reward": -0.1731514111161232,
"reward_std": 0.16837183013558388,
"rewards/cosine_scaled_reward": -0.0865757018327713,
"rewards/format_reward": 0.0,
"step": 217
},
{
"clip_ratio": 0.0,
"completion_length": 1524.6488342285156,
"epoch": 0.872,
"grad_norm": 0.0806507021188736,
"kl": 0.1669921875,
"learning_rate": 7.243820139034464e-07,
"loss": -0.0038,
"reward": -0.12116867862641811,
"reward_std": 0.10355196706950665,
"rewards/cosine_scaled_reward": -0.060584343038499355,
"rewards/format_reward": 0.0,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 1489.3809814453125,
"epoch": 0.876,
"grad_norm": 0.07953932136297226,
"kl": 0.18017578125,
"learning_rate": 7.214816693576234e-07,
"loss": -0.037,
"reward": -0.15147988684475422,
"reward_std": 0.13851544074714184,
"rewards/cosine_scaled_reward": -0.07573994528502226,
"rewards/format_reward": 0.0,
"step": 219
},
{
"clip_ratio": 0.0,
"completion_length": 1519.6071472167969,
"epoch": 0.88,
"grad_norm": 0.09334682673215866,
"kl": 0.161376953125,
"learning_rate": 7.185729670371604e-07,
"loss": -0.0124,
"reward": -0.14643241465091705,
"reward_std": 0.12924424931406975,
"rewards/cosine_scaled_reward": -0.07321620732545853,
"rewards/format_reward": 0.0,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 1530.5357360839844,
"epoch": 0.884,
"grad_norm": 0.15875208377838135,
"kl": 0.1796875,
"learning_rate": 7.156560487081051e-07,
"loss": -0.0004,
"reward": -0.12105057947337627,
"reward_std": 0.11511022225022316,
"rewards/cosine_scaled_reward": -0.060525291599333286,
"rewards/format_reward": 0.0,
"step": 221
},
{
"clip_ratio": 0.0,
"completion_length": 1471.8214721679688,
"epoch": 0.888,
"grad_norm": 0.09343700110912323,
"kl": 0.166015625,
"learning_rate": 7.127310565369415e-07,
"loss": -0.0241,
"reward": -0.1482422910630703,
"reward_std": 0.13543636724352837,
"rewards/cosine_scaled_reward": -0.07412114553153515,
"rewards/format_reward": 0.0,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 1513.8214416503906,
"epoch": 0.892,
"grad_norm": 0.12307551503181458,
"kl": 0.165771484375,
"learning_rate": 7.097981330836616e-07,
"loss": -0.0248,
"reward": -0.12069513462483883,
"reward_std": 0.13596191070973873,
"rewards/cosine_scaled_reward": -0.060347567312419415,
"rewards/format_reward": 0.0,
"step": 223
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 0.896,
"grad_norm": 0.1183975487947464,
"kl": 0.153564453125,
"learning_rate": 7.068574212948169e-07,
"loss": 0.0062,
"reward": -0.14491389319300652,
"reward_std": 0.1196140144020319,
"rewards/cosine_scaled_reward": -0.07245695032179356,
"rewards/format_reward": 0.0,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 1497.4583435058594,
"epoch": 0.9,
"grad_norm": 0.0833095982670784,
"kl": 0.1728515625,
"learning_rate": 7.039090644965509e-07,
"loss": -0.0371,
"reward": -0.13499305956065655,
"reward_std": 0.12412836588919163,
"rewards/cosine_scaled_reward": -0.06749652978032827,
"rewards/format_reward": 0.0,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 1496.6845397949219,
"epoch": 0.904,
"grad_norm": 0.08985927700996399,
"kl": 0.16650390625,
"learning_rate": 7.009532063876148e-07,
"loss": -0.036,
"reward": -0.13741241209208965,
"reward_std": 0.13803981989622116,
"rewards/cosine_scaled_reward": -0.06870621163398027,
"rewards/format_reward": 0.0,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 1524.4047546386719,
"epoch": 0.908,
"grad_norm": 0.0919218361377716,
"kl": 0.16455078125,
"learning_rate": 6.979899910323624e-07,
"loss": -0.0092,
"reward": -0.1429591029882431,
"reward_std": 0.12002728693187237,
"rewards/cosine_scaled_reward": -0.0714795533567667,
"rewards/format_reward": 0.0,
"step": 227
},
{
"clip_ratio": 0.0,
"completion_length": 1503.6607360839844,
"epoch": 0.912,
"grad_norm": 0.11030226945877075,
"kl": 0.1650390625,
"learning_rate": 6.950195628537299e-07,
"loss": -0.0386,
"reward": -0.13499593548476696,
"reward_std": 0.13771596364676952,
"rewards/cosine_scaled_reward": -0.06749796774238348,
"rewards/format_reward": 0.0,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 1488.8214416503906,
"epoch": 0.916,
"grad_norm": 0.09756813943386078,
"kl": 0.1806640625,
"learning_rate": 6.920420666261961e-07,
"loss": -0.0537,
"reward": -0.13239642046391964,
"reward_std": 0.14560103975236416,
"rewards/cosine_scaled_reward": -0.06619821395725012,
"rewards/format_reward": 0.0,
"step": 229
},
{
"clip_ratio": 0.0,
"completion_length": 1520.875,
"epoch": 0.92,
"grad_norm": 0.13375338912010193,
"kl": 0.183349609375,
"learning_rate": 6.890576474687263e-07,
"loss": -0.0143,
"reward": -0.11103706806898117,
"reward_std": 0.11034407652914524,
"rewards/cosine_scaled_reward": -0.05551853682845831,
"rewards/format_reward": 0.0,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 1504.6369323730469,
"epoch": 0.924,
"grad_norm": 0.09202142059803009,
"kl": 0.14892578125,
"learning_rate": 6.860664508377001e-07,
"loss": -0.0339,
"reward": -0.14039192162454128,
"reward_std": 0.13506866246461868,
"rewards/cosine_scaled_reward": -0.07019595894962549,
"rewards/format_reward": 0.0,
"step": 231
},
{
"clip_ratio": 0.0,
"completion_length": 1520.5476379394531,
"epoch": 0.928,
"grad_norm": 0.098211869597435,
"kl": 0.1650390625,
"learning_rate": 6.83068622519821e-07,
"loss": -0.0127,
"reward": -0.12785408459603786,
"reward_std": 0.12025357596576214,
"rewards/cosine_scaled_reward": -0.06392704229801893,
"rewards/format_reward": 0.0,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 1519.5000305175781,
"epoch": 0.932,
"grad_norm": 0.1020435318350792,
"kl": 0.172119140625,
"learning_rate": 6.800643086250121e-07,
"loss": -0.0108,
"reward": -0.11733199469745159,
"reward_std": 0.09876838512718678,
"rewards/cosine_scaled_reward": -0.058665999211370945,
"rewards/format_reward": 0.0,
"step": 233
},
{
"clip_ratio": 0.0,
"completion_length": 1514.2500305175781,
"epoch": 0.936,
"grad_norm": 0.09378773719072342,
"kl": 0.154541015625,
"learning_rate": 6.770536555792944e-07,
"loss": -0.005,
"reward": -0.12837751768529415,
"reward_std": 0.12461850047111511,
"rewards/cosine_scaled_reward": -0.06418875977396965,
"rewards/format_reward": 0.0,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 1513.8750305175781,
"epoch": 0.94,
"grad_norm": 0.1036522388458252,
"kl": 0.1611328125,
"learning_rate": 6.740368101176495e-07,
"loss": -0.0073,
"reward": -0.12727641500532627,
"reward_std": 0.11483397521078587,
"rewards/cosine_scaled_reward": -0.06363820657134056,
"rewards/format_reward": 0.0,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 1522.732177734375,
"epoch": 0.944,
"grad_norm": 0.12251052260398865,
"kl": 0.1640625,
"learning_rate": 6.710139192768694e-07,
"loss": -0.0118,
"reward": -0.11951213330030441,
"reward_std": 0.1262509860098362,
"rewards/cosine_scaled_reward": -0.059756068512797356,
"rewards/format_reward": 0.0,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 1509.75,
"epoch": 0.948,
"grad_norm": 0.17989245057106018,
"kl": 0.156982421875,
"learning_rate": 6.679851303883891e-07,
"loss": -0.0289,
"reward": -0.13234319165349007,
"reward_std": 0.13814006373286247,
"rewards/cosine_scaled_reward": -0.06617159210145473,
"rewards/format_reward": 0.0,
"step": 237
},
{
"clip_ratio": 0.0,
"completion_length": 1512.8333435058594,
"epoch": 0.952,
"grad_norm": 0.12887263298034668,
"kl": 0.1552734375,
"learning_rate": 6.649505910711058e-07,
"loss": -0.0163,
"reward": -0.13513639941811562,
"reward_std": 0.1341167613863945,
"rewards/cosine_scaled_reward": -0.06756819784641266,
"rewards/format_reward": 0.0,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 1507.7202453613281,
"epoch": 0.956,
"grad_norm": 0.10920631885528564,
"kl": 0.163330078125,
"learning_rate": 6.619104492241847e-07,
"loss": -0.0208,
"reward": -0.11296023428440094,
"reward_std": 0.10206396505236626,
"rewards/cosine_scaled_reward": -0.056480118073523045,
"rewards/format_reward": 0.0,
"step": 239
},
{
"clip_ratio": 0.0,
"completion_length": 1501.2917175292969,
"epoch": 0.96,
"grad_norm": 0.10691919177770615,
"kl": 0.173828125,
"learning_rate": 6.588648530198504e-07,
"loss": -0.0251,
"reward": -0.12773141264915466,
"reward_std": 0.12681871838867664,
"rewards/cosine_scaled_reward": -0.06386570446193218,
"rewards/format_reward": 0.0,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 1508.9940795898438,
"epoch": 0.964,
"grad_norm": 0.12696890532970428,
"kl": 0.1640625,
"learning_rate": 6.558139508961654e-07,
"loss": -0.0235,
"reward": -0.12828433699905872,
"reward_std": 0.12480376102030277,
"rewards/cosine_scaled_reward": -0.06414216570556164,
"rewards/format_reward": 0.0,
"step": 241
},
{
"clip_ratio": 0.0,
"completion_length": 1528.40478515625,
"epoch": 0.968,
"grad_norm": 0.19038116931915283,
"kl": 0.166015625,
"learning_rate": 6.527578915497951e-07,
"loss": -0.0025,
"reward": -0.12644540891051292,
"reward_std": 0.11273947730660439,
"rewards/cosine_scaled_reward": -0.06322270352393389,
"rewards/format_reward": 0.0,
"step": 242
},
{
"clip_ratio": 0.0,
"completion_length": 1502.4404907226562,
"epoch": 0.972,
"grad_norm": 0.1264588087797165,
"kl": 0.1728515625,
"learning_rate": 6.496968239287603e-07,
"loss": -0.0168,
"reward": -0.13079960085451603,
"reward_std": 0.10997281037271023,
"rewards/cosine_scaled_reward": -0.06539979483932257,
"rewards/format_reward": 0.0,
"step": 243
},
{
"clip_ratio": 0.0,
"completion_length": 1514.0119323730469,
"epoch": 0.976,
"grad_norm": 0.08865034580230713,
"kl": 0.177734375,
"learning_rate": 6.466308972251785e-07,
"loss": -0.0154,
"reward": -0.13137424178421497,
"reward_std": 0.1251276545226574,
"rewards/cosine_scaled_reward": -0.06568712089210749,
"rewards/format_reward": 0.0,
"step": 244
},
{
"clip_ratio": 0.0,
"completion_length": 1482.8036193847656,
"epoch": 0.98,
"grad_norm": 0.11429865658283234,
"kl": 0.1796875,
"learning_rate": 6.435602608679916e-07,
"loss": -0.0297,
"reward": -0.13589460216462612,
"reward_std": 0.1405064184218645,
"rewards/cosine_scaled_reward": -0.06794730294495821,
"rewards/format_reward": 0.0,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 1502.5178833007812,
"epoch": 0.984,
"grad_norm": 0.06874290853738785,
"kl": 0.1602783203125,
"learning_rate": 6.404850645156841e-07,
"loss": -0.0348,
"reward": -0.0405126977711916,
"reward_std": 0.10752722714096308,
"rewards/cosine_scaled_reward": -0.020256347954273224,
"rewards/format_reward": 0.0,
"step": 246
},
{
"clip_ratio": 0.0,
"completion_length": 1509.2678833007812,
"epoch": 0.988,
"grad_norm": 0.1023624986410141,
"kl": 0.16748046875,
"learning_rate": 6.374054580489873e-07,
"loss": -0.0321,
"reward": -0.13477133214473724,
"reward_std": 0.13717731088399887,
"rewards/cosine_scaled_reward": -0.06738566607236862,
"rewards/format_reward": 0.0,
"step": 247
},
{
"clip_ratio": 0.0,
"completion_length": 1487.4464416503906,
"epoch": 0.992,
"grad_norm": 0.09231995791196823,
"kl": 0.17333984375,
"learning_rate": 6.343215915635761e-07,
"loss": -0.0601,
"reward": -0.12894214503467083,
"reward_std": 0.1458846628665924,
"rewards/cosine_scaled_reward": -0.06447107251733541,
"rewards/format_reward": 0.0,
"step": 248
},
{
"clip_ratio": 0.0,
"completion_length": 1510.107177734375,
"epoch": 0.996,
"grad_norm": 0.08529460430145264,
"kl": 0.171630859375,
"learning_rate": 6.31233615362752e-07,
"loss": -0.0203,
"reward": -0.11800242215394974,
"reward_std": 0.12636969611048698,
"rewards/cosine_scaled_reward": -0.059001206420361996,
"rewards/format_reward": 0.0,
"step": 249
},
{
"clip_ratio": 0.0,
"completion_length": 1524.5001220703125,
"epoch": 1.0,
"grad_norm": 0.08003545552492142,
"kl": 0.176513671875,
"learning_rate": 6.281416799501187e-07,
"loss": -0.0263,
"reward": -0.11569427512586117,
"reward_std": 0.10093419812619686,
"rewards/cosine_scaled_reward": -0.05784713663160801,
"rewards/format_reward": 0.0,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 1513.452392578125,
"epoch": 1.004,
"grad_norm": 0.0908510684967041,
"kl": 0.1728515625,
"learning_rate": 6.25045936022246e-07,
"loss": -0.0223,
"reward": -0.10760886035859585,
"reward_std": 0.10886831395328045,
"rewards/cosine_scaled_reward": -0.053804428316652775,
"rewards/format_reward": 0.0,
"step": 251
},
{
"clip_ratio": 0.0,
"completion_length": 1516.6250305175781,
"epoch": 1.008,
"grad_norm": 0.12224633246660233,
"kl": 0.18212890625,
"learning_rate": 6.219465344613258e-07,
"loss": -0.0196,
"reward": -0.12489514984190464,
"reward_std": 0.12947135604918003,
"rewards/cosine_scaled_reward": -0.062447573989629745,
"rewards/format_reward": 0.0,
"step": 252
},
{
"clip_ratio": 0.0,
"completion_length": 1504.5654907226562,
"epoch": 1.012,
"grad_norm": 0.13140302896499634,
"kl": 0.1845703125,
"learning_rate": 6.188436263278172e-07,
"loss": -0.0149,
"reward": -0.12439864501357079,
"reward_std": 0.12002300284802914,
"rewards/cosine_scaled_reward": -0.06219932623207569,
"rewards/format_reward": 0.0,
"step": 253
},
{
"clip_ratio": 0.0,
"completion_length": 1510.0357360839844,
"epoch": 1.016,
"grad_norm": 0.11975951492786407,
"kl": 0.17724609375,
"learning_rate": 6.157373628530852e-07,
"loss": -0.0206,
"reward": -0.1344124898314476,
"reward_std": 0.12755578383803368,
"rewards/cosine_scaled_reward": -0.0672062449157238,
"rewards/format_reward": 0.0,
"step": 254
},
{
"clip_ratio": 0.0,
"completion_length": 1510.5,
"epoch": 1.02,
"grad_norm": 0.12696978449821472,
"kl": 0.18701171875,
"learning_rate": 6.126278954320294e-07,
"loss": -0.0138,
"reward": -0.13142068311572075,
"reward_std": 0.13189822621643543,
"rewards/cosine_scaled_reward": -0.0657103369012475,
"rewards/format_reward": 0.0,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 1505.4642944335938,
"epoch": 1.024,
"grad_norm": 0.11200578510761261,
"kl": 0.177001953125,
"learning_rate": 6.095153756157051e-07,
"loss": -0.036,
"reward": -0.12037310004234314,
"reward_std": 0.11811047606170177,
"rewards/cosine_scaled_reward": -0.06018654815852642,
"rewards/format_reward": 0.0,
"step": 256
},
{
"clip_ratio": 0.0,
"completion_length": 1499.672607421875,
"epoch": 1.028,
"grad_norm": 0.1471128910779953,
"kl": 0.172607421875,
"learning_rate": 6.06399955103937e-07,
"loss": -0.0249,
"reward": -0.12174288742244244,
"reward_std": 0.1309817060828209,
"rewards/cosine_scaled_reward": -0.06087144184857607,
"rewards/format_reward": 0.0,
"step": 257
},
{
"clip_ratio": 0.0,
"completion_length": 1520.2559814453125,
"epoch": 1.032,
"grad_norm": 0.10926475375890732,
"kl": 0.170166015625,
"learning_rate": 6.032817857379256e-07,
"loss": -0.0154,
"reward": -0.13306790590286255,
"reward_std": 0.1421681884676218,
"rewards/cosine_scaled_reward": -0.06653395667672157,
"rewards/format_reward": 0.0,
"step": 258
},
{
"clip_ratio": 0.0,
"completion_length": 1523.1666870117188,
"epoch": 1.036,
"grad_norm": 0.15364930033683777,
"kl": 0.1787109375,
"learning_rate": 6.001610194928464e-07,
"loss": -0.0083,
"reward": -0.11868332512676716,
"reward_std": 0.11832969635725021,
"rewards/cosine_scaled_reward": -0.059341663494706154,
"rewards/format_reward": 0.0,
"step": 259
},
{
"clip_ratio": 0.0,
"completion_length": 1504.9583435058594,
"epoch": 1.04,
"grad_norm": 0.10218587517738342,
"kl": 0.163330078125,
"learning_rate": 5.97037808470444e-07,
"loss": -0.0278,
"reward": -0.12172269076108932,
"reward_std": 0.120206318795681,
"rewards/cosine_scaled_reward": -0.06086134351789951,
"rewards/format_reward": 0.0,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 1503.9880981445312,
"epoch": 1.044,
"grad_norm": 0.11966069042682648,
"kl": 0.16552734375,
"learning_rate": 5.939123048916173e-07,
"loss": -0.0226,
"reward": -0.11986922658979893,
"reward_std": 0.1164070088416338,
"rewards/cosine_scaled_reward": -0.05993461608886719,
"rewards/format_reward": 0.0,
"step": 261
},
{
"clip_ratio": 0.0,
"completion_length": 1498.857177734375,
"epoch": 1.048,
"grad_norm": 0.11268185824155807,
"kl": 0.161865234375,
"learning_rate": 5.907846610890011e-07,
"loss": -0.037,
"reward": -0.11277966573834419,
"reward_std": 0.11542832851409912,
"rewards/cosine_scaled_reward": -0.05638983380049467,
"rewards/format_reward": 0.0,
"step": 262
},
{
"clip_ratio": 0.0,
"completion_length": 1522.702392578125,
"epoch": 1.052,
"grad_norm": 0.1593422144651413,
"kl": 0.1494140625,
"learning_rate": 5.87655029499542e-07,
"loss": -0.0086,
"reward": -0.11897107027471066,
"reward_std": 0.12276079133152962,
"rewards/cosine_scaled_reward": -0.0594855360686779,
"rewards/format_reward": 0.0,
"step": 263
},
{
"clip_ratio": 0.0,
"completion_length": 1520.1904907226562,
"epoch": 1.056,
"grad_norm": 0.09266054630279541,
"kl": 0.1494140625,
"learning_rate": 5.845235626570683e-07,
"loss": -0.0117,
"reward": -0.10893694311380386,
"reward_std": 0.11326288990676403,
"rewards/cosine_scaled_reward": -0.05446847062557936,
"rewards/format_reward": 0.0,
"step": 264
},
{
"clip_ratio": 0.0,
"completion_length": 1499.922607421875,
"epoch": 1.06,
"grad_norm": 0.10393787920475006,
"kl": 0.157470703125,
"learning_rate": 5.813904131848564e-07,
"loss": -0.0333,
"reward": -0.12131388112902641,
"reward_std": 0.1239698026329279,
"rewards/cosine_scaled_reward": -0.06065694149583578,
"rewards/format_reward": 0.0,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 1513.3511962890625,
"epoch": 1.064,
"grad_norm": 0.0964948907494545,
"kl": 0.164794921875,
"learning_rate": 5.78255733788191e-07,
"loss": -0.0206,
"reward": -0.10519931092858315,
"reward_std": 0.11078912951052189,
"rewards/cosine_scaled_reward": -0.05259965639561415,
"rewards/format_reward": 0.0,
"step": 266
},
{
"clip_ratio": 0.0,
"completion_length": 1510.2798156738281,
"epoch": 1.068,
"grad_norm": 0.1041577160358429,
"kl": 0.167724609375,
"learning_rate": 5.751196772469237e-07,
"loss": -0.028,
"reward": -0.11491566710174084,
"reward_std": 0.13411646336317062,
"rewards/cosine_scaled_reward": -0.05745783355087042,
"rewards/format_reward": 0.0,
"step": 267
},
{
"clip_ratio": 0.0,
"completion_length": 1527.4285888671875,
"epoch": 1.072,
"grad_norm": 0.12076977640390396,
"kl": 0.172119140625,
"learning_rate": 5.71982396408026e-07,
"loss": -0.0013,
"reward": -0.09519998729228973,
"reward_std": 0.09795477986335754,
"rewards/cosine_scaled_reward": -0.04759999457746744,
"rewards/format_reward": 0.0,
"step": 268
},
{
"clip_ratio": 0.0,
"completion_length": 1502.1964721679688,
"epoch": 1.076,
"grad_norm": 0.15113261342048645,
"kl": 0.16552734375,
"learning_rate": 5.688440441781398e-07,
"loss": -0.0375,
"reward": -0.11081114411354065,
"reward_std": 0.12273510918021202,
"rewards/cosine_scaled_reward": -0.05540557578206062,
"rewards/format_reward": 0.0,
"step": 269
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.08,
"grad_norm": 0.14382889866828918,
"kl": 0.154296875,
"learning_rate": 5.657047735161255e-07,
"loss": 0.0062,
"reward": -0.09718998149037361,
"reward_std": 0.09087535366415977,
"rewards/cosine_scaled_reward": -0.04859498701989651,
"rewards/format_reward": 0.0,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 1500.0,
"epoch": 1.084,
"grad_norm": 0.16920308768749237,
"kl": 0.1630859375,
"learning_rate": 5.625647374256061e-07,
"loss": -0.021,
"reward": -0.1035971287637949,
"reward_std": 0.11180637776851654,
"rewards/cosine_scaled_reward": -0.051798563450574875,
"rewards/format_reward": 0.0,
"step": 271
},
{
"clip_ratio": 0.0,
"completion_length": 1490.7143249511719,
"epoch": 1.088,
"grad_norm": 0.13314908742904663,
"kl": 0.157470703125,
"learning_rate": 5.594240889475106e-07,
"loss": -0.037,
"reward": -0.11400851234793663,
"reward_std": 0.12938306666910648,
"rewards/cosine_scaled_reward": -0.05700425896793604,
"rewards/format_reward": 0.0,
"step": 272
},
{
"clip_ratio": 0.0,
"completion_length": 1504.9464721679688,
"epoch": 1.092,
"grad_norm": 0.07967788726091385,
"kl": 0.166748046875,
"learning_rate": 5.562829811526154e-07,
"loss": -0.0369,
"reward": -0.10690408386290073,
"reward_std": 0.12804840318858624,
"rewards/cosine_scaled_reward": -0.053452043794095516,
"rewards/format_reward": 0.0,
"step": 273
},
{
"clip_ratio": 0.0,
"completion_length": 1529.7440795898438,
"epoch": 1.096,
"grad_norm": 0.09101837128400803,
"kl": 0.1650390625,
"learning_rate": 5.531415671340826e-07,
"loss": -0.0001,
"reward": -0.10337771661579609,
"reward_std": 0.10489914752542973,
"rewards/cosine_scaled_reward": -0.051688858307898045,
"rewards/format_reward": 0.0,
"step": 274
},
{
"clip_ratio": 0.0,
"completion_length": 1527.9404907226562,
"epoch": 1.1,
"grad_norm": 0.16794751584529877,
"kl": 0.168701171875,
"learning_rate": 5.5e-07,
"loss": -0.0041,
"reward": -0.08096367586404085,
"reward_std": 0.08801300823688507,
"rewards/cosine_scaled_reward": -0.040481837932020426,
"rewards/format_reward": 0.0,
"step": 275
},
{
"clip_ratio": 0.0,
"completion_length": 1512.3274230957031,
"epoch": 1.104,
"grad_norm": 0.0922553613781929,
"kl": 0.161376953125,
"learning_rate": 5.468584328659172e-07,
"loss": -0.0232,
"reward": -0.1129224356263876,
"reward_std": 0.12950855493545532,
"rewards/cosine_scaled_reward": -0.056461221538484097,
"rewards/format_reward": 0.0,
"step": 276
},
{
"clip_ratio": 0.0,
"completion_length": 1504.65478515625,
"epoch": 1.108,
"grad_norm": 0.1329527050256729,
"kl": 0.167236328125,
"learning_rate": 5.437170188473847e-07,
"loss": -0.0302,
"reward": -0.11213578283786774,
"reward_std": 0.11713657341897488,
"rewards/cosine_scaled_reward": -0.05606789421290159,
"rewards/format_reward": 0.0,
"step": 277
},
{
"clip_ratio": 0.0,
"completion_length": 1506.0,
"epoch": 1.112,
"grad_norm": 0.10364029556512833,
"kl": 0.163818359375,
"learning_rate": 5.405759110524894e-07,
"loss": -0.0298,
"reward": -0.10770672746002674,
"reward_std": 0.11716131307184696,
"rewards/cosine_scaled_reward": -0.053853364661335945,
"rewards/format_reward": 0.0,
"step": 278
},
{
"clip_ratio": 0.0,
"completion_length": 1474.4702453613281,
"epoch": 1.116,
"grad_norm": 0.1940528154373169,
"kl": 0.17138671875,
"learning_rate": 5.37435262574394e-07,
"loss": -0.0549,
"reward": -0.06423771567642689,
"reward_std": 0.14231404848396778,
"rewards/cosine_scaled_reward": -0.03211886156350374,
"rewards/format_reward": 0.0,
"step": 279
},
{
"clip_ratio": 0.0,
"completion_length": 1515.9821472167969,
"epoch": 1.12,
"grad_norm": 0.09174785017967224,
"kl": 0.165283203125,
"learning_rate": 5.342952264838747e-07,
"loss": -0.0167,
"reward": -0.12564732693135738,
"reward_std": 0.12320295721292496,
"rewards/cosine_scaled_reward": -0.06282366160303354,
"rewards/format_reward": 0.0,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 1525.8214416503906,
"epoch": 1.124,
"grad_norm": 0.11484113335609436,
"kl": 0.16650390625,
"learning_rate": 5.311559558218603e-07,
"loss": -0.0015,
"reward": -0.09635571762919426,
"reward_std": 0.09338909015059471,
"rewards/cosine_scaled_reward": -0.048177859745919704,
"rewards/format_reward": 0.0,
"step": 281
},
{
"clip_ratio": 0.0,
"completion_length": 1502.0714721679688,
"epoch": 1.1280000000000001,
"grad_norm": 0.1825956404209137,
"kl": 0.158203125,
"learning_rate": 5.28017603591974e-07,
"loss": -0.0234,
"reward": -0.11337108165025711,
"reward_std": 0.1339975707232952,
"rewards/cosine_scaled_reward": -0.05668553803116083,
"rewards/format_reward": 0.0,
"step": 282
},
{
"clip_ratio": 0.0,
"completion_length": 1506.6964416503906,
"epoch": 1.1320000000000001,
"grad_norm": 0.1683872640132904,
"kl": 0.185546875,
"learning_rate": 5.248803227530763e-07,
"loss": -0.0202,
"reward": -0.1038710568100214,
"reward_std": 0.1097688339650631,
"rewards/cosine_scaled_reward": -0.051935529336333275,
"rewards/format_reward": 0.0,
"step": 283
},
{
"clip_ratio": 0.0,
"completion_length": 1528.0595397949219,
"epoch": 1.1360000000000001,
"grad_norm": 0.10006389766931534,
"kl": 0.1591796875,
"learning_rate": 5.21744266211809e-07,
"loss": 0.0012,
"reward": -0.12082774937152863,
"reward_std": 0.11856056936085224,
"rewards/cosine_scaled_reward": -0.06041387468576431,
"rewards/format_reward": 0.0,
"step": 284
},
{
"clip_ratio": 0.0,
"completion_length": 1505.7916870117188,
"epoch": 1.1400000000000001,
"grad_norm": 0.1140761449933052,
"kl": 0.156982421875,
"learning_rate": 5.186095868151436e-07,
"loss": -0.035,
"reward": -0.11102944053709507,
"reward_std": 0.143316388130188,
"rewards/cosine_scaled_reward": -0.05551472119987011,
"rewards/format_reward": 0.0,
"step": 285
},
{
"clip_ratio": 0.0,
"completion_length": 1529.6309509277344,
"epoch": 1.144,
"grad_norm": 0.09991593658924103,
"kl": 0.158447265625,
"learning_rate": 5.154764373429315e-07,
"loss": -0.0016,
"reward": -0.11252638325095177,
"reward_std": 0.11945481784641743,
"rewards/cosine_scaled_reward": -0.05626319348812103,
"rewards/format_reward": 0.0,
"step": 286
},
{
"clip_ratio": 0.0,
"completion_length": 1520.4821472167969,
"epoch": 1.148,
"grad_norm": 0.12035401910543442,
"kl": 0.17529296875,
"learning_rate": 5.123449705004581e-07,
"loss": -0.0112,
"reward": -0.1297306139022112,
"reward_std": 0.13188758678734303,
"rewards/cosine_scaled_reward": -0.0648653069511056,
"rewards/format_reward": 0.0,
"step": 287
},
{
"clip_ratio": 0.0,
"completion_length": 1514.7083740234375,
"epoch": 1.152,
"grad_norm": 0.11799659579992294,
"kl": 0.169189453125,
"learning_rate": 5.09215338910999e-07,
"loss": -0.0217,
"reward": -0.10410146042704582,
"reward_std": 0.10413151048123837,
"rewards/cosine_scaled_reward": -0.05205072835087776,
"rewards/format_reward": 0.0,
"step": 288
},
{
"clip_ratio": 0.0,
"completion_length": 1496.8512268066406,
"epoch": 1.156,
"grad_norm": 0.1316956877708435,
"kl": 0.15869140625,
"learning_rate": 5.060876951083828e-07,
"loss": -0.0207,
"reward": -0.11279321648180485,
"reward_std": 0.11689519137144089,
"rewards/cosine_scaled_reward": -0.05639660730957985,
"rewards/format_reward": 0.0,
"step": 289
},
{
"clip_ratio": 0.0,
"completion_length": 1513.6369323730469,
"epoch": 1.16,
"grad_norm": 0.07206040620803833,
"kl": 0.165283203125,
"learning_rate": 5.02962191529556e-07,
"loss": -0.0101,
"reward": -0.11338524892926216,
"reward_std": 0.10832424648106098,
"rewards/cosine_scaled_reward": -0.05669262260198593,
"rewards/format_reward": 0.0,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 1514.3511962890625,
"epoch": 1.164,
"grad_norm": 0.20136146247386932,
"kl": 0.164794921875,
"learning_rate": 4.998389805071536e-07,
"loss": -0.0202,
"reward": -0.12464358657598495,
"reward_std": 0.10932311788201332,
"rewards/cosine_scaled_reward": -0.06232179142534733,
"rewards/format_reward": 0.0,
"step": 291
},
{
"clip_ratio": 0.0,
"completion_length": 1510.1547546386719,
"epoch": 1.168,
"grad_norm": 0.10925502330064774,
"kl": 0.185791015625,
"learning_rate": 4.967182142620745e-07,
"loss": 0.003,
"reward": -0.09491665475070477,
"reward_std": 0.08971338160336018,
"rewards/cosine_scaled_reward": -0.04745833110064268,
"rewards/format_reward": 0.0,
"step": 292
},
{
"clip_ratio": 0.0,
"completion_length": 1501.0535888671875,
"epoch": 1.172,
"grad_norm": 0.09527327865362167,
"kl": 0.173828125,
"learning_rate": 4.93600044896063e-07,
"loss": -0.0318,
"reward": -0.13249907828867435,
"reward_std": 0.1209456454962492,
"rewards/cosine_scaled_reward": -0.06624954286962748,
"rewards/format_reward": 0.0,
"step": 293
},
{
"clip_ratio": 0.0,
"completion_length": 1516.3869323730469,
"epoch": 1.176,
"grad_norm": 0.13041535019874573,
"kl": 0.188720703125,
"learning_rate": 4.904846243842949e-07,
"loss": -0.015,
"reward": -0.11728105135262012,
"reward_std": 0.11388706415891647,
"rewards/cosine_scaled_reward": -0.058640528470277786,
"rewards/format_reward": 0.0,
"step": 294
},
{
"clip_ratio": 0.0,
"completion_length": 1516.1488342285156,
"epoch": 1.18,
"grad_norm": 0.13715095818042755,
"kl": 0.184814453125,
"learning_rate": 4.873721045679706e-07,
"loss": -0.0191,
"reward": -0.10163358226418495,
"reward_std": 0.10592619515955448,
"rewards/cosine_scaled_reward": -0.05081678926944733,
"rewards/format_reward": 0.0,
"step": 295
},
{
"clip_ratio": 0.0,
"completion_length": 1517.1548156738281,
"epoch": 1.184,
"grad_norm": 0.10485463589429855,
"kl": 0.172119140625,
"learning_rate": 4.842626371469149e-07,
"loss": -0.0183,
"reward": -0.11159243248403072,
"reward_std": 0.11549564823508263,
"rewards/cosine_scaled_reward": -0.05579621531069279,
"rewards/format_reward": 0.0,
"step": 296
},
{
"clip_ratio": 0.0,
"completion_length": 1505.4702453613281,
"epoch": 1.188,
"grad_norm": 0.16054122149944305,
"kl": 0.19091796875,
"learning_rate": 4.811563736721829e-07,
"loss": -0.0199,
"reward": -0.10421715676784515,
"reward_std": 0.10152514837682247,
"rewards/cosine_scaled_reward": -0.05210857838392258,
"rewards/format_reward": 0.0,
"step": 297
},
{
"clip_ratio": 0.0,
"completion_length": 1510.8630981445312,
"epoch": 1.192,
"grad_norm": 0.1103135496377945,
"kl": 0.167236328125,
"learning_rate": 4.780534655386743e-07,
"loss": -0.0185,
"reward": -0.11138204857707024,
"reward_std": 0.12202793546020985,
"rewards/cosine_scaled_reward": -0.05569102708250284,
"rewards/format_reward": 0.0,
"step": 298
},
{
"clip_ratio": 0.0,
"completion_length": 1498.1905212402344,
"epoch": 1.196,
"grad_norm": 0.13167892396450043,
"kl": 0.165283203125,
"learning_rate": 4.749540639777539e-07,
"loss": -0.0422,
"reward": -0.12753658182919025,
"reward_std": 0.15048057585954666,
"rewards/cosine_scaled_reward": -0.06376829091459513,
"rewards/format_reward": 0.0,
"step": 299
},
{
"clip_ratio": 0.0,
"completion_length": 1504.3333740234375,
"epoch": 1.2,
"grad_norm": 0.11807087808847427,
"kl": 0.1787109375,
"learning_rate": 4.7185832004988133e-07,
"loss": -0.0233,
"reward": -0.10466483794152737,
"reward_std": 0.09747852385044098,
"rewards/cosine_scaled_reward": -0.05233241897076368,
"rewards/format_reward": 0.0,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 1504.6369018554688,
"epoch": 1.204,
"grad_norm": 0.10298826545476913,
"kl": 0.17041015625,
"learning_rate": 4.68766384637248e-07,
"loss": -0.0369,
"reward": -0.030609130859375,
"reward_std": 0.1283973567187786,
"rewards/cosine_scaled_reward": -0.0153045654296875,
"rewards/format_reward": 0.0,
"step": 301
},
{
"clip_ratio": 0.0,
"completion_length": 1512.0119323730469,
"epoch": 1.208,
"grad_norm": 0.11705794930458069,
"kl": 0.16259765625,
"learning_rate": 4.656784084364238e-07,
"loss": -0.0288,
"reward": -0.1029995009303093,
"reward_std": 0.1215117834508419,
"rewards/cosine_scaled_reward": -0.0514997486025095,
"rewards/format_reward": 0.0,
"step": 302
},
{
"clip_ratio": 0.0,
"completion_length": 1512.077392578125,
"epoch": 1.212,
"grad_norm": 0.13367965817451477,
"kl": 0.186279296875,
"learning_rate": 4.6259454195101267e-07,
"loss": -0.024,
"reward": -0.1126109603792429,
"reward_std": 0.11090057343244553,
"rewards/cosine_scaled_reward": -0.05630548112094402,
"rewards/format_reward": 0.0,
"step": 303
},
{
"clip_ratio": 0.0,
"completion_length": 1516.8511962890625,
"epoch": 1.216,
"grad_norm": 0.1228313073515892,
"kl": 0.17578125,
"learning_rate": 4.59514935484316e-07,
"loss": -0.0105,
"reward": -0.08529405388981104,
"reward_std": 0.08704109024256468,
"rewards/cosine_scaled_reward": -0.042647027876228094,
"rewards/format_reward": 0.0,
"step": 304
},
{
"clip_ratio": 0.0,
"completion_length": 1480.511962890625,
"epoch": 1.22,
"grad_norm": 0.12493617087602615,
"kl": 0.1513671875,
"learning_rate": 4.5643973913200837e-07,
"loss": -0.0485,
"reward": -0.12973922118544579,
"reward_std": 0.15159142762422562,
"rewards/cosine_scaled_reward": -0.06486961059272289,
"rewards/format_reward": 0.0,
"step": 305
},
{
"clip_ratio": 0.0,
"completion_length": 1525.3988342285156,
"epoch": 1.224,
"grad_norm": 0.08764777332544327,
"kl": 0.165283203125,
"learning_rate": 4.5336910277482155e-07,
"loss": -0.006,
"reward": -0.08825473487377167,
"reward_std": 0.10441902838647366,
"rewards/cosine_scaled_reward": -0.04412736464291811,
"rewards/format_reward": 0.0,
"step": 306
},
{
"clip_ratio": 0.0,
"completion_length": 1529.3869018554688,
"epoch": 1.228,
"grad_norm": 0.08289927244186401,
"kl": 0.169677734375,
"learning_rate": 4.503031760712397e-07,
"loss": -0.003,
"reward": -0.07872879132628441,
"reward_std": 0.07979759760200977,
"rewards/cosine_scaled_reward": -0.03936439473181963,
"rewards/format_reward": 0.0,
"step": 307
},
{
"clip_ratio": 0.0,
"completion_length": 1513.3630981445312,
"epoch": 1.232,
"grad_norm": 0.14125800132751465,
"kl": 0.17578125,
"learning_rate": 4.4724210845020494e-07,
"loss": -0.0177,
"reward": -0.08932580798864365,
"reward_std": 0.08838274143636227,
"rewards/cosine_scaled_reward": -0.04466290678828955,
"rewards/format_reward": 0.0,
"step": 308
},
{
"clip_ratio": 0.0,
"completion_length": 1499.9642944335938,
"epoch": 1.236,
"grad_norm": 0.1731095314025879,
"kl": 0.146484375,
"learning_rate": 4.441860491038345e-07,
"loss": -0.0359,
"reward": -0.10843771509826183,
"reward_std": 0.11413087695837021,
"rewards/cosine_scaled_reward": -0.05421885754913092,
"rewards/format_reward": 0.0,
"step": 309
},
{
"clip_ratio": 0.0,
"completion_length": 1494.4702758789062,
"epoch": 1.24,
"grad_norm": 0.2260572761297226,
"kl": 0.164306640625,
"learning_rate": 4.4113514698014953e-07,
"loss": -0.0351,
"reward": -0.11317398212850094,
"reward_std": 0.124705346301198,
"rewards/cosine_scaled_reward": -0.05658698920160532,
"rewards/format_reward": 0.0,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 1519.1428833007812,
"epoch": 1.244,
"grad_norm": 0.10579942166805267,
"kl": 0.17333984375,
"learning_rate": 4.3808955077581546e-07,
"loss": -0.0133,
"reward": -0.10701234266161919,
"reward_std": 0.11289746686816216,
"rewards/cosine_scaled_reward": -0.05350616853684187,
"rewards/format_reward": 0.0,
"step": 311
},
{
"clip_ratio": 0.0,
"completion_length": 1499.4940795898438,
"epoch": 1.248,
"grad_norm": 0.12452121824026108,
"kl": 0.151611328125,
"learning_rate": 4.350494089288943e-07,
"loss": -0.0416,
"reward": -0.12627964280545712,
"reward_std": 0.1439381241798401,
"rewards/cosine_scaled_reward": -0.06313982233405113,
"rewards/format_reward": 0.0,
"step": 312
},
{
"clip_ratio": 0.0,
"completion_length": 1519.7559814453125,
"epoch": 1.252,
"grad_norm": 0.14036187529563904,
"kl": 0.164306640625,
"learning_rate": 4.3201486961161093e-07,
"loss": -0.0152,
"reward": -0.08883011713624,
"reward_std": 0.09118240885436535,
"rewards/cosine_scaled_reward": -0.0444150622934103,
"rewards/format_reward": 0.0,
"step": 313
},
{
"clip_ratio": 0.0,
"completion_length": 1525.2440795898438,
"epoch": 1.256,
"grad_norm": 0.1239011213183403,
"kl": 0.165283203125,
"learning_rate": 4.2898608072313045e-07,
"loss": -0.0058,
"reward": -0.10498439148068428,
"reward_std": 0.11687885224819183,
"rewards/cosine_scaled_reward": -0.05249219387769699,
"rewards/format_reward": 0.0,
"step": 314
},
{
"clip_ratio": 0.0,
"completion_length": 1490.8631286621094,
"epoch": 1.26,
"grad_norm": 0.09535997360944748,
"kl": 0.159423828125,
"learning_rate": 4.2596318988235037e-07,
"loss": -0.0165,
"reward": -0.02371996082365513,
"reward_std": 0.09539724607020617,
"rewards/cosine_scaled_reward": -0.011859980411827564,
"rewards/format_reward": 0.0,
"step": 315
},
{
"clip_ratio": 0.0,
"completion_length": 1494.3988342285156,
"epoch": 1.264,
"grad_norm": 0.11306377500295639,
"kl": 0.161865234375,
"learning_rate": 4.2294634442070553e-07,
"loss": -0.0466,
"reward": -0.03322407230734825,
"reward_std": 0.12497628107666969,
"rewards/cosine_scaled_reward": -0.0166120370849967,
"rewards/format_reward": 0.0,
"step": 316
},
{
"clip_ratio": 0.0,
"completion_length": 1530.4166870117188,
"epoch": 1.268,
"grad_norm": 0.11471430212259293,
"kl": 0.160888671875,
"learning_rate": 4.1993569137498776e-07,
"loss": 0.0016,
"reward": -0.08301959745585918,
"reward_std": 0.08951563201844692,
"rewards/cosine_scaled_reward": -0.04150979872792959,
"rewards/format_reward": 0.0,
"step": 317
},
{
"clip_ratio": 0.0,
"completion_length": 1523.1607055664062,
"epoch": 1.272,
"grad_norm": 0.18867182731628418,
"kl": 0.155029296875,
"learning_rate": 4.1693137748017915e-07,
"loss": -0.0106,
"reward": -0.10158013552427292,
"reward_std": 0.10436173714697361,
"rewards/cosine_scaled_reward": -0.050790068693459034,
"rewards/format_reward": 0.0,
"step": 318
},
{
"clip_ratio": 0.0,
"completion_length": 1492.1548156738281,
"epoch": 1.276,
"grad_norm": 0.1889064460992813,
"kl": 0.16552734375,
"learning_rate": 4.1393354916230005e-07,
"loss": -0.0519,
"reward": -0.1043181549757719,
"reward_std": 0.13521616160869598,
"rewards/cosine_scaled_reward": -0.0521590793505311,
"rewards/format_reward": 0.0,
"step": 319
},
{
"clip_ratio": 0.0,
"completion_length": 1516.2261962890625,
"epoch": 1.28,
"grad_norm": 0.12251739203929901,
"kl": 0.18359375,
"learning_rate": 4.1094235253127374e-07,
"loss": -0.0218,
"reward": -0.0895642340183258,
"reward_std": 0.11007728800177574,
"rewards/cosine_scaled_reward": -0.04478211794048548,
"rewards/format_reward": 0.0,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 1486.0059814453125,
"epoch": 1.284,
"grad_norm": 0.11772434413433075,
"kl": 0.152587890625,
"learning_rate": 4.079579333738039e-07,
"loss": -0.0484,
"reward": -0.1089986227452755,
"reward_std": 0.12628877721726894,
"rewards/cosine_scaled_reward": -0.0544993132352829,
"rewards/format_reward": 0.0,
"step": 321
},
{
"clip_ratio": 0.0,
"completion_length": 1519.9940490722656,
"epoch": 1.288,
"grad_norm": 0.11078700423240662,
"kl": 0.1650390625,
"learning_rate": 4.0498043714627006e-07,
"loss": -0.0131,
"reward": -0.0795932961627841,
"reward_std": 0.08838632330298424,
"rewards/cosine_scaled_reward": -0.03979664808139205,
"rewards/format_reward": 0.0,
"step": 322
},
{
"clip_ratio": 0.0,
"completion_length": 1493.6488342285156,
"epoch": 1.292,
"grad_norm": 0.11576636880636215,
"kl": 0.16064453125,
"learning_rate": 4.020100089676376e-07,
"loss": -0.0516,
"reward": -0.09283129125833511,
"reward_std": 0.11138802394270897,
"rewards/cosine_scaled_reward": -0.04641564283519983,
"rewards/format_reward": 0.0,
"step": 323
},
{
"clip_ratio": 0.0,
"completion_length": 1513.1904907226562,
"epoch": 1.296,
"grad_norm": 0.0929921567440033,
"kl": 0.163818359375,
"learning_rate": 3.9904679361238526e-07,
"loss": -0.0256,
"reward": -0.11852945201098919,
"reward_std": 0.14786842092871666,
"rewards/cosine_scaled_reward": -0.059264726005494595,
"rewards/format_reward": 0.0,
"step": 324
},
{
"clip_ratio": 0.0,
"completion_length": 1513.8809814453125,
"epoch": 1.3,
"grad_norm": 0.10295694321393967,
"kl": 0.151123046875,
"learning_rate": 3.9609093550344907e-07,
"loss": -0.025,
"reward": -0.09280366078019142,
"reward_std": 0.1096403207629919,
"rewards/cosine_scaled_reward": -0.046401829458773136,
"rewards/format_reward": 0.0,
"step": 325
},
{
"clip_ratio": 0.0,
"completion_length": 1526.6428833007812,
"epoch": 1.304,
"grad_norm": 0.08783736079931259,
"kl": 0.15576171875,
"learning_rate": 3.931425787051832e-07,
"loss": -0.0069,
"reward": -0.10956737771630287,
"reward_std": 0.11006363853812218,
"rewards/cosine_scaled_reward": -0.054783688858151436,
"rewards/format_reward": 0.0,
"step": 326
},
{
"clip_ratio": 0.0,
"completion_length": 1495.7024230957031,
"epoch": 1.308,
"grad_norm": 0.10409428924322128,
"kl": 0.15234375,
"learning_rate": 3.902018669163384e-07,
"loss": -0.0457,
"reward": -0.10653090476989746,
"reward_std": 0.12193662486970425,
"rewards/cosine_scaled_reward": -0.053265451453626156,
"rewards/format_reward": 0.0,
"step": 327
},
{
"clip_ratio": 0.0,
"completion_length": 1530.3988037109375,
"epoch": 1.312,
"grad_norm": 0.09973278641700745,
"kl": 0.151123046875,
"learning_rate": 3.872689434630585e-07,
"loss": -0.0022,
"reward": -0.08937697485089302,
"reward_std": 0.09377033449709415,
"rewards/cosine_scaled_reward": -0.04468849208205938,
"rewards/format_reward": 0.0,
"step": 328
},
{
"clip_ratio": 0.0,
"completion_length": 1499.6607360839844,
"epoch": 1.316,
"grad_norm": 0.13606639206409454,
"kl": 0.166259765625,
"learning_rate": 3.843439512918949e-07,
"loss": -0.0237,
"reward": -0.11537123657763004,
"reward_std": 0.1290461514145136,
"rewards/cosine_scaled_reward": -0.05768561642616987,
"rewards/format_reward": 0.0,
"step": 329
},
{
"clip_ratio": 0.0,
"completion_length": 1519.7857666015625,
"epoch": 1.32,
"grad_norm": 0.10613211989402771,
"kl": 0.167724609375,
"learning_rate": 3.8142703296283953e-07,
"loss": -0.0159,
"reward": -0.09533977694809437,
"reward_std": 0.10748440586030483,
"rewards/cosine_scaled_reward": -0.04766988940536976,
"rewards/format_reward": 0.0,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 1511.4464721679688,
"epoch": 1.324,
"grad_norm": 0.17454038560390472,
"kl": 0.170166015625,
"learning_rate": 3.785183306423767e-07,
"loss": -0.0282,
"reward": -0.015690762549638748,
"reward_std": 0.0955708883702755,
"rewards/cosine_scaled_reward": -0.007845382206141949,
"rewards/format_reward": 0.0,
"step": 331
},
{
"clip_ratio": 0.0,
"completion_length": 1499.7024230957031,
"epoch": 1.328,
"grad_norm": 0.10816742479801178,
"kl": 0.153564453125,
"learning_rate": 3.7561798609655373e-07,
"loss": -0.0399,
"reward": -0.09764312580227852,
"reward_std": 0.10405797138810158,
"rewards/cosine_scaled_reward": -0.04882156103849411,
"rewards/format_reward": 0.0,
"step": 332
},
{
"clip_ratio": 0.0,
"completion_length": 1513.2738037109375,
"epoch": 1.332,
"grad_norm": 0.09580235928297043,
"kl": 0.1630859375,
"learning_rate": 3.72726140684072e-07,
"loss": -0.0238,
"reward": -0.0930531919002533,
"reward_std": 0.10378883965313435,
"rewards/cosine_scaled_reward": -0.0465265978127718,
"rewards/format_reward": 0.0,
"step": 333
},
{
"clip_ratio": 0.0,
"completion_length": 1497.4762268066406,
"epoch": 1.336,
"grad_norm": 0.18943195044994354,
"kl": 0.186767578125,
"learning_rate": 3.6984293534939737e-07,
"loss": -0.0458,
"reward": -0.09320422261953354,
"reward_std": 0.11783652380108833,
"rewards/cosine_scaled_reward": -0.04660210944712162,
"rewards/format_reward": 0.0,
"step": 334
},
{
"clip_ratio": 0.0,
"completion_length": 1511.4583740234375,
"epoch": 1.34,
"grad_norm": 0.11527442187070847,
"kl": 0.169677734375,
"learning_rate": 3.6696851061588994e-07,
"loss": -0.0222,
"reward": -0.09490611962974072,
"reward_std": 0.106621278449893,
"rewards/cosine_scaled_reward": -0.04745305888354778,
"rewards/format_reward": 0.0,
"step": 335
},
{
"clip_ratio": 0.0,
"completion_length": 1510.3333435058594,
"epoch": 1.3439999999999999,
"grad_norm": 0.1746179610490799,
"kl": 0.15966796875,
"learning_rate": 3.641030065789562e-07,
"loss": -0.0303,
"reward": -0.0963439904153347,
"reward_std": 0.11076842434704304,
"rewards/cosine_scaled_reward": -0.04817199241369963,
"rewards/format_reward": 0.0,
"step": 336
},
{
"clip_ratio": 0.0,
"completion_length": 1512.5893249511719,
"epoch": 1.3479999999999999,
"grad_norm": 0.1353609561920166,
"kl": 0.16748046875,
"learning_rate": 3.612465628992203e-07,
"loss": -0.0247,
"reward": -0.09672348201274872,
"reward_std": 0.1137369517236948,
"rewards/cosine_scaled_reward": -0.04836174100637436,
"rewards/format_reward": 0.0,
"step": 337
},
{
"clip_ratio": 0.0,
"completion_length": 1506.3809509277344,
"epoch": 1.3519999999999999,
"grad_norm": 0.15681445598602295,
"kl": 0.156982421875,
"learning_rate": 3.5839931879571725e-07,
"loss": -0.0329,
"reward": -0.09822369925677776,
"reward_std": 0.11475454457104206,
"rewards/cosine_scaled_reward": -0.04911184962838888,
"rewards/format_reward": 0.0,
"step": 338
},
{
"clip_ratio": 0.0,
"completion_length": 1526.2619323730469,
"epoch": 1.3559999999999999,
"grad_norm": 0.12026900053024292,
"kl": 0.15185546875,
"learning_rate": 3.555614130391079e-07,
"loss": -0.0027,
"reward": -0.09019140899181366,
"reward_std": 0.08315368928015232,
"rewards/cosine_scaled_reward": -0.045095707289874554,
"rewards/format_reward": 0.0,
"step": 339
},
{
"clip_ratio": 0.0,
"completion_length": 1519.3988342285156,
"epoch": 1.3599999999999999,
"grad_norm": 0.189870685338974,
"kl": 0.185791015625,
"learning_rate": 3.5273298394491515e-07,
"loss": -0.0085,
"reward": -0.08764730766415596,
"reward_std": 0.08974755555391312,
"rewards/cosine_scaled_reward": -0.043823654763400555,
"rewards/format_reward": 0.0,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 1526.1309509277344,
"epoch": 1.3639999999999999,
"grad_norm": 0.11866843700408936,
"kl": 0.1502685546875,
"learning_rate": 3.4991416936678276e-07,
"loss": 0.0082,
"reward": -0.10054401028901339,
"reward_std": 0.091705821454525,
"rewards/cosine_scaled_reward": -0.05027200886979699,
"rewards/format_reward": 0.0,
"step": 341
},
{
"clip_ratio": 0.0,
"completion_length": 1504.6190795898438,
"epoch": 1.3679999999999999,
"grad_norm": 0.07965697348117828,
"kl": 0.160888671875,
"learning_rate": 3.471051066897562e-07,
"loss": -0.0327,
"reward": -0.03098013624548912,
"reward_std": 0.10922298207879066,
"rewards/cosine_scaled_reward": -0.015490064397454262,
"rewards/format_reward": 0.0,
"step": 342
},
{
"clip_ratio": 0.0,
"completion_length": 1515.702392578125,
"epoch": 1.3719999999999999,
"grad_norm": 0.12758509814739227,
"kl": 0.172607421875,
"learning_rate": 3.4430593282358777e-07,
"loss": -0.0202,
"reward": -0.09887174144387245,
"reward_std": 0.11539069190621376,
"rewards/cosine_scaled_reward": -0.049435872584581375,
"rewards/format_reward": 0.0,
"step": 343
},
{
"clip_ratio": 0.0,
"completion_length": 1519.0535888671875,
"epoch": 1.376,
"grad_norm": 0.09368550777435303,
"kl": 0.18115234375,
"learning_rate": 3.4151678419606233e-07,
"loss": -0.0143,
"reward": -0.09874763153493404,
"reward_std": 0.0960962763056159,
"rewards/cosine_scaled_reward": -0.049373818561434746,
"rewards/format_reward": 0.0,
"step": 344
},
{
"clip_ratio": 0.0,
"completion_length": 1505.8631286621094,
"epoch": 1.38,
"grad_norm": 0.10917885601520538,
"kl": 0.177001953125,
"learning_rate": 3.387377967463493e-07,
"loss": -0.0331,
"reward": -0.10075951926410198,
"reward_std": 0.11745327524840832,
"rewards/cosine_scaled_reward": -0.050379764288663864,
"rewards/format_reward": 0.0,
"step": 345
},
{
"clip_ratio": 0.0,
"completion_length": 1508.7976379394531,
"epoch": 1.384,
"grad_norm": 0.08625519275665283,
"kl": 0.1611328125,
"learning_rate": 3.359691059183761e-07,
"loss": -0.0277,
"reward": -0.11206395924091339,
"reward_std": 0.13379977643489838,
"rewards/cosine_scaled_reward": -0.05603197868913412,
"rewards/format_reward": 0.0,
"step": 346
},
{
"clip_ratio": 0.0,
"completion_length": 1514.77978515625,
"epoch": 1.388,
"grad_norm": 0.09115591645240784,
"kl": 0.164306640625,
"learning_rate": 3.3321084665422803e-07,
"loss": -0.0129,
"reward": -0.08784853294491768,
"reward_std": 0.09035127516835928,
"rewards/cosine_scaled_reward": -0.043924265541136265,
"rewards/format_reward": 0.0,
"step": 347
},
{
"clip_ratio": 0.0,
"completion_length": 1508.4702453613281,
"epoch": 1.392,
"grad_norm": 0.12537740170955658,
"kl": 0.155517578125,
"learning_rate": 3.3046315338757026e-07,
"loss": -0.0323,
"reward": -0.11530621163547039,
"reward_std": 0.1266392320394516,
"rewards/cosine_scaled_reward": -0.05765310861170292,
"rewards/format_reward": 0.0,
"step": 348
},
{
"clip_ratio": 0.0,
"completion_length": 1496.6726379394531,
"epoch": 1.396,
"grad_norm": 0.11020371317863464,
"kl": 0.158935546875,
"learning_rate": 3.2772616003709616e-07,
"loss": -0.0268,
"reward": -0.10983618721365929,
"reward_std": 0.10708382353186607,
"rewards/cosine_scaled_reward": -0.054918091744184494,
"rewards/format_reward": 0.0,
"step": 349
},
{
"clip_ratio": 0.0,
"completion_length": 1530.0416564941406,
"epoch": 1.4,
"grad_norm": 0.17068089544773102,
"kl": 0.163818359375,
"learning_rate": 3.250000000000001e-07,
"loss": 0.0002,
"reward": -0.08832419849932194,
"reward_std": 0.09396599233150482,
"rewards/cosine_scaled_reward": -0.04416209738701582,
"rewards/format_reward": 0.0,
"step": 350
},
{
"clip_ratio": 0.0,
"completion_length": 1515.4345397949219,
"epoch": 1.404,
"grad_norm": 0.1316055804491043,
"kl": 0.1689453125,
"learning_rate": 3.222848061454764e-07,
"loss": -0.0178,
"reward": -0.08349752612411976,
"reward_std": 0.08428733702749014,
"rewards/cosine_scaled_reward": -0.041748762130737305,
"rewards/format_reward": 0.0,
"step": 351
},
{
"clip_ratio": 0.0,
"completion_length": 1501.7976379394531,
"epoch": 1.408,
"grad_norm": 0.1486114114522934,
"kl": 0.16552734375,
"learning_rate": 3.195807108082429e-07,
"loss": -0.0412,
"reward": -0.0832710936665535,
"reward_std": 0.10713749751448631,
"rewards/cosine_scaled_reward": -0.04163554683327675,
"rewards/format_reward": 0.0,
"step": 352
},
{
"clip_ratio": 0.0,
"completion_length": 1510.6428833007812,
"epoch": 1.412,
"grad_norm": 0.12983661890029907,
"kl": 0.163330078125,
"learning_rate": 3.168878457820915e-07,
"loss": -0.0096,
"reward": -0.10671682469546795,
"reward_std": 0.11679115891456604,
"rewards/cosine_scaled_reward": -0.0533584114164114,
"rewards/format_reward": 0.0,
"step": 353
},
{
"clip_ratio": 0.0,
"completion_length": 1497.7976379394531,
"epoch": 1.416,
"grad_norm": 0.09482970088720322,
"kl": 0.163818359375,
"learning_rate": 3.142063423134644e-07,
"loss": -0.0377,
"reward": -0.10173431225121021,
"reward_std": 0.11217576451599598,
"rewards/cosine_scaled_reward": -0.050867156125605106,
"rewards/format_reward": 0.0,
"step": 354
},
{
"clip_ratio": 0.0,
"completion_length": 1511.9226379394531,
"epoch": 1.42,
"grad_norm": 0.11015576124191284,
"kl": 0.16796875,
"learning_rate": 3.115363310950578e-07,
"loss": -0.027,
"reward": -0.10424264334142208,
"reward_std": 0.10744853690266609,
"rewards/cosine_scaled_reward": -0.05212132353335619,
"rewards/format_reward": 0.0,
"step": 355
},
{
"clip_ratio": 0.0,
"completion_length": 1507.7143249511719,
"epoch": 1.424,
"grad_norm": 0.1039690375328064,
"kl": 0.15771484375,
"learning_rate": 3.0887794225945143e-07,
"loss": -0.0104,
"reward": -0.11364280618727207,
"reward_std": 0.11577463708817959,
"rewards/cosine_scaled_reward": -0.05682140402495861,
"rewards/format_reward": 0.0,
"step": 356
},
{
"clip_ratio": 0.0,
"completion_length": 1522.547607421875,
"epoch": 1.428,
"grad_norm": 0.13563141226768494,
"kl": 0.16943359375,
"learning_rate": 3.062313053727671e-07,
"loss": -0.0127,
"reward": -0.09091841243207455,
"reward_std": 0.1005500927567482,
"rewards/cosine_scaled_reward": -0.04545920621603727,
"rewards/format_reward": 0.0,
"step": 357
},
{
"clip_ratio": 0.0,
"completion_length": 1508.1607360839844,
"epoch": 1.432,
"grad_norm": 0.12330485880374908,
"kl": 0.175048828125,
"learning_rate": 3.0359654942835247e-07,
"loss": -0.0276,
"reward": -0.09949876181781292,
"reward_std": 0.10788233578205109,
"rewards/cosine_scaled_reward": -0.049749381840229034,
"rewards/format_reward": 0.0,
"step": 358
},
{
"clip_ratio": 0.0,
"completion_length": 1526.1785888671875,
"epoch": 1.436,
"grad_norm": 0.1008228212594986,
"kl": 0.172607421875,
"learning_rate": 3.0097380284049523e-07,
"loss": -0.0072,
"reward": -0.08119065128266811,
"reward_std": 0.09274793975055218,
"rewards/cosine_scaled_reward": -0.04059532564133406,
"rewards/format_reward": 0.0,
"step": 359
},
{
"clip_ratio": 0.0,
"completion_length": 1507.6964721679688,
"epoch": 1.44,
"grad_norm": 0.11536505818367004,
"kl": 0.1640625,
"learning_rate": 2.9836319343816397e-07,
"loss": -0.0305,
"reward": -0.10905157402157784,
"reward_std": 0.11038926243782043,
"rewards/cosine_scaled_reward": -0.05452578607946634,
"rewards/format_reward": 0.0,
"step": 360
},
{
"clip_ratio": 0.0,
"completion_length": 1518.827392578125,
"epoch": 1.444,
"grad_norm": 0.12276989966630936,
"kl": 0.1514892578125,
"learning_rate": 2.9576484845877793e-07,
"loss": -0.0175,
"reward": -0.08610734064131975,
"reward_std": 0.09063750877976418,
"rewards/cosine_scaled_reward": -0.043053670320659876,
"rewards/format_reward": 0.0,
"step": 361
},
{
"clip_ratio": 0.0,
"completion_length": 1519.4702453613281,
"epoch": 1.448,
"grad_norm": 0.08738084882497787,
"kl": 0.18017578125,
"learning_rate": 2.931788945420058e-07,
"loss": -0.0112,
"reward": -0.09291153028607368,
"reward_std": 0.09842956997454166,
"rewards/cosine_scaled_reward": -0.046455767937004566,
"rewards/format_reward": 0.0,
"step": 362
},
{
"clip_ratio": 0.0,
"completion_length": 1509.0774230957031,
"epoch": 1.452,
"grad_norm": 0.11346267908811569,
"kl": 0.175048828125,
"learning_rate": 2.9060545772359305e-07,
"loss": -0.0277,
"reward": -0.11039301194250584,
"reward_std": 0.12665076181292534,
"rewards/cosine_scaled_reward": -0.05519650410860777,
"rewards/format_reward": 0.0,
"step": 363
},
{
"clip_ratio": 0.0,
"completion_length": 1524.1964416503906,
"epoch": 1.456,
"grad_norm": 0.14776764810085297,
"kl": 0.150634765625,
"learning_rate": 2.8804466342921987e-07,
"loss": -0.006,
"reward": -0.022786946967244148,
"reward_std": 0.10106383822858334,
"rewards/cosine_scaled_reward": -0.011393471620976925,
"rewards/format_reward": 0.0,
"step": 364
},
{
"clip_ratio": 0.0,
"completion_length": 1493.607177734375,
"epoch": 1.46,
"grad_norm": 0.09510252624750137,
"kl": 0.153076171875,
"learning_rate": 2.854966364683872e-07,
"loss": -0.0487,
"reward": -0.10556191392242908,
"reward_std": 0.10932666808366776,
"rewards/cosine_scaled_reward": -0.05278095696121454,
"rewards/format_reward": 0.0,
"step": 365
},
{
"clip_ratio": 0.0,
"completion_length": 1529.875,
"epoch": 1.464,
"grad_norm": 0.15466201305389404,
"kl": 0.15966796875,
"learning_rate": 2.829615010283344e-07,
"loss": 0.0007,
"reward": -0.09042776376008987,
"reward_std": 0.10296636447310448,
"rewards/cosine_scaled_reward": -0.04521388094872236,
"rewards/format_reward": 0.0,
"step": 366
},
{
"clip_ratio": 0.0,
"completion_length": 1504.6786193847656,
"epoch": 1.468,
"grad_norm": 0.08847711980342865,
"kl": 0.17041015625,
"learning_rate": 2.8043938066798645e-07,
"loss": -0.0311,
"reward": -0.10542780347168446,
"reward_std": 0.11852787062525749,
"rewards/cosine_scaled_reward": -0.05271390173584223,
"rewards/format_reward": 0.0,
"step": 367
},
{
"clip_ratio": 0.0,
"completion_length": 1498.7083435058594,
"epoch": 1.472,
"grad_norm": 0.1147918626666069,
"kl": 0.17041015625,
"learning_rate": 2.7793039831193133e-07,
"loss": -0.0404,
"reward": -0.10119456797838211,
"reward_std": 0.1359020471572876,
"rewards/cosine_scaled_reward": -0.050597282126545906,
"rewards/format_reward": 0.0,
"step": 368
},
{
"clip_ratio": 0.0,
"completion_length": 1507.5774230957031,
"epoch": 1.476,
"grad_norm": 0.0867527574300766,
"kl": 0.154052734375,
"learning_rate": 2.7543467624442956e-07,
"loss": -0.0327,
"reward": -0.09615712240338326,
"reward_std": 0.11924017407000065,
"rewards/cosine_scaled_reward": -0.0480785621330142,
"rewards/format_reward": 0.0,
"step": 369
},
{
"clip_ratio": 0.0,
"completion_length": 1515.5357360839844,
"epoch": 1.48,
"grad_norm": 0.07760825008153915,
"kl": 0.172607421875,
"learning_rate": 2.729523361034538e-07,
"loss": -0.012,
"reward": -0.09595928713679314,
"reward_std": 0.10662926360964775,
"rewards/cosine_scaled_reward": -0.04797964543104172,
"rewards/format_reward": 0.0,
"step": 370
},
{
"clip_ratio": 0.0,
"completion_length": 1510.3809814453125,
"epoch": 1.484,
"grad_norm": 0.1310672163963318,
"kl": 0.1671142578125,
"learning_rate": 2.7048349887476037e-07,
"loss": -0.0266,
"reward": -0.08946863748133183,
"reward_std": 0.0914797130972147,
"rewards/cosine_scaled_reward": -0.04473431780934334,
"rewards/format_reward": 0.0,
"step": 371
},
{
"clip_ratio": 0.0,
"completion_length": 1491.5179138183594,
"epoch": 1.488,
"grad_norm": 0.08744286000728607,
"kl": 0.156494140625,
"learning_rate": 2.6802828488599294e-07,
"loss": -0.045,
"reward": -0.1184717956930399,
"reward_std": 0.13941991329193115,
"rewards/cosine_scaled_reward": -0.059235901571810246,
"rewards/format_reward": 0.0,
"step": 372
},
{
"clip_ratio": 0.0,
"completion_length": 1521.0238342285156,
"epoch": 1.492,
"grad_norm": 0.1646253615617752,
"kl": 0.16357421875,
"learning_rate": 2.655868138008171e-07,
"loss": 0.0142,
"reward": -0.084196537733078,
"reward_std": 0.07485349848866463,
"rewards/cosine_scaled_reward": -0.042098269797861576,
"rewards/format_reward": 0.0,
"step": 373
},
{
"clip_ratio": 0.0,
"completion_length": 1523.375,
"epoch": 1.496,
"grad_norm": 0.11430079489946365,
"kl": 0.172607421875,
"learning_rate": 2.631592046130896e-07,
"loss": -0.0099,
"reward": -0.07816067058593035,
"reward_std": 0.08419617265462875,
"rewards/cosine_scaled_reward": -0.03908033389598131,
"rewards/format_reward": 0.0,
"step": 374
},
{
"clip_ratio": 0.0,
"completion_length": 1506.4524230957031,
"epoch": 1.5,
"grad_norm": 0.14677973091602325,
"kl": 0.18603515625,
"learning_rate": 2.6074557564105724e-07,
"loss": -0.0194,
"reward": -0.08955581299960613,
"reward_std": 0.09336170181632042,
"rewards/cosine_scaled_reward": -0.04477790556848049,
"rewards/format_reward": 0.0,
"step": 375
},
{
"clip_ratio": 0.0,
"completion_length": 1527.3988342285156,
"epoch": 1.504,
"grad_norm": 0.12783505022525787,
"kl": 0.159912109375,
"learning_rate": 2.583460445215911e-07,
"loss": -0.0049,
"reward": -0.0952699575573206,
"reward_std": 0.09568927250802517,
"rewards/cosine_scaled_reward": -0.047634975984692574,
"rewards/format_reward": 0.0,
"step": 376
},
{
"clip_ratio": 0.0,
"completion_length": 1520.7678833007812,
"epoch": 1.508,
"grad_norm": 0.1176699697971344,
"kl": 0.1650390625,
"learning_rate": 2.5596072820445254e-07,
"loss": -0.011,
"reward": -0.019147060811519623,
"reward_std": 0.09721549972891808,
"rewards/cosine_scaled_reward": -0.009573530405759811,
"rewards/format_reward": 0.0,
"step": 377
},
{
"clip_ratio": 0.0,
"completion_length": 1519.297607421875,
"epoch": 1.512,
"grad_norm": 0.11060648411512375,
"kl": 0.1650390625,
"learning_rate": 2.5358974294659373e-07,
"loss": -0.0134,
"reward": -0.09460222348570824,
"reward_std": 0.1032384280115366,
"rewards/cosine_scaled_reward": -0.04730111453682184,
"rewards/format_reward": 0.0,
"step": 378
},
{
"clip_ratio": 0.0,
"completion_length": 1524.2916870117188,
"epoch": 1.516,
"grad_norm": 0.12652094662189484,
"kl": 0.16064453125,
"learning_rate": 2.512332043064913e-07,
"loss": -0.0078,
"reward": -0.07960367575287819,
"reward_std": 0.08834364637732506,
"rewards/cosine_scaled_reward": -0.03980184067040682,
"rewards/format_reward": 0.0,
"step": 379
},
{
"clip_ratio": 0.0,
"completion_length": 1524.8750305175781,
"epoch": 1.52,
"grad_norm": 0.10227189213037491,
"kl": 0.16748046875,
"learning_rate": 2.488912271385139e-07,
"loss": -0.0064,
"reward": -0.08977451547980309,
"reward_std": 0.1080553438514471,
"rewards/cosine_scaled_reward": -0.04488725960254669,
"rewards/format_reward": 0.0,
"step": 380
},
{
"clip_ratio": 0.0,
"completion_length": 1506.3511962890625,
"epoch": 1.524,
"grad_norm": 0.12043853104114532,
"kl": 0.17138671875,
"learning_rate": 2.465639255873246e-07,
"loss": -0.035,
"reward": -0.11090395227074623,
"reward_std": 0.12006122805178165,
"rewards/cosine_scaled_reward": -0.05545197706669569,
"rewards/format_reward": 0.0,
"step": 381
},
{
"clip_ratio": 0.0,
"completion_length": 1523.5178833007812,
"epoch": 1.528,
"grad_norm": 0.13229811191558838,
"kl": 0.170654296875,
"learning_rate": 2.4425141308231765e-07,
"loss": -0.0068,
"reward": -0.09728248044848442,
"reward_std": 0.107889199629426,
"rewards/cosine_scaled_reward": -0.048641239292919636,
"rewards/format_reward": 0.0,
"step": 382
},
{
"clip_ratio": 0.0,
"completion_length": 1527.2261962890625,
"epoch": 1.532,
"grad_norm": 0.10695023834705353,
"kl": 0.1630859375,
"learning_rate": 2.4195380233209006e-07,
"loss": -0.0022,
"reward": -0.09213725849986076,
"reward_std": 0.10676849260926247,
"rewards/cosine_scaled_reward": -0.046068630181252956,
"rewards/format_reward": 0.0,
"step": 383
},
{
"clip_ratio": 0.0,
"completion_length": 1510.7083740234375,
"epoch": 1.536,
"grad_norm": 0.13451160490512848,
"kl": 0.150390625,
"learning_rate": 2.3967120531894857e-07,
"loss": -0.0256,
"reward": -0.10359417460858822,
"reward_std": 0.12065772153437138,
"rewards/cosine_scaled_reward": -0.05179708730429411,
"rewards/format_reward": 0.0,
"step": 384
},
{
"clip_ratio": 0.0,
"completion_length": 1501.4940795898438,
"epoch": 1.54,
"grad_norm": 0.1391247659921646,
"kl": 0.16357421875,
"learning_rate": 2.374037332934512e-07,
"loss": -0.0409,
"reward": -0.09379393607378006,
"reward_std": 0.10166217759251595,
"rewards/cosine_scaled_reward": -0.04689696989953518,
"rewards/format_reward": 0.0,
"step": 385
},
{
"clip_ratio": 0.0,
"completion_length": 1516.2500305175781,
"epoch": 1.544,
"grad_norm": 0.1330152153968811,
"kl": 0.166748046875,
"learning_rate": 2.3515149676898552e-07,
"loss": 0.0087,
"reward": -0.07589279673993587,
"reward_std": 0.09089674055576324,
"rewards/cosine_scaled_reward": -0.03794640023261309,
"rewards/format_reward": 0.0,
"step": 386
},
{
"clip_ratio": 0.0,
"completion_length": 1488.6131286621094,
"epoch": 1.548,
"grad_norm": 0.10263092815876007,
"kl": 0.1630859375,
"learning_rate": 2.3291460551638237e-07,
"loss": -0.0521,
"reward": -0.12465786561369896,
"reward_std": 0.1609484814107418,
"rewards/cosine_scaled_reward": -0.06232893466949463,
"rewards/format_reward": 0.0,
"step": 387
},
{
"clip_ratio": 0.0,
"completion_length": 1516.8928833007812,
"epoch": 1.552,
"grad_norm": 0.09812143445014954,
"kl": 0.15966796875,
"learning_rate": 2.306931685585657e-07,
"loss": -0.015,
"reward": -0.0796813191846013,
"reward_std": 0.08767454512417316,
"rewards/cosine_scaled_reward": -0.039840660989284515,
"rewards/format_reward": 0.0,
"step": 388
},
{
"clip_ratio": 0.0,
"completion_length": 1505.2738342285156,
"epoch": 1.556,
"grad_norm": 0.16943664848804474,
"kl": 0.1591796875,
"learning_rate": 2.2848729416523859e-07,
"loss": -0.0254,
"reward": -0.11296515539288521,
"reward_std": 0.12935122102499008,
"rewards/cosine_scaled_reward": -0.05648257676512003,
"rewards/format_reward": 0.0,
"step": 389
},
{
"clip_ratio": 0.0,
"completion_length": 1511.9345397949219,
"epoch": 1.56,
"grad_norm": 0.1270017921924591,
"kl": 0.1513671875,
"learning_rate": 2.2629708984760706e-07,
"loss": -0.0186,
"reward": -0.08384528011083603,
"reward_std": 0.08424858003854752,
"rewards/cosine_scaled_reward": -0.04192264098674059,
"rewards/format_reward": 0.0,
"step": 390
},
{
"clip_ratio": 0.0,
"completion_length": 1525.982177734375,
"epoch": 1.564,
"grad_norm": 0.16950343549251556,
"kl": 0.169921875,
"learning_rate": 2.2412266235313973e-07,
"loss": -0.0058,
"reward": -0.08042520564049482,
"reward_std": 0.08201098442077637,
"rewards/cosine_scaled_reward": -0.04021260142326355,
"rewards/format_reward": 0.0,
"step": 391
},
{
"clip_ratio": 0.0,
"completion_length": 1506.6666564941406,
"epoch": 1.568,
"grad_norm": 0.13040253520011902,
"kl": 0.14990234375,
"learning_rate": 2.2196411766036487e-07,
"loss": -0.0288,
"reward": -0.10378818027675152,
"reward_std": 0.1260694395750761,
"rewards/cosine_scaled_reward": -0.05189409013837576,
"rewards/format_reward": 0.0,
"step": 392
},
{
"clip_ratio": 0.0,
"completion_length": 1518.4047546386719,
"epoch": 1.572,
"grad_norm": 0.11275047063827515,
"kl": 0.1572265625,
"learning_rate": 2.1982156097370557e-07,
"loss": -0.0157,
"reward": -0.09913922101259232,
"reward_std": 0.10591815412044525,
"rewards/cosine_scaled_reward": -0.04956961143761873,
"rewards/format_reward": 0.0,
"step": 393
},
{
"clip_ratio": 0.0,
"completion_length": 1524.8035888671875,
"epoch": 1.576,
"grad_norm": 0.11497998982667923,
"kl": 0.17578125,
"learning_rate": 2.1769509671835223e-07,
"loss": -0.009,
"reward": -0.08646929264068604,
"reward_std": 0.09624841343611479,
"rewards/cosine_scaled_reward": -0.04323464818298817,
"rewards/format_reward": 0.0,
"step": 394
},
{
"clip_ratio": 0.0,
"completion_length": 1518.5416870117188,
"epoch": 1.58,
"grad_norm": 0.19012141227722168,
"kl": 0.148193359375,
"learning_rate": 2.1558482853517253e-07,
"loss": -0.0063,
"reward": -0.09366242028772831,
"reward_std": 0.1069308090955019,
"rewards/cosine_scaled_reward": -0.046831210143864155,
"rewards/format_reward": 0.0,
"step": 395
},
{
"clip_ratio": 0.0,
"completion_length": 1508.1488342285156,
"epoch": 1.584,
"grad_norm": 0.1060405820608139,
"kl": 0.156982421875,
"learning_rate": 2.134908592756607e-07,
"loss": -0.0309,
"reward": -0.10527068562805653,
"reward_std": 0.12328575551509857,
"rewards/cosine_scaled_reward": -0.05263534560799599,
"rewards/format_reward": 0.0,
"step": 396
},
{
"clip_ratio": 0.0,
"completion_length": 1506.7916870117188,
"epoch": 1.588,
"grad_norm": 0.10801802575588226,
"kl": 0.140869140625,
"learning_rate": 2.1141329099692406e-07,
"loss": -0.0205,
"reward": -0.11307091265916824,
"reward_std": 0.123080899938941,
"rewards/cosine_scaled_reward": -0.05653545819222927,
"rewards/format_reward": 0.0,
"step": 397
},
{
"clip_ratio": 0.0,
"completion_length": 1504.5595703125,
"epoch": 1.592,
"grad_norm": 0.08320983499288559,
"kl": 0.1591796875,
"learning_rate": 2.0935222495670968e-07,
"loss": -0.037,
"reward": -0.09146481472998857,
"reward_std": 0.09883083030581474,
"rewards/cosine_scaled_reward": -0.04573240736499429,
"rewards/format_reward": 0.0,
"step": 398
},
{
"clip_ratio": 0.0,
"completion_length": 1531.6607360839844,
"epoch": 1.596,
"grad_norm": 0.09601892530918121,
"kl": 0.1533203125,
"learning_rate": 2.0730776160846853e-07,
"loss": 0.0006,
"reward": -0.08569015190005302,
"reward_std": 0.0903671607375145,
"rewards/cosine_scaled_reward": -0.04284507688134909,
"rewards/format_reward": 0.0,
"step": 399
},
{
"clip_ratio": 0.0,
"completion_length": 1501.8928833007812,
"epoch": 1.6,
"grad_norm": 0.12060719728469849,
"kl": 0.16943359375,
"learning_rate": 2.0528000059645995e-07,
"loss": -0.0212,
"reward": -0.09470336884260178,
"reward_std": 0.11146636307239532,
"rewards/cosine_scaled_reward": -0.04735168442130089,
"rewards/format_reward": 0.0,
"step": 400
},
{
"clip_ratio": 0.0,
"completion_length": 1523.4940795898438,
"epoch": 1.604,
"grad_norm": 0.10291819274425507,
"kl": 0.1591796875,
"learning_rate": 2.032690407508949e-07,
"loss": -0.0098,
"reward": -0.09152790158987045,
"reward_std": 0.11163719370961189,
"rewards/cosine_scaled_reward": -0.045763951260596514,
"rewards/format_reward": 0.0,
"step": 401
},
{
"clip_ratio": 0.0,
"completion_length": 1515.0952453613281,
"epoch": 1.608,
"grad_norm": 0.13543279469013214,
"kl": 0.1689453125,
"learning_rate": 2.0127498008311922e-07,
"loss": -0.0193,
"reward": -0.08428375516086817,
"reward_std": 0.08265121094882488,
"rewards/cosine_scaled_reward": -0.042141877580434084,
"rewards/format_reward": 0.0,
"step": 402
},
{
"clip_ratio": 0.0,
"completion_length": 1514.3154907226562,
"epoch": 1.612,
"grad_norm": 0.10554395616054535,
"kl": 0.15380859375,
"learning_rate": 1.9929791578083655e-07,
"loss": -0.0233,
"reward": -0.09138609375804663,
"reward_std": 0.09994357451796532,
"rewards/cosine_scaled_reward": -0.04569304594770074,
"rewards/format_reward": 0.0,
"step": 403
},
{
"clip_ratio": 0.0,
"completion_length": 1503.452392578125,
"epoch": 1.616,
"grad_norm": 0.10063979774713516,
"kl": 0.155517578125,
"learning_rate": 1.9733794420337213e-07,
"loss": -0.0392,
"reward": -0.100379329174757,
"reward_std": 0.12372113950550556,
"rewards/cosine_scaled_reward": -0.05018966645002365,
"rewards/format_reward": 0.0,
"step": 404
},
{
"clip_ratio": 0.0,
"completion_length": 1516.607177734375,
"epoch": 1.62,
"grad_norm": 0.090563103556633,
"kl": 0.163818359375,
"learning_rate": 1.9539516087697517e-07,
"loss": -0.0215,
"reward": -0.08255079202353954,
"reward_std": 0.09494246542453766,
"rewards/cosine_scaled_reward": -0.041275396943092346,
"rewards/format_reward": 0.0,
"step": 405
},
{
"clip_ratio": 0.0,
"completion_length": 1524.3988342285156,
"epoch": 1.624,
"grad_norm": 0.10463332384824753,
"kl": 0.154541015625,
"learning_rate": 1.934696604901642e-07,
"loss": -0.0101,
"reward": -0.09653126262128353,
"reward_std": 0.11365084536373615,
"rewards/cosine_scaled_reward": -0.04826563224196434,
"rewards/format_reward": 0.0,
"step": 406
},
{
"clip_ratio": 0.0,
"completion_length": 1504.4345703125,
"epoch": 1.6280000000000001,
"grad_norm": 0.18202035129070282,
"kl": 0.164794921875,
"learning_rate": 1.915615368891117e-07,
"loss": -0.0156,
"reward": -0.10028301551938057,
"reward_std": 0.12200421467423439,
"rewards/cosine_scaled_reward": -0.050141509622335434,
"rewards/format_reward": 0.0,
"step": 407
},
{
"clip_ratio": 0.0,
"completion_length": 1501.7857360839844,
"epoch": 1.6320000000000001,
"grad_norm": 0.10041651129722595,
"kl": 0.1552734375,
"learning_rate": 1.8967088307307e-07,
"loss": -0.0405,
"reward": -0.1025087870657444,
"reward_std": 0.12081354483962059,
"rewards/cosine_scaled_reward": -0.051254394464194775,
"rewards/format_reward": 0.0,
"step": 408
},
{
"clip_ratio": 0.0,
"completion_length": 1507.65478515625,
"epoch": 1.6360000000000001,
"grad_norm": 0.0929652526974678,
"kl": 0.164306640625,
"learning_rate": 1.8779779118983867e-07,
"loss": -0.0336,
"reward": -0.10521730966866016,
"reward_std": 0.12063234858214855,
"rewards/cosine_scaled_reward": -0.05260865669697523,
"rewards/format_reward": 0.0,
"step": 409
},
{
"clip_ratio": 0.0,
"completion_length": 1489.9404907226562,
"epoch": 1.6400000000000001,
"grad_norm": 0.08884437382221222,
"kl": 0.16552734375,
"learning_rate": 1.8594235253127372e-07,
"loss": -0.0292,
"reward": -0.09861567430198193,
"reward_std": 0.10845682211220264,
"rewards/cosine_scaled_reward": -0.04930783715099096,
"rewards/format_reward": 0.0,
"step": 410
},
{
"clip_ratio": 0.0,
"completion_length": 1514.0476379394531,
"epoch": 1.6440000000000001,
"grad_norm": 0.1006086990237236,
"kl": 0.156494140625,
"learning_rate": 1.8410465752883758e-07,
"loss": -0.021,
"reward": -0.09761104919016361,
"reward_std": 0.10258225724101067,
"rewards/cosine_scaled_reward": -0.048805526457726955,
"rewards/format_reward": 0.0,
"step": 411
},
{
"clip_ratio": 0.0,
"completion_length": 1519.90478515625,
"epoch": 1.6480000000000001,
"grad_norm": 0.11515481770038605,
"kl": 0.16943359375,
"learning_rate": 1.822847957491922e-07,
"loss": -0.016,
"reward": -0.08585721254348755,
"reward_std": 0.10039913840591908,
"rewards/cosine_scaled_reward": -0.042928608134388924,
"rewards/format_reward": 0.0,
"step": 412
},
{
"clip_ratio": 0.0,
"completion_length": 1510.357177734375,
"epoch": 1.6520000000000001,
"grad_norm": 0.09629681706428528,
"kl": 0.159423828125,
"learning_rate": 1.804828558898332e-07,
"loss": -0.0283,
"reward": -0.08894845098257065,
"reward_std": 0.10276514105498791,
"rewards/cosine_scaled_reward": -0.04447422595694661,
"rewards/format_reward": 0.0,
"step": 413
},
{
"clip_ratio": 0.0,
"completion_length": 1511.0357360839844,
"epoch": 1.6560000000000001,
"grad_norm": 0.12116753309965134,
"kl": 0.169189453125,
"learning_rate": 1.7869892577476722e-07,
"loss": -0.0245,
"reward": -0.10262815281748772,
"reward_std": 0.12108992040157318,
"rewards/cosine_scaled_reward": -0.051314075477421284,
"rewards/format_reward": 0.0,
"step": 414
},
{
"clip_ratio": 0.0,
"completion_length": 1521.9345397949219,
"epoch": 1.6600000000000001,
"grad_norm": 0.10831650346517563,
"kl": 0.163818359375,
"learning_rate": 1.7693309235023127e-07,
"loss": -0.0085,
"reward": -0.08154256083071232,
"reward_std": 0.09072042256593704,
"rewards/cosine_scaled_reward": -0.04077128041535616,
"rewards/format_reward": 0.0,
"step": 415
},
{
"clip_ratio": 0.0,
"completion_length": 1508.0059814453125,
"epoch": 1.6640000000000001,
"grad_norm": 0.10683077573776245,
"kl": 0.175537109375,
"learning_rate": 1.7518544168045524e-07,
"loss": -0.0242,
"reward": -0.1116462592035532,
"reward_std": 0.10574496164917946,
"rewards/cosine_scaled_reward": -0.05582312494516373,
"rewards/format_reward": 0.0,
"step": 416
},
{
"clip_ratio": 0.0,
"completion_length": 1515.3393249511719,
"epoch": 1.6680000000000001,
"grad_norm": 0.14900319278240204,
"kl": 0.1650390625,
"learning_rate": 1.7345605894346726e-07,
"loss": -0.021,
"reward": -0.08745052106678486,
"reward_std": 0.11200828477740288,
"rewards/cosine_scaled_reward": -0.04372526053339243,
"rewards/format_reward": 0.0,
"step": 417
},
{
"clip_ratio": 0.0,
"completion_length": 1523.3690490722656,
"epoch": 1.6720000000000002,
"grad_norm": 0.11940804123878479,
"kl": 0.154541015625,
"learning_rate": 1.7174502842694212e-07,
"loss": -0.0124,
"reward": -0.0070722997188568115,
"reward_std": 0.09923059120774269,
"rewards/cosine_scaled_reward": -0.0035361526533961296,
"rewards/format_reward": 0.0,
"step": 418
},
{
"clip_ratio": 0.0,
"completion_length": 1485.9583435058594,
"epoch": 1.6760000000000002,
"grad_norm": 0.08529967814683914,
"kl": 0.172607421875,
"learning_rate": 1.7005243352409333e-07,
"loss": -0.0651,
"reward": -0.04097301326692104,
"reward_std": 0.15967968851327896,
"rewards/cosine_scaled_reward": -0.02048650663346052,
"rewards/format_reward": 0.0,
"step": 419
},
{
"clip_ratio": 0.0,
"completion_length": 1504.0059814453125,
"epoch": 1.6800000000000002,
"grad_norm": 0.0939546748995781,
"kl": 0.16015625,
"learning_rate": 1.6837835672960831e-07,
"loss": -0.0345,
"reward": -0.091935895383358,
"reward_std": 0.11023806594312191,
"rewards/cosine_scaled_reward": -0.04596794489771128,
"rewards/format_reward": 0.0,
"step": 420
},
{
"clip_ratio": 0.0,
"completion_length": 1502.7500305175781,
"epoch": 1.6840000000000002,
"grad_norm": 0.114561066031456,
"kl": 0.17724609375,
"learning_rate": 1.6672287963562852e-07,
"loss": -0.0193,
"reward": -0.07856714259833097,
"reward_std": 0.08897042460739613,
"rewards/cosine_scaled_reward": -0.03928357409313321,
"rewards/format_reward": 0.0,
"step": 421
},
{
"clip_ratio": 0.0,
"completion_length": 1496.7261962890625,
"epoch": 1.688,
"grad_norm": 0.11227195709943771,
"kl": 0.160888671875,
"learning_rate": 1.6508608292777203e-07,
"loss": -0.0359,
"reward": -0.09738295152783394,
"reward_std": 0.11914198100566864,
"rewards/cosine_scaled_reward": -0.048691474832594395,
"rewards/format_reward": 0.0,
"step": 422
},
{
"clip_ratio": 0.0,
"completion_length": 1511.1964416503906,
"epoch": 1.692,
"grad_norm": 0.13162577152252197,
"kl": 0.181640625,
"learning_rate": 1.6346804638120098e-07,
"loss": -0.0245,
"reward": -0.07754436880350113,
"reward_std": 0.10734674707055092,
"rewards/cosine_scaled_reward": -0.0387721830047667,
"rewards/format_reward": 0.0,
"step": 423
},
{
"clip_ratio": 0.0,
"completion_length": 1508.1012268066406,
"epoch": 1.696,
"grad_norm": 0.10524528473615646,
"kl": 0.164306640625,
"learning_rate": 1.6186884885673413e-07,
"loss": -0.024,
"reward": -0.08680723141878843,
"reward_std": 0.0982758505269885,
"rewards/cosine_scaled_reward": -0.04340361384674907,
"rewards/format_reward": 0.0,
"step": 424
},
{
"clip_ratio": 0.0,
"completion_length": 1505.3154907226562,
"epoch": 1.7,
"grad_norm": 0.10563742369413376,
"kl": 0.16015625,
"learning_rate": 1.6028856829700258e-07,
"loss": -0.0092,
"reward": -0.08459902927279472,
"reward_std": 0.09910181537270546,
"rewards/cosine_scaled_reward": -0.04229951370507479,
"rewards/format_reward": 0.0,
"step": 425
},
{
"clip_ratio": 0.0,
"completion_length": 1520.6785888671875,
"epoch": 1.704,
"grad_norm": 0.08786718547344208,
"kl": 0.161865234375,
"learning_rate": 1.5872728172265146e-07,
"loss": -0.0165,
"reward": -0.07912362925708294,
"reward_std": 0.08175937831401825,
"rewards/cosine_scaled_reward": -0.03956181462854147,
"rewards/format_reward": 0.0,
"step": 426
},
{
"clip_ratio": 0.0,
"completion_length": 1498.9762573242188,
"epoch": 1.708,
"grad_norm": 0.08499140292406082,
"kl": 0.15625,
"learning_rate": 1.5718506522858572e-07,
"loss": -0.0364,
"reward": -0.0896658506244421,
"reward_std": 0.10279479995369911,
"rewards/cosine_scaled_reward": -0.0448329309001565,
"rewards/format_reward": 0.0,
"step": 427
},
{
"clip_ratio": 0.0,
"completion_length": 1508.3035583496094,
"epoch": 1.712,
"grad_norm": 0.08925153315067291,
"kl": 0.14453125,
"learning_rate": 1.5566199398026147e-07,
"loss": -0.0309,
"reward": -0.09496857039630413,
"reward_std": 0.1123510580509901,
"rewards/cosine_scaled_reward": -0.04748428799211979,
"rewards/format_reward": 0.0,
"step": 428
},
{
"clip_ratio": 0.0,
"completion_length": 1523.2916870117188,
"epoch": 1.716,
"grad_norm": 0.10608566552400589,
"kl": 0.167724609375,
"learning_rate": 1.5415814221002265e-07,
"loss": -0.0113,
"reward": -0.09526684321463108,
"reward_std": 0.0988641269505024,
"rewards/cosine_scaled_reward": -0.047633420675992966,
"rewards/format_reward": 0.0,
"step": 429
},
{
"clip_ratio": 0.0,
"completion_length": 1498.2261962890625,
"epoch": 1.72,
"grad_norm": 0.10655763745307922,
"kl": 0.16162109375,
"learning_rate": 1.5267358321348285e-07,
"loss": -0.0414,
"reward": -0.10186839010566473,
"reward_std": 0.13254049234092236,
"rewards/cosine_scaled_reward": -0.05093420064076781,
"rewards/format_reward": 0.0,
"step": 430
},
{
"clip_ratio": 0.0,
"completion_length": 1528.0476379394531,
"epoch": 1.724,
"grad_norm": 0.1068165972828865,
"kl": 0.15673828125,
"learning_rate": 1.5120838934595337e-07,
"loss": -0.0041,
"reward": -0.09541826322674751,
"reward_std": 0.1032972726970911,
"rewards/cosine_scaled_reward": -0.04770912975072861,
"rewards/format_reward": 0.0,
"step": 431
},
{
"clip_ratio": 0.0,
"completion_length": 1522.4583435058594,
"epoch": 1.728,
"grad_norm": 0.12408644706010818,
"kl": 0.16796875,
"learning_rate": 1.4976263201891613e-07,
"loss": -0.013,
"reward": -0.08289302699267864,
"reward_std": 0.08351449854671955,
"rewards/cosine_scaled_reward": -0.041446512565016747,
"rewards/format_reward": 0.0,
"step": 432
},
{
"clip_ratio": 0.0,
"completion_length": 1497.9107360839844,
"epoch": 1.732,
"grad_norm": 0.07734204828739166,
"kl": 0.173095703125,
"learning_rate": 1.483363816965435e-07,
"loss": -0.0391,
"reward": -0.10260258801281452,
"reward_std": 0.12828159891068935,
"rewards/cosine_scaled_reward": -0.051301293075084686,
"rewards/format_reward": 0.0,
"step": 433
},
{
"clip_ratio": 0.0,
"completion_length": 1513.7559814453125,
"epoch": 1.736,
"grad_norm": 0.12413759529590607,
"kl": 0.17919921875,
"learning_rate": 1.469297078922642e-07,
"loss": -0.0244,
"reward": -0.015836404636502266,
"reward_std": 0.09649943746626377,
"rewards/cosine_scaled_reward": -0.007918204180896282,
"rewards/format_reward": 0.0,
"step": 434
},
{
"clip_ratio": 0.0,
"completion_length": 1522.1190795898438,
"epoch": 1.74,
"grad_norm": 0.12912577390670776,
"kl": 0.16650390625,
"learning_rate": 1.4554267916537495e-07,
"loss": -0.0112,
"reward": -0.08571217954158783,
"reward_std": 0.10151237808167934,
"rewards/cosine_scaled_reward": -0.042856089770793915,
"rewards/format_reward": 0.0,
"step": 435
},
{
"clip_ratio": 0.0,
"completion_length": 1474.952392578125,
"epoch": 1.744,
"grad_norm": 0.08318183571100235,
"kl": 0.1689453125,
"learning_rate": 1.4417536311769885e-07,
"loss": -0.0637,
"reward": -0.09841407462954521,
"reward_std": 0.1172296404838562,
"rewards/cosine_scaled_reward": -0.049207039177417755,
"rewards/format_reward": 0.0,
"step": 436
},
{
"clip_ratio": 0.0,
"completion_length": 1481.3095397949219,
"epoch": 1.748,
"grad_norm": 0.0786975845694542,
"kl": 0.156005859375,
"learning_rate": 1.4282782639029128e-07,
"loss": -0.0386,
"reward": -0.08532883040606976,
"reward_std": 0.09728906117379665,
"rewards/cosine_scaled_reward": -0.04266441613435745,
"rewards/format_reward": 0.0,
"step": 437
},
{
"clip_ratio": 0.0,
"completion_length": 1500.7440490722656,
"epoch": 1.752,
"grad_norm": 0.0900636538863182,
"kl": 0.161376953125,
"learning_rate": 1.4150013466019114e-07,
"loss": -0.0316,
"reward": -0.08545132167637348,
"reward_std": 0.10151121858507395,
"rewards/cosine_scaled_reward": -0.042725661769509315,
"rewards/format_reward": 0.0,
"step": 438
},
{
"clip_ratio": 0.0,
"completion_length": 1506.9404907226562,
"epoch": 1.756,
"grad_norm": 0.11020209640264511,
"kl": 0.1640625,
"learning_rate": 1.4019235263722034e-07,
"loss": -0.0259,
"reward": -0.08197178691625595,
"reward_std": 0.09423052612692118,
"rewards/cosine_scaled_reward": -0.040985893458127975,
"rewards/format_reward": 0.0,
"step": 439
},
{
"clip_ratio": 0.0,
"completion_length": 1517.482177734375,
"epoch": 1.76,
"grad_norm": 0.08999020606279373,
"kl": 0.165283203125,
"learning_rate": 1.3890454406082956e-07,
"loss": -0.017,
"reward": -0.07763329334557056,
"reward_std": 0.08629796095192432,
"rewards/cosine_scaled_reward": -0.03881664574146271,
"rewards/format_reward": 0.0,
"step": 440
},
{
"clip_ratio": 0.0,
"completion_length": 1510.8511962890625,
"epoch": 1.764,
"grad_norm": 0.13127504289150238,
"kl": 0.15234375,
"learning_rate": 1.3763677169699217e-07,
"loss": -0.0232,
"reward": -0.08330708928406239,
"reward_std": 0.09235509857535362,
"rewards/cosine_scaled_reward": -0.04165354464203119,
"rewards/format_reward": 0.0,
"step": 441
},
{
"clip_ratio": 0.0,
"completion_length": 1507.7619323730469,
"epoch": 1.768,
"grad_norm": 0.14613445103168488,
"kl": 0.152099609375,
"learning_rate": 1.3638909733514452e-07,
"loss": -0.0284,
"reward": -0.09447834640741348,
"reward_std": 0.09266001731157303,
"rewards/cosine_scaled_reward": -0.04723917320370674,
"rewards/format_reward": 0.0,
"step": 442
},
{
"clip_ratio": 0.0,
"completion_length": 1516.875,
"epoch": 1.772,
"grad_norm": 0.18538497388362885,
"kl": 0.15966796875,
"learning_rate": 1.351615817851748e-07,
"loss": -0.0153,
"reward": -0.08249685540795326,
"reward_std": 0.09769860841333866,
"rewards/cosine_scaled_reward": -0.04124843003228307,
"rewards/format_reward": 0.0,
"step": 443
},
{
"clip_ratio": 0.0,
"completion_length": 1501.8155212402344,
"epoch": 1.776,
"grad_norm": 0.1319953352212906,
"kl": 0.155029296875,
"learning_rate": 1.3395428487445914e-07,
"loss": -0.039,
"reward": -0.09754344820976257,
"reward_std": 0.11035412549972534,
"rewards/cosine_scaled_reward": -0.04877172317355871,
"rewards/format_reward": 0.0,
"step": 444
},
{
"clip_ratio": 0.0,
"completion_length": 1511.9821472167969,
"epoch": 1.78,
"grad_norm": 0.1029873788356781,
"kl": 0.1484375,
"learning_rate": 1.3276726544494571e-07,
"loss": -0.0251,
"reward": -0.0899391695857048,
"reward_std": 0.10835397988557816,
"rewards/cosine_scaled_reward": -0.04496958386152983,
"rewards/format_reward": 0.0,
"step": 445
},
{
"clip_ratio": 0.0,
"completion_length": 1469.40478515625,
"epoch": 1.784,
"grad_norm": 0.08266568928956985,
"kl": 0.158935546875,
"learning_rate": 1.316005813502869e-07,
"loss": -0.0788,
"reward": -0.10532401315867901,
"reward_std": 0.12493490241467953,
"rewards/cosine_scaled_reward": -0.05266200751066208,
"rewards/format_reward": 0.0,
"step": 446
},
{
"clip_ratio": 0.0,
"completion_length": 1505.5952453613281,
"epoch": 1.788,
"grad_norm": 0.13063663244247437,
"kl": 0.15576171875,
"learning_rate": 1.3045428945301953e-07,
"loss": -0.0317,
"reward": -0.022020583972334862,
"reward_std": 0.1138888020068407,
"rewards/cosine_scaled_reward": -0.011010290123522282,
"rewards/format_reward": 0.0,
"step": 447
},
{
"clip_ratio": 0.0,
"completion_length": 1493.7142944335938,
"epoch": 1.792,
"grad_norm": 0.1297776997089386,
"kl": 0.17236328125,
"learning_rate": 1.2932844562179352e-07,
"loss": -0.0294,
"reward": -0.08543841261416674,
"reward_std": 0.09460597112774849,
"rewards/cosine_scaled_reward": -0.042719203513115644,
"rewards/format_reward": 0.0,
"step": 448
},
{
"clip_ratio": 0.0,
"completion_length": 1521.1071472167969,
"epoch": 1.796,
"grad_norm": 0.1407863050699234,
"kl": 0.16357421875,
"learning_rate": 1.2822310472864885e-07,
"loss": -0.0138,
"reward": -0.10117548704147339,
"reward_std": 0.12595792300999165,
"rewards/cosine_scaled_reward": -0.05058774631470442,
"rewards/format_reward": 0.0,
"step": 449
},
{
"clip_ratio": 0.0,
"completion_length": 1507.1964721679688,
"epoch": 1.8,
"grad_norm": 0.16165214776992798,
"kl": 0.1787109375,
"learning_rate": 1.2713832064634125e-07,
"loss": -0.0099,
"reward": -0.08527638856321573,
"reward_std": 0.09594122413545847,
"rewards/cosine_scaled_reward": -0.04263819335028529,
"rewards/format_reward": 0.0,
"step": 450
},
{
"clip_ratio": 0.0,
"completion_length": 1511.7202758789062,
"epoch": 1.804,
"grad_norm": 0.1142469272017479,
"kl": 0.163818359375,
"learning_rate": 1.260741462457165e-07,
"loss": -0.023,
"reward": -0.09475222788751125,
"reward_std": 0.10312853008508682,
"rewards/cosine_scaled_reward": -0.04737611673772335,
"rewards/format_reward": 0.0,
"step": 451
},
{
"clip_ratio": 0.0,
"completion_length": 1527.5654907226562,
"epoch": 1.808,
"grad_norm": 0.1640588343143463,
"kl": 0.171142578125,
"learning_rate": 1.2503063339313356e-07,
"loss": -0.0028,
"reward": -0.09184761717915535,
"reward_std": 0.09996213018894196,
"rewards/cosine_scaled_reward": -0.0459238076582551,
"rewards/format_reward": 0.0,
"step": 452
},
{
"clip_ratio": 0.0,
"completion_length": 1493.8036193847656,
"epoch": 1.812,
"grad_norm": 0.07719198614358902,
"kl": 0.177978515625,
"learning_rate": 1.2400783294793668e-07,
"loss": -0.0428,
"reward": -0.10916751623153687,
"reward_std": 0.12887151166796684,
"rewards/cosine_scaled_reward": -0.054583752527832985,
"rewards/format_reward": 0.0,
"step": 453
},
{
"clip_ratio": 0.0,
"completion_length": 1504.3631286621094,
"epoch": 1.8159999999999998,
"grad_norm": 0.12536108493804932,
"kl": 0.152587890625,
"learning_rate": 1.2300579475997657e-07,
"loss": -0.038,
"reward": -0.1067353542894125,
"reward_std": 0.12835400737822056,
"rewards/cosine_scaled_reward": -0.05336767714470625,
"rewards/format_reward": 0.0,
"step": 454
},
{
"clip_ratio": 0.0,
"completion_length": 1507.77978515625,
"epoch": 1.8199999999999998,
"grad_norm": 0.1128176897764206,
"kl": 0.16259765625,
"learning_rate": 1.220245676671809e-07,
"loss": -0.0177,
"reward": -0.10911162942647934,
"reward_std": 0.13343517668545246,
"rewards/cosine_scaled_reward": -0.054555815644562244,
"rewards/format_reward": 0.0,
"step": 455
},
{
"clip_ratio": 0.0,
"completion_length": 1536.0,
"epoch": 1.8239999999999998,
"grad_norm": 0.13748064637184143,
"kl": 0.172119140625,
"learning_rate": 1.2106419949317388e-07,
"loss": 0.0069,
"reward": -0.07034523971378803,
"reward_std": 0.07846208661794662,
"rewards/cosine_scaled_reward": -0.03517262078821659,
"rewards/format_reward": 0.0,
"step": 456
},
{
"clip_ratio": 0.0,
"completion_length": 1492.8690490722656,
"epoch": 1.8279999999999998,
"grad_norm": 0.1389494091272354,
"kl": 0.152099609375,
"learning_rate": 1.2012473704494537e-07,
"loss": -0.0455,
"reward": -0.11908807791769505,
"reward_std": 0.13069945573806763,
"rewards/cosine_scaled_reward": -0.059544037096202374,
"rewards/format_reward": 0.0,
"step": 457
},
{
"clip_ratio": 0.0,
"completion_length": 1499.9464721679688,
"epoch": 1.8319999999999999,
"grad_norm": 0.1537049114704132,
"kl": 0.1683349609375,
"learning_rate": 1.1920622611056974e-07,
"loss": -0.0307,
"reward": -0.08774650190025568,
"reward_std": 0.10440967045724392,
"rewards/cosine_scaled_reward": -0.043873251881450415,
"rewards/format_reward": 0.0,
"step": 458
},
{
"clip_ratio": 0.0,
"completion_length": 1524.2083435058594,
"epoch": 1.8359999999999999,
"grad_norm": 0.13081440329551697,
"kl": 0.176513671875,
"learning_rate": 1.1830871145697412e-07,
"loss": -0.0066,
"reward": -0.08506089821457863,
"reward_std": 0.09712946228682995,
"rewards/cosine_scaled_reward": -0.04253045003861189,
"rewards/format_reward": 0.0,
"step": 459
},
{
"clip_ratio": 0.0,
"completion_length": 1502.9583435058594,
"epoch": 1.8399999999999999,
"grad_norm": 0.09031596034765244,
"kl": 0.154296875,
"learning_rate": 1.1743223682775649e-07,
"loss": -0.0358,
"reward": -0.09923446178436279,
"reward_std": 0.11484255269169807,
"rewards/cosine_scaled_reward": -0.049617230892181396,
"rewards/format_reward": 0.0,
"step": 460
},
{
"clip_ratio": 0.0,
"completion_length": 1514.5059509277344,
"epoch": 1.8439999999999999,
"grad_norm": 0.15131881833076477,
"kl": 0.161376953125,
"learning_rate": 1.1657684494105386e-07,
"loss": -0.0215,
"reward": -0.09375773929059505,
"reward_std": 0.1126671563833952,
"rewards/cosine_scaled_reward": -0.04687886871397495,
"rewards/format_reward": 0.0,
"step": 461
},
{
"clip_ratio": 0.0,
"completion_length": 1515.2678833007812,
"epoch": 1.8479999999999999,
"grad_norm": 0.08426119387149811,
"kl": 0.15576171875,
"learning_rate": 1.1574257748745986e-07,
"loss": -0.0228,
"reward": -0.09476478770375252,
"reward_std": 0.10140549577772617,
"rewards/cosine_scaled_reward": -0.04738239198923111,
"rewards/format_reward": 0.0,
"step": 462
},
{
"clip_ratio": 0.0,
"completion_length": 1512.8333435058594,
"epoch": 1.8519999999999999,
"grad_norm": 0.08592584729194641,
"kl": 0.17333984375,
"learning_rate": 1.1492947512799328e-07,
"loss": -0.0256,
"reward": -0.011012900620698929,
"reward_std": 0.08133355341851711,
"rewards/cosine_scaled_reward": -0.005506448447704315,
"rewards/format_reward": 0.0,
"step": 463
},
{
"clip_ratio": 0.0,
"completion_length": 1515.3511962890625,
"epoch": 1.8559999999999999,
"grad_norm": 0.1047179102897644,
"kl": 0.17529296875,
"learning_rate": 1.1413757749211602e-07,
"loss": -0.0208,
"reward": -0.08088574931025505,
"reward_std": 0.09925234131515026,
"rewards/cosine_scaled_reward": -0.040442874655127525,
"rewards/format_reward": 0.0,
"step": 464
},
{
"clip_ratio": 0.0,
"completion_length": 1506.4642944335938,
"epoch": 1.8599999999999999,
"grad_norm": 0.1791323721408844,
"kl": 0.177734375,
"learning_rate": 1.1336692317580158e-07,
"loss": -0.0299,
"reward": -0.09140351600944996,
"reward_std": 0.1100204586982727,
"rewards/cosine_scaled_reward": -0.045701757073402405,
"rewards/format_reward": 0.0,
"step": 465
},
{
"clip_ratio": 0.0,
"completion_length": 1522.0535583496094,
"epoch": 1.8639999999999999,
"grad_norm": 0.13129960000514984,
"kl": 0.1676025390625,
"learning_rate": 1.1261754973965422e-07,
"loss": -0.0129,
"reward": -0.08949675410985947,
"reward_std": 0.09439942799508572,
"rewards/cosine_scaled_reward": -0.04474837705492973,
"rewards/format_reward": 0.0,
"step": 466
},
{
"clip_ratio": 0.0,
"completion_length": 1484.0535888671875,
"epoch": 1.8679999999999999,
"grad_norm": 0.13218103349208832,
"kl": 0.15966796875,
"learning_rate": 1.1188949370707787e-07,
"loss": -0.0593,
"reward": -0.11925767548382282,
"reward_std": 0.15814346075057983,
"rewards/cosine_scaled_reward": -0.05962884332984686,
"rewards/format_reward": 0.0,
"step": 467
},
{
"clip_ratio": 0.0,
"completion_length": 1496.4702453613281,
"epoch": 1.8719999999999999,
"grad_norm": 0.09863686561584473,
"kl": 0.146484375,
"learning_rate": 1.1118279056249653e-07,
"loss": -0.0405,
"reward": -0.0985277071595192,
"reward_std": 0.11279423907399178,
"rewards/cosine_scaled_reward": -0.04926385171711445,
"rewards/format_reward": 0.0,
"step": 468
},
{
"clip_ratio": 0.0,
"completion_length": 1515.8869323730469,
"epoch": 1.876,
"grad_norm": 0.09514996409416199,
"kl": 0.158203125,
"learning_rate": 1.1049747474962444e-07,
"loss": -0.0164,
"reward": -0.08131754398345947,
"reward_std": 0.0914676021784544,
"rewards/cosine_scaled_reward": -0.04065877292305231,
"rewards/format_reward": 0.0,
"step": 469
},
{
"clip_ratio": 0.0,
"completion_length": 1525.4464416503906,
"epoch": 1.88,
"grad_norm": 0.11126792430877686,
"kl": 0.158447265625,
"learning_rate": 1.0983357966978745e-07,
"loss": -0.0064,
"reward": -0.0882963128387928,
"reward_std": 0.09903069026768208,
"rewards/cosine_scaled_reward": -0.04414815828204155,
"rewards/format_reward": 0.0,
"step": 470
},
{
"clip_ratio": 0.0,
"completion_length": 1502.3392944335938,
"epoch": 1.884,
"grad_norm": 0.12479417026042938,
"kl": 0.168701171875,
"learning_rate": 1.0919113768029517e-07,
"loss": -0.0409,
"reward": -0.07126700505614281,
"reward_std": 0.08810876682400703,
"rewards/cosine_scaled_reward": -0.035633502528071404,
"rewards/format_reward": 0.0,
"step": 471
},
{
"clip_ratio": 0.0,
"completion_length": 1523.3511962890625,
"epoch": 1.888,
"grad_norm": 0.15230634808540344,
"kl": 0.1669921875,
"learning_rate": 1.0857018009286381e-07,
"loss": -0.0107,
"reward": -0.07650433294475079,
"reward_std": 0.09276540018618107,
"rewards/cosine_scaled_reward": -0.03825216554105282,
"rewards/format_reward": 0.0,
"step": 472
},
{
"clip_ratio": 0.0,
"completion_length": 1529.2916564941406,
"epoch": 1.892,
"grad_norm": 0.10235889256000519,
"kl": 0.163818359375,
"learning_rate": 1.0797073717209013e-07,
"loss": -0.0031,
"reward": -0.08031682576984167,
"reward_std": 0.08574636466801167,
"rewards/cosine_scaled_reward": -0.0401584105566144,
"rewards/format_reward": 0.0,
"step": 473
},
{
"clip_ratio": 0.0,
"completion_length": 1514.4643249511719,
"epoch": 1.896,
"grad_norm": 0.24448621273040771,
"kl": 0.17724609375,
"learning_rate": 1.0739283813397639e-07,
"loss": -0.0168,
"reward": -0.09807473048567772,
"reward_std": 0.11232626810669899,
"rewards/cosine_scaled_reward": -0.049037366174161434,
"rewards/format_reward": 0.0,
"step": 474
},
{
"clip_ratio": 0.0,
"completion_length": 1508.8155212402344,
"epoch": 1.9,
"grad_norm": 0.11423542350530624,
"kl": 0.15625,
"learning_rate": 1.068365111445064e-07,
"loss": -0.0138,
"reward": -0.09042352437973022,
"reward_std": 0.10206466354429722,
"rewards/cosine_scaled_reward": -0.04521176405251026,
"rewards/format_reward": 0.0,
"step": 475
},
{
"clip_ratio": 0.0,
"completion_length": 1506.4404907226562,
"epoch": 1.904,
"grad_norm": 0.1770845502614975,
"kl": 0.150390625,
"learning_rate": 1.063017833182728e-07,
"loss": -0.0315,
"reward": -0.1031611617654562,
"reward_std": 0.11722332611680031,
"rewards/cosine_scaled_reward": -0.051580581814050674,
"rewards/format_reward": 0.0,
"step": 476
},
{
"clip_ratio": 0.0,
"completion_length": 1512.5178527832031,
"epoch": 1.908,
"grad_norm": 0.1035989373922348,
"kl": 0.155517578125,
"learning_rate": 1.0578868071715544e-07,
"loss": -0.0263,
"reward": -0.10269530303776264,
"reward_std": 0.13116441946476698,
"rewards/cosine_scaled_reward": -0.05134765151888132,
"rewards/format_reward": 0.0,
"step": 477
},
{
"clip_ratio": 0.0,
"completion_length": 1506.7083435058594,
"epoch": 1.912,
"grad_norm": 0.1195254847407341,
"kl": 0.162841796875,
"learning_rate": 1.0529722834905125e-07,
"loss": -0.0243,
"reward": -0.10078963078558445,
"reward_std": 0.11876899935305119,
"rewards/cosine_scaled_reward": -0.050394815392792225,
"rewards/format_reward": 0.0,
"step": 478
},
{
"clip_ratio": 0.0,
"completion_length": 1515.0952453613281,
"epoch": 1.916,
"grad_norm": 0.1497546136379242,
"kl": 0.1513671875,
"learning_rate": 1.0482745016665526e-07,
"loss": -0.0204,
"reward": -0.10379143245518208,
"reward_std": 0.11940331198275089,
"rewards/cosine_scaled_reward": -0.05189571529626846,
"rewards/format_reward": 0.0,
"step": 479
},
{
"clip_ratio": 0.0,
"completion_length": 1522.8690490722656,
"epoch": 1.92,
"grad_norm": 0.12198466807603836,
"kl": 0.169189453125,
"learning_rate": 1.0437936906629334e-07,
"loss": -0.0117,
"reward": -0.0045996010303497314,
"reward_std": 0.09211089462041855,
"rewards/cosine_scaled_reward": -0.002299800980836153,
"rewards/format_reward": 0.0,
"step": 480
},
{
"clip_ratio": 0.0,
"completion_length": 1498.5833740234375,
"epoch": 1.924,
"grad_norm": 0.10352538526058197,
"kl": 0.14697265625,
"learning_rate": 1.0395300688680625e-07,
"loss": -0.0433,
"reward": -0.1318805105984211,
"reward_std": 0.15495008416473866,
"rewards/cosine_scaled_reward": -0.06594025250524282,
"rewards/format_reward": 0.0,
"step": 481
},
{
"clip_ratio": 0.0,
"completion_length": 1517.107177734375,
"epoch": 1.928,
"grad_norm": 0.08769793808460236,
"kl": 0.14990234375,
"learning_rate": 1.0354838440848501e-07,
"loss": -0.0207,
"reward": -0.10253190249204636,
"reward_std": 0.121914217248559,
"rewards/cosine_scaled_reward": -0.05126595124602318,
"rewards/format_reward": 0.0,
"step": 482
},
{
"clip_ratio": 0.0,
"completion_length": 1523.0059814453125,
"epoch": 1.932,
"grad_norm": 0.09510423243045807,
"kl": 0.16064453125,
"learning_rate": 1.0316552135205837e-07,
"loss": -0.0073,
"reward": -0.09257967211306095,
"reward_std": 0.09288883674889803,
"rewards/cosine_scaled_reward": -0.0462898388504982,
"rewards/format_reward": 0.0,
"step": 483
},
{
"clip_ratio": 0.0,
"completion_length": 1522.1666870117188,
"epoch": 1.936,
"grad_norm": 0.11582231521606445,
"kl": 0.159912109375,
"learning_rate": 1.0280443637773163e-07,
"loss": -0.013,
"reward": -0.09562139585614204,
"reward_std": 0.10689939372241497,
"rewards/cosine_scaled_reward": -0.04781070165336132,
"rewards/format_reward": 0.0,
"step": 484
},
{
"clip_ratio": 0.0,
"completion_length": 1527.6428527832031,
"epoch": 1.94,
"grad_norm": 0.1510264277458191,
"kl": 0.168701171875,
"learning_rate": 1.0246514708427701e-07,
"loss": 0.0057,
"reward": -0.07166448421776295,
"reward_std": 0.06708121951669455,
"rewards/cosine_scaled_reward": -0.03583224397152662,
"rewards/format_reward": 0.0,
"step": 485
},
{
"clip_ratio": 0.0,
"completion_length": 1508.6607360839844,
"epoch": 1.944,
"grad_norm": 0.10827223211526871,
"kl": 0.153076171875,
"learning_rate": 1.0214767000817596e-07,
"loss": -0.0092,
"reward": -0.077840281650424,
"reward_std": 0.09841375425457954,
"rewards/cosine_scaled_reward": -0.038920141756534576,
"rewards/format_reward": 0.0,
"step": 486
},
{
"clip_ratio": 0.0,
"completion_length": 1515.2857360839844,
"epoch": 1.948,
"grad_norm": 0.08925757557153702,
"kl": 0.1376953125,
"learning_rate": 1.0185202062281336e-07,
"loss": -0.0088,
"reward": -0.0860859602689743,
"reward_std": 0.10117382928729057,
"rewards/cosine_scaled_reward": -0.04304297920316458,
"rewards/format_reward": 0.0,
"step": 487
},
{
"clip_ratio": 0.0,
"completion_length": 1510.1130981445312,
"epoch": 1.952,
"grad_norm": 0.1778467744588852,
"kl": 0.160400390625,
"learning_rate": 1.0157821333772304e-07,
"loss": -0.0269,
"reward": -0.08845487236976624,
"reward_std": 0.09980816766619682,
"rewards/cosine_scaled_reward": -0.04422743525356054,
"rewards/format_reward": 0.0,
"step": 488
},
{
"clip_ratio": 0.0,
"completion_length": 1523.482177734375,
"epoch": 1.956,
"grad_norm": 0.09875297546386719,
"kl": 0.162841796875,
"learning_rate": 1.013262614978859e-07,
"loss": -0.0109,
"reward": -0.08624122757464647,
"reward_std": 0.09759997017681599,
"rewards/cosine_scaled_reward": -0.04312061285600066,
"rewards/format_reward": 0.0,
"step": 489
},
{
"clip_ratio": 0.0,
"completion_length": 1488.8929138183594,
"epoch": 1.96,
"grad_norm": 0.10072106122970581,
"kl": 0.16064453125,
"learning_rate": 1.0109617738307911e-07,
"loss": -0.0561,
"reward": -0.1024992810562253,
"reward_std": 0.1395698133856058,
"rewards/cosine_scaled_reward": -0.051249639596790075,
"rewards/format_reward": 0.0,
"step": 490
},
{
"clip_ratio": 0.0,
"completion_length": 1507.6190795898438,
"epoch": 1.964,
"grad_norm": 0.14594227075576782,
"kl": 0.170166015625,
"learning_rate": 1.0088797220727779e-07,
"loss": -0.0329,
"reward": -0.09847836010158062,
"reward_std": 0.12085962668061256,
"rewards/cosine_scaled_reward": -0.049239179119467735,
"rewards/format_reward": 0.0,
"step": 491
},
{
"clip_ratio": 0.0,
"completion_length": 1515.52978515625,
"epoch": 1.968,
"grad_norm": 0.1781827211380005,
"kl": 0.171875,
"learning_rate": 1.0070165611810855e-07,
"loss": -0.0183,
"reward": -0.09458879381418228,
"reward_std": 0.10490395873785019,
"rewards/cosine_scaled_reward": -0.047294397838413715,
"rewards/format_reward": 0.0,
"step": 492
},
{
"clip_ratio": 0.0,
"completion_length": 1510.4464416503906,
"epoch": 1.972,
"grad_norm": 0.14723263680934906,
"kl": 0.166259765625,
"learning_rate": 1.005372381963547e-07,
"loss": -0.0245,
"reward": -0.08563583716750145,
"reward_std": 0.09580126218497753,
"rewards/cosine_scaled_reward": -0.04281791765242815,
"rewards/format_reward": 0.0,
"step": 493
},
{
"clip_ratio": 0.0,
"completion_length": 1508.9822082519531,
"epoch": 1.976,
"grad_norm": 0.08827134966850281,
"kl": 0.171142578125,
"learning_rate": 1.0039472645551372e-07,
"loss": -0.0314,
"reward": -0.09158815257251263,
"reward_std": 0.11639940552413464,
"rewards/cosine_scaled_reward": -0.04579407814890146,
"rewards/format_reward": 0.0,
"step": 494
},
{
"clip_ratio": 0.0,
"completion_length": 1508.8095397949219,
"epoch": 1.98,
"grad_norm": 0.09312310069799423,
"kl": 0.15380859375,
"learning_rate": 1.002741278414069e-07,
"loss": -0.0125,
"reward": -0.08922230452299118,
"reward_std": 0.09334707166999578,
"rewards/cosine_scaled_reward": -0.044611155055463314,
"rewards/format_reward": 0.0,
"step": 495
},
{
"clip_ratio": 0.0,
"completion_length": 1511.452392578125,
"epoch": 1.984,
"grad_norm": 0.10467734187841415,
"kl": 0.14697265625,
"learning_rate": 1.0017544823184055e-07,
"loss": -0.0302,
"reward": -0.09572159126400948,
"reward_std": 0.11608831025660038,
"rewards/cosine_scaled_reward": -0.04786079656332731,
"rewards/format_reward": 0.0,
"step": 496
},
{
"clip_ratio": 0.0,
"completion_length": 1488.7262573242188,
"epoch": 1.988,
"grad_norm": 0.11462484300136566,
"kl": 0.152099609375,
"learning_rate": 1.0009869243631952e-07,
"loss": -0.0413,
"reward": -0.10070546343922615,
"reward_std": 0.12976408563554287,
"rewards/cosine_scaled_reward": -0.05035272892564535,
"rewards/format_reward": 0.0,
"step": 497
},
{
"clip_ratio": 0.0,
"completion_length": 1513.2143249511719,
"epoch": 1.992,
"grad_norm": 0.09213641285896301,
"kl": 0.135498046875,
"learning_rate": 1.000438641958131e-07,
"loss": -0.0224,
"reward": -0.09420822747051716,
"reward_std": 0.1108301505446434,
"rewards/cosine_scaled_reward": -0.04710411373525858,
"rewards/format_reward": 0.0,
"step": 498
},
{
"clip_ratio": 0.0,
"completion_length": 1523.3511962890625,
"epoch": 1.996,
"grad_norm": 0.10646738111972809,
"kl": 0.1552734375,
"learning_rate": 1.0001096618257236e-07,
"loss": -0.0112,
"reward": -0.10537549015134573,
"reward_std": 0.11287760734558105,
"rewards/cosine_scaled_reward": -0.052687746938318014,
"rewards/format_reward": 0.0,
"step": 499
},
{
"clip_ratio": 0.0,
"completion_length": 1513.0893859863281,
"epoch": 2.0,
"grad_norm": 0.12409133464097977,
"kl": 0.166015625,
"learning_rate": 1e-07,
"loss": -0.0134,
"reward": -0.08198001235723495,
"reward_std": 0.0940225888043642,
"rewards/cosine_scaled_reward": -0.04099000431597233,
"rewards/format_reward": 0.0,
"step": 500
},
{
"epoch": 2.0,
"step": 500,
"total_flos": 0.0,
"train_loss": -0.028187389324251855,
"train_runtime": 65552.2916,
"train_samples_per_second": 1.281,
"train_steps_per_second": 0.008
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}