OpenRS-GRPO / trainer_state.json
KKKKKON's picture
Model save
5408bb6 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.4285408185129634,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 1990.0278625488281,
"epoch": 0.0008570816370259267,
"grad_norm": 0.7143716812133789,
"kl": 0.0,
"learning_rate": 2e-08,
"loss": 0.0768,
"reward": 0.7364962100982666,
"reward_std": 0.8344592750072479,
"rewards/cosine_scaled_reward": 0.3335258811712265,
"rewards/format_reward": 0.0694444477558136,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 2242.5694885253906,
"epoch": 0.0017141632740518534,
"grad_norm": 0.3701298236846924,
"kl": 0.0,
"learning_rate": 4e-08,
"loss": 0.0188,
"reward": 0.2869503181427717,
"reward_std": 0.7584073394536972,
"rewards/cosine_scaled_reward": 0.10180849023163319,
"rewards/format_reward": 0.08333333488553762,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 2240.2083740234375,
"epoch": 0.0025712449110777804,
"grad_norm": 0.289852112531662,
"kl": -4.544854164123535e-06,
"learning_rate": 6e-08,
"loss": 0.0887,
"reward": 0.19659454189240932,
"reward_std": 0.6517409235239029,
"rewards/cosine_scaled_reward": 0.04274171104407287,
"rewards/format_reward": 0.11111111287027597,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 1926.1667175292969,
"epoch": 0.0034283265481037067,
"grad_norm": 1.4475386142730713,
"kl": 5.131587386131287e-06,
"learning_rate": 8e-08,
"loss": 0.1401,
"reward": 0.6266543348319829,
"reward_std": 0.7619214951992035,
"rewards/cosine_scaled_reward": 0.25082714576274157,
"rewards/format_reward": 0.12500000186264515,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 2068.638916015625,
"epoch": 0.004285408185129634,
"grad_norm": 0.492279052734375,
"kl": -6.013549864292145e-06,
"learning_rate": 1e-07,
"loss": 0.0131,
"reward": 0.8806147426366806,
"reward_std": 0.7828683108091354,
"rewards/cosine_scaled_reward": 0.3500296138226986,
"rewards/format_reward": 0.18055556155741215,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 2029.3888854980469,
"epoch": 0.005142489822155561,
"grad_norm": 0.5861327052116394,
"kl": 4.366040229797363e-06,
"learning_rate": 1.2e-07,
"loss": -0.0572,
"reward": 0.7398289144039154,
"reward_std": 0.8924512416124344,
"rewards/cosine_scaled_reward": 0.2796366587281227,
"rewards/format_reward": 0.18055556062608957,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 1942.5416564941406,
"epoch": 0.005999571459181487,
"grad_norm": 1.1912319660186768,
"kl": 4.488509148359299e-06,
"learning_rate": 1.4e-07,
"loss": 0.0059,
"reward": 0.23400097712874413,
"reward_std": 0.7279467135667801,
"rewards/cosine_scaled_reward": 0.07533382624387741,
"rewards/format_reward": 0.08333333395421505,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 2180.8750610351562,
"epoch": 0.0068566530962074134,
"grad_norm": 0.6393517851829529,
"kl": -4.702596925199032e-06,
"learning_rate": 1.6e-07,
"loss": -0.0338,
"reward": 0.6042829677462578,
"reward_std": 0.8216739147901535,
"rewards/cosine_scaled_reward": 0.26047481037676334,
"rewards/format_reward": 0.08333333395421505,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 2086.1944885253906,
"epoch": 0.00771373473323334,
"grad_norm": 0.8930106163024902,
"kl": 5.990266799926758e-06,
"learning_rate": 1.8e-07,
"loss": -0.1895,
"reward": 0.5918732397258282,
"reward_std": 0.6417871415615082,
"rewards/cosine_scaled_reward": 0.2264921732712537,
"rewards/format_reward": 0.13888888992369175,
"step": 9
},
{
"clip_ratio": 0.0,
"completion_length": 2342.5416870117188,
"epoch": 0.008570816370259268,
"grad_norm": 0.20637141168117523,
"kl": 1.8426217138767242e-06,
"learning_rate": 2e-07,
"loss": -0.0414,
"reward": 0.5427078725770116,
"reward_std": 0.6480782926082611,
"rewards/cosine_scaled_reward": 0.22968729073181748,
"rewards/format_reward": 0.08333333395421505,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 2251.083282470703,
"epoch": 0.009427898007285194,
"grad_norm": 0.13938356935977936,
"kl": -3.427267074584961e-07,
"learning_rate": 2.1999999999999998e-07,
"loss": -0.0399,
"reward": 0.4674246795475483,
"reward_std": 0.9071184694766998,
"rewards/cosine_scaled_reward": 0.20593456458300352,
"rewards/format_reward": 0.0555555559694767,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 2086.4306030273438,
"epoch": 0.010284979644311121,
"grad_norm": 0.17967204749584198,
"kl": 4.600733518600464e-07,
"learning_rate": 2.4e-07,
"loss": 0.1108,
"reward": 0.7090050615370274,
"reward_std": 0.9076867997646332,
"rewards/cosine_scaled_reward": 0.31978030502796173,
"rewards/format_reward": 0.06944444589316845,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 2177.7500610351562,
"epoch": 0.011142061281337047,
"grad_norm": 0.6705565452575684,
"kl": -9.059906005859375e-06,
"learning_rate": 2.6e-07,
"loss": -0.107,
"reward": 0.31611130852252245,
"reward_std": 0.700124979019165,
"rewards/cosine_scaled_reward": 0.10250008956063539,
"rewards/format_reward": 0.1111111119389534,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 1980.0971984863281,
"epoch": 0.011999142918362973,
"grad_norm": 0.4753912091255188,
"kl": 1.8067657947540283e-06,
"learning_rate": 2.8e-07,
"loss": -0.1518,
"reward": 0.5453062728047371,
"reward_std": 0.7382813096046448,
"rewards/cosine_scaled_reward": 0.21709759440273046,
"rewards/format_reward": 0.1111111119389534,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 2061.277801513672,
"epoch": 0.012856224555388901,
"grad_norm": 1.0317578315734863,
"kl": -4.398170858621597e-06,
"learning_rate": 3e-07,
"loss": 0.1252,
"reward": 0.7395287081599236,
"reward_std": 0.9806090295314789,
"rewards/cosine_scaled_reward": 0.3350421413779259,
"rewards/format_reward": 0.06944444589316845,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 1978.9167175292969,
"epoch": 0.013713306192414827,
"grad_norm": 0.8236006498336792,
"kl": 9.63360071182251e-06,
"learning_rate": 3.2e-07,
"loss": 0.0218,
"reward": 0.6508458182215691,
"reward_std": 0.8475509732961655,
"rewards/cosine_scaled_reward": 0.2768117773812264,
"rewards/format_reward": 0.0972222238779068,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 1865.6111602783203,
"epoch": 0.014570387829440755,
"grad_norm": 0.5406652092933655,
"kl": 7.767230272293091e-07,
"learning_rate": 3.4000000000000003e-07,
"loss": 0.0741,
"reward": 0.8166351541876793,
"reward_std": 0.9249791204929352,
"rewards/cosine_scaled_reward": 0.33887312887236476,
"rewards/format_reward": 0.13888889085501432,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 2117.500030517578,
"epoch": 0.01542746946646668,
"grad_norm": 0.34515851736068726,
"kl": 2.3096799850463867e-06,
"learning_rate": 3.6e-07,
"loss": 0.1115,
"reward": 0.7321371585130692,
"reward_std": 0.6702713221311569,
"rewards/cosine_scaled_reward": 0.3105130400508642,
"rewards/format_reward": 0.11111111473292112,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 1973.0972290039062,
"epoch": 0.016284551103492608,
"grad_norm": 1.3012654781341553,
"kl": 4.159286618232727e-06,
"learning_rate": 3.7999999999999996e-07,
"loss": 0.1323,
"reward": 0.28933676797896624,
"reward_std": 0.875898152589798,
"rewards/cosine_scaled_reward": 0.10994616383686662,
"rewards/format_reward": 0.06944444496184587,
"step": 19
},
{
"clip_ratio": 0.0,
"completion_length": 2246.500030517578,
"epoch": 0.017141632740518536,
"grad_norm": 0.23211120069026947,
"kl": -1.0263174772262573e-05,
"learning_rate": 4e-07,
"loss": 0.0164,
"reward": 0.12983610574156046,
"reward_std": 0.7194785475730896,
"rewards/cosine_scaled_reward": -0.004526391625404358,
"rewards/format_reward": 0.1388888917863369,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 1989.7916870117188,
"epoch": 0.01799871437754446,
"grad_norm": 0.2483261376619339,
"kl": -4.772096872329712e-06,
"learning_rate": 4.1999999999999995e-07,
"loss": 0.0647,
"reward": 0.2600990349892527,
"reward_std": 0.7444702684879303,
"rewards/cosine_scaled_reward": 0.06754951924085617,
"rewards/format_reward": 0.12500000279396772,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 2344.8472900390625,
"epoch": 0.018855796014570388,
"grad_norm": 0.531119704246521,
"kl": -5.930662155151367e-06,
"learning_rate": 4.3999999999999997e-07,
"loss": 0.0059,
"reward": 0.31322263344191015,
"reward_std": 0.7910206019878387,
"rewards/cosine_scaled_reward": 0.10105575760826468,
"rewards/format_reward": 0.11111111287027597,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 2217.40283203125,
"epoch": 0.019712877651596315,
"grad_norm": 0.4045103192329407,
"kl": -9.08970832824707e-06,
"learning_rate": 4.6e-07,
"loss": -0.0425,
"reward": 0.28447722643613815,
"reward_std": 0.7937969118356705,
"rewards/cosine_scaled_reward": 0.12140527740120888,
"rewards/format_reward": 0.041666666977107525,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 1954.6805725097656,
"epoch": 0.020569959288622243,
"grad_norm": 1.1069881916046143,
"kl": 3.439374268054962e-06,
"learning_rate": 4.8e-07,
"loss": -0.04,
"reward": 0.61127008497715,
"reward_std": 0.621691383421421,
"rewards/cosine_scaled_reward": 0.22924613999202847,
"rewards/format_reward": 0.15277778077870607,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 1864.0972595214844,
"epoch": 0.021427040925648167,
"grad_norm": 0.13546203076839447,
"kl": 1.4692544937133789e-05,
"learning_rate": 5e-07,
"loss": -0.0367,
"reward": 0.8168461695313454,
"reward_std": 0.8645190075039864,
"rewards/cosine_scaled_reward": 0.33897863049060106,
"rewards/format_reward": 0.1388888917863369,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 2138.2638549804688,
"epoch": 0.022284122562674095,
"grad_norm": 0.23594234883785248,
"kl": -1.7136335372924805e-06,
"learning_rate": 5.2e-07,
"loss": -0.0366,
"reward": 0.2813246757723391,
"reward_std": 0.6457278877496719,
"rewards/cosine_scaled_reward": 0.09899566043168306,
"rewards/format_reward": 0.08333333488553762,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 2089.0556030273438,
"epoch": 0.023141204199700022,
"grad_norm": 1.0738513469696045,
"kl": 5.405396223068237e-06,
"learning_rate": 5.4e-07,
"loss": 0.1218,
"reward": 0.6343957483768463,
"reward_std": 0.7526431530714035,
"rewards/cosine_scaled_reward": 0.24775342142675072,
"rewards/format_reward": 0.1388888917863369,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 2116.638916015625,
"epoch": 0.023998285836725947,
"grad_norm": 1.4345226287841797,
"kl": 3.216089680790901e-06,
"learning_rate": 5.6e-07,
"loss": 0.0098,
"reward": 0.5014659259468317,
"reward_std": 0.8205202594399452,
"rewards/cosine_scaled_reward": 0.20212182961404324,
"rewards/format_reward": 0.09722222294658422,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 1885.0278015136719,
"epoch": 0.024855367473751874,
"grad_norm": 0.29304057359695435,
"kl": 4.43682074546814e-06,
"learning_rate": 5.8e-07,
"loss": 0.0183,
"reward": 0.4122583381831646,
"reward_std": 0.7856339067220688,
"rewards/cosine_scaled_reward": 0.1644625086337328,
"rewards/format_reward": 0.08333333395421505,
"step": 29
},
{
"clip_ratio": 0.0,
"completion_length": 2169.583282470703,
"epoch": 0.025712449110777802,
"grad_norm": 0.21029269695281982,
"kl": 1.8927734345197678e-05,
"learning_rate": 6e-07,
"loss": 0.0248,
"reward": 0.484335083514452,
"reward_std": 0.7596095502376556,
"rewards/cosine_scaled_reward": 0.18661198392510414,
"rewards/format_reward": 0.11111111380159855,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 1893.3333129882812,
"epoch": 0.02656953074780373,
"grad_norm": 0.18814511597156525,
"kl": 0.00010308623313903809,
"learning_rate": 6.2e-07,
"loss": -0.0622,
"reward": 0.408107977360487,
"reward_std": 0.7381277531385422,
"rewards/cosine_scaled_reward": 0.14849842991679907,
"rewards/format_reward": 0.11111111380159855,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 2393.0694580078125,
"epoch": 0.027426612384829654,
"grad_norm": 0.4075649380683899,
"kl": 3.24249267578125e-05,
"learning_rate": 6.4e-07,
"loss": -0.1087,
"reward": 0.22196191549301147,
"reward_std": 0.7661005556583405,
"rewards/cosine_scaled_reward": 0.06236985884606838,
"rewards/format_reward": 0.09722222294658422,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 2272.3194580078125,
"epoch": 0.02828369402185558,
"grad_norm": 0.21960391104221344,
"kl": 7.766485214233398e-05,
"learning_rate": 6.6e-07,
"loss": 0.0847,
"reward": 0.4620617777109146,
"reward_std": 0.8374988958239555,
"rewards/cosine_scaled_reward": 0.1268642134964466,
"rewards/format_reward": 0.20833334047347307,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 2146.1527709960938,
"epoch": 0.02914077565888151,
"grad_norm": 0.21705187857151031,
"kl": 0.00015848875045776367,
"learning_rate": 6.800000000000001e-07,
"loss": 0.0208,
"reward": 0.228340620175004,
"reward_std": 0.7516879141330719,
"rewards/cosine_scaled_reward": 0.09333697834517807,
"rewards/format_reward": 0.041666666977107525,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 2046.4166870117188,
"epoch": 0.029997857295907437,
"grad_norm": 0.32378724217414856,
"kl": 7.605552673339844e-05,
"learning_rate": 7e-07,
"loss": 0.0799,
"reward": 0.47813151963055134,
"reward_std": 0.6827547252178192,
"rewards/cosine_scaled_reward": 0.19739909376949072,
"rewards/format_reward": 0.08333333395421505,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 2243.15283203125,
"epoch": 0.03085493893293336,
"grad_norm": 0.3061051070690155,
"kl": 0.0001518726348876953,
"learning_rate": 7.2e-07,
"loss": 0.0024,
"reward": 0.5015687884879299,
"reward_std": 0.8517486453056335,
"rewards/cosine_scaled_reward": 0.21606217822409235,
"rewards/format_reward": 0.06944444589316845,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 2279.791717529297,
"epoch": 0.03171202056995929,
"grad_norm": 0.6476448178291321,
"kl": 0.0002086162567138672,
"learning_rate": 7.4e-07,
"loss": 0.1123,
"reward": 0.4819689504802227,
"reward_std": 0.8821997940540314,
"rewards/cosine_scaled_reward": 0.1715400367975235,
"rewards/format_reward": 0.13888889085501432,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 2446.9166870117188,
"epoch": 0.032569102206985216,
"grad_norm": 0.4492949843406677,
"kl": 0.00011676549911499023,
"learning_rate": 7.599999999999999e-07,
"loss": 0.0675,
"reward": 0.4931858219206333,
"reward_std": 0.8364229053258896,
"rewards/cosine_scaled_reward": 0.1702040210366249,
"rewards/format_reward": 0.15277778171002865,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 2038.7083740234375,
"epoch": 0.033426183844011144,
"grad_norm": 0.1742168515920639,
"kl": 0.00026722252368927,
"learning_rate": 7.799999999999999e-07,
"loss": 0.1608,
"reward": 0.03227643854916096,
"reward_std": 0.6348245367407799,
"rewards/cosine_scaled_reward": -0.018583996687084436,
"rewards/format_reward": 0.06944444496184587,
"step": 39
},
{
"clip_ratio": 0.0,
"completion_length": 2093.7361450195312,
"epoch": 0.03428326548103707,
"grad_norm": 0.3163558542728424,
"kl": 0.0002613067626953125,
"learning_rate": 8e-07,
"loss": -0.0057,
"reward": 0.6369250733405352,
"reward_std": 0.8408300578594208,
"rewards/cosine_scaled_reward": 0.29068473260849714,
"rewards/format_reward": 0.0555555559694767,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 2250.875,
"epoch": 0.03514034711806299,
"grad_norm": 0.24971450865268707,
"kl": 0.0001220703125,
"learning_rate": 8.199999999999999e-07,
"loss": 0.0645,
"reward": 0.6921100318431854,
"reward_std": 0.6361059248447418,
"rewards/cosine_scaled_reward": 0.3043883480131626,
"rewards/format_reward": 0.08333333395421505,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 2280.763916015625,
"epoch": 0.03599742875508892,
"grad_norm": 0.2735660672187805,
"kl": 0.0002765655517578125,
"learning_rate": 8.399999999999999e-07,
"loss": -0.0169,
"reward": 0.6382394358515739,
"reward_std": 0.6768579035997391,
"rewards/cosine_scaled_reward": 0.2982863746583462,
"rewards/format_reward": 0.041666666977107525,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 2201.75,
"epoch": 0.03685451039211485,
"grad_norm": 0.28408563137054443,
"kl": 0.0007734298706054688,
"learning_rate": 8.599999999999999e-07,
"loss": -0.0836,
"reward": 0.2587485685944557,
"reward_std": 0.8071333467960358,
"rewards/cosine_scaled_reward": 0.10159650258719921,
"rewards/format_reward": 0.055555556900799274,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 2117.277801513672,
"epoch": 0.037711592029140775,
"grad_norm": 0.16344650089740753,
"kl": 0.00028318166732788086,
"learning_rate": 8.799999999999999e-07,
"loss": -0.1207,
"reward": 0.14954735711216927,
"reward_std": 0.654334545135498,
"rewards/cosine_scaled_reward": 0.04699590289965272,
"rewards/format_reward": 0.0555555559694767,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 2220.430633544922,
"epoch": 0.0385686736661667,
"grad_norm": 0.5771039128303528,
"kl": 0.0004525184631347656,
"learning_rate": 9e-07,
"loss": -0.1835,
"reward": 0.45511680841445923,
"reward_std": 0.8355356454849243,
"rewards/cosine_scaled_reward": 0.17894728854298592,
"rewards/format_reward": 0.0972222238779068,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 2210.3472595214844,
"epoch": 0.03942575530319263,
"grad_norm": 0.16971856355667114,
"kl": 0.0004546046257019043,
"learning_rate": 9.2e-07,
"loss": 0.1153,
"reward": 0.9467306435108185,
"reward_std": 0.7745302617549896,
"rewards/cosine_scaled_reward": 0.40392088890075684,
"rewards/format_reward": 0.13888889271765947,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 2326.6666870117188,
"epoch": 0.04028283694021856,
"grad_norm": 0.18288320302963257,
"kl": 0.00079345703125,
"learning_rate": 9.399999999999999e-07,
"loss": 0.0068,
"reward": 0.4090319825336337,
"reward_std": 0.8255032747983932,
"rewards/cosine_scaled_reward": 0.17673821188509464,
"rewards/format_reward": 0.0555555559694767,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 2395.9027709960938,
"epoch": 0.041139918577244486,
"grad_norm": 0.11357901245355606,
"kl": 0.0008918642997741699,
"learning_rate": 9.6e-07,
"loss": -0.0441,
"reward": 0.5232048779726028,
"reward_std": 0.824892595410347,
"rewards/cosine_scaled_reward": 0.22688022814691067,
"rewards/format_reward": 0.06944444589316845,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 2259.8333740234375,
"epoch": 0.04199700021427041,
"grad_norm": 0.32720884680747986,
"kl": 0.0004296302795410156,
"learning_rate": 9.8e-07,
"loss": -0.056,
"reward": 0.26625396870076656,
"reward_std": 0.7466485947370529,
"rewards/cosine_scaled_reward": 0.08451587241142988,
"rewards/format_reward": 0.0972222238779068,
"step": 49
},
{
"clip_ratio": 0.0,
"completion_length": 2333.138885498047,
"epoch": 0.042854081851296334,
"grad_norm": 0.417357474565506,
"kl": 0.0006256103515625,
"learning_rate": 1e-06,
"loss": 0.0745,
"reward": 0.7284820526838303,
"reward_std": 0.6617946848273277,
"rewards/cosine_scaled_reward": 0.3017410282045603,
"rewards/format_reward": 0.12500000093132257,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 2022.5278625488281,
"epoch": 0.04371116348832226,
"grad_norm": 0.3659718930721283,
"kl": 0.0008034706115722656,
"learning_rate": 9.999890338174275e-07,
"loss": -0.1326,
"reward": 0.9250798672437668,
"reward_std": 0.7151706963777542,
"rewards/cosine_scaled_reward": 0.3861510306596756,
"rewards/format_reward": 0.15277778171002865,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 2212.513885498047,
"epoch": 0.04456824512534819,
"grad_norm": 0.5044128894805908,
"kl": 0.0008015632629394531,
"learning_rate": 9.999561358041868e-07,
"loss": 0.0979,
"reward": 0.6630913875997066,
"reward_std": 0.7525846064090729,
"rewards/cosine_scaled_reward": 0.30376790650188923,
"rewards/format_reward": 0.0555555559694767,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 2055.7222290039062,
"epoch": 0.04542532676237412,
"grad_norm": 0.24211224913597107,
"kl": 0.0007920265197753906,
"learning_rate": 9.999013075636804e-07,
"loss": 0.12,
"reward": 0.7963576763868332,
"reward_std": 0.8260948657989502,
"rewards/cosine_scaled_reward": 0.3495677448809147,
"rewards/format_reward": 0.0972222238779068,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 2191.02783203125,
"epoch": 0.046282408399400045,
"grad_norm": 0.34720614552497864,
"kl": 0.0012145042419433594,
"learning_rate": 9.998245517681593e-07,
"loss": 0.0331,
"reward": 0.477291576564312,
"reward_std": 0.7308211028575897,
"rewards/cosine_scaled_reward": 0.19697911106050014,
"rewards/format_reward": 0.08333333488553762,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 2249.4444885253906,
"epoch": 0.04713949003642597,
"grad_norm": 0.37687548995018005,
"kl": 0.0005116462707519531,
"learning_rate": 9.997258721585931e-07,
"loss": 0.0667,
"reward": 0.4002426005899906,
"reward_std": 0.8839976191520691,
"rewards/cosine_scaled_reward": 0.1515101813711226,
"rewards/format_reward": 0.0972222238779068,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 2593.388916015625,
"epoch": 0.04799657167345189,
"grad_norm": 0.3094277083873749,
"kl": 0.0008883476257324219,
"learning_rate": 9.996052735444862e-07,
"loss": -0.0481,
"reward": 0.4086095951497555,
"reward_std": 0.9278567582368851,
"rewards/cosine_scaled_reward": 0.17652700655162334,
"rewards/format_reward": 0.0555555559694767,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 2042.125,
"epoch": 0.04885365331047782,
"grad_norm": 0.218702495098114,
"kl": 0.0008041709661483765,
"learning_rate": 9.994627618036452e-07,
"loss": -0.0412,
"reward": 0.6693701073527336,
"reward_std": 0.7297275960445404,
"rewards/cosine_scaled_reward": 0.2930183745920658,
"rewards/format_reward": 0.08333333395421505,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 1938.15283203125,
"epoch": 0.04971073494750375,
"grad_norm": 0.19452664256095886,
"kl": 0.0034373998641967773,
"learning_rate": 9.992983438818915e-07,
"loss": 0.0073,
"reward": 0.7400819137692451,
"reward_std": 0.7548571228981018,
"rewards/cosine_scaled_reward": 0.34226314071565866,
"rewards/format_reward": 0.0555555559694767,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 1864.9027404785156,
"epoch": 0.050567816584529676,
"grad_norm": 0.18900378048419952,
"kl": 0.0017142295837402344,
"learning_rate": 9.991120277927223e-07,
"loss": -0.0249,
"reward": 0.5964205050840974,
"reward_std": 0.6683285385370255,
"rewards/cosine_scaled_reward": 0.2426547077484429,
"rewards/format_reward": 0.11111111380159855,
"step": 59
},
{
"clip_ratio": 0.0,
"completion_length": 2128.15283203125,
"epoch": 0.051424898221555604,
"grad_norm": 0.8275086283683777,
"kl": 0.0007976293563842773,
"learning_rate": 9.989038226169207e-07,
"loss": -0.0511,
"reward": 0.31469447165727615,
"reward_std": 0.7699690908193588,
"rewards/cosine_scaled_reward": 0.09484723675996065,
"rewards/format_reward": 0.12500000186264515,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 1909.6667175292969,
"epoch": 0.05228197985858153,
"grad_norm": 0.3160114288330078,
"kl": 0.0005016326904296875,
"learning_rate": 9.98673738502114e-07,
"loss": 0.1518,
"reward": 0.7029329240322113,
"reward_std": 0.7837346494197845,
"rewards/cosine_scaled_reward": 0.2542442447738722,
"rewards/format_reward": 0.19444444961845875,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 2232.986114501953,
"epoch": 0.05313906149560746,
"grad_norm": 0.15240426361560822,
"kl": 0.0007839202880859375,
"learning_rate": 9.98421786662277e-07,
"loss": -0.0462,
"reward": 0.5141148939728737,
"reward_std": 0.871205598115921,
"rewards/cosine_scaled_reward": 0.2223352324217558,
"rewards/format_reward": 0.06944444496184587,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 2244.6944885253906,
"epoch": 0.05399614313263338,
"grad_norm": 0.3563172519207001,
"kl": 0.00048482418060302734,
"learning_rate": 9.981479793771866e-07,
"loss": -0.092,
"reward": 0.32696417067199945,
"reward_std": 0.7440320923924446,
"rewards/cosine_scaled_reward": 0.10792652331292629,
"rewards/format_reward": 0.11111111287027597,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 2177.111083984375,
"epoch": 0.05485322476965931,
"grad_norm": 0.1857367753982544,
"kl": 0.0009160041809082031,
"learning_rate": 9.97852329991824e-07,
"loss": -0.1367,
"reward": 0.8723283112049103,
"reward_std": 0.8291849941015244,
"rewards/cosine_scaled_reward": 0.36671971902251244,
"rewards/format_reward": 0.13888889085501432,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 2447.9861450195312,
"epoch": 0.055710306406685235,
"grad_norm": 0.4035802185535431,
"kl": 0.0016689300537109375,
"learning_rate": 9.975348529157229e-07,
"loss": -0.1549,
"reward": 0.1848380509763956,
"reward_std": 0.7055136561393738,
"rewards/cosine_scaled_reward": 0.050752353854477406,
"rewards/format_reward": 0.08333333395421505,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 2348.5416870117188,
"epoch": 0.05656738804371116,
"grad_norm": 0.1111924946308136,
"kl": 0.0013647079467773438,
"learning_rate": 9.971955636222684e-07,
"loss": -0.046,
"reward": 0.3021825775504112,
"reward_std": 0.5473092719912529,
"rewards/cosine_scaled_reward": 0.09553573839366436,
"rewards/format_reward": 0.11111111380159855,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 2172.625030517578,
"epoch": 0.05742446968073709,
"grad_norm": 0.3930043578147888,
"kl": 0.005237579345703125,
"learning_rate": 9.968344786479415e-07,
"loss": 0.1107,
"reward": 0.6916175857186317,
"reward_std": 0.932863637804985,
"rewards/cosine_scaled_reward": 0.24858656898140907,
"rewards/format_reward": 0.19444444961845875,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 1801.1666564941406,
"epoch": 0.05828155131776302,
"grad_norm": 0.22752800583839417,
"kl": 0.0023860931396484375,
"learning_rate": 9.964516155915151e-07,
"loss": 0.0344,
"reward": 0.47825843654572964,
"reward_std": 0.7419339567422867,
"rewards/cosine_scaled_reward": 0.16968477331101894,
"rewards/format_reward": 0.13888889085501432,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 2261.625030517578,
"epoch": 0.059138632954788946,
"grad_norm": 0.16897499561309814,
"kl": 0.0013341903686523438,
"learning_rate": 9.960469931131936e-07,
"loss": 0.0432,
"reward": 0.35771574825048447,
"reward_std": 0.7713551372289658,
"rewards/cosine_scaled_reward": 0.12330232141539454,
"rewards/format_reward": 0.11111111287027597,
"step": 69
},
{
"clip_ratio": 0.0,
"completion_length": 2128.111114501953,
"epoch": 0.059995714591814873,
"grad_norm": 0.9206687808036804,
"kl": 0.0018476247787475586,
"learning_rate": 9.956206309337066e-07,
"loss": -0.1611,
"reward": 0.5072405263781548,
"reward_std": 0.8621459007263184,
"rewards/cosine_scaled_reward": 0.22584249824285507,
"rewards/format_reward": 0.0555555559694767,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 2014.263916015625,
"epoch": 0.060852796228840794,
"grad_norm": 0.3131060004234314,
"kl": 0.0014543533325195312,
"learning_rate": 9.951725498333448e-07,
"loss": 0.0172,
"reward": 0.4196384996175766,
"reward_std": 0.6068210601806641,
"rewards/cosine_scaled_reward": 0.17509703256655484,
"rewards/format_reward": 0.06944444589316845,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 2465.2916870117188,
"epoch": 0.06170987786586672,
"grad_norm": 0.187669575214386,
"kl": 0.0011992454528808594,
"learning_rate": 9.947027716509488e-07,
"loss": -0.0272,
"reward": 0.5738496109843254,
"reward_std": 0.8109806478023529,
"rewards/cosine_scaled_reward": 0.2244247980415821,
"rewards/format_reward": 0.12500000279396772,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 2012.3888854980469,
"epoch": 0.06256695950289265,
"grad_norm": 0.3393029570579529,
"kl": 0.0021200180053710938,
"learning_rate": 9.942113192828444e-07,
"loss": -0.1397,
"reward": 0.32288938760757446,
"reward_std": 0.6824958473443985,
"rewards/cosine_scaled_reward": 0.10588914155960083,
"rewards/format_reward": 0.1111111119389534,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 2256.7084045410156,
"epoch": 0.06342404113991858,
"grad_norm": 0.4278959035873413,
"kl": 0.0016908645629882812,
"learning_rate": 9.93698216681727e-07,
"loss": 0.1105,
"reward": 0.7894450277090073,
"reward_std": 0.6593053936958313,
"rewards/cosine_scaled_reward": 0.32527804747223854,
"rewards/format_reward": 0.13888888992369175,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 2068.9722290039062,
"epoch": 0.0642811227769445,
"grad_norm": 0.4622882008552551,
"kl": 0.0036230087280273438,
"learning_rate": 9.931634888554935e-07,
"loss": 0.0005,
"reward": 0.22274947352707386,
"reward_std": 0.8964805155992508,
"rewards/cosine_scaled_reward": 0.09748584777116776,
"rewards/format_reward": 0.02777777798473835,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 2233.750030517578,
"epoch": 0.06513820441397043,
"grad_norm": 0.3396798074245453,
"kl": 0.0010144710540771484,
"learning_rate": 9.926071618660237e-07,
"loss": -0.0669,
"reward": 0.39245418552309275,
"reward_std": 0.7776346653699875,
"rewards/cosine_scaled_reward": 0.15456042252480984,
"rewards/format_reward": 0.08333333674818277,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 2399.513916015625,
"epoch": 0.06599528605099636,
"grad_norm": 0.2183726727962494,
"kl": 0.0007622241973876953,
"learning_rate": 9.9202926282791e-07,
"loss": -0.0456,
"reward": 0.2495843954384327,
"reward_std": 0.5640047863125801,
"rewards/cosine_scaled_reward": 0.07618108215683606,
"rewards/format_reward": 0.0972222238779068,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 1918.75,
"epoch": 0.06685236768802229,
"grad_norm": 0.2223767340183258,
"kl": 0.0014109611511230469,
"learning_rate": 9.91429819907136e-07,
"loss": 0.0196,
"reward": 0.32135773450136185,
"reward_std": 0.7068247720599174,
"rewards/cosine_scaled_reward": 0.10512331128120422,
"rewards/format_reward": 0.11111111287027597,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 2230.125030517578,
"epoch": 0.06770944932504822,
"grad_norm": 0.4025353789329529,
"kl": 0.0010633468627929688,
"learning_rate": 9.908088623197048e-07,
"loss": 0.0663,
"reward": 0.38789599807932973,
"reward_std": 0.703848659992218,
"rewards/cosine_scaled_reward": 0.13839244283735752,
"rewards/format_reward": 0.11111111287027597,
"step": 79
},
{
"clip_ratio": 0.0,
"completion_length": 2083.40283203125,
"epoch": 0.06856653096207414,
"grad_norm": 0.17173290252685547,
"kl": 0.0009837150573730469,
"learning_rate": 9.901664203302124e-07,
"loss": -0.0862,
"reward": 0.5672617349773645,
"reward_std": 0.7471679449081421,
"rewards/cosine_scaled_reward": 0.23501975380349904,
"rewards/format_reward": 0.09722222294658422,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 2022.3889465332031,
"epoch": 0.06942361259910007,
"grad_norm": 0.188516765832901,
"kl": 0.00244903564453125,
"learning_rate": 9.895025252503755e-07,
"loss": 0.021,
"reward": 0.6240249052643776,
"reward_std": 0.7871060222387314,
"rewards/cosine_scaled_reward": 0.2703457809984684,
"rewards/format_reward": 0.08333333488553762,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 2090.4583740234375,
"epoch": 0.07028069423612598,
"grad_norm": 0.28751835227012634,
"kl": 0.0048160552978515625,
"learning_rate": 9.888172094375033e-07,
"loss": 0.1485,
"reward": 0.20903612580150366,
"reward_std": 0.8013690561056137,
"rewards/cosine_scaled_reward": 0.05590694583952427,
"rewards/format_reward": 0.0972222238779068,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 2388.4584350585938,
"epoch": 0.07113777587315191,
"grad_norm": 0.22767475247383118,
"kl": 0.0034203529357910156,
"learning_rate": 9.881105062929221e-07,
"loss": -0.0868,
"reward": 0.11441808566451073,
"reward_std": 0.6552191823720932,
"rewards/cosine_scaled_reward": 0.04332014673855156,
"rewards/format_reward": 0.02777777798473835,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 2293.0694274902344,
"epoch": 0.07199485751017784,
"grad_norm": 0.11945953965187073,
"kl": 0.0014848709106445312,
"learning_rate": 9.873824502603459e-07,
"loss": 0.0848,
"reward": 0.5547876954078674,
"reward_std": 0.7587804347276688,
"rewards/cosine_scaled_reward": 0.17322717513889074,
"rewards/format_reward": 0.20833333674818277,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 2540.3611450195312,
"epoch": 0.07285193914720377,
"grad_norm": 0.178083136677742,
"kl": 0.002269744873046875,
"learning_rate": 9.866330768241983e-07,
"loss": -0.1125,
"reward": -0.029575519263744354,
"reward_std": 0.563959889113903,
"rewards/cosine_scaled_reward": -0.05645443079993129,
"rewards/format_reward": 0.08333333395421505,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 2571.7361450195312,
"epoch": 0.0737090207842297,
"grad_norm": 0.1585242599248886,
"kl": 0.0007100105285644531,
"learning_rate": 9.85862422507884e-07,
"loss": 0.0231,
"reward": 0.3958965800702572,
"reward_std": 0.8284079432487488,
"rewards/cosine_scaled_reward": 0.12155939312651753,
"rewards/format_reward": 0.15277778077870607,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 2367.3333740234375,
"epoch": 0.07456610242125562,
"grad_norm": 0.32309991121292114,
"kl": 0.0017614364624023438,
"learning_rate": 9.850705248720068e-07,
"loss": 0.1136,
"reward": 0.7444435358047485,
"reward_std": 0.7368911355733871,
"rewards/cosine_scaled_reward": 0.28888843953609467,
"rewards/format_reward": 0.1666666679084301,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 2257.9583129882812,
"epoch": 0.07542318405828155,
"grad_norm": 0.22389107942581177,
"kl": 0.0009946823120117188,
"learning_rate": 9.8425742251254e-07,
"loss": -0.0003,
"reward": 0.7726655453443527,
"reward_std": 0.7912376075983047,
"rewards/cosine_scaled_reward": 0.3099438678473234,
"rewards/format_reward": 0.15277777891606092,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 2160.1944580078125,
"epoch": 0.07628026569530748,
"grad_norm": 0.22130398452281952,
"kl": 0.005789756774902344,
"learning_rate": 9.83423155058946e-07,
"loss": 0.0076,
"reward": 0.47880110889673233,
"reward_std": 0.8113372325897217,
"rewards/cosine_scaled_reward": 0.19773388467729092,
"rewards/format_reward": 0.08333333488553762,
"step": 89
},
{
"clip_ratio": 0.0,
"completion_length": 2317.486114501953,
"epoch": 0.0771373473323334,
"grad_norm": 0.25060081481933594,
"kl": 0.002841949462890625,
"learning_rate": 9.825677631722435e-07,
"loss": -0.0693,
"reward": 0.17479625344276428,
"reward_std": 0.7589741870760918,
"rewards/cosine_scaled_reward": 0.011009246576577425,
"rewards/format_reward": 0.15277778171002865,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 2067.597198486328,
"epoch": 0.07799442896935933,
"grad_norm": 0.6746274828910828,
"kl": 0.0038204193115234375,
"learning_rate": 9.816912885430258e-07,
"loss": 0.093,
"reward": 0.5547708794474602,
"reward_std": 0.7499261125922203,
"rewards/cosine_scaled_reward": 0.2148854248225689,
"rewards/format_reward": 0.12500000093132257,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 2108.736114501953,
"epoch": 0.07885151060638526,
"grad_norm": 0.2025013267993927,
"kl": 0.0023174285888671875,
"learning_rate": 9.807937738894303e-07,
"loss": -0.0409,
"reward": 0.5829055476933718,
"reward_std": 0.8402788192033768,
"rewards/cosine_scaled_reward": 0.21506388299167156,
"rewards/format_reward": 0.15277778077870607,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 2121.388885498047,
"epoch": 0.07970859224341119,
"grad_norm": 0.20141884684562683,
"kl": 0.0013599395751953125,
"learning_rate": 9.798752629550546e-07,
"loss": 0.1446,
"reward": 0.4946107156574726,
"reward_std": 0.7894033789634705,
"rewards/cosine_scaled_reward": 0.16397201921790838,
"rewards/format_reward": 0.16666667070239782,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 1983.5138854980469,
"epoch": 0.08056567388043712,
"grad_norm": 0.6650830507278442,
"kl": 0.0022716522216796875,
"learning_rate": 9.78935800506826e-07,
"loss": 0.2632,
"reward": 0.7651779092848301,
"reward_std": 0.8691636770963669,
"rewards/cosine_scaled_reward": 0.25758895510807633,
"rewards/format_reward": 0.25000000558793545,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 1863.0833435058594,
"epoch": 0.08142275551746304,
"grad_norm": 0.16325955092906952,
"kl": 0.0027484893798828125,
"learning_rate": 9.779754323328192e-07,
"loss": -0.0665,
"reward": 0.6303411349654198,
"reward_std": 0.8782893866300583,
"rewards/cosine_scaled_reward": 0.2179483361542225,
"rewards/format_reward": 0.19444444961845875,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 1843.5833435058594,
"epoch": 0.08227983715448897,
"grad_norm": 0.29014191031455994,
"kl": 0.0071086883544921875,
"learning_rate": 9.769942052400235e-07,
"loss": -0.1035,
"reward": 0.8213257193565369,
"reward_std": 0.7900742888450623,
"rewards/cosine_scaled_reward": 0.3551072867412586,
"rewards/format_reward": 0.11111111287027597,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 2286.7361450195312,
"epoch": 0.08313691879151489,
"grad_norm": 0.1988731324672699,
"kl": 0.0035772323608398438,
"learning_rate": 9.759921670520634e-07,
"loss": -0.0885,
"reward": -0.018717994913458824,
"reward_std": 0.6726399958133698,
"rewards/cosine_scaled_reward": -0.07185898721218109,
"rewards/format_reward": 0.12500000465661287,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 1786.0972595214844,
"epoch": 0.08399400042854081,
"grad_norm": 0.49428296089172363,
"kl": 0.008466720581054688,
"learning_rate": 9.749693666068663e-07,
"loss": 0.166,
"reward": 0.6488583460450172,
"reward_std": 0.7391397655010223,
"rewards/cosine_scaled_reward": 0.2966513857245445,
"rewards/format_reward": 0.055555556900799274,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 2219.3055725097656,
"epoch": 0.08485108206556674,
"grad_norm": 0.30233073234558105,
"kl": 0.003276824951171875,
"learning_rate": 9.739258537542835e-07,
"loss": -0.0317,
"reward": 0.06553506385535002,
"reward_std": 0.608645610511303,
"rewards/cosine_scaled_reward": -0.029732469469308853,
"rewards/format_reward": 0.12500000279396772,
"step": 99
},
{
"clip_ratio": 0.0,
"completion_length": 2252.916717529297,
"epoch": 0.08570816370259267,
"grad_norm": 0.16652679443359375,
"kl": 0.007282257080078125,
"learning_rate": 9.728616793536587e-07,
"loss": -0.0111,
"reward": 1.0242944061756134,
"reward_std": 0.8552243709564209,
"rewards/cosine_scaled_reward": 0.4079805314540863,
"rewards/format_reward": 0.20833334140479565,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 2132.361114501953,
"epoch": 0.0865652453396186,
"grad_norm": 0.3907296061515808,
"kl": 0.006062507629394531,
"learning_rate": 9.717768952713511e-07,
"loss": 0.2013,
"reward": 0.8774868845939636,
"reward_std": 0.8741513937711716,
"rewards/cosine_scaled_reward": 0.3623545281589031,
"rewards/format_reward": 0.15277778077870607,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 1591.3611450195312,
"epoch": 0.08742232697664452,
"grad_norm": 0.20045924186706543,
"kl": 0.020427703857421875,
"learning_rate": 9.706715543782064e-07,
"loss": 0.067,
"reward": 0.6824215389788151,
"reward_std": 0.8814049959182739,
"rewards/cosine_scaled_reward": 0.27871076576411724,
"rewards/format_reward": 0.12500000093132257,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 2065.1944885253906,
"epoch": 0.08827940861367045,
"grad_norm": 0.39964577555656433,
"kl": 0.00489044189453125,
"learning_rate": 9.695457105469804e-07,
"loss": 0.0617,
"reward": 0.3749300390481949,
"reward_std": 0.8525811061263084,
"rewards/cosine_scaled_reward": 0.11802058666944504,
"rewards/format_reward": 0.13888889364898205,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 1969.375,
"epoch": 0.08913649025069638,
"grad_norm": 0.4705803394317627,
"kl": 0.0055637359619140625,
"learning_rate": 9.683994186497132e-07,
"loss": 0.1758,
"reward": 1.0127343982458115,
"reward_std": 0.6061429902911186,
"rewards/cosine_scaled_reward": 0.3744227262213826,
"rewards/format_reward": 0.2638888955116272,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 2124.2916564941406,
"epoch": 0.0899935718877223,
"grad_norm": 0.19006462395191193,
"kl": 0.0029754638671875,
"learning_rate": 9.672327345550543e-07,
"loss": 0.0454,
"reward": 0.9216031394898891,
"reward_std": 0.9004587382078171,
"rewards/cosine_scaled_reward": 0.3635793221183121,
"rewards/format_reward": 0.19444444868713617,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 1721.5694885253906,
"epoch": 0.09085065352474823,
"grad_norm": 0.18808282911777496,
"kl": 0.008443832397460938,
"learning_rate": 9.66045715125541e-07,
"loss": 0.0263,
"reward": 0.6266486272215843,
"reward_std": 0.6180888190865517,
"rewards/cosine_scaled_reward": 0.2994354497641325,
"rewards/format_reward": 0.02777777798473835,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 2028.7083740234375,
"epoch": 0.09170773516177416,
"grad_norm": 0.24120453000068665,
"kl": 0.01983642578125,
"learning_rate": 9.648384182148252e-07,
"loss": 0.1064,
"reward": 0.5810213461518288,
"reward_std": 0.5696274787187576,
"rewards/cosine_scaled_reward": 0.2002328964881599,
"rewards/format_reward": 0.18055556248873472,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 2217.90283203125,
"epoch": 0.09256481679880009,
"grad_norm": 0.3850785493850708,
"kl": 0.0053253173828125,
"learning_rate": 9.636109026648554e-07,
"loss": -0.0305,
"reward": 0.6920746862888336,
"reward_std": 0.9560717344284058,
"rewards/cosine_scaled_reward": 0.25575956143438816,
"rewards/format_reward": 0.18055556248873472,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 2110.7222595214844,
"epoch": 0.09342189843582602,
"grad_norm": 0.167547345161438,
"kl": 0.01023101806640625,
"learning_rate": 9.623632283030077e-07,
"loss": 0.0664,
"reward": 0.9005825072526932,
"reward_std": 0.7161072492599487,
"rewards/cosine_scaled_reward": 0.36001347936689854,
"rewards/format_reward": 0.18055556155741215,
"step": 109
},
{
"clip_ratio": 0.0,
"completion_length": 1947.2500305175781,
"epoch": 0.09427898007285195,
"grad_norm": 0.14026661217212677,
"kl": 0.00750732421875,
"learning_rate": 9.610954559391704e-07,
"loss": -0.02,
"reward": 0.6598109304904938,
"reward_std": 0.7547452449798584,
"rewards/cosine_scaled_reward": 0.2396276891231537,
"rewards/format_reward": 0.18055555876344442,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 1930.9027709960938,
"epoch": 0.09513606170987787,
"grad_norm": 0.21193362772464752,
"kl": 0.007018566131591797,
"learning_rate": 9.598076473627796e-07,
"loss": 0.1196,
"reward": 0.5302619338035583,
"reward_std": 0.6282015666365623,
"rewards/cosine_scaled_reward": 0.18179763481020927,
"rewards/format_reward": 0.16666666977107525,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 2134.0694885253906,
"epoch": 0.09599314334690379,
"grad_norm": 0.24564455449581146,
"kl": 0.00876617431640625,
"learning_rate": 9.58499865339809e-07,
"loss": 0.0425,
"reward": 0.7763140201568604,
"reward_std": 0.937856912612915,
"rewards/cosine_scaled_reward": 0.2909347750246525,
"rewards/format_reward": 0.19444444868713617,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 2006.2083129882812,
"epoch": 0.09685022498392971,
"grad_norm": 0.2176864892244339,
"kl": 0.011257171630859375,
"learning_rate": 9.571721736097088e-07,
"loss": 0.2242,
"reward": 1.0729680806398392,
"reward_std": 0.8336671739816666,
"rewards/cosine_scaled_reward": 0.39065071195364,
"rewards/format_reward": 0.2916666753590107,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 1895.02783203125,
"epoch": 0.09770730662095564,
"grad_norm": 0.15649937093257904,
"kl": 0.00569915771484375,
"learning_rate": 9.55824636882301e-07,
"loss": 0.0528,
"reward": 0.438002310693264,
"reward_std": 0.9526876509189606,
"rewards/cosine_scaled_reward": 0.09400115348398685,
"rewards/format_reward": 0.2500000074505806,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 1993.6528015136719,
"epoch": 0.09856438825798157,
"grad_norm": 0.4257446527481079,
"kl": 0.01016998291015625,
"learning_rate": 9.54457320834625e-07,
"loss": 0.0147,
"reward": 0.6780023947358131,
"reward_std": 0.9333401471376419,
"rewards/cosine_scaled_reward": 0.2626123018562794,
"rewards/format_reward": 0.15277778077870607,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 1788.4444885253906,
"epoch": 0.0994214698950075,
"grad_norm": 0.15009744465351105,
"kl": 0.01854705810546875,
"learning_rate": 9.530702921077358e-07,
"loss": 0.1687,
"reward": 0.8848052807152271,
"reward_std": 0.9200158715248108,
"rewards/cosine_scaled_reward": 0.3590692952275276,
"rewards/format_reward": 0.16666666977107525,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 1550.0833129882812,
"epoch": 0.10027855153203342,
"grad_norm": 0.2336883395910263,
"kl": 0.0109710693359375,
"learning_rate": 9.516636183034564e-07,
"loss": 0.0968,
"reward": 0.721510112285614,
"reward_std": 0.8123895823955536,
"rewards/cosine_scaled_reward": 0.2635328520555049,
"rewards/format_reward": 0.19444444961845875,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 2167.1250610351562,
"epoch": 0.10113563316905935,
"grad_norm": 0.3328269422054291,
"kl": 0.009395599365234375,
"learning_rate": 9.502373679810839e-07,
"loss": 0.0206,
"reward": -0.14523404464125633,
"reward_std": 0.5727858245372772,
"rewards/cosine_scaled_reward": -0.14206147193908691,
"rewards/format_reward": 0.13888888992369175,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 1597.2361145019531,
"epoch": 0.10199271480608528,
"grad_norm": 1.107146143913269,
"kl": 0.0159759521484375,
"learning_rate": 9.487916106540465e-07,
"loss": 0.5165,
"reward": 0.715252235531807,
"reward_std": 0.834864467382431,
"rewards/cosine_scaled_reward": 0.25345943216234446,
"rewards/format_reward": 0.20833333488553762,
"step": 119
},
{
"clip_ratio": 0.0,
"completion_length": 2161.027801513672,
"epoch": 0.10284979644311121,
"grad_norm": 0.15888024866580963,
"kl": 0.007426261901855469,
"learning_rate": 9.473264167865171e-07,
"loss": 0.0203,
"reward": 0.2189617045223713,
"reward_std": 0.7949748933315277,
"rewards/cosine_scaled_reward": 0.04003641102463007,
"rewards/format_reward": 0.13888889271765947,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 2409.2083129882812,
"epoch": 0.10370687808013714,
"grad_norm": 0.21175938844680786,
"kl": 0.008785247802734375,
"learning_rate": 9.458418577899774e-07,
"loss": -0.0082,
"reward": 0.8535979464650154,
"reward_std": 0.8966440111398697,
"rewards/cosine_scaled_reward": 0.3642989657819271,
"rewards/format_reward": 0.12500000186264515,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 1877.7222290039062,
"epoch": 0.10456395971716306,
"grad_norm": 0.350690633058548,
"kl": 0.01152801513671875,
"learning_rate": 9.443380060197385e-07,
"loss": -0.014,
"reward": 0.47487088665366173,
"reward_std": 0.7110822051763535,
"rewards/cosine_scaled_reward": 0.13326877844519913,
"rewards/format_reward": 0.20833333674818277,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 1858.8750610351562,
"epoch": 0.10542104135418899,
"grad_norm": 0.2693331837654114,
"kl": 0.016326904296875,
"learning_rate": 9.428149347714143e-07,
"loss": -0.021,
"reward": 0.7835421413183212,
"reward_std": 0.7691494226455688,
"rewards/cosine_scaled_reward": 0.2806599698960781,
"rewards/format_reward": 0.22222222946584225,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 1666.7222290039062,
"epoch": 0.10627812299121492,
"grad_norm": 0.21113620698451996,
"kl": 0.018585205078125,
"learning_rate": 9.412727182773486e-07,
"loss": 0.224,
"reward": 0.8200660422444344,
"reward_std": 0.7651881277561188,
"rewards/cosine_scaled_reward": 0.2780885882675648,
"rewards/format_reward": 0.2638888992369175,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 1892.8472290039062,
"epoch": 0.10713520462824085,
"grad_norm": 0.2539234161376953,
"kl": 0.015058517456054688,
"learning_rate": 9.397114317029974e-07,
"loss": 0.1511,
"reward": 0.6824524328112602,
"reward_std": 0.6703763008117676,
"rewards/cosine_scaled_reward": 0.2578928880393505,
"rewards/format_reward": 0.16666667256504297,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 2195.3472290039062,
"epoch": 0.10799228626526676,
"grad_norm": 0.1690240204334259,
"kl": 0.011653900146484375,
"learning_rate": 9.381311511432658e-07,
"loss": 0.0703,
"reward": 0.618515363894403,
"reward_std": 0.9210871905088425,
"rewards/cosine_scaled_reward": 0.1842576777562499,
"rewards/format_reward": 0.25000000558793545,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 2141.375030517578,
"epoch": 0.10884936790229269,
"grad_norm": 0.19393949210643768,
"kl": 0.01447296142578125,
"learning_rate": 9.36531953618799e-07,
"loss": 0.1103,
"reward": 0.9065948352217674,
"reward_std": 0.6801795363426208,
"rewards/cosine_scaled_reward": 0.36301962845027447,
"rewards/format_reward": 0.18055555783212185,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 2105.375030517578,
"epoch": 0.10970644953931862,
"grad_norm": 0.21088111400604248,
"kl": 0.01592254638671875,
"learning_rate": 9.34913917072228e-07,
"loss": 0.0865,
"reward": 0.822256007231772,
"reward_std": 0.953468844294548,
"rewards/cosine_scaled_reward": 0.2861280349898152,
"rewards/format_reward": 0.2500000046566129,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 2267.9583129882812,
"epoch": 0.11056353117634454,
"grad_norm": 0.2045803964138031,
"kl": 0.01177978515625,
"learning_rate": 9.332771203643714e-07,
"loss": 0.0814,
"reward": 0.36659867502748966,
"reward_std": 0.6582172811031342,
"rewards/cosine_scaled_reward": 0.05135490372776985,
"rewards/format_reward": 0.2638888955116272,
"step": 129
},
{
"clip_ratio": 0.0,
"completion_length": 1816.2222290039062,
"epoch": 0.11142061281337047,
"grad_norm": 0.44615596532821655,
"kl": 0.0237884521484375,
"learning_rate": 9.316216432703916e-07,
"loss": 0.0822,
"reward": 0.9266887735575438,
"reward_std": 0.7460800111293793,
"rewards/cosine_scaled_reward": 0.3313999269157648,
"rewards/format_reward": 0.2638888955116272,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 1904.1944885253906,
"epoch": 0.1122776944503964,
"grad_norm": 0.2686100900173187,
"kl": 0.0119476318359375,
"learning_rate": 9.299475664759068e-07,
"loss": 0.0847,
"reward": 0.4160115160048008,
"reward_std": 0.7496158927679062,
"rewards/cosine_scaled_reward": 0.08995018899440765,
"rewards/format_reward": 0.236111119389534,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 1897.7083129882812,
"epoch": 0.11313477608742233,
"grad_norm": 0.2043069452047348,
"kl": 0.01409912109375,
"learning_rate": 9.282549715730579e-07,
"loss": 0.1342,
"reward": 0.4763021022081375,
"reward_std": 0.810718834400177,
"rewards/cosine_scaled_reward": 0.1339843887835741,
"rewards/format_reward": 0.2083333358168602,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 2475.1111450195312,
"epoch": 0.11399185772444825,
"grad_norm": 0.15878832340240479,
"kl": 0.00994110107421875,
"learning_rate": 9.265439410565328e-07,
"loss": -0.0354,
"reward": 0.2221047766506672,
"reward_std": 0.7571545913815498,
"rewards/cosine_scaled_reward": 0.041607944294810295,
"rewards/format_reward": 0.13888889364898205,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 2260.750030517578,
"epoch": 0.11484893936147418,
"grad_norm": 0.1739797592163086,
"kl": 0.00885772705078125,
"learning_rate": 9.248145583195447e-07,
"loss": 0.0656,
"reward": 0.7371436022222042,
"reward_std": 0.7502488344907761,
"rewards/cosine_scaled_reward": 0.2713496144860983,
"rewards/format_reward": 0.19444444961845875,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 1935.4583740234375,
"epoch": 0.11570602099850011,
"grad_norm": 0.5385437607765198,
"kl": 0.0226593017578125,
"learning_rate": 9.230669076497687e-07,
"loss": 0.1112,
"reward": 0.5696738436818123,
"reward_std": 0.6989183947443962,
"rewards/cosine_scaled_reward": 0.19455914944410324,
"rewards/format_reward": 0.18055555876344442,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 1787.6111145019531,
"epoch": 0.11656310263552604,
"grad_norm": 0.5149728059768677,
"kl": 0.012420654296875,
"learning_rate": 9.213010742252327e-07,
"loss": 0.1226,
"reward": 0.687653437256813,
"reward_std": 0.9176287800073624,
"rewards/cosine_scaled_reward": 0.1841045105829835,
"rewards/format_reward": 0.3194444552063942,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 1814.8194274902344,
"epoch": 0.11742018427255196,
"grad_norm": 0.2650432884693146,
"kl": 0.0258636474609375,
"learning_rate": 9.195171441101668e-07,
"loss": -0.0266,
"reward": 1.1596794873476028,
"reward_std": 0.9145576506853104,
"rewards/cosine_scaled_reward": 0.4201175607740879,
"rewards/format_reward": 0.3194444514811039,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 1682.513916015625,
"epoch": 0.11827726590957789,
"grad_norm": 0.2149789184331894,
"kl": 0.0146484375,
"learning_rate": 9.177152042508077e-07,
"loss": 0.1368,
"reward": 0.49744264781475067,
"reward_std": 0.819553479552269,
"rewards/cosine_scaled_reward": 0.12372134439647198,
"rewards/format_reward": 0.25000000558793545,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 1823.75,
"epoch": 0.11913434754660382,
"grad_norm": 0.4041607677936554,
"kl": 0.018646240234375,
"learning_rate": 9.158953424711624e-07,
"loss": 0.158,
"reward": 0.5236843451857567,
"reward_std": 0.6677617505192757,
"rewards/cosine_scaled_reward": 0.10906438087113202,
"rewards/format_reward": 0.305555559694767,
"step": 139
},
{
"clip_ratio": 0.0,
"completion_length": 1966.875,
"epoch": 0.11999142918362975,
"grad_norm": 0.19172143936157227,
"kl": 0.0146484375,
"learning_rate": 9.140576474687263e-07,
"loss": 0.12,
"reward": 0.788971059024334,
"reward_std": 0.7478453367948532,
"rewards/cosine_scaled_reward": 0.2625410854816437,
"rewards/format_reward": 0.2638888955116272,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 2236.7361450195312,
"epoch": 0.12084851082065566,
"grad_norm": 0.20563149452209473,
"kl": 0.011810302734375,
"learning_rate": 9.122022088101613e-07,
"loss": 0.0257,
"reward": 0.4495049864053726,
"reward_std": 0.9425568133592606,
"rewards/cosine_scaled_reward": 0.1275302767753601,
"rewards/format_reward": 0.1944444514811039,
"step": 141
},
{
"clip_ratio": 0.0,
"completion_length": 1477.8611145019531,
"epoch": 0.12170559245768159,
"grad_norm": 0.5766943693161011,
"kl": 0.01958465576171875,
"learning_rate": 9.103291169269299e-07,
"loss": 0.2984,
"reward": 0.9424830563366413,
"reward_std": 0.763814777135849,
"rewards/cosine_scaled_reward": 0.34624152863398194,
"rewards/format_reward": 0.2500000037252903,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 1456.9722595214844,
"epoch": 0.12256267409470752,
"grad_norm": 0.33502310514450073,
"kl": 0.02838134765625,
"learning_rate": 9.084384631108882e-07,
"loss": 0.2811,
"reward": 1.1146164610981941,
"reward_std": 0.8421279340982437,
"rewards/cosine_scaled_reward": 0.3698082063347101,
"rewards/format_reward": 0.3750000074505806,
"step": 143
},
{
"clip_ratio": 0.0,
"completion_length": 1582.8611450195312,
"epoch": 0.12341975573173344,
"grad_norm": 0.5452846884727478,
"kl": 0.0584716796875,
"learning_rate": 9.065303395098358e-07,
"loss": 0.2054,
"reward": 0.49096263851970434,
"reward_std": 0.8086179941892624,
"rewards/cosine_scaled_reward": 0.15520351566374302,
"rewards/format_reward": 0.18055555876344442,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 2001.9305725097656,
"epoch": 0.12427683736875937,
"grad_norm": 0.29051852226257324,
"kl": 0.0219879150390625,
"learning_rate": 9.046048391230247e-07,
"loss": 0.1612,
"reward": 0.858160063624382,
"reward_std": 0.8109176307916641,
"rewards/cosine_scaled_reward": 0.2832467071712017,
"rewards/format_reward": 0.2916666753590107,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 1551.8611145019531,
"epoch": 0.1251339190057853,
"grad_norm": 0.3925701975822449,
"kl": 0.02143096923828125,
"learning_rate": 9.026620557966279e-07,
"loss": 0.301,
"reward": 0.8699105493724346,
"reward_std": 0.96321090310812,
"rewards/cosine_scaled_reward": 0.28217751905322075,
"rewards/format_reward": 0.3055555634200573,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 1913.1111450195312,
"epoch": 0.12599100064281124,
"grad_norm": 0.39382514357566833,
"kl": 0.0259246826171875,
"learning_rate": 9.007020842191634e-07,
"loss": 0.1395,
"reward": 0.39645494148135185,
"reward_std": 0.8219664841890335,
"rewards/cosine_scaled_reward": 0.08017192035913467,
"rewards/format_reward": 0.2361111156642437,
"step": 147
},
{
"clip_ratio": 0.0,
"completion_length": 1661.125,
"epoch": 0.12684808227983715,
"grad_norm": 0.2739737927913666,
"kl": 0.064727783203125,
"learning_rate": 8.987250199168808e-07,
"loss": 0.0322,
"reward": 0.9951315224170685,
"reward_std": 0.8566233068704605,
"rewards/cosine_scaled_reward": 0.39339908584952354,
"rewards/format_reward": 0.20833334140479565,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 2030.9027709960938,
"epoch": 0.12770516391686307,
"grad_norm": 0.1996004581451416,
"kl": 0.01544189453125,
"learning_rate": 8.967309592491052e-07,
"loss": 0.0669,
"reward": 0.31733213737607,
"reward_std": 0.5332914516329765,
"rewards/cosine_scaled_reward": 0.02672163024544716,
"rewards/format_reward": 0.2638888955116272,
"step": 149
},
{
"clip_ratio": 0.0,
"completion_length": 1978.4722595214844,
"epoch": 0.128562245553889,
"grad_norm": 0.4115337133407593,
"kl": 0.0281524658203125,
"learning_rate": 8.9471999940354e-07,
"loss": 0.012,
"reward": 0.8926911260932684,
"reward_std": 0.6474704742431641,
"rewards/cosine_scaled_reward": 0.2935677766799927,
"rewards/format_reward": 0.3055555634200573,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 1758.2083740234375,
"epoch": 0.12941932719091492,
"grad_norm": 0.3856815695762634,
"kl": 0.0369873046875,
"learning_rate": 8.926922383915315e-07,
"loss": 0.237,
"reward": 0.8293609768152237,
"reward_std": 0.9914700090885162,
"rewards/cosine_scaled_reward": 0.2827360359951854,
"rewards/format_reward": 0.26388889364898205,
"step": 151
},
{
"clip_ratio": 0.0,
"completion_length": 1611.4861450195312,
"epoch": 0.13027640882794086,
"grad_norm": 0.44330915808677673,
"kl": 0.0380401611328125,
"learning_rate": 8.906477750432903e-07,
"loss": -0.0427,
"reward": 0.6217841571196914,
"reward_std": 0.6576393991708755,
"rewards/cosine_scaled_reward": 0.19283651188015938,
"rewards/format_reward": 0.236111119389534,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 1863.4305725097656,
"epoch": 0.13113349046496678,
"grad_norm": 0.23155762255191803,
"kl": 0.029293060302734375,
"learning_rate": 8.88586709003076e-07,
"loss": -0.0151,
"reward": 0.6762807443737984,
"reward_std": 0.9942405819892883,
"rewards/cosine_scaled_reward": 0.19925148598849773,
"rewards/format_reward": 0.2777777835726738,
"step": 153
},
{
"clip_ratio": 0.0,
"completion_length": 2036.3055725097656,
"epoch": 0.13199057210199272,
"grad_norm": 6.47697114944458,
"kl": 0.0350189208984375,
"learning_rate": 8.865091407243394e-07,
"loss": 0.0039,
"reward": 0.2652840279042721,
"reward_std": 0.5364516898989677,
"rewards/cosine_scaled_reward": -0.0131913423538208,
"rewards/format_reward": 0.2916666688397527,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 1694.7361450195312,
"epoch": 0.13284765373901863,
"grad_norm": 0.6100454330444336,
"kl": 0.057952880859375,
"learning_rate": 8.844151714648274e-07,
"loss": 0.2837,
"reward": 0.949450820684433,
"reward_std": 0.8621908873319626,
"rewards/cosine_scaled_reward": 0.3705587573349476,
"rewards/format_reward": 0.2083333395421505,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 1715.5972442626953,
"epoch": 0.13370473537604458,
"grad_norm": 0.36461231112480164,
"kl": 0.034576416015625,
"learning_rate": 8.823049032816478e-07,
"loss": 0.1019,
"reward": 0.9654277712106705,
"reward_std": 0.8116246461868286,
"rewards/cosine_scaled_reward": 0.31604722142219543,
"rewards/format_reward": 0.33333333767950535,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 1362.9583435058594,
"epoch": 0.1345618170130705,
"grad_norm": 0.3907575309276581,
"kl": 0.0545654296875,
"learning_rate": 8.801784390262943e-07,
"loss": 0.141,
"reward": 0.8722702264785767,
"reward_std": 0.6232585608959198,
"rewards/cosine_scaled_reward": 0.29724621400237083,
"rewards/format_reward": 0.27777778543531895,
"step": 157
},
{
"clip_ratio": 0.0,
"completion_length": 1861.013916015625,
"epoch": 0.13541889865009643,
"grad_norm": 0.36566102504730225,
"kl": 0.050750732421875,
"learning_rate": 8.780358823396352e-07,
"loss": 0.2151,
"reward": 0.5994696915149689,
"reward_std": 0.9274942576885223,
"rewards/cosine_scaled_reward": 0.18167929956689477,
"rewards/format_reward": 0.2361111156642437,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 1819.916748046875,
"epoch": 0.13627598028712234,
"grad_norm": 0.31390905380249023,
"kl": 0.05914306640625,
"learning_rate": 8.758773376468604e-07,
"loss": 0.3592,
"reward": 0.8621488437056541,
"reward_std": 0.8510329574346542,
"rewards/cosine_scaled_reward": 0.29218554496765137,
"rewards/format_reward": 0.2777777835726738,
"step": 159
},
{
"clip_ratio": 0.0,
"completion_length": 1867.0277709960938,
"epoch": 0.1371330619241483,
"grad_norm": 0.23102979362010956,
"kl": 0.0302734375,
"learning_rate": 8.737029101523929e-07,
"loss": 0.0781,
"reward": 0.5296455472707748,
"reward_std": 0.7065431177616119,
"rewards/cosine_scaled_reward": 0.11898945830762386,
"rewards/format_reward": 0.2916666716337204,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 1560.7916717529297,
"epoch": 0.1379901435611742,
"grad_norm": 0.35218510031700134,
"kl": 0.05161285400390625,
"learning_rate": 8.715127058347614e-07,
"loss": -0.046,
"reward": 0.9299365878105164,
"reward_std": 0.5807594284415245,
"rewards/cosine_scaled_reward": 0.3260794151574373,
"rewards/format_reward": 0.2777777835726738,
"step": 161
},
{
"clip_ratio": 0.0,
"completion_length": 1482.4861450195312,
"epoch": 0.13884722519820014,
"grad_norm": 0.31520912051200867,
"kl": 0.05499267578125,
"learning_rate": 8.693068314414344e-07,
"loss": 0.1936,
"reward": 1.1436526030302048,
"reward_std": 0.786539800465107,
"rewards/cosine_scaled_reward": 0.4398818574845791,
"rewards/format_reward": 0.2638888992369175,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 1211.0694427490234,
"epoch": 0.13970430683522606,
"grad_norm": 0.7765697240829468,
"kl": 0.0682373046875,
"learning_rate": 8.670853944836176e-07,
"loss": 0.403,
"reward": 1.4794435054063797,
"reward_std": 0.9502733200788498,
"rewards/cosine_scaled_reward": 0.5383328944444656,
"rewards/format_reward": 0.4027777798473835,
"step": 163
},
{
"clip_ratio": 0.0,
"completion_length": 1754.9583435058594,
"epoch": 0.14056138847225197,
"grad_norm": 0.484488308429718,
"kl": 0.046142578125,
"learning_rate": 8.648485032310144e-07,
"loss": 0.0821,
"reward": 0.8094066381454468,
"reward_std": 0.9717631787061691,
"rewards/cosine_scaled_reward": 0.2241477482020855,
"rewards/format_reward": 0.3611111231148243,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 1594.3472290039062,
"epoch": 0.1414184701092779,
"grad_norm": 0.40359950065612793,
"kl": 0.063720703125,
"learning_rate": 8.625962667065487e-07,
"loss": 0.3409,
"reward": 0.8673667535185814,
"reward_std": 0.8468609303236008,
"rewards/cosine_scaled_reward": 0.30868337862193584,
"rewards/format_reward": 0.2500000074505806,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 1543.9027404785156,
"epoch": 0.14227555174630382,
"grad_norm": 0.4297288656234741,
"kl": 0.07159423828125,
"learning_rate": 8.603287946810513e-07,
"loss": 0.1784,
"reward": 1.0218196213245392,
"reward_std": 0.8119381964206696,
"rewards/cosine_scaled_reward": 0.3650764860212803,
"rewards/format_reward": 0.2916666716337204,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 1453.4722290039062,
"epoch": 0.14313263338332977,
"grad_norm": 0.6794213652610779,
"kl": 0.08380126953125,
"learning_rate": 8.580461976679099e-07,
"loss": 0.304,
"reward": 1.2251211404800415,
"reward_std": 0.7566522508859634,
"rewards/cosine_scaled_reward": 0.43200499936938286,
"rewards/format_reward": 0.3611111231148243,
"step": 167
},
{
"clip_ratio": 0.0,
"completion_length": 1612.1527862548828,
"epoch": 0.14398971502035568,
"grad_norm": 0.6980922222137451,
"kl": 0.05419921875,
"learning_rate": 8.557485869176825e-07,
"loss": 0.1675,
"reward": 0.8015574552118778,
"reward_std": 0.9579913914203644,
"rewards/cosine_scaled_reward": 0.24800091050565243,
"rewards/format_reward": 0.30555556528270245,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 1692.9028015136719,
"epoch": 0.14484679665738162,
"grad_norm": 0.8601597547531128,
"kl": 0.095458984375,
"learning_rate": 8.534360744126753e-07,
"loss": 0.1693,
"reward": 0.3856995478272438,
"reward_std": 0.7862947285175323,
"rewards/cosine_scaled_reward": 0.012294212356209755,
"rewards/format_reward": 0.361111119389534,
"step": 169
},
{
"clip_ratio": 0.0,
"completion_length": 1859.0416870117188,
"epoch": 0.14570387829440754,
"grad_norm": 0.4334951937198639,
"kl": 0.05572509765625,
"learning_rate": 8.511087728614862e-07,
"loss": 0.1955,
"reward": 0.8502315804362297,
"reward_std": 0.9405944645404816,
"rewards/cosine_scaled_reward": 0.25844913721084595,
"rewards/format_reward": 0.3333333432674408,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 1359.763916015625,
"epoch": 0.14656095993143348,
"grad_norm": 1.0471651554107666,
"kl": 0.1002197265625,
"learning_rate": 8.487667956935087e-07,
"loss": 0.4846,
"reward": 0.9384964210912585,
"reward_std": 0.8249912112951279,
"rewards/cosine_scaled_reward": 0.274803776293993,
"rewards/format_reward": 0.3888888955116272,
"step": 171
},
{
"clip_ratio": 0.0,
"completion_length": 1483.1250305175781,
"epoch": 0.1474180415684594,
"grad_norm": 0.46483904123306274,
"kl": 0.07452392578125,
"learning_rate": 8.464102570534061e-07,
"loss": 0.1276,
"reward": 1.1911370605230331,
"reward_std": 0.7849500328302383,
"rewards/cosine_scaled_reward": 0.4289018586277962,
"rewards/format_reward": 0.33333333767950535,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 1719.3750305175781,
"epoch": 0.14827512320548533,
"grad_norm": 0.6636127233505249,
"kl": 0.08563232421875,
"learning_rate": 8.440392717955475e-07,
"loss": 0.3273,
"reward": 0.640670370310545,
"reward_std": 0.7593775987625122,
"rewards/cosine_scaled_reward": 0.13977964222431183,
"rewards/format_reward": 0.3611111231148243,
"step": 173
},
{
"clip_ratio": 0.0,
"completion_length": 1732.0000305175781,
"epoch": 0.14913220484251125,
"grad_norm": 1.571647047996521,
"kl": 0.093109130859375,
"learning_rate": 8.416539554784089e-07,
"loss": 0.1424,
"reward": 0.7180789969861507,
"reward_std": 0.7864874973893166,
"rewards/cosine_scaled_reward": 0.18542836606502533,
"rewards/format_reward": 0.3472222276031971,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 1991.75,
"epoch": 0.1499892864795372,
"grad_norm": 0.5590563416481018,
"kl": 0.0755615234375,
"learning_rate": 8.392544243589427e-07,
"loss": 0.1139,
"reward": 0.528855599462986,
"reward_std": 0.9861510694026947,
"rewards/cosine_scaled_reward": 0.09081667335703969,
"rewards/format_reward": 0.3472222276031971,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 1684.8750305175781,
"epoch": 0.1508463681165631,
"grad_norm": 0.4577758014202118,
"kl": 0.0635986328125,
"learning_rate": 8.368407953869103e-07,
"loss": 0.2379,
"reward": 0.7251470182090998,
"reward_std": 0.8135327100753784,
"rewards/cosine_scaled_reward": 0.1959068402647972,
"rewards/format_reward": 0.3333333395421505,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 1634.75,
"epoch": 0.15170344975358904,
"grad_norm": 1.0375832319259644,
"kl": 0.09271240234375,
"learning_rate": 8.344131861991828e-07,
"loss": 0.24,
"reward": 0.805720079690218,
"reward_std": 0.8198679685592651,
"rewards/cosine_scaled_reward": 0.1806377861648798,
"rewards/format_reward": 0.4444444477558136,
"step": 177
},
{
"clip_ratio": 0.0,
"completion_length": 1334.4583740234375,
"epoch": 0.15256053139061496,
"grad_norm": 1.7115483283996582,
"kl": 0.14361572265625,
"learning_rate": 8.319717151140072e-07,
"loss": -0.0013,
"reward": 0.6213032007217407,
"reward_std": 0.733954668045044,
"rewards/cosine_scaled_reward": 0.19954046607017517,
"rewards/format_reward": 0.2222222276031971,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 1860.3056030273438,
"epoch": 0.15341761302764087,
"grad_norm": 0.5624024271965027,
"kl": 0.071868896484375,
"learning_rate": 8.295165011252396e-07,
"loss": 0.074,
"reward": 0.5974816232919693,
"reward_std": 0.7972533106803894,
"rewards/cosine_scaled_reward": 0.14596302818972617,
"rewards/format_reward": 0.305555559694767,
"step": 179
},
{
"clip_ratio": 0.0,
"completion_length": 1709.4583435058594,
"epoch": 0.1542746946646668,
"grad_norm": 1.1149603128433228,
"kl": 0.0904541015625,
"learning_rate": 8.270476638965461e-07,
"loss": 0.1984,
"reward": 0.7268609385937452,
"reward_std": 0.8206999450922012,
"rewards/cosine_scaled_reward": 0.21065270341932774,
"rewards/format_reward": 0.3055555634200573,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 1504.4861145019531,
"epoch": 0.15513177630169273,
"grad_norm": 0.8208626508712769,
"kl": 0.10107421875,
"learning_rate": 8.245653237555705e-07,
"loss": 0.2409,
"reward": 0.8135174959897995,
"reward_std": 0.8603949248790741,
"rewards/cosine_scaled_reward": 0.2192587312310934,
"rewards/format_reward": 0.3750000037252903,
"step": 181
},
{
"clip_ratio": 0.0,
"completion_length": 1832.2221984863281,
"epoch": 0.15598885793871867,
"grad_norm": 0.7162370681762695,
"kl": 0.138916015625,
"learning_rate": 8.220696016880687e-07,
"loss": 0.0797,
"reward": 0.7859554402530193,
"reward_std": 0.8516587615013123,
"rewards/cosine_scaled_reward": 0.24019994214177132,
"rewards/format_reward": 0.3055555634200573,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 1286.0000305175781,
"epoch": 0.15684593957574458,
"grad_norm": 2.3148720264434814,
"kl": 0.1636962890625,
"learning_rate": 8.195606193320136e-07,
"loss": 0.2503,
"reward": 0.9098443686962128,
"reward_std": 0.9491814821958542,
"rewards/cosine_scaled_reward": 0.2604777254164219,
"rewards/format_reward": 0.3888888955116272,
"step": 183
},
{
"clip_ratio": 0.0,
"completion_length": 1429.7361450195312,
"epoch": 0.15770302121277052,
"grad_norm": 0.5743198990821838,
"kl": 0.084716796875,
"learning_rate": 8.170384989716657e-07,
"loss": -0.0243,
"reward": 0.716302827000618,
"reward_std": 0.9275215268135071,
"rewards/cosine_scaled_reward": 0.17759587243199348,
"rewards/format_reward": 0.361111119389534,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 1653.9444885253906,
"epoch": 0.15856010284979644,
"grad_norm": 1.1362574100494385,
"kl": 0.10540771484375,
"learning_rate": 8.145033635316128e-07,
"loss": 0.152,
"reward": 0.4382926889229566,
"reward_std": 0.8508107364177704,
"rewards/cosine_scaled_reward": 0.017757446970790625,
"rewards/format_reward": 0.4027777835726738,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 927.8750152587891,
"epoch": 0.15941718448682238,
"grad_norm": 2.538083553314209,
"kl": 0.1431884765625,
"learning_rate": 8.119553365707802e-07,
"loss": 0.1705,
"reward": 1.315717488527298,
"reward_std": 0.922233521938324,
"rewards/cosine_scaled_reward": 0.44258103519678116,
"rewards/format_reward": 0.4305555671453476,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 1182.6666564941406,
"epoch": 0.1602742661238483,
"grad_norm": 1.7073755264282227,
"kl": 0.180908203125,
"learning_rate": 8.093945422764069e-07,
"loss": 0.2743,
"reward": 1.043900977820158,
"reward_std": 0.8111362755298615,
"rewards/cosine_scaled_reward": 0.36917273700237274,
"rewards/format_reward": 0.305555559694767,
"step": 187
},
{
"clip_ratio": 0.0,
"completion_length": 922.5416641235352,
"epoch": 0.16113134776087423,
"grad_norm": 1.0958812236785889,
"kl": 0.1793212890625,
"learning_rate": 8.068211054579943e-07,
"loss": 0.4721,
"reward": 0.8457982540130615,
"reward_std": 1.025609239935875,
"rewards/cosine_scaled_reward": 0.23539912048727274,
"rewards/format_reward": 0.3750000074505806,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 1749.4583740234375,
"epoch": 0.16198842939790015,
"grad_norm": 0.6942291259765625,
"kl": 0.0941162109375,
"learning_rate": 8.04235151541222e-07,
"loss": 0.0101,
"reward": 0.9162072837352753,
"reward_std": 0.9123063534498215,
"rewards/cosine_scaled_reward": 0.24282585456967354,
"rewards/format_reward": 0.430555559694767,
"step": 189
},
{
"clip_ratio": 0.0,
"completion_length": 1193.9861297607422,
"epoch": 0.1628455110349261,
"grad_norm": 1.193941354751587,
"kl": 0.1387939453125,
"learning_rate": 8.01636806561836e-07,
"loss": 0.5036,
"reward": 1.0086410311050713,
"reward_std": 0.8937728404998779,
"rewards/cosine_scaled_reward": 0.3237649239599705,
"rewards/format_reward": 0.361111119389534,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 1352.2638854980469,
"epoch": 0.163702592671952,
"grad_norm": 0.7374529242515564,
"kl": 0.163818359375,
"learning_rate": 7.990261971595048e-07,
"loss": 0.2226,
"reward": 0.4831864982843399,
"reward_std": 0.8398203700780869,
"rewards/cosine_scaled_reward": 0.08881546184420586,
"rewards/format_reward": 0.3055555634200573,
"step": 191
},
{
"clip_ratio": 0.0,
"completion_length": 1205.0278015136719,
"epoch": 0.16455967430897794,
"grad_norm": 0.6933061480522156,
"kl": 0.205322265625,
"learning_rate": 7.964034505716476e-07,
"loss": 0.1316,
"reward": 1.1680071130394936,
"reward_std": 0.7614214420318604,
"rewards/cosine_scaled_reward": 0.3687257831916213,
"rewards/format_reward": 0.430555559694767,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 979.9166564941406,
"epoch": 0.16541675594600386,
"grad_norm": 1.3442184925079346,
"kl": 0.16552734375,
"learning_rate": 7.93768694627233e-07,
"loss": 0.4192,
"reward": 1.7566750347614288,
"reward_std": 0.8575289100408554,
"rewards/cosine_scaled_reward": 0.6700042113661766,
"rewards/format_reward": 0.416666679084301,
"step": 193
},
{
"clip_ratio": 0.0,
"completion_length": 877.2916870117188,
"epoch": 0.16627383758302977,
"grad_norm": 1.4500232934951782,
"kl": 0.2979736328125,
"learning_rate": 7.911220577405484e-07,
"loss": 0.3993,
"reward": 0.9797601252794266,
"reward_std": 0.8100396543741226,
"rewards/cosine_scaled_reward": 0.2954356314148754,
"rewards/format_reward": 0.3888889029622078,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 1305.9028015136719,
"epoch": 0.1671309192200557,
"grad_norm": 2.036522150039673,
"kl": 0.154541015625,
"learning_rate": 7.884636689049422e-07,
"loss": 0.4356,
"reward": 0.4684947496280074,
"reward_std": 0.6030187755823135,
"rewards/cosine_scaled_reward": 0.025914038997143507,
"rewards/format_reward": 0.4166666716337204,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 1123.9027709960938,
"epoch": 0.16798800085708163,
"grad_norm": 2.62882137298584,
"kl": 0.313232421875,
"learning_rate": 7.857936576865356e-07,
"loss": 0.1435,
"reward": 0.6905184164643288,
"reward_std": 0.7359469905495644,
"rewards/cosine_scaled_reward": 0.1508147695567459,
"rewards/format_reward": 0.3888888992369175,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 882.1805572509766,
"epoch": 0.16884508249410757,
"grad_norm": 1.288465976715088,
"kl": 0.25537109375,
"learning_rate": 7.831121542179086e-07,
"loss": 0.4079,
"reward": 1.1543247550725937,
"reward_std": 0.9211147129535675,
"rewards/cosine_scaled_reward": 0.41744012013077736,
"rewards/format_reward": 0.31944445334374905,
"step": 197
},
{
"clip_ratio": 0.0,
"completion_length": 1125.5555877685547,
"epoch": 0.16970216413113348,
"grad_norm": 1.6123173236846924,
"kl": 0.3720703125,
"learning_rate": 7.804192891917571e-07,
"loss": 0.4226,
"reward": 0.7712806100025773,
"reward_std": 0.8221293687820435,
"rewards/cosine_scaled_reward": 0.17036251351237297,
"rewards/format_reward": 0.430555559694767,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 1190.1666717529297,
"epoch": 0.17055924576815942,
"grad_norm": 1.612281084060669,
"kl": 0.4462890625,
"learning_rate": 7.777151938545235e-07,
"loss": 0.2878,
"reward": 0.7899716692045331,
"reward_std": 0.8678261786699295,
"rewards/cosine_scaled_reward": 0.23526360094547272,
"rewards/format_reward": 0.3194444514811039,
"step": 199
},
{
"clip_ratio": 0.0,
"completion_length": 962.3888854980469,
"epoch": 0.17141632740518534,
"grad_norm": 1.62675940990448,
"kl": 0.43115234375,
"learning_rate": 7.75e-07,
"loss": 0.2341,
"reward": 0.8529508542269468,
"reward_std": 0.890992283821106,
"rewards/cosine_scaled_reward": 0.25286430679261684,
"rewards/format_reward": 0.3472222313284874,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 556.6250190734863,
"epoch": 0.17227340904221128,
"grad_norm": 1.815177083015442,
"kl": 0.45654296875,
"learning_rate": 7.72273839962904e-07,
"loss": 0.4604,
"reward": 1.0454039722681046,
"reward_std": 0.7526903375983238,
"rewards/cosine_scaled_reward": 0.3560353182256222,
"rewards/format_reward": 0.3333333395421505,
"step": 201
},
{
"clip_ratio": 0.0,
"completion_length": 1524.1944580078125,
"epoch": 0.1731304906792372,
"grad_norm": 1.6689302921295166,
"kl": 0.484375,
"learning_rate": 7.695368466124296e-07,
"loss": 0.1883,
"reward": 0.7762234956026077,
"reward_std": 0.7869797348976135,
"rewards/cosine_scaled_reward": 0.21450063399970531,
"rewards/format_reward": 0.3472222276031971,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 933.7361145019531,
"epoch": 0.17398757231626313,
"grad_norm": 1.4884265661239624,
"kl": 0.51171875,
"learning_rate": 7.667891533457718e-07,
"loss": 0.5092,
"reward": 1.1106317043304443,
"reward_std": 0.9267386496067047,
"rewards/cosine_scaled_reward": 0.34698252752423286,
"rewards/format_reward": 0.416666679084301,
"step": 203
},
{
"clip_ratio": 0.0,
"completion_length": 1146.9583282470703,
"epoch": 0.17484465395328905,
"grad_norm": 2.8435497283935547,
"kl": 0.5849609375,
"learning_rate": 7.640308940816239e-07,
"loss": 0.3038,
"reward": 0.4783020354807377,
"reward_std": 0.7846653908491135,
"rewards/cosine_scaled_reward": 0.12803990789689124,
"rewards/format_reward": 0.22222222946584225,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 911.7639007568359,
"epoch": 0.175701735590315,
"grad_norm": 2.8843140602111816,
"kl": 0.41357421875,
"learning_rate": 7.612622032536507e-07,
"loss": 0.2368,
"reward": 0.8713492751121521,
"reward_std": 0.9335136562585831,
"rewards/cosine_scaled_reward": 0.23428576067090034,
"rewards/format_reward": 0.4027777872979641,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 700.7361068725586,
"epoch": 0.1765588172273409,
"grad_norm": 2.3919317722320557,
"kl": 0.548828125,
"learning_rate": 7.584832158039378e-07,
"loss": 0.1555,
"reward": 0.5804239325225353,
"reward_std": 0.8797028362751007,
"rewards/cosine_scaled_reward": 0.13048974284902215,
"rewards/format_reward": 0.3194444514811039,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 1057.2916870117188,
"epoch": 0.17741589886436684,
"grad_norm": 2.6160635948181152,
"kl": 0.60546875,
"learning_rate": 7.556940671764124e-07,
"loss": 0.3724,
"reward": 0.591970931738615,
"reward_std": 0.928965374827385,
"rewards/cosine_scaled_reward": 0.13626324571669102,
"rewards/format_reward": 0.3194444552063942,
"step": 207
},
{
"clip_ratio": 0.0,
"completion_length": 1054.2222442626953,
"epoch": 0.17827298050139276,
"grad_norm": 4.055714130401611,
"kl": 0.6064453125,
"learning_rate": 7.528948933102438e-07,
"loss": 0.1023,
"reward": 0.5747384652495384,
"reward_std": 0.835850402712822,
"rewards/cosine_scaled_reward": 0.13459146209061146,
"rewards/format_reward": 0.3055555634200573,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 672.6527862548828,
"epoch": 0.17913006213841867,
"grad_norm": 2.89412260055542,
"kl": 0.77734375,
"learning_rate": 7.500858306332172e-07,
"loss": 0.2746,
"reward": 0.5216602731961757,
"reward_std": 0.8375790268182755,
"rewards/cosine_scaled_reward": 0.1080523431301117,
"rewards/format_reward": 0.3055555671453476,
"step": 209
},
{
"clip_ratio": 0.0,
"completion_length": 1029.6527709960938,
"epoch": 0.1799871437754446,
"grad_norm": 2.079103469848633,
"kl": 0.5517578125,
"learning_rate": 7.472670160550848e-07,
"loss": 0.3516,
"reward": 0.3450094065628946,
"reward_std": 0.6973802000284195,
"rewards/cosine_scaled_reward": 0.03361581452190876,
"rewards/format_reward": 0.2777777835726738,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 815.6805725097656,
"epoch": 0.18084422541247053,
"grad_norm": 2.8681375980377197,
"kl": 0.53173828125,
"learning_rate": 7.444385869608921e-07,
"loss": 0.4467,
"reward": 0.2037402605637908,
"reward_std": 0.7096255868673325,
"rewards/cosine_scaled_reward": -0.050907641649246216,
"rewards/format_reward": 0.30555556155741215,
"step": 211
},
{
"clip_ratio": 0.0,
"completion_length": 1077.4166793823242,
"epoch": 0.18170130704949647,
"grad_norm": 1.8894522190093994,
"kl": 0.513671875,
"learning_rate": 7.416006812042827e-07,
"loss": 0.1559,
"reward": 0.2127110045403242,
"reward_std": 0.7653373330831528,
"rewards/cosine_scaled_reward": -0.018644492141902447,
"rewards/format_reward": 0.2500000046566129,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 760.8888702392578,
"epoch": 0.18255838868652238,
"grad_norm": 2.848653793334961,
"kl": 0.6552734375,
"learning_rate": 7.387534371007797e-07,
"loss": 0.257,
"reward": 0.4657673854380846,
"reward_std": 0.7699891328811646,
"rewards/cosine_scaled_reward": 0.09399479907006025,
"rewards/format_reward": 0.2777777872979641,
"step": 213
},
{
"clip_ratio": 0.0,
"completion_length": 1346.2916870117188,
"epoch": 0.18341547032354832,
"grad_norm": 3.975214958190918,
"kl": 0.53173828125,
"learning_rate": 7.358969934210438e-07,
"loss": 0.2101,
"reward": 0.010750308400020003,
"reward_std": 0.7175936102867126,
"rewards/cosine_scaled_reward": -0.1265692890738137,
"rewards/format_reward": 0.2638888955116272,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 903.7500152587891,
"epoch": 0.18427255196057424,
"grad_norm": 1.7518130540847778,
"kl": 0.478515625,
"learning_rate": 7.330314893841101e-07,
"loss": 0.4712,
"reward": 0.6189166195690632,
"reward_std": 0.7413264513015747,
"rewards/cosine_scaled_reward": 0.18445828184485435,
"rewards/format_reward": 0.2500000074505806,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 569.4166717529297,
"epoch": 0.18512963359760018,
"grad_norm": 2.782475709915161,
"kl": 0.609375,
"learning_rate": 7.301570646506027e-07,
"loss": 0.1972,
"reward": 0.7244105041027069,
"reward_std": 0.7233957052230835,
"rewards/cosine_scaled_reward": 0.12609414962935261,
"rewards/format_reward": 0.4722222238779068,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 753.7361145019531,
"epoch": 0.1859867152346261,
"grad_norm": 3.62007212638855,
"kl": 0.5966796875,
"learning_rate": 7.27273859315928e-07,
"loss": 0.1095,
"reward": 0.7907614503055811,
"reward_std": 0.9498309046030045,
"rewards/cosine_scaled_reward": 0.2564918287098408,
"rewards/format_reward": 0.2777777872979641,
"step": 217
},
{
"clip_ratio": 0.0,
"completion_length": 846.0694427490234,
"epoch": 0.18684379687165203,
"grad_norm": 6.74003791809082,
"kl": 0.56884765625,
"learning_rate": 7.243820139034464e-07,
"loss": 0.369,
"reward": 0.3798181489109993,
"reward_std": 0.7759093195199966,
"rewards/cosine_scaled_reward": 0.002409084467217326,
"rewards/format_reward": 0.3750000037252903,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 657.1527709960938,
"epoch": 0.18770087850867795,
"grad_norm": 8.006071090698242,
"kl": 0.64794921875,
"learning_rate": 7.214816693576234e-07,
"loss": 0.39,
"reward": 0.6195143777877092,
"reward_std": 0.9361841827630997,
"rewards/cosine_scaled_reward": 0.13614607648923993,
"rewards/format_reward": 0.3472222313284874,
"step": 219
},
{
"clip_ratio": 0.0,
"completion_length": 488.31944274902344,
"epoch": 0.1885579601457039,
"grad_norm": 6.052534580230713,
"kl": 0.662109375,
"learning_rate": 7.185729670371604e-07,
"loss": 0.4044,
"reward": 0.591563917696476,
"reward_std": 0.9269620776176453,
"rewards/cosine_scaled_reward": 0.13605972938239574,
"rewards/format_reward": 0.3194444552063942,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 640.6805648803711,
"epoch": 0.1894150417827298,
"grad_norm": 3.168754816055298,
"kl": 0.8017578125,
"learning_rate": 7.156560487081051e-07,
"loss": 0.3799,
"reward": 0.4200323410332203,
"reward_std": 0.8377434760332108,
"rewards/cosine_scaled_reward": 0.09890506649389863,
"rewards/format_reward": 0.2222222276031971,
"step": 221
},
{
"clip_ratio": 0.0,
"completion_length": 715.4305572509766,
"epoch": 0.19027212341975575,
"grad_norm": 3.2562437057495117,
"kl": 0.841796875,
"learning_rate": 7.127310565369415e-07,
"loss": 0.1672,
"reward": 0.4836801737546921,
"reward_std": 0.7787017673254013,
"rewards/cosine_scaled_reward": 0.09600674454122782,
"rewards/format_reward": 0.2916666716337204,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 603.6527786254883,
"epoch": 0.19112920505678166,
"grad_norm": 4.294973373413086,
"kl": 0.8857421875,
"learning_rate": 7.097981330836616e-07,
"loss": 0.3874,
"reward": 0.41634054901078343,
"reward_std": 0.7685736864805222,
"rewards/cosine_scaled_reward": 0.06233693804824725,
"rewards/format_reward": 0.2916666753590107,
"step": 223
},
{
"clip_ratio": 0.0,
"completion_length": 615.4027786254883,
"epoch": 0.19198628669380757,
"grad_norm": 5.623297691345215,
"kl": 0.9609375,
"learning_rate": 7.068574212948169e-07,
"loss": 0.2779,
"reward": 0.8603571616113186,
"reward_std": 0.9389389455318451,
"rewards/cosine_scaled_reward": 0.2426785994321108,
"rewards/format_reward": 0.3750000111758709,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 590.1666870117188,
"epoch": 0.19284336833083351,
"grad_norm": 9.251201629638672,
"kl": 0.970703125,
"learning_rate": 7.039090644965509e-07,
"loss": 0.3814,
"reward": 0.4910267172381282,
"reward_std": 0.7892083153128624,
"rewards/cosine_scaled_reward": 0.09968002699315548,
"rewards/format_reward": 0.2916666716337204,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 364.7916679382324,
"epoch": 0.19370044996785943,
"grad_norm": 3.1022725105285645,
"kl": 0.87890625,
"learning_rate": 7.009532063876148e-07,
"loss": 0.2221,
"reward": 0.5126173943281174,
"reward_std": 0.6831100434064865,
"rewards/cosine_scaled_reward": 0.12436424475163221,
"rewards/format_reward": 0.2638888992369175,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 616.5555572509766,
"epoch": 0.19455753160488537,
"grad_norm": 2.3377246856689453,
"kl": 0.9384765625,
"learning_rate": 6.979899910323624e-07,
"loss": 0.3176,
"reward": 0.5720387771725655,
"reward_std": 0.6217157021164894,
"rewards/cosine_scaled_reward": 0.1054638409987092,
"rewards/format_reward": 0.361111119389534,
"step": 227
},
{
"clip_ratio": 0.0,
"completion_length": 540.0972061157227,
"epoch": 0.19541461324191128,
"grad_norm": 6.154067039489746,
"kl": 0.8564453125,
"learning_rate": 6.950195628537299e-07,
"loss": 0.4069,
"reward": 0.43204435613006353,
"reward_std": 0.7807599157094955,
"rewards/cosine_scaled_reward": 0.08407774195075035,
"rewards/format_reward": 0.2638888955116272,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 572.2916717529297,
"epoch": 0.19627169487893723,
"grad_norm": 2.6342246532440186,
"kl": 0.923828125,
"learning_rate": 6.920420666261961e-07,
"loss": 0.2841,
"reward": 0.6314438227564096,
"reward_std": 0.8878332078456879,
"rewards/cosine_scaled_reward": 0.13516635773703456,
"rewards/format_reward": 0.361111119389534,
"step": 229
},
{
"clip_ratio": 0.0,
"completion_length": 488.7638854980469,
"epoch": 0.19712877651596314,
"grad_norm": 2.928675651550293,
"kl": 1.046875,
"learning_rate": 6.890576474687263e-07,
"loss": 0.2236,
"reward": 0.36511653289198875,
"reward_std": 0.7729461342096329,
"rewards/cosine_scaled_reward": 0.029780485958326608,
"rewards/format_reward": 0.3055555634200573,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 419.2638931274414,
"epoch": 0.19798585815298908,
"grad_norm": 3.3741118907928467,
"kl": 0.9599609375,
"learning_rate": 6.860664508377001e-07,
"loss": 0.2546,
"reward": 0.7524889260530472,
"reward_std": 0.970270186662674,
"rewards/cosine_scaled_reward": 0.20957778953015804,
"rewards/format_reward": 0.3333333395421505,
"step": 231
},
{
"clip_ratio": 0.0,
"completion_length": 457.93055725097656,
"epoch": 0.198842939790015,
"grad_norm": 3.406334638595581,
"kl": 1.1494140625,
"learning_rate": 6.83068622519821e-07,
"loss": 0.3246,
"reward": 0.9599852412939072,
"reward_std": 0.8014604300260544,
"rewards/cosine_scaled_reward": 0.29943707399070263,
"rewards/format_reward": 0.3611111156642437,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 593.0833358764648,
"epoch": 0.19970002142704094,
"grad_norm": 2.4271864891052246,
"kl": 1.279296875,
"learning_rate": 6.800643086250121e-07,
"loss": 0.3557,
"reward": 0.4210415966808796,
"reward_std": 0.6882978901267052,
"rewards/cosine_scaled_reward": 0.10635412717238069,
"rewards/format_reward": 0.20833333861082792,
"step": 233
},
{
"clip_ratio": 0.0,
"completion_length": 488.9583435058594,
"epoch": 0.20055710306406685,
"grad_norm": 2.523871660232544,
"kl": 1.107421875,
"learning_rate": 6.770536555792944e-07,
"loss": 0.4558,
"reward": 0.8532587476074696,
"reward_std": 0.8488901779055595,
"rewards/cosine_scaled_reward": 0.22524047270417213,
"rewards/format_reward": 0.4027777872979641,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 436.6805725097656,
"epoch": 0.2014141847010928,
"grad_norm": 4.82633113861084,
"kl": 0.9912109375,
"learning_rate": 6.740368101176495e-07,
"loss": 0.2315,
"reward": 0.7889588698744774,
"reward_std": 0.942963719367981,
"rewards/cosine_scaled_reward": 0.2069794237613678,
"rewards/format_reward": 0.3750000074505806,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 454.0416793823242,
"epoch": 0.2022712663381187,
"grad_norm": 7.737008094787598,
"kl": 1.0390625,
"learning_rate": 6.710139192768694e-07,
"loss": 0.4408,
"reward": 0.2776918327435851,
"reward_std": 0.6808639168739319,
"rewards/cosine_scaled_reward": -4.296889528632164e-05,
"rewards/format_reward": 0.2777777835726738,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 353.5138931274414,
"epoch": 0.20312834797514465,
"grad_norm": 10.61968994140625,
"kl": 0.990234375,
"learning_rate": 6.679851303883891e-07,
"loss": 0.3745,
"reward": 0.9284175038337708,
"reward_std": 0.9358415603637695,
"rewards/cosine_scaled_reward": 0.3322643097490072,
"rewards/format_reward": 0.26388889364898205,
"step": 237
},
{
"clip_ratio": 0.0,
"completion_length": 489.4722213745117,
"epoch": 0.20398542961217056,
"grad_norm": 5.041754245758057,
"kl": 1.13671875,
"learning_rate": 6.649505910711058e-07,
"loss": 0.3936,
"reward": 0.8772722482681274,
"reward_std": 1.1123964041471481,
"rewards/cosine_scaled_reward": 0.2719694413244724,
"rewards/format_reward": 0.3333333395421505,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 384.00000762939453,
"epoch": 0.20484251124919647,
"grad_norm": 3.059415817260742,
"kl": 1.005859375,
"learning_rate": 6.619104492241847e-07,
"loss": 0.3146,
"reward": 0.7366920709609985,
"reward_std": 0.9259907156229019,
"rewards/cosine_scaled_reward": 0.22945713996887207,
"rewards/format_reward": 0.2777777872979641,
"step": 239
},
{
"clip_ratio": 0.0,
"completion_length": 419.1388931274414,
"epoch": 0.20569959288622242,
"grad_norm": 10.234456062316895,
"kl": 1.1953125,
"learning_rate": 6.588648530198504e-07,
"loss": 0.3922,
"reward": 0.6590520106256008,
"reward_std": 0.7455599829554558,
"rewards/cosine_scaled_reward": 0.14202599972486496,
"rewards/format_reward": 0.37500000931322575,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 401.2638854980469,
"epoch": 0.20655667452324833,
"grad_norm": 2.2838265895843506,
"kl": 1.1591796875,
"learning_rate": 6.558139508961654e-07,
"loss": 0.2743,
"reward": 0.7352767586708069,
"reward_std": 0.8139046281576157,
"rewards/cosine_scaled_reward": 0.13847169885411859,
"rewards/format_reward": 0.4583333432674408,
"step": 241
},
{
"clip_ratio": 0.0,
"completion_length": 407.9166793823242,
"epoch": 0.20741375616027427,
"grad_norm": 4.2408246994018555,
"kl": 0.982421875,
"learning_rate": 6.527578915497951e-07,
"loss": 0.2713,
"reward": 0.9523800164461136,
"reward_std": 1.001899242401123,
"rewards/cosine_scaled_reward": 0.28869001008570194,
"rewards/format_reward": 0.375,
"step": 242
},
{
"clip_ratio": 0.0,
"completion_length": 439.652774810791,
"epoch": 0.20827083779730018,
"grad_norm": 5.787041664123535,
"kl": 1.49609375,
"learning_rate": 6.496968239287603e-07,
"loss": 0.1675,
"reward": 0.5723136551678181,
"reward_std": 0.7788431346416473,
"rewards/cosine_scaled_reward": 0.15421238262206316,
"rewards/format_reward": 0.2638888955116272,
"step": 243
},
{
"clip_ratio": 0.0,
"completion_length": 398.08333587646484,
"epoch": 0.20912791943432613,
"grad_norm": 2.507175922393799,
"kl": 1.6328125,
"learning_rate": 6.466308972251785e-07,
"loss": 0.3818,
"reward": 1.0151595324277878,
"reward_std": 1.0486897379159927,
"rewards/cosine_scaled_reward": 0.285357553511858,
"rewards/format_reward": 0.4444444514811039,
"step": 244
},
{
"clip_ratio": 0.0,
"completion_length": 364.277774810791,
"epoch": 0.20998500107135204,
"grad_norm": 8.139939308166504,
"kl": 1.345703125,
"learning_rate": 6.435602608679916e-07,
"loss": 0.1818,
"reward": 0.7046700529754162,
"reward_std": 0.7369572669267654,
"rewards/cosine_scaled_reward": 0.16483502835035324,
"rewards/format_reward": 0.3750000074505806,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 460.98612213134766,
"epoch": 0.21084208270837798,
"grad_norm": 2.476030111312866,
"kl": 1.345703125,
"learning_rate": 6.404850645156841e-07,
"loss": 0.3358,
"reward": 0.342040394898504,
"reward_std": 0.691289097070694,
"rewards/cosine_scaled_reward": 0.07379795983433723,
"rewards/format_reward": 0.19444444961845875,
"step": 246
},
{
"clip_ratio": 0.0,
"completion_length": 246.83333587646484,
"epoch": 0.2116991643454039,
"grad_norm": 6.393570899963379,
"kl": 1.162109375,
"learning_rate": 6.374054580489873e-07,
"loss": 0.2145,
"reward": 1.1625754237174988,
"reward_std": 0.9642776250839233,
"rewards/cosine_scaled_reward": 0.3937877155840397,
"rewards/format_reward": 0.3750000074505806,
"step": 247
},
{
"clip_ratio": 0.0,
"completion_length": 441.25000762939453,
"epoch": 0.21255624598242984,
"grad_norm": 2.3917956352233887,
"kl": 1.556640625,
"learning_rate": 6.343215915635761e-07,
"loss": 0.3667,
"reward": 0.23520513158291578,
"reward_std": 0.5816171392798424,
"rewards/cosine_scaled_reward": 0.00649144034832716,
"rewards/format_reward": 0.2222222313284874,
"step": 248
},
{
"clip_ratio": 0.0,
"completion_length": 406.94444274902344,
"epoch": 0.21341332761945575,
"grad_norm": 2.021603584289551,
"kl": 1.32421875,
"learning_rate": 6.31233615362752e-07,
"loss": 0.3835,
"reward": 0.6145300641655922,
"reward_std": 0.8172438591718674,
"rewards/cosine_scaled_reward": 0.0919872522354126,
"rewards/format_reward": 0.430555559694767,
"step": 249
},
{
"clip_ratio": 0.0,
"completion_length": 545.9027786254883,
"epoch": 0.2142704092564817,
"grad_norm": 5.071481227874756,
"kl": 1.462890625,
"learning_rate": 6.281416799501187e-07,
"loss": 0.3667,
"reward": 0.5654324060305953,
"reward_std": 0.7506130635738373,
"rewards/cosine_scaled_reward": 0.12299397867172956,
"rewards/format_reward": 0.3194444514811039,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 295.27778244018555,
"epoch": 0.2151274908935076,
"grad_norm": 6.028750896453857,
"kl": 1.396484375,
"learning_rate": 6.25045936022246e-07,
"loss": 0.2881,
"reward": 0.6139978468418121,
"reward_std": 0.8909667134284973,
"rewards/cosine_scaled_reward": 0.09866558946669102,
"rewards/format_reward": 0.4166666679084301,
"step": 251
},
{
"clip_ratio": 0.0,
"completion_length": 455.0138854980469,
"epoch": 0.21598457253053352,
"grad_norm": 5.143624305725098,
"kl": 1.3984375,
"learning_rate": 6.219465344613258e-07,
"loss": 0.2564,
"reward": 0.8803071463480592,
"reward_std": 0.8127338886260986,
"rewards/cosine_scaled_reward": 0.23182022757828236,
"rewards/format_reward": 0.416666679084301,
"step": 252
},
{
"clip_ratio": 0.0,
"completion_length": 319.5833396911621,
"epoch": 0.21684165416755946,
"grad_norm": 2.218837261199951,
"kl": 1.318359375,
"learning_rate": 6.188436263278172e-07,
"loss": 0.2489,
"reward": 0.7824475020170212,
"reward_std": 0.8558803498744965,
"rewards/cosine_scaled_reward": 0.22455710358917713,
"rewards/format_reward": 0.3333333395421505,
"step": 253
},
{
"clip_ratio": 0.0,
"completion_length": 351.58333587646484,
"epoch": 0.21769873580458537,
"grad_norm": 2.8312885761260986,
"kl": 1.45703125,
"learning_rate": 6.157373628530852e-07,
"loss": 0.329,
"reward": 0.6108426973223686,
"reward_std": 0.7599765211343765,
"rewards/cosine_scaled_reward": 0.14569912757724524,
"rewards/format_reward": 0.3194444477558136,
"step": 254
},
{
"clip_ratio": 0.0,
"completion_length": 357.66666412353516,
"epoch": 0.21855581744161132,
"grad_norm": 3.159221649169922,
"kl": 1.310546875,
"learning_rate": 6.126278954320294e-07,
"loss": 0.3418,
"reward": 0.8543988540768623,
"reward_std": 0.8301258683204651,
"rewards/cosine_scaled_reward": 0.22581054456532001,
"rewards/format_reward": 0.4027777798473835,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 434.9166679382324,
"epoch": 0.21941289907863723,
"grad_norm": 2.6879823207855225,
"kl": 1.517578125,
"learning_rate": 6.095153756157051e-07,
"loss": 0.2982,
"reward": 0.6386940572410822,
"reward_std": 0.7044311463832855,
"rewards/cosine_scaled_reward": 0.13879146426916122,
"rewards/format_reward": 0.3611111231148243,
"step": 256
},
{
"clip_ratio": 0.0,
"completion_length": 361.11112213134766,
"epoch": 0.22026998071566317,
"grad_norm": 3.5288994312286377,
"kl": 1.27734375,
"learning_rate": 6.06399955103937e-07,
"loss": 0.2277,
"reward": 0.813978984951973,
"reward_std": 0.7525499165058136,
"rewards/cosine_scaled_reward": 0.26810058392584324,
"rewards/format_reward": 0.2777777835726738,
"step": 257
},
{
"clip_ratio": 0.0,
"completion_length": 364.84722900390625,
"epoch": 0.22112706235268909,
"grad_norm": 2.3899548053741455,
"kl": 1.326171875,
"learning_rate": 6.032817857379256e-07,
"loss": 0.2684,
"reward": 0.7509399205446243,
"reward_std": 0.7508059442043304,
"rewards/cosine_scaled_reward": 0.20880330353975296,
"rewards/format_reward": 0.3333333395421505,
"step": 258
},
{
"clip_ratio": 0.0,
"completion_length": 361.50000762939453,
"epoch": 0.22198414398971503,
"grad_norm": 5.087327480316162,
"kl": 1.62109375,
"learning_rate": 6.001610194928464e-07,
"loss": 0.3756,
"reward": 0.9518274813890457,
"reward_std": 0.8851035535335541,
"rewards/cosine_scaled_reward": 0.29535816609859467,
"rewards/format_reward": 0.36111112125217915,
"step": 259
},
{
"clip_ratio": 0.0,
"completion_length": 335.4027786254883,
"epoch": 0.22284122562674094,
"grad_norm": 2.6951775550842285,
"kl": 1.43359375,
"learning_rate": 5.97037808470444e-07,
"loss": 0.2947,
"reward": 0.5025412552058697,
"reward_std": 0.7434158027172089,
"rewards/cosine_scaled_reward": 0.09154839906841516,
"rewards/format_reward": 0.3194444552063942,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 342.4861068725586,
"epoch": 0.22369830726376688,
"grad_norm": 3.2796826362609863,
"kl": 1.1669921875,
"learning_rate": 5.939123048916173e-07,
"loss": 0.3505,
"reward": 0.5631570406258106,
"reward_std": 0.8579424917697906,
"rewards/cosine_scaled_reward": 0.09407851565629244,
"rewards/format_reward": 0.3750000037252903,
"step": 261
},
{
"clip_ratio": 0.0,
"completion_length": 319.04166412353516,
"epoch": 0.2245553889007928,
"grad_norm": 5.055459499359131,
"kl": 1.193359375,
"learning_rate": 5.907846610890011e-07,
"loss": 0.2438,
"reward": 0.5839212201535702,
"reward_std": 0.797115832567215,
"rewards/cosine_scaled_reward": 0.111405044561252,
"rewards/format_reward": 0.3611111156642437,
"step": 262
},
{
"clip_ratio": 0.0,
"completion_length": 287.4166679382324,
"epoch": 0.22541247053781874,
"grad_norm": 4.351484775543213,
"kl": 1.130859375,
"learning_rate": 5.87655029499542e-07,
"loss": 0.2364,
"reward": 0.8294338285923004,
"reward_std": 0.9452664703130722,
"rewards/cosine_scaled_reward": 0.29666137136518955,
"rewards/format_reward": 0.2361111156642437,
"step": 263
},
{
"clip_ratio": 0.0,
"completion_length": 344.6111030578613,
"epoch": 0.22626955217484465,
"grad_norm": 3.6436305046081543,
"kl": 1.2177734375,
"learning_rate": 5.845235626570683e-07,
"loss": 0.2119,
"reward": 0.9100049883127213,
"reward_std": 0.9092362821102142,
"rewards/cosine_scaled_reward": 0.2744469365570694,
"rewards/format_reward": 0.3611111156642437,
"step": 264
},
{
"clip_ratio": 0.0,
"completion_length": 397.94445037841797,
"epoch": 0.2271266338118706,
"grad_norm": 4.111161231994629,
"kl": 1.228515625,
"learning_rate": 5.813904131848564e-07,
"loss": 0.3083,
"reward": 0.4675387665629387,
"reward_std": 0.7649587690830231,
"rewards/cosine_scaled_reward": 0.025436056777834892,
"rewards/format_reward": 0.4166666716337204,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 321.83333587646484,
"epoch": 0.2279837154488965,
"grad_norm": 4.648217678070068,
"kl": 1.2587890625,
"learning_rate": 5.78255733788191e-07,
"loss": 0.2023,
"reward": 0.773270171135664,
"reward_std": 0.7938476204872131,
"rewards/cosine_scaled_reward": 0.18524618819355965,
"rewards/format_reward": 0.4027777798473835,
"step": 266
},
{
"clip_ratio": 0.0,
"completion_length": 437.41666412353516,
"epoch": 0.22884079708592242,
"grad_norm": 5.6152472496032715,
"kl": 1.37109375,
"learning_rate": 5.751196772469237e-07,
"loss": 0.2184,
"reward": 0.5861554071307182,
"reward_std": 0.7004242539405823,
"rewards/cosine_scaled_reward": 0.09168882109224796,
"rewards/format_reward": 0.4027777910232544,
"step": 267
},
{
"clip_ratio": 0.0,
"completion_length": 274.86111068725586,
"epoch": 0.22969787872294836,
"grad_norm": 3.5720877647399902,
"kl": 1.1787109375,
"learning_rate": 5.71982396408026e-07,
"loss": 0.1723,
"reward": 0.8904364705085754,
"reward_std": 0.8545982241630554,
"rewards/cosine_scaled_reward": 0.22299600392580032,
"rewards/format_reward": 0.4444444477558136,
"step": 268
},
{
"clip_ratio": 0.0,
"completion_length": 296.2638931274414,
"epoch": 0.23055496035997428,
"grad_norm": 6.410920143127441,
"kl": 1.08203125,
"learning_rate": 5.688440441781398e-07,
"loss": 0.1769,
"reward": 0.7688810527324677,
"reward_std": 1.0249820053577423,
"rewards/cosine_scaled_reward": 0.17610719101503491,
"rewards/format_reward": 0.4166666716337204,
"step": 269
},
{
"clip_ratio": 0.0,
"completion_length": 297.98611068725586,
"epoch": 0.23141204199700022,
"grad_norm": 2.672879695892334,
"kl": 1.0966796875,
"learning_rate": 5.657047735161255e-07,
"loss": 0.2069,
"reward": 1.3364269733428955,
"reward_std": 0.8959543257951736,
"rewards/cosine_scaled_reward": 0.43210237100720406,
"rewards/format_reward": 0.4722222313284874,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 306.81945037841797,
"epoch": 0.23226912363402613,
"grad_norm": 4.001352787017822,
"kl": 1.1513671875,
"learning_rate": 5.625647374256061e-07,
"loss": 0.1775,
"reward": 0.9076458215713501,
"reward_std": 0.803716853260994,
"rewards/cosine_scaled_reward": 0.25243401899933815,
"rewards/format_reward": 0.4027777835726738,
"step": 271
},
{
"clip_ratio": 0.0,
"completion_length": 319.7638931274414,
"epoch": 0.23312620527105207,
"grad_norm": 10.995331764221191,
"kl": 1.0615234375,
"learning_rate": 5.594240889475106e-07,
"loss": 0.3224,
"reward": 0.5522180162370205,
"reward_std": 0.7163975164294243,
"rewards/cosine_scaled_reward": 0.10944235138595104,
"rewards/format_reward": 0.3333333432674408,
"step": 272
},
{
"clip_ratio": 0.0,
"completion_length": 363.5277786254883,
"epoch": 0.233983286908078,
"grad_norm": 2.6429786682128906,
"kl": 1.224609375,
"learning_rate": 5.562829811526154e-07,
"loss": 0.2456,
"reward": 0.4188144411891699,
"reward_std": 0.6072976887226105,
"rewards/cosine_scaled_reward": 0.07746277935802937,
"rewards/format_reward": 0.2638888955116272,
"step": 273
},
{
"clip_ratio": 0.0,
"completion_length": 360.7361145019531,
"epoch": 0.23484036854510393,
"grad_norm": 2.7255380153656006,
"kl": 1.240234375,
"learning_rate": 5.531415671340826e-07,
"loss": 0.1997,
"reward": 1.1216635033488274,
"reward_std": 0.7341399192810059,
"rewards/cosine_scaled_reward": 0.3524984158575535,
"rewards/format_reward": 0.4166666716337204,
"step": 274
},
{
"clip_ratio": 0.0,
"completion_length": 351.9027862548828,
"epoch": 0.23569745018212984,
"grad_norm": 3.7585530281066895,
"kl": 1.185546875,
"learning_rate": 5.5e-07,
"loss": 0.2149,
"reward": 0.8286983985453844,
"reward_std": 0.8109488189220428,
"rewards/cosine_scaled_reward": 0.21990476548671722,
"rewards/format_reward": 0.3888888955116272,
"step": 275
},
{
"clip_ratio": 0.0,
"completion_length": 362.55555725097656,
"epoch": 0.23655453181915578,
"grad_norm": 5.316573143005371,
"kl": 1.419921875,
"learning_rate": 5.468584328659172e-07,
"loss": 0.2454,
"reward": 0.7834634706377983,
"reward_std": 0.8923482447862625,
"rewards/cosine_scaled_reward": 0.19728727941401303,
"rewards/format_reward": 0.3888888917863369,
"step": 276
},
{
"clip_ratio": 0.0,
"completion_length": 474.31945037841797,
"epoch": 0.2374116134561817,
"grad_norm": 7.202195167541504,
"kl": 1.765625,
"learning_rate": 5.437170188473847e-07,
"loss": 0.3221,
"reward": 0.7273948639631271,
"reward_std": 0.7511462718248367,
"rewards/cosine_scaled_reward": 0.11369742685928941,
"rewards/format_reward": 0.5000000074505806,
"step": 277
},
{
"clip_ratio": 0.0,
"completion_length": 371.7638931274414,
"epoch": 0.23826869509320764,
"grad_norm": 7.068774700164795,
"kl": 1.662109375,
"learning_rate": 5.405759110524894e-07,
"loss": 0.1665,
"reward": 1.2386417984962463,
"reward_std": 0.9174100756645203,
"rewards/cosine_scaled_reward": 0.3901542127132416,
"rewards/format_reward": 0.4583333432674408,
"step": 278
},
{
"clip_ratio": 0.0,
"completion_length": 349.6527862548828,
"epoch": 0.23912577673023355,
"grad_norm": 11.34634017944336,
"kl": 1.658203125,
"learning_rate": 5.37435262574394e-07,
"loss": 0.1887,
"reward": 0.492940915748477,
"reward_std": 0.721496045589447,
"rewards/cosine_scaled_reward": 0.10063712205737829,
"rewards/format_reward": 0.2916666716337204,
"step": 279
},
{
"clip_ratio": 0.0,
"completion_length": 480.2222366333008,
"epoch": 0.2399828583672595,
"grad_norm": 3.518911838531494,
"kl": 1.3828125,
"learning_rate": 5.342952264838747e-07,
"loss": 0.3564,
"reward": 0.39445267990231514,
"reward_std": 0.733232319355011,
"rewards/cosine_scaled_reward": 0.0027818959206342697,
"rewards/format_reward": 0.3888888955116272,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 372.7361068725586,
"epoch": 0.2408399400042854,
"grad_norm": 5.535303115844727,
"kl": 1.59765625,
"learning_rate": 5.311559558218603e-07,
"loss": 0.1378,
"reward": 0.5733058899641037,
"reward_std": 0.8472346812486649,
"rewards/cosine_scaled_reward": 0.140819625928998,
"rewards/format_reward": 0.2916666669771075,
"step": 281
},
{
"clip_ratio": 0.0,
"completion_length": 415.26390075683594,
"epoch": 0.24169702164131132,
"grad_norm": 5.782970428466797,
"kl": 1.326171875,
"learning_rate": 5.28017603591974e-07,
"loss": 0.1165,
"reward": 0.8404653370380402,
"reward_std": 0.8565979599952698,
"rewards/cosine_scaled_reward": 0.19801045581698418,
"rewards/format_reward": 0.4444444552063942,
"step": 282
},
{
"clip_ratio": 0.0,
"completion_length": 427.9861068725586,
"epoch": 0.24255410327833726,
"grad_norm": 6.241755962371826,
"kl": 1.23046875,
"learning_rate": 5.248803227530763e-07,
"loss": 0.1899,
"reward": 1.1734114736318588,
"reward_std": 1.0097443908452988,
"rewards/cosine_scaled_reward": 0.3575390987098217,
"rewards/format_reward": 0.4583333432674408,
"step": 283
},
{
"clip_ratio": 0.0,
"completion_length": 457.9027786254883,
"epoch": 0.24341118491536318,
"grad_norm": 12.400771141052246,
"kl": 1.1044921875,
"learning_rate": 5.21744266211809e-07,
"loss": 0.2043,
"reward": 0.6685996502637863,
"reward_std": 0.896918535232544,
"rewards/cosine_scaled_reward": 0.15374425239861012,
"rewards/format_reward": 0.361111119389534,
"step": 284
},
{
"clip_ratio": 0.0,
"completion_length": 403.19444274902344,
"epoch": 0.24426826655238912,
"grad_norm": 5.228241443634033,
"kl": 1.513671875,
"learning_rate": 5.186095868151436e-07,
"loss": 0.2219,
"reward": 0.6613360345363617,
"reward_std": 0.8527265787124634,
"rewards/cosine_scaled_reward": 0.14316802099347115,
"rewards/format_reward": 0.3750000074505806,
"step": 285
},
{
"clip_ratio": 0.0,
"completion_length": 465.8333435058594,
"epoch": 0.24512534818941503,
"grad_norm": 9.96542739868164,
"kl": 1.0654296875,
"learning_rate": 5.154764373429315e-07,
"loss": 0.1583,
"reward": 0.8348981812596321,
"reward_std": 0.9325527995824814,
"rewards/cosine_scaled_reward": 0.21606018114835024,
"rewards/format_reward": 0.4027777835726738,
"step": 286
},
{
"clip_ratio": 0.0,
"completion_length": 372.5972366333008,
"epoch": 0.24598242982644097,
"grad_norm": 12.640274047851562,
"kl": 1.1533203125,
"learning_rate": 5.123449705004581e-07,
"loss": 0.122,
"reward": 1.0775522887706757,
"reward_std": 0.8368659615516663,
"rewards/cosine_scaled_reward": 0.30266502872109413,
"rewards/format_reward": 0.4722222313284874,
"step": 287
},
{
"clip_ratio": 0.0,
"completion_length": 478.50000762939453,
"epoch": 0.2468395114634669,
"grad_norm": 101.3699951171875,
"kl": 1.58984375,
"learning_rate": 5.09215338910999e-07,
"loss": 0.1448,
"reward": 0.5251022726297379,
"reward_std": 0.7163691967725754,
"rewards/cosine_scaled_reward": 0.04032891429960728,
"rewards/format_reward": 0.4444444477558136,
"step": 288
},
{
"clip_ratio": 0.0,
"completion_length": 425.20833587646484,
"epoch": 0.24769659310049283,
"grad_norm": 14.3350830078125,
"kl": 1.1904296875,
"learning_rate": 5.060876951083828e-07,
"loss": 0.1997,
"reward": 1.171989917755127,
"reward_std": 0.9945340603590012,
"rewards/cosine_scaled_reward": 0.3637727275490761,
"rewards/format_reward": 0.4444444477558136,
"step": 289
},
{
"clip_ratio": 0.0,
"completion_length": 374.87500762939453,
"epoch": 0.24855367473751874,
"grad_norm": 7.5819315910339355,
"kl": 1.0908203125,
"learning_rate": 5.02962191529556e-07,
"loss": 0.1346,
"reward": 1.154215730726719,
"reward_std": 0.894601583480835,
"rewards/cosine_scaled_reward": 0.3965523011283949,
"rewards/format_reward": 0.3611111231148243,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 406.93055725097656,
"epoch": 0.24941075637454468,
"grad_norm": 14.51109504699707,
"kl": 1.333984375,
"learning_rate": 4.998389805071536e-07,
"loss": 0.3856,
"reward": 1.0358281284570694,
"reward_std": 0.996618315577507,
"rewards/cosine_scaled_reward": 0.25402514450252056,
"rewards/format_reward": 0.527777798473835,
"step": 291
},
{
"clip_ratio": 0.0,
"completion_length": 410.00000762939453,
"epoch": 0.2502678380115706,
"grad_norm": 12.098337173461914,
"kl": 1.232421875,
"learning_rate": 4.967182142620745e-07,
"loss": 0.1032,
"reward": 0.9677926301956177,
"reward_std": 0.873188391327858,
"rewards/cosine_scaled_reward": 0.26861853525042534,
"rewards/format_reward": 0.4305555671453476,
"step": 292
},
{
"clip_ratio": 0.0,
"completion_length": 440.4722213745117,
"epoch": 0.2511249196485965,
"grad_norm": 682.6365966796875,
"kl": 2.2578125,
"learning_rate": 4.93600044896063e-07,
"loss": 0.3027,
"reward": 0.7292786613106728,
"reward_std": 0.7885087877511978,
"rewards/cosine_scaled_reward": 0.23963932693004608,
"rewards/format_reward": 0.2500000037252903,
"step": 293
},
{
"clip_ratio": 0.0,
"completion_length": 483.08333587646484,
"epoch": 0.2519820012856225,
"grad_norm": 5.502435684204102,
"kl": 1.58203125,
"learning_rate": 4.904846243842949e-07,
"loss": 0.2492,
"reward": 0.9282772243022919,
"reward_std": 1.0097183585166931,
"rewards/cosine_scaled_reward": 0.24886082112789154,
"rewards/format_reward": 0.4305555634200573,
"step": 294
},
{
"clip_ratio": 0.0,
"completion_length": 363.9305648803711,
"epoch": 0.2528390829226484,
"grad_norm": 12.606801986694336,
"kl": 1.716796875,
"learning_rate": 4.873721045679706e-07,
"loss": 0.2035,
"reward": 0.6285464763641357,
"reward_std": 0.8090884387493134,
"rewards/cosine_scaled_reward": 0.1406621327623725,
"rewards/format_reward": 0.3472222276031971,
"step": 295
},
{
"clip_ratio": 0.0,
"completion_length": 444.0555648803711,
"epoch": 0.2536961645596743,
"grad_norm": 8.756891250610352,
"kl": 1.45703125,
"learning_rate": 4.842626371469149e-07,
"loss": 0.2407,
"reward": 0.6327312793582678,
"reward_std": 0.776206910610199,
"rewards/cosine_scaled_reward": 0.10803230293095112,
"rewards/format_reward": 0.416666679084301,
"step": 296
},
{
"clip_ratio": 0.0,
"completion_length": 396.19445037841797,
"epoch": 0.2545532461967002,
"grad_norm": 4.2702484130859375,
"kl": 1.763671875,
"learning_rate": 4.811563736721829e-07,
"loss": 0.2519,
"reward": 0.718172661960125,
"reward_std": 0.9500904381275177,
"rewards/cosine_scaled_reward": 0.16464189253747463,
"rewards/format_reward": 0.3888888917863369,
"step": 297
},
{
"clip_ratio": 0.0,
"completion_length": 400.20833587646484,
"epoch": 0.25541032783372614,
"grad_norm": 16.376901626586914,
"kl": 1.56640625,
"learning_rate": 4.780534655386743e-07,
"loss": 0.1925,
"reward": 0.9078701715916395,
"reward_std": 0.7346006631851196,
"rewards/cosine_scaled_reward": 0.26643507555127144,
"rewards/format_reward": 0.3750000149011612,
"step": 298
},
{
"clip_ratio": 0.0,
"completion_length": 424.01390075683594,
"epoch": 0.2562674094707521,
"grad_norm": 41.33639907836914,
"kl": 1.951171875,
"learning_rate": 4.749540639777539e-07,
"loss": 0.2305,
"reward": 0.9393416047096252,
"reward_std": 1.0011892914772034,
"rewards/cosine_scaled_reward": 0.24050412327051163,
"rewards/format_reward": 0.4583333432674408,
"step": 299
},
{
"clip_ratio": 0.0,
"completion_length": 414.84722900390625,
"epoch": 0.257124491107778,
"grad_norm": 6.579893589019775,
"kl": 1.59375,
"learning_rate": 4.7185832004988133e-07,
"loss": 0.2066,
"reward": 0.9773926436901093,
"reward_std": 0.8585023283958435,
"rewards/cosine_scaled_reward": 0.3150852136313915,
"rewards/format_reward": 0.3472222350537777,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 328.0416717529297,
"epoch": 0.25798157274480393,
"grad_norm": 7.188145160675049,
"kl": 2.0,
"learning_rate": 4.68766384637248e-07,
"loss": 0.263,
"reward": 0.7739622257649899,
"reward_std": 0.8753332197666168,
"rewards/cosine_scaled_reward": 0.1855922369286418,
"rewards/format_reward": 0.4027777835726738,
"step": 301
},
{
"clip_ratio": 0.0,
"completion_length": 413.6527862548828,
"epoch": 0.25883865438182985,
"grad_norm": 20.148056030273438,
"kl": 1.46484375,
"learning_rate": 4.656784084364238e-07,
"loss": 0.2093,
"reward": 0.8456357046961784,
"reward_std": 0.8055497854948044,
"rewards/cosine_scaled_reward": 0.23531784676015377,
"rewards/format_reward": 0.3750000046566129,
"step": 302
},
{
"clip_ratio": 0.0,
"completion_length": 435.4027786254883,
"epoch": 0.2596957360188558,
"grad_norm": 6.98222541809082,
"kl": 1.439453125,
"learning_rate": 4.6259454195101267e-07,
"loss": 0.1817,
"reward": 0.7380593828856945,
"reward_std": 0.6477810889482498,
"rewards/cosine_scaled_reward": 0.20236299559473991,
"rewards/format_reward": 0.3333333432674408,
"step": 303
},
{
"clip_ratio": 0.0,
"completion_length": 403.20833587646484,
"epoch": 0.26055281765588173,
"grad_norm": 10.506485939025879,
"kl": 1.654296875,
"learning_rate": 4.59514935484316e-07,
"loss": 0.2068,
"reward": 0.6883874237537384,
"reward_std": 0.8970037549734116,
"rewards/cosine_scaled_reward": 0.1566937081515789,
"rewards/format_reward": 0.37500000186264515,
"step": 304
},
{
"clip_ratio": 0.0,
"completion_length": 393.63890075683594,
"epoch": 0.26140989929290764,
"grad_norm": 31.882427215576172,
"kl": 1.412109375,
"learning_rate": 4.5643973913200837e-07,
"loss": 0.1912,
"reward": 0.833147831261158,
"reward_std": 0.7371143400669098,
"rewards/cosine_scaled_reward": 0.19435168150812387,
"rewards/format_reward": 0.4444444477558136,
"step": 305
},
{
"clip_ratio": 0.0,
"completion_length": 413.1111145019531,
"epoch": 0.26226698092993356,
"grad_norm": 12.78693962097168,
"kl": 1.515625,
"learning_rate": 4.5336910277482155e-07,
"loss": 0.0471,
"reward": 0.6375277414917946,
"reward_std": 0.733474999666214,
"rewards/cosine_scaled_reward": 0.09654165129177272,
"rewards/format_reward": 0.4444444477558136,
"step": 306
},
{
"clip_ratio": 0.0,
"completion_length": 447.50001525878906,
"epoch": 0.2631240625669595,
"grad_norm": 117.86434936523438,
"kl": 2.7265625,
"learning_rate": 4.503031760712397e-07,
"loss": 0.2281,
"reward": 0.7174494117498398,
"reward_std": 0.8096088320016861,
"rewards/cosine_scaled_reward": 0.13650248385965824,
"rewards/format_reward": 0.4444444589316845,
"step": 307
},
{
"clip_ratio": 0.0,
"completion_length": 452.8333435058594,
"epoch": 0.26398114420398544,
"grad_norm": 10.973830223083496,
"kl": 1.271484375,
"learning_rate": 4.4724210845020494e-07,
"loss": 0.2238,
"reward": 0.7689935564994812,
"reward_std": 0.8074923604726791,
"rewards/cosine_scaled_reward": 0.1761634573340416,
"rewards/format_reward": 0.4166666716337204,
"step": 308
},
{
"clip_ratio": 0.0,
"completion_length": 439.8472213745117,
"epoch": 0.26483822584101135,
"grad_norm": 6.3766188621521,
"kl": 1.54296875,
"learning_rate": 4.441860491038345e-07,
"loss": 0.209,
"reward": 0.5929550379514694,
"reward_std": 0.6037983000278473,
"rewards/cosine_scaled_reward": 0.07425528764724731,
"rewards/format_reward": 0.4444444440305233,
"step": 309
},
{
"clip_ratio": 0.0,
"completion_length": 418.65277099609375,
"epoch": 0.26569530747803727,
"grad_norm": 21.236345291137695,
"kl": 1.384765625,
"learning_rate": 4.4113514698014953e-07,
"loss": 0.1979,
"reward": 0.9790968149900436,
"reward_std": 0.7783628851175308,
"rewards/cosine_scaled_reward": 0.2881595455110073,
"rewards/format_reward": 0.4027777872979641,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 450.2222213745117,
"epoch": 0.26655238911506324,
"grad_norm": 14.473074913024902,
"kl": 1.33203125,
"learning_rate": 4.3808955077581546e-07,
"loss": 0.2879,
"reward": 1.0430985651910305,
"reward_std": 0.8672375828027725,
"rewards/cosine_scaled_reward": 0.31321592442691326,
"rewards/format_reward": 0.416666679084301,
"step": 311
},
{
"clip_ratio": 0.0,
"completion_length": 471.1111145019531,
"epoch": 0.26740947075208915,
"grad_norm": 9.259844779968262,
"kl": 1.51171875,
"learning_rate": 4.350494089288943e-07,
"loss": 0.2339,
"reward": 0.9496155381202698,
"reward_std": 0.910922110080719,
"rewards/cosine_scaled_reward": 0.2803633138537407,
"rewards/format_reward": 0.3888888917863369,
"step": 312
},
{
"clip_ratio": 0.0,
"completion_length": 445.06945037841797,
"epoch": 0.26826655238911506,
"grad_norm": 4.26533317565918,
"kl": 1.58203125,
"learning_rate": 4.3201486961161093e-07,
"loss": 0.2456,
"reward": 1.2665941417217255,
"reward_std": 0.9569890201091766,
"rewards/cosine_scaled_reward": 0.43190818652510643,
"rewards/format_reward": 0.4027777835726738,
"step": 313
},
{
"clip_ratio": 0.0,
"completion_length": 392.7777862548828,
"epoch": 0.269123634026141,
"grad_norm": 7.616832256317139,
"kl": 1.375,
"learning_rate": 4.2898608072313045e-07,
"loss": 0.1271,
"reward": 1.1350777596235275,
"reward_std": 0.9450895041227341,
"rewards/cosine_scaled_reward": 0.35920554026961327,
"rewards/format_reward": 0.4166666716337204,
"step": 314
},
{
"clip_ratio": 0.0,
"completion_length": 440.08333587646484,
"epoch": 0.2699807156631669,
"grad_norm": 12.737751007080078,
"kl": 1.46484375,
"learning_rate": 4.2596318988235037e-07,
"loss": 0.2746,
"reward": 0.6033434271812439,
"reward_std": 0.7901871353387833,
"rewards/cosine_scaled_reward": 0.1072272639721632,
"rewards/format_reward": 0.3888889029622078,
"step": 315
},
{
"clip_ratio": 0.0,
"completion_length": 423.3611145019531,
"epoch": 0.27083779730019286,
"grad_norm": 34.6724853515625,
"kl": 1.4873046875,
"learning_rate": 4.2294634442070553e-07,
"loss": 0.3012,
"reward": 0.8819021135568619,
"reward_std": 0.6502636596560478,
"rewards/cosine_scaled_reward": 0.30206217616796494,
"rewards/format_reward": 0.2777777872979641,
"step": 316
},
{
"clip_ratio": 0.0,
"completion_length": 409.56944274902344,
"epoch": 0.2716948789372188,
"grad_norm": 8.627520561218262,
"kl": 1.564453125,
"learning_rate": 4.1993569137498776e-07,
"loss": 0.1536,
"reward": 0.9121913909912109,
"reward_std": 0.8483704626560211,
"rewards/cosine_scaled_reward": 0.2894290406256914,
"rewards/format_reward": 0.3333333358168602,
"step": 317
},
{
"clip_ratio": 0.0,
"completion_length": 413.6111145019531,
"epoch": 0.2725519605742447,
"grad_norm": 2.8996214866638184,
"kl": 1.509765625,
"learning_rate": 4.1693137748017915e-07,
"loss": 0.1049,
"reward": 1.1796189993619919,
"reward_std": 0.8427684605121613,
"rewards/cosine_scaled_reward": 0.31897614523768425,
"rewards/format_reward": 0.5416666716337204,
"step": 318
},
{
"clip_ratio": 0.0,
"completion_length": 416.55555725097656,
"epoch": 0.2734090422112706,
"grad_norm": 10.151158332824707,
"kl": 1.458984375,
"learning_rate": 4.1393354916230005e-07,
"loss": 0.2491,
"reward": 0.6920264512300491,
"reward_std": 0.6206417083740234,
"rewards/cosine_scaled_reward": 0.13767989072948694,
"rewards/format_reward": 0.4166666679084301,
"step": 319
},
{
"clip_ratio": 0.0,
"completion_length": 384.05555725097656,
"epoch": 0.2742661238482966,
"grad_norm": 5.853416442871094,
"kl": 1.333984375,
"learning_rate": 4.1094235253127374e-07,
"loss": 0.1025,
"reward": 1.3231835961341858,
"reward_std": 0.8811145946383476,
"rewards/cosine_scaled_reward": 0.4463140070438385,
"rewards/format_reward": 0.4305555559694767,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 438.59722900390625,
"epoch": 0.2751232054853225,
"grad_norm": 24.13959503173828,
"kl": 1.798828125,
"learning_rate": 4.079579333738039e-07,
"loss": 0.1857,
"reward": 0.7677492424845695,
"reward_std": 0.8653182983398438,
"rewards/cosine_scaled_reward": 0.16859683208167553,
"rewards/format_reward": 0.430555559694767,
"step": 321
},
{
"clip_ratio": 0.0,
"completion_length": 419.22222900390625,
"epoch": 0.2759802871223484,
"grad_norm": 25.10494041442871,
"kl": 1.732421875,
"learning_rate": 4.0498043714627006e-07,
"loss": 0.2985,
"reward": 0.5208378061652184,
"reward_std": 0.7815151214599609,
"rewards/cosine_scaled_reward": 0.08680777484551072,
"rewards/format_reward": 0.3472222350537777,
"step": 322
},
{
"clip_ratio": 0.0,
"completion_length": 458.25,
"epoch": 0.2768373687593743,
"grad_norm": 6.803804874420166,
"kl": 1.67578125,
"learning_rate": 4.020100089676376e-07,
"loss": 0.1707,
"reward": 0.8683362007141113,
"reward_std": 0.8737305179238319,
"rewards/cosine_scaled_reward": 0.13555700704455376,
"rewards/format_reward": 0.5972222238779068,
"step": 323
},
{
"clip_ratio": 0.0,
"completion_length": 390.7361145019531,
"epoch": 0.2776944503964003,
"grad_norm": 6.555212020874023,
"kl": 1.5546875,
"learning_rate": 3.9904679361238526e-07,
"loss": 0.0673,
"reward": 0.8696636259555817,
"reward_std": 0.8899157643318176,
"rewards/cosine_scaled_reward": 0.19872068613767624,
"rewards/format_reward": 0.4722222238779068,
"step": 324
},
{
"clip_ratio": 0.0,
"completion_length": 506.33333587646484,
"epoch": 0.2785515320334262,
"grad_norm": 10.4780912399292,
"kl": 1.701171875,
"learning_rate": 3.9609093550344907e-07,
"loss": 0.1414,
"reward": 0.8947531227022409,
"reward_std": 1.0458511114120483,
"rewards/cosine_scaled_reward": 0.24598768074065447,
"rewards/format_reward": 0.4027777872979641,
"step": 325
},
{
"clip_ratio": 0.0,
"completion_length": 389.875,
"epoch": 0.2794086136704521,
"grad_norm": 8.659296989440918,
"kl": 1.732421875,
"learning_rate": 3.931425787051832e-07,
"loss": 0.2063,
"reward": 0.7398964213207364,
"reward_std": 0.8089132308959961,
"rewards/cosine_scaled_reward": 0.13383711129426956,
"rewards/format_reward": 0.4722222238779068,
"step": 326
},
{
"clip_ratio": 0.0,
"completion_length": 448.20833587646484,
"epoch": 0.280265695307478,
"grad_norm": 19.596874237060547,
"kl": 1.962890625,
"learning_rate": 3.902018669163384e-07,
"loss": 0.1572,
"reward": 0.8671465888619423,
"reward_std": 0.915204182267189,
"rewards/cosine_scaled_reward": 0.21135106589645147,
"rewards/format_reward": 0.4444444552063942,
"step": 327
},
{
"clip_ratio": 0.0,
"completion_length": 391.08333587646484,
"epoch": 0.28112277694450394,
"grad_norm": 32.9877815246582,
"kl": 1.431640625,
"learning_rate": 3.872689434630585e-07,
"loss": 0.0696,
"reward": 1.0604142509400845,
"reward_std": 0.8675804287195206,
"rewards/cosine_scaled_reward": 0.30104043427854776,
"rewards/format_reward": 0.4583333432674408,
"step": 328
},
{
"clip_ratio": 0.0,
"completion_length": 465.2083435058594,
"epoch": 0.2819798585815299,
"grad_norm": 13.628067016601562,
"kl": 1.689453125,
"learning_rate": 3.843439512918949e-07,
"loss": 0.1891,
"reward": 0.7241683751344681,
"reward_std": 0.7928906679153442,
"rewards/cosine_scaled_reward": 0.13291750941425562,
"rewards/format_reward": 0.4583333358168602,
"step": 329
},
{
"clip_ratio": 0.0,
"completion_length": 462.87500762939453,
"epoch": 0.2828369402185558,
"grad_norm": 6.838570594787598,
"kl": 1.494140625,
"learning_rate": 3.8142703296283953e-07,
"loss": 0.1692,
"reward": 0.9220460206270218,
"reward_std": 0.6238923817873001,
"rewards/cosine_scaled_reward": 0.25268966890871525,
"rewards/format_reward": 0.416666679084301,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 473.2777862548828,
"epoch": 0.28369402185558174,
"grad_norm": 8.527922630310059,
"kl": 1.451171875,
"learning_rate": 3.785183306423767e-07,
"loss": 0.2612,
"reward": 0.7746013253927231,
"reward_std": 0.6590248346328735,
"rewards/cosine_scaled_reward": 0.21368957962840796,
"rewards/format_reward": 0.3472222276031971,
"step": 331
},
{
"clip_ratio": 0.0,
"completion_length": 401.19444274902344,
"epoch": 0.28455110349260765,
"grad_norm": 18.457897186279297,
"kl": 1.564453125,
"learning_rate": 3.7561798609655373e-07,
"loss": 0.2324,
"reward": 1.1048437356948853,
"reward_std": 0.8566301316022873,
"rewards/cosine_scaled_reward": 0.25381074473261833,
"rewards/format_reward": 0.5972222238779068,
"step": 332
},
{
"clip_ratio": 0.0,
"completion_length": 496.69446563720703,
"epoch": 0.2854081851296336,
"grad_norm": 10.751416206359863,
"kl": 1.69921875,
"learning_rate": 3.72726140684072e-07,
"loss": 0.1604,
"reward": 0.9123432487249374,
"reward_std": 0.9888466447591782,
"rewards/cosine_scaled_reward": 0.22700495785102248,
"rewards/format_reward": 0.4583333432674408,
"step": 333
},
{
"clip_ratio": 0.0,
"completion_length": 370.75000762939453,
"epoch": 0.28626526676665953,
"grad_norm": 4.947657108306885,
"kl": 1.51171875,
"learning_rate": 3.6984293534939737e-07,
"loss": 0.1946,
"reward": 0.3965581804513931,
"reward_std": 0.7018196731805801,
"rewards/cosine_scaled_reward": 0.010779092612210661,
"rewards/format_reward": 0.3750000074505806,
"step": 334
},
{
"clip_ratio": 0.0,
"completion_length": 417.25,
"epoch": 0.28712234840368545,
"grad_norm": 11.954354286193848,
"kl": 1.294921875,
"learning_rate": 3.6696851061588994e-07,
"loss": 0.0114,
"reward": 1.05374076962471,
"reward_std": 0.8027269691228867,
"rewards/cosine_scaled_reward": 0.24909262219443917,
"rewards/format_reward": 0.5555555671453476,
"step": 335
},
{
"clip_ratio": 0.0,
"completion_length": 504.69445037841797,
"epoch": 0.28797943004071136,
"grad_norm": 7.7731170654296875,
"kl": 1.669921875,
"learning_rate": 3.641030065789562e-07,
"loss": 0.2419,
"reward": 0.8747316524386406,
"reward_std": 0.8644589632749557,
"rewards/cosine_scaled_reward": 0.1943102532532066,
"rewards/format_reward": 0.486111119389534,
"step": 336
},
{
"clip_ratio": 0.0,
"completion_length": 363.2777862548828,
"epoch": 0.28883651167773733,
"grad_norm": 21.512271881103516,
"kl": 1.5693359375,
"learning_rate": 3.612465628992203e-07,
"loss": 0.2896,
"reward": 1.566197782754898,
"reward_std": 0.9783513993024826,
"rewards/cosine_scaled_reward": 0.49837667867541313,
"rewards/format_reward": 0.5694444552063942,
"step": 337
},
{
"clip_ratio": 0.0,
"completion_length": 384.47222900390625,
"epoch": 0.28969359331476324,
"grad_norm": 10.973138809204102,
"kl": 1.48828125,
"learning_rate": 3.5839931879571725e-07,
"loss": 0.1953,
"reward": 0.8962609972804785,
"reward_std": 0.9198465496301651,
"rewards/cosine_scaled_reward": 0.19118605181574821,
"rewards/format_reward": 0.5138888880610466,
"step": 338
},
{
"clip_ratio": 0.0,
"completion_length": 423.45833587646484,
"epoch": 0.29055067495178916,
"grad_norm": 22.09733772277832,
"kl": 1.876953125,
"learning_rate": 3.555614130391079e-07,
"loss": 0.347,
"reward": 0.811115313321352,
"reward_std": 0.8457043170928955,
"rewards/cosine_scaled_reward": 0.1625020916108042,
"rewards/format_reward": 0.4861111231148243,
"step": 339
},
{
"clip_ratio": 0.0,
"completion_length": 423.72222900390625,
"epoch": 0.29140775658881507,
"grad_norm": 22.564533233642578,
"kl": 1.546875,
"learning_rate": 3.5273298394491515e-07,
"loss": 0.2976,
"reward": 0.8002185821533203,
"reward_std": 0.8067903742194176,
"rewards/cosine_scaled_reward": 0.15705375373363495,
"rewards/format_reward": 0.4861111119389534,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 365.91666412353516,
"epoch": 0.292264838225841,
"grad_norm": 8.0919828414917,
"kl": 1.380859375,
"learning_rate": 3.4991416936678276e-07,
"loss": 0.2234,
"reward": 1.0582543164491653,
"reward_std": 0.700420930981636,
"rewards/cosine_scaled_reward": 0.2721826871857047,
"rewards/format_reward": 0.5138888955116272,
"step": 341
},
{
"clip_ratio": 0.0,
"completion_length": 396.5555648803711,
"epoch": 0.29312191986286695,
"grad_norm": 13.28870964050293,
"kl": 1.693359375,
"learning_rate": 3.471051066897562e-07,
"loss": 0.2345,
"reward": 0.9433440640568733,
"reward_std": 0.8704208433628082,
"rewards/cosine_scaled_reward": 0.2772275973111391,
"rewards/format_reward": 0.3888888955116272,
"step": 342
},
{
"clip_ratio": 0.0,
"completion_length": 425.625,
"epoch": 0.29397900149989287,
"grad_norm": 102.71472930908203,
"kl": 2.208984375,
"learning_rate": 3.4430593282358777e-07,
"loss": 0.2262,
"reward": 0.8235329911112785,
"reward_std": 0.5788537338376045,
"rewards/cosine_scaled_reward": 0.09926649276167154,
"rewards/format_reward": 0.6250000149011612,
"step": 343
},
{
"clip_ratio": 0.0,
"completion_length": 365.58333587646484,
"epoch": 0.2948360831369188,
"grad_norm": 15.007022857666016,
"kl": 1.810546875,
"learning_rate": 3.4151678419606233e-07,
"loss": 0.108,
"reward": 0.8345845490694046,
"reward_std": 0.8136800527572632,
"rewards/cosine_scaled_reward": 0.1950700655579567,
"rewards/format_reward": 0.4444444552063942,
"step": 344
},
{
"clip_ratio": 0.0,
"completion_length": 390.6527862548828,
"epoch": 0.2956931647739447,
"grad_norm": 17.396751403808594,
"kl": 1.453125,
"learning_rate": 3.387377967463493e-07,
"loss": 0.1139,
"reward": 1.2658544778823853,
"reward_std": 0.8507587239146233,
"rewards/cosine_scaled_reward": 0.36903833597898483,
"rewards/format_reward": 0.5277777910232544,
"step": 345
},
{
"clip_ratio": 0.0,
"completion_length": 380.2777786254883,
"epoch": 0.29655024641097066,
"grad_norm": 6.361865043640137,
"kl": 1.896484375,
"learning_rate": 3.359691059183761e-07,
"loss": 0.2842,
"reward": 0.7403211258351803,
"reward_std": 0.8092114925384521,
"rewards/cosine_scaled_reward": 0.2034938931465149,
"rewards/format_reward": 0.3333333395421505,
"step": 346
},
{
"clip_ratio": 0.0,
"completion_length": 401.58333587646484,
"epoch": 0.2974073280479966,
"grad_norm": 4.304242134094238,
"kl": 1.55078125,
"learning_rate": 3.3321084665422803e-07,
"loss": 0.1731,
"reward": 1.124197095632553,
"reward_std": 0.6386073157191277,
"rewards/cosine_scaled_reward": 0.31209855526685715,
"rewards/format_reward": 0.5000000223517418,
"step": 347
},
{
"clip_ratio": 0.0,
"completion_length": 389.22222900390625,
"epoch": 0.2982644096850225,
"grad_norm": 5.652775764465332,
"kl": 2.107421875,
"learning_rate": 3.3046315338757026e-07,
"loss": 0.3405,
"reward": 0.4380467850714922,
"reward_std": 0.7566511631011963,
"rewards/cosine_scaled_reward": -0.003198828548192978,
"rewards/format_reward": 0.4444444477558136,
"step": 348
},
{
"clip_ratio": 0.0,
"completion_length": 382.8611145019531,
"epoch": 0.2991214913220484,
"grad_norm": 139.029052734375,
"kl": 2.10546875,
"learning_rate": 3.2772616003709616e-07,
"loss": 0.2329,
"reward": 0.886528730392456,
"reward_std": 0.9427484571933746,
"rewards/cosine_scaled_reward": 0.22798660211265087,
"rewards/format_reward": 0.430555559694767,
"step": 349
},
{
"clip_ratio": 0.0,
"completion_length": 378.08333587646484,
"epoch": 0.2999785729590744,
"grad_norm": 273.4222412109375,
"kl": 2.13671875,
"learning_rate": 3.250000000000001e-07,
"loss": 0.223,
"reward": 1.0541678816080093,
"reward_std": 0.9911145269870758,
"rewards/cosine_scaled_reward": 0.3118061521090567,
"rewards/format_reward": 0.430555559694767,
"step": 350
},
{
"clip_ratio": 0.0,
"completion_length": 435.625,
"epoch": 0.3008356545961003,
"grad_norm": 5.59974479675293,
"kl": 1.93359375,
"learning_rate": 3.222848061454764e-07,
"loss": 0.2827,
"reward": 0.46341075748205185,
"reward_std": 0.7887972742319107,
"rewards/cosine_scaled_reward": 0.030316500924527645,
"rewards/format_reward": 0.4027777835726738,
"step": 351
},
{
"clip_ratio": 0.0,
"completion_length": 499.12500762939453,
"epoch": 0.3016927362331262,
"grad_norm": 8.170894622802734,
"kl": 1.8984375,
"learning_rate": 3.195807108082429e-07,
"loss": 0.246,
"reward": 0.7659785971045494,
"reward_std": 0.865535780787468,
"rewards/cosine_scaled_reward": 0.1607670597732067,
"rewards/format_reward": 0.4444444626569748,
"step": 352
},
{
"clip_ratio": 0.0,
"completion_length": 412.75001525878906,
"epoch": 0.3025498178701521,
"grad_norm": 4.896013259887695,
"kl": 1.9609375,
"learning_rate": 3.168878457820915e-07,
"loss": 0.2717,
"reward": 0.4919071840122342,
"reward_std": 0.5877289474010468,
"rewards/cosine_scaled_reward": 0.10012026876211166,
"rewards/format_reward": 0.29166666977107525,
"step": 353
},
{
"clip_ratio": 0.0,
"completion_length": 444.3055648803711,
"epoch": 0.3034068995071781,
"grad_norm": 843.45703125,
"kl": 7.69921875,
"learning_rate": 3.142063423134644e-07,
"loss": 0.3878,
"reward": 0.47469986602663994,
"reward_std": 0.6482968628406525,
"rewards/cosine_scaled_reward": 0.06373882107436657,
"rewards/format_reward": 0.3472222313284874,
"step": 354
},
{
"clip_ratio": 0.0,
"completion_length": 508.8055648803711,
"epoch": 0.304263981144204,
"grad_norm": 610.3280639648438,
"kl": 2.517578125,
"learning_rate": 3.115363310950578e-07,
"loss": 0.3122,
"reward": 0.9253015741705894,
"reward_std": 0.9372780025005341,
"rewards/cosine_scaled_reward": 0.19876189157366753,
"rewards/format_reward": 0.5277777835726738,
"step": 355
},
{
"clip_ratio": 0.0,
"completion_length": 451.4861145019531,
"epoch": 0.3051210627812299,
"grad_norm": 6.685305595397949,
"kl": 1.939453125,
"learning_rate": 3.0887794225945143e-07,
"loss": 0.2488,
"reward": 1.0591753125190735,
"reward_std": 0.9159562736749649,
"rewards/cosine_scaled_reward": 0.27958764508366585,
"rewards/format_reward": 0.5,
"step": 356
},
{
"clip_ratio": 0.0,
"completion_length": 386.027774810791,
"epoch": 0.3059781444182558,
"grad_norm": 75.1882095336914,
"kl": 3.021484375,
"learning_rate": 3.062313053727671e-07,
"loss": 0.158,
"reward": 0.9487558901309967,
"reward_std": 0.8189297467470169,
"rewards/cosine_scaled_reward": 0.2591001633554697,
"rewards/format_reward": 0.430555559694767,
"step": 357
},
{
"clip_ratio": 0.0,
"completion_length": 420.33333587646484,
"epoch": 0.30683522605528174,
"grad_norm": 588.209228515625,
"kl": 2.5234375,
"learning_rate": 3.0359654942835247e-07,
"loss": 0.196,
"reward": 0.9485956132411957,
"reward_std": 0.8210533708333969,
"rewards/cosine_scaled_reward": 0.21735336817801,
"rewards/format_reward": 0.5138888955116272,
"step": 358
},
{
"clip_ratio": 0.0,
"completion_length": 446.76390075683594,
"epoch": 0.3076923076923077,
"grad_norm": 128.001220703125,
"kl": 2.671875,
"learning_rate": 3.0097380284049523e-07,
"loss": 0.1816,
"reward": 1.0116847660392523,
"reward_std": 0.8659616261720657,
"rewards/cosine_scaled_reward": 0.24889790453016758,
"rewards/format_reward": 0.5138888992369175,
"step": 359
},
{
"clip_ratio": 0.0,
"completion_length": 423.2777862548828,
"epoch": 0.3085493893293336,
"grad_norm": 13.23314380645752,
"kl": 2.044921875,
"learning_rate": 2.9836319343816397e-07,
"loss": 0.1871,
"reward": 1.3461374938488007,
"reward_std": 0.9391425997018814,
"rewards/cosine_scaled_reward": 0.4022354434709996,
"rewards/format_reward": 0.5416666641831398,
"step": 360
},
{
"clip_ratio": 0.0,
"completion_length": 433.63890075683594,
"epoch": 0.30940647096635954,
"grad_norm": 232.4365692138672,
"kl": 2.478515625,
"learning_rate": 2.9576484845877793e-07,
"loss": 0.2607,
"reward": 1.047914907336235,
"reward_std": 0.9106916189193726,
"rewards/cosine_scaled_reward": 0.23229077784344554,
"rewards/format_reward": 0.5833333432674408,
"step": 361
},
{
"clip_ratio": 0.0,
"completion_length": 414.6527862548828,
"epoch": 0.31026355260338545,
"grad_norm": 125.23755645751953,
"kl": 2.080078125,
"learning_rate": 2.931788945420058e-07,
"loss": 0.149,
"reward": 0.834898516535759,
"reward_std": 0.7312210351228714,
"rewards/cosine_scaled_reward": 0.1605048067867756,
"rewards/format_reward": 0.5138888955116272,
"step": 362
},
{
"clip_ratio": 0.0,
"completion_length": 435.12499237060547,
"epoch": 0.3111206342404114,
"grad_norm": 6.104694366455078,
"kl": 1.779296875,
"learning_rate": 2.9060545772359305e-07,
"loss": 0.2289,
"reward": 1.3437991440296173,
"reward_std": 0.8975027948617935,
"rewards/cosine_scaled_reward": 0.38023288547992706,
"rewards/format_reward": 0.5833333432674408,
"step": 363
},
{
"clip_ratio": 0.0,
"completion_length": 409.8472213745117,
"epoch": 0.31197771587743733,
"grad_norm": 24.235681533813477,
"kl": 2.080078125,
"learning_rate": 2.8804466342921987e-07,
"loss": 0.1818,
"reward": 0.9619172001257539,
"reward_std": 0.8415810465812683,
"rewards/cosine_scaled_reward": 0.27956968918442726,
"rewards/format_reward": 0.4027777835726738,
"step": 364
},
{
"clip_ratio": 0.0,
"completion_length": 398.5138854980469,
"epoch": 0.31283479751446325,
"grad_norm": 11.025618553161621,
"kl": 2.248046875,
"learning_rate": 2.854966364683872e-07,
"loss": 0.1973,
"reward": 0.7603890486061573,
"reward_std": 0.8526364490389824,
"rewards/cosine_scaled_reward": 0.1857500895857811,
"rewards/format_reward": 0.3888888955116272,
"step": 365
},
{
"clip_ratio": 0.0,
"completion_length": 498.2777862548828,
"epoch": 0.31369187915148916,
"grad_norm": 65.14573669433594,
"kl": 2.064453125,
"learning_rate": 2.829615010283344e-07,
"loss": 0.1461,
"reward": 1.086868055164814,
"reward_std": 0.9657749831676483,
"rewards/cosine_scaled_reward": 0.32815628172829747,
"rewards/format_reward": 0.430555559694767,
"step": 366
},
{
"clip_ratio": 0.0,
"completion_length": 387.7777786254883,
"epoch": 0.31454896078851513,
"grad_norm": 50.92959976196289,
"kl": 2.1015625,
"learning_rate": 2.8043938066798645e-07,
"loss": 0.1503,
"reward": 0.6913352087140083,
"reward_std": 0.7219003140926361,
"rewards/cosine_scaled_reward": 0.12344538388424553,
"rewards/format_reward": 0.4444444440305233,
"step": 367
},
{
"clip_ratio": 0.0,
"completion_length": 415.75000762939453,
"epoch": 0.31540604242554104,
"grad_norm": 23.864198684692383,
"kl": 1.697265625,
"learning_rate": 2.7793039831193133e-07,
"loss": 0.2941,
"reward": 0.803162232041359,
"reward_std": 0.8274150788784027,
"rewards/cosine_scaled_reward": 0.1376922446070239,
"rewards/format_reward": 0.5277777910232544,
"step": 368
},
{
"clip_ratio": 0.0,
"completion_length": 404.37500762939453,
"epoch": 0.31626312406256696,
"grad_norm": 80.24060821533203,
"kl": 1.919921875,
"learning_rate": 2.7543467624442956e-07,
"loss": 0.1397,
"reward": 0.9380350708961487,
"reward_std": 0.868830531835556,
"rewards/cosine_scaled_reward": 0.23290642350912094,
"rewards/format_reward": 0.4722222238779068,
"step": 369
},
{
"clip_ratio": 0.0,
"completion_length": 493.6666717529297,
"epoch": 0.31712020569959287,
"grad_norm": 16.302597045898438,
"kl": 1.53515625,
"learning_rate": 2.729523361034538e-07,
"loss": 0.0377,
"reward": 1.3772657215595245,
"reward_std": 0.8640294969081879,
"rewards/cosine_scaled_reward": 0.40391062945127487,
"rewards/format_reward": 0.5694444552063942,
"step": 370
},
{
"clip_ratio": 0.0,
"completion_length": 444.6666717529297,
"epoch": 0.3179772873366188,
"grad_norm": 74.72409057617188,
"kl": 2.32421875,
"learning_rate": 2.7048349887476037e-07,
"loss": 0.2057,
"reward": 0.564115053974092,
"reward_std": 0.7745189592242241,
"rewards/cosine_scaled_reward": 0.08066862914711237,
"rewards/format_reward": 0.4027777798473835,
"step": 371
},
{
"clip_ratio": 0.0,
"completion_length": 466.0277862548828,
"epoch": 0.31883436897364476,
"grad_norm": 23.615995407104492,
"kl": 2.126953125,
"learning_rate": 2.6802828488599294e-07,
"loss": 0.1857,
"reward": 0.9031898975372314,
"reward_std": 0.8619142323732376,
"rewards/cosine_scaled_reward": 0.236317184753716,
"rewards/format_reward": 0.4305555671453476,
"step": 372
},
{
"clip_ratio": 0.0,
"completion_length": 438.3888854980469,
"epoch": 0.31969145061067067,
"grad_norm": 10.200531005859375,
"kl": 1.654296875,
"learning_rate": 2.655868138008171e-07,
"loss": 0.1653,
"reward": 0.8666345775127411,
"reward_std": 0.8320091515779495,
"rewards/cosine_scaled_reward": 0.19720618752762675,
"rewards/format_reward": 0.4722222238779068,
"step": 373
},
{
"clip_ratio": 0.0,
"completion_length": 409.47222900390625,
"epoch": 0.3205485322476966,
"grad_norm": 40.82743453979492,
"kl": 2.0703125,
"learning_rate": 2.631592046130896e-07,
"loss": 0.1313,
"reward": 1.2135090231895447,
"reward_std": 1.0808076113462448,
"rewards/cosine_scaled_reward": 0.36369897052645683,
"rewards/format_reward": 0.486111119389534,
"step": 374
},
{
"clip_ratio": 0.0,
"completion_length": 531.1389083862305,
"epoch": 0.3214056138847225,
"grad_norm": 20.503686904907227,
"kl": 2.048828125,
"learning_rate": 2.6074557564105724e-07,
"loss": 0.1784,
"reward": 1.1541820913553238,
"reward_std": 1.0066500753164291,
"rewards/cosine_scaled_reward": 0.3826466426253319,
"rewards/format_reward": 0.3888888992369175,
"step": 375
},
{
"clip_ratio": 0.0,
"completion_length": 446.73612213134766,
"epoch": 0.32226269552174847,
"grad_norm": 36.28611373901367,
"kl": 1.435546875,
"learning_rate": 2.583460445215911e-07,
"loss": 0.141,
"reward": 1.155954971909523,
"reward_std": 0.9807834774255753,
"rewards/cosine_scaled_reward": 0.32103302888572216,
"rewards/format_reward": 0.5138889029622078,
"step": 376
},
{
"clip_ratio": 0.0,
"completion_length": 369.1388931274414,
"epoch": 0.3231197771587744,
"grad_norm": 21.888423919677734,
"kl": 2.341796875,
"learning_rate": 2.5596072820445254e-07,
"loss": 0.158,
"reward": 0.8885826840996742,
"reward_std": 0.8598978072404861,
"rewards/cosine_scaled_reward": 0.19429135276004672,
"rewards/format_reward": 0.5,
"step": 377
},
{
"clip_ratio": 0.0,
"completion_length": 517.125,
"epoch": 0.3239768587958003,
"grad_norm": 19.566164016723633,
"kl": 1.60546875,
"learning_rate": 2.5358974294659373e-07,
"loss": 0.1273,
"reward": 1.0485451221466064,
"reward_std": 1.0302338749170303,
"rewards/cosine_scaled_reward": 0.27427253872156143,
"rewards/format_reward": 0.5,
"step": 378
},
{
"clip_ratio": 0.0,
"completion_length": 408.81945037841797,
"epoch": 0.3248339404328262,
"grad_norm": 51.357086181640625,
"kl": 1.625,
"learning_rate": 2.512332043064913e-07,
"loss": 0.0661,
"reward": 1.2394737899303436,
"reward_std": 0.8347266316413879,
"rewards/cosine_scaled_reward": 0.3280702382326126,
"rewards/format_reward": 0.5833333358168602,
"step": 379
},
{
"clip_ratio": 0.0,
"completion_length": 449.50000762939453,
"epoch": 0.3256910220698522,
"grad_norm": 13.72800350189209,
"kl": 1.7578125,
"learning_rate": 2.488912271385139e-07,
"loss": 0.2142,
"reward": 1.0208731442689896,
"reward_std": 0.7508396059274673,
"rewards/cosine_scaled_reward": 0.2812698809430003,
"rewards/format_reward": 0.4583333358168602,
"step": 380
},
{
"clip_ratio": 0.0,
"completion_length": 406.4861145019531,
"epoch": 0.3265481037068781,
"grad_norm": 5.486995220184326,
"kl": 1.76953125,
"learning_rate": 2.465639255873246e-07,
"loss": 0.2384,
"reward": 0.863591693341732,
"reward_std": 0.80591781437397,
"rewards/cosine_scaled_reward": 0.13318472169339657,
"rewards/format_reward": 0.5972222313284874,
"step": 381
},
{
"clip_ratio": 0.0,
"completion_length": 413.5972213745117,
"epoch": 0.327405185343904,
"grad_norm": 12.793088912963867,
"kl": 1.849609375,
"learning_rate": 2.4425141308231765e-07,
"loss": 0.0601,
"reward": 1.3274620473384857,
"reward_std": 0.9712315052747726,
"rewards/cosine_scaled_reward": 0.40678660944104195,
"rewards/format_reward": 0.5138888880610466,
"step": 382
},
{
"clip_ratio": 0.0,
"completion_length": 462.05555725097656,
"epoch": 0.3282622669809299,
"grad_norm": 25.417646408081055,
"kl": 1.63671875,
"learning_rate": 2.4195380233209006e-07,
"loss": -0.0034,
"reward": 0.7435576766729355,
"reward_std": 0.7562405988574028,
"rewards/cosine_scaled_reward": 0.17733439663425088,
"rewards/format_reward": 0.3888889029622078,
"step": 383
},
{
"clip_ratio": 0.0,
"completion_length": 393.62500762939453,
"epoch": 0.3291193486179559,
"grad_norm": 11.36495590209961,
"kl": 2.20703125,
"learning_rate": 2.3967120531894857e-07,
"loss": 0.1751,
"reward": 0.8897156268358231,
"reward_std": 0.8956611603498459,
"rewards/cosine_scaled_reward": 0.20180223789066076,
"rewards/format_reward": 0.4861111119389534,
"step": 384
},
{
"clip_ratio": 0.0,
"completion_length": 415.2777862548828,
"epoch": 0.3299764302549818,
"grad_norm": 28.817380905151367,
"kl": 1.892578125,
"learning_rate": 2.374037332934512e-07,
"loss": 0.1319,
"reward": 1.1349451541900635,
"reward_std": 1.0257329195737839,
"rewards/cosine_scaled_reward": 0.3244170341640711,
"rewards/format_reward": 0.486111119389534,
"step": 385
},
{
"clip_ratio": 0.0,
"completion_length": 427.20833587646484,
"epoch": 0.3308335118920077,
"grad_norm": 62.827362060546875,
"kl": 1.46875,
"learning_rate": 2.3515149676898552e-07,
"loss": 0.0615,
"reward": 1.3169071674346924,
"reward_std": 0.8778738602995872,
"rewards/cosine_scaled_reward": 0.3667869158089161,
"rewards/format_reward": 0.5833333432674408,
"step": 386
},
{
"clip_ratio": 0.0,
"completion_length": 385.94444274902344,
"epoch": 0.33169059352903363,
"grad_norm": 29.661022186279297,
"kl": 1.708984375,
"learning_rate": 2.3291460551638237e-07,
"loss": 0.0891,
"reward": 0.7852656096220016,
"reward_std": 0.7992514222860336,
"rewards/cosine_scaled_reward": 0.13568835996557027,
"rewards/format_reward": 0.5138888880610466,
"step": 387
},
{
"clip_ratio": 0.0,
"completion_length": 453.55555725097656,
"epoch": 0.33254767516605954,
"grad_norm": 5.4821882247924805,
"kl": 1.947265625,
"learning_rate": 2.306931685585657e-07,
"loss": 0.175,
"reward": 0.6292361579835415,
"reward_std": 0.8566596806049347,
"rewards/cosine_scaled_reward": 0.0993402823805809,
"rewards/format_reward": 0.430555559694767,
"step": 388
},
{
"clip_ratio": 0.0,
"completion_length": 467.68055725097656,
"epoch": 0.3334047568030855,
"grad_norm": 14.547080993652344,
"kl": 2.296875,
"learning_rate": 2.2848729416523859e-07,
"loss": 0.2111,
"reward": 0.729412317276001,
"reward_std": 0.8567145764827728,
"rewards/cosine_scaled_reward": 0.1355394944548607,
"rewards/format_reward": 0.4583333432674408,
"step": 389
},
{
"clip_ratio": 0.0,
"completion_length": 424.0277786254883,
"epoch": 0.3342618384401114,
"grad_norm": 34.17932891845703,
"kl": 1.759765625,
"learning_rate": 2.2629708984760706e-07,
"loss": 0.249,
"reward": 0.6682014372199774,
"reward_std": 0.7891437709331512,
"rewards/cosine_scaled_reward": 0.13271182030439377,
"rewards/format_reward": 0.4027777872979641,
"step": 390
},
{
"clip_ratio": 0.0,
"completion_length": 466.0416793823242,
"epoch": 0.33511892007713734,
"grad_norm": 79.28044891357422,
"kl": 2.046875,
"learning_rate": 2.2412266235313973e-07,
"loss": 0.0428,
"reward": 0.4808058775961399,
"reward_std": 0.7752177119255066,
"rewards/cosine_scaled_reward": 0.04595848359167576,
"rewards/format_reward": 0.3888888955116272,
"step": 391
},
{
"clip_ratio": 0.0,
"completion_length": 482.4166793823242,
"epoch": 0.33597600171416325,
"grad_norm": 7.091101169586182,
"kl": 1.888671875,
"learning_rate": 2.2196411766036487e-07,
"loss": 0.142,
"reward": 0.8153834193944931,
"reward_std": 0.9166710078716278,
"rewards/cosine_scaled_reward": 0.22019170981366187,
"rewards/format_reward": 0.3750000111758709,
"step": 392
},
{
"clip_ratio": 0.0,
"completion_length": 444.97222900390625,
"epoch": 0.3368330833511892,
"grad_norm": 5.4899396896362305,
"kl": 1.81640625,
"learning_rate": 2.1982156097370557e-07,
"loss": 0.1499,
"reward": 0.9435096383094788,
"reward_std": 0.6671365574002266,
"rewards/cosine_scaled_reward": 0.1870325729250908,
"rewards/format_reward": 0.5694444477558136,
"step": 393
},
{
"clip_ratio": 0.0,
"completion_length": 462.69445037841797,
"epoch": 0.33769016498821514,
"grad_norm": 6.30225944519043,
"kl": 1.921875,
"learning_rate": 2.1769509671835223e-07,
"loss": 0.2415,
"reward": 1.1110005229711533,
"reward_std": 0.8034192770719528,
"rewards/cosine_scaled_reward": 0.2777224676683545,
"rewards/format_reward": 0.555555559694767,
"step": 394
},
{
"clip_ratio": 0.0,
"completion_length": 449.52777099609375,
"epoch": 0.33854724662524105,
"grad_norm": 12.88353443145752,
"kl": 1.69921875,
"learning_rate": 2.1558482853517253e-07,
"loss": 0.1616,
"reward": 1.2834501564502716,
"reward_std": 1.0129185914993286,
"rewards/cosine_scaled_reward": 0.3639473095536232,
"rewards/format_reward": 0.5555555522441864,
"step": 395
},
{
"clip_ratio": 0.0,
"completion_length": 424.6527786254883,
"epoch": 0.33940432826226696,
"grad_norm": 8.04071044921875,
"kl": 1.734375,
"learning_rate": 2.134908592756607e-07,
"loss": 0.0449,
"reward": 1.0254860520362854,
"reward_std": 0.8995675295591354,
"rewards/cosine_scaled_reward": 0.25579858385026455,
"rewards/format_reward": 0.5138888955116272,
"step": 396
},
{
"clip_ratio": 0.0,
"completion_length": 419.04166412353516,
"epoch": 0.34026140989929293,
"grad_norm": 13.297782897949219,
"kl": 1.74609375,
"learning_rate": 2.1141329099692406e-07,
"loss": 0.1566,
"reward": 1.252098884433508,
"reward_std": 0.8234367519617081,
"rewards/cosine_scaled_reward": 0.3482716903090477,
"rewards/format_reward": 0.5555555671453476,
"step": 397
},
{
"clip_ratio": 0.0,
"completion_length": 425.8472213745117,
"epoch": 0.34111849153631885,
"grad_norm": 57.158538818359375,
"kl": 1.884765625,
"learning_rate": 2.0935222495670968e-07,
"loss": 0.0898,
"reward": 1.0553182810544968,
"reward_std": 0.8401579111814499,
"rewards/cosine_scaled_reward": 0.22210356313735247,
"rewards/format_reward": 0.611111119389534,
"step": 398
},
{
"clip_ratio": 0.0,
"completion_length": 431.81944274902344,
"epoch": 0.34197557317334476,
"grad_norm": 10.068607330322266,
"kl": 2.22265625,
"learning_rate": 2.0730776160846853e-07,
"loss": 0.1113,
"reward": 0.957691490650177,
"reward_std": 0.9538578987121582,
"rewards/cosine_scaled_reward": 0.2219013087451458,
"rewards/format_reward": 0.5138889029622078,
"step": 399
},
{
"clip_ratio": 0.0,
"completion_length": 472.25001525878906,
"epoch": 0.3428326548103707,
"grad_norm": 4.390395641326904,
"kl": 1.947265625,
"learning_rate": 2.0528000059645995e-07,
"loss": 0.2373,
"reward": 0.8023122465237975,
"reward_std": 0.8230636864900589,
"rewards/cosine_scaled_reward": 0.19976721669081599,
"rewards/format_reward": 0.4027777872979641,
"step": 400
},
{
"clip_ratio": 0.0,
"completion_length": 455.05555725097656,
"epoch": 0.3436897364473966,
"grad_norm": 28.075910568237305,
"kl": 1.68359375,
"learning_rate": 2.032690407508949e-07,
"loss": 0.201,
"reward": 1.1237227618694305,
"reward_std": 0.9717634171247482,
"rewards/cosine_scaled_reward": 0.28408361971378326,
"rewards/format_reward": 0.5555555522441864,
"step": 401
},
{
"clip_ratio": 0.0,
"completion_length": 474.09722900390625,
"epoch": 0.34454681808442256,
"grad_norm": 34.623069763183594,
"kl": 1.5625,
"learning_rate": 2.0127498008311922e-07,
"loss": 0.2037,
"reward": 0.760801451979205,
"reward_std": 0.9681618064641953,
"rewards/cosine_scaled_reward": 0.1859562654281035,
"rewards/format_reward": 0.3888888955116272,
"step": 402
},
{
"clip_ratio": 0.0,
"completion_length": 540.7361145019531,
"epoch": 0.34540389972144847,
"grad_norm": 7.207082271575928,
"kl": 1.66015625,
"learning_rate": 1.9929791578083655e-07,
"loss": 0.1982,
"reward": 0.8205151949077845,
"reward_std": 0.904526948928833,
"rewards/cosine_scaled_reward": 0.2088687140494585,
"rewards/format_reward": 0.4027777835726738,
"step": 403
},
{
"clip_ratio": 0.0,
"completion_length": 435.2361145019531,
"epoch": 0.3462609813584744,
"grad_norm": 16.900236129760742,
"kl": 1.931640625,
"learning_rate": 1.9733794420337213e-07,
"loss": 0.1385,
"reward": 0.6418208181858063,
"reward_std": 0.9299771934747696,
"rewards/cosine_scaled_reward": 0.07091040024533868,
"rewards/format_reward": 0.5000000074505806,
"step": 404
},
{
"clip_ratio": 0.0,
"completion_length": 429.41666412353516,
"epoch": 0.3471180629955003,
"grad_norm": 13.073430061340332,
"kl": 2.048828125,
"learning_rate": 1.9539516087697517e-07,
"loss": 0.2188,
"reward": 0.8325824737548828,
"reward_std": 0.7748439311981201,
"rewards/cosine_scaled_reward": 0.1593467751517892,
"rewards/format_reward": 0.5138888955116272,
"step": 405
},
{
"clip_ratio": 0.0,
"completion_length": 524.9027862548828,
"epoch": 0.34797514463252627,
"grad_norm": 19.661832809448242,
"kl": 1.90625,
"learning_rate": 1.934696604901642e-07,
"loss": 0.2202,
"reward": 0.5801484230905771,
"reward_std": 0.7412025928497314,
"rewards/cosine_scaled_reward": 0.053963107988238335,
"rewards/format_reward": 0.4722222238779068,
"step": 406
},
{
"clip_ratio": 0.0,
"completion_length": 389.1111145019531,
"epoch": 0.3488322262695522,
"grad_norm": 14.274138450622559,
"kl": 1.583984375,
"learning_rate": 1.915615368891117e-07,
"loss": 0.124,
"reward": 0.9547043144702911,
"reward_std": 0.8225141167640686,
"rewards/cosine_scaled_reward": 0.18568546572350897,
"rewards/format_reward": 0.5833333432674408,
"step": 407
},
{
"clip_ratio": 0.0,
"completion_length": 445.6805648803711,
"epoch": 0.3496893079065781,
"grad_norm": 15.514725685119629,
"kl": 2.017578125,
"learning_rate": 1.8967088307307e-07,
"loss": 0.0997,
"reward": 1.0453148484230042,
"reward_std": 1.1434958428144455,
"rewards/cosine_scaled_reward": 0.32126854080706835,
"rewards/format_reward": 0.4027777835726738,
"step": 408
},
{
"clip_ratio": 0.0,
"completion_length": 479.95833587646484,
"epoch": 0.350546389543604,
"grad_norm": 10.456230163574219,
"kl": 1.640625,
"learning_rate": 1.8779779118983867e-07,
"loss": 0.1688,
"reward": 0.7981878519058228,
"reward_std": 0.8995783925056458,
"rewards/cosine_scaled_reward": 0.14214947074651718,
"rewards/format_reward": 0.5138888880610466,
"step": 409
},
{
"clip_ratio": 0.0,
"completion_length": 439.2361145019531,
"epoch": 0.35140347118063,
"grad_norm": 9.658367156982422,
"kl": 1.91796875,
"learning_rate": 1.8594235253127372e-07,
"loss": 0.1396,
"reward": 0.7300833091139793,
"reward_std": 0.7368223965167999,
"rewards/cosine_scaled_reward": 0.12198610045015812,
"rewards/format_reward": 0.486111119389534,
"step": 410
},
{
"clip_ratio": 0.0,
"completion_length": 420.94445037841797,
"epoch": 0.3522605528176559,
"grad_norm": 27.74929428100586,
"kl": 1.73046875,
"learning_rate": 1.8410465752883758e-07,
"loss": 0.2249,
"reward": 0.6258293315768242,
"reward_std": 0.8346365168690681,
"rewards/cosine_scaled_reward": 0.11152577586472034,
"rewards/format_reward": 0.4027777835726738,
"step": 411
},
{
"clip_ratio": 0.0,
"completion_length": 447.70833587646484,
"epoch": 0.3531176344546818,
"grad_norm": 8.646500587463379,
"kl": 1.943359375,
"learning_rate": 1.822847957491922e-07,
"loss": 0.1584,
"reward": 0.9124602228403091,
"reward_std": 0.9870292246341705,
"rewards/cosine_scaled_reward": 0.22011900879442692,
"rewards/format_reward": 0.4722222313284874,
"step": 412
},
{
"clip_ratio": 0.0,
"completion_length": 416.6527862548828,
"epoch": 0.3539747160917077,
"grad_norm": 21.185155868530273,
"kl": 1.671875,
"learning_rate": 1.804828558898332e-07,
"loss": 0.1789,
"reward": 1.2152755111455917,
"reward_std": 0.8829309791326523,
"rewards/cosine_scaled_reward": 0.357637744396925,
"rewards/format_reward": 0.5000000074505806,
"step": 413
},
{
"clip_ratio": 0.0,
"completion_length": 387.56945037841797,
"epoch": 0.3548317977287337,
"grad_norm": 7.426985740661621,
"kl": 1.640625,
"learning_rate": 1.7869892577476722e-07,
"loss": 0.2077,
"reward": 0.7159564755856991,
"reward_std": 0.6246453821659088,
"rewards/cosine_scaled_reward": 0.15658933855593204,
"rewards/format_reward": 0.4027777872979641,
"step": 414
},
{
"clip_ratio": 0.0,
"completion_length": 422.47222900390625,
"epoch": 0.3556888793657596,
"grad_norm": 9.64609146118164,
"kl": 1.505859375,
"learning_rate": 1.7693309235023127e-07,
"loss": 0.0921,
"reward": 0.9304980635643005,
"reward_std": 0.908847376704216,
"rewards/cosine_scaled_reward": 0.2291379189118743,
"rewards/format_reward": 0.4722222238779068,
"step": 415
},
{
"clip_ratio": 0.0,
"completion_length": 430.3055648803711,
"epoch": 0.3565459610027855,
"grad_norm": 46.99845886230469,
"kl": 1.486328125,
"learning_rate": 1.7518544168045524e-07,
"loss": 0.1551,
"reward": 0.926967169623822,
"reward_std": 0.8230779618024826,
"rewards/cosine_scaled_reward": 0.23431690875440836,
"rewards/format_reward": 0.45833333022892475,
"step": 416
},
{
"clip_ratio": 0.0,
"completion_length": 455.1527786254883,
"epoch": 0.35740304263981143,
"grad_norm": 11.308908462524414,
"kl": 1.482421875,
"learning_rate": 1.7345605894346726e-07,
"loss": 0.2065,
"reward": 1.206569030880928,
"reward_std": 0.852574422955513,
"rewards/cosine_scaled_reward": 0.27689564414322376,
"rewards/format_reward": 0.6527777910232544,
"step": 417
},
{
"clip_ratio": 0.0,
"completion_length": 449.0,
"epoch": 0.35826012427683734,
"grad_norm": 94.13972473144531,
"kl": 1.634765625,
"learning_rate": 1.7174502842694212e-07,
"loss": 0.1649,
"reward": 1.0392098128795624,
"reward_std": 0.7440510094165802,
"rewards/cosine_scaled_reward": 0.2418271228671074,
"rewards/format_reward": 0.5555555671453476,
"step": 418
},
{
"clip_ratio": 0.0,
"completion_length": 364.3472213745117,
"epoch": 0.3591172059138633,
"grad_norm": 13.36479663848877,
"kl": 1.822265625,
"learning_rate": 1.7005243352409333e-07,
"loss": 0.0298,
"reward": 1.1378347873687744,
"reward_std": 0.9185468256473541,
"rewards/cosine_scaled_reward": 0.26336181070655584,
"rewards/format_reward": 0.6111111268401146,
"step": 419
},
{
"clip_ratio": 0.0,
"completion_length": 406.4027786254883,
"epoch": 0.3599742875508892,
"grad_norm": 8.439767837524414,
"kl": 1.845703125,
"learning_rate": 1.6837835672960831e-07,
"loss": 0.1749,
"reward": 1.267815724015236,
"reward_std": 0.7989730685949326,
"rewards/cosine_scaled_reward": 0.38390786573290825,
"rewards/format_reward": 0.5,
"step": 420
},
{
"clip_ratio": 0.0,
"completion_length": 461.3472213745117,
"epoch": 0.36083136918791514,
"grad_norm": 22.24334716796875,
"kl": 1.705078125,
"learning_rate": 1.6672287963562852e-07,
"loss": 0.137,
"reward": 0.7026118133217096,
"reward_std": 0.6709327548742294,
"rewards/cosine_scaled_reward": 0.10825034603476524,
"rewards/format_reward": 0.486111119389534,
"step": 421
},
{
"clip_ratio": 0.0,
"completion_length": 429.2916717529297,
"epoch": 0.36168845082494105,
"grad_norm": 17.320945739746094,
"kl": 1.8046875,
"learning_rate": 1.6508608292777203e-07,
"loss": 0.3511,
"reward": 1.3339325338602066,
"reward_std": 0.9927608072757721,
"rewards/cosine_scaled_reward": 0.3891884870827198,
"rewards/format_reward": 0.555555559694767,
"step": 422
},
{
"clip_ratio": 0.0,
"completion_length": 391.02779388427734,
"epoch": 0.362545532461967,
"grad_norm": 13.228276252746582,
"kl": 1.7890625,
"learning_rate": 1.6346804638120098e-07,
"loss": 0.1998,
"reward": 0.7798476368188858,
"reward_std": 0.8442924916744232,
"rewards/cosine_scaled_reward": 0.1815904900431633,
"rewards/format_reward": 0.416666679084301,
"step": 423
},
{
"clip_ratio": 0.0,
"completion_length": 419.55555725097656,
"epoch": 0.36340261409899294,
"grad_norm": 4.867981910705566,
"kl": 1.88671875,
"learning_rate": 1.6186884885673413e-07,
"loss": 0.141,
"reward": 0.38255439326167107,
"reward_std": 0.6649321764707565,
"rewards/cosine_scaled_reward": 0.003777193371206522,
"rewards/format_reward": 0.3750000074505806,
"step": 424
},
{
"clip_ratio": 0.0,
"completion_length": 444.80554962158203,
"epoch": 0.36425969573601885,
"grad_norm": 19.474985122680664,
"kl": 1.388671875,
"learning_rate": 1.6028856829700258e-07,
"loss": 0.2453,
"reward": 0.8091739304363728,
"reward_std": 0.6519715338945389,
"rewards/cosine_scaled_reward": 0.1476425053551793,
"rewards/format_reward": 0.5138888955116272,
"step": 425
},
{
"clip_ratio": 0.0,
"completion_length": 364.4861068725586,
"epoch": 0.36511677737304477,
"grad_norm": 5.00600004196167,
"kl": 1.5703125,
"learning_rate": 1.5872728172265146e-07,
"loss": 0.0966,
"reward": 1.0419821739196777,
"reward_std": 0.8779765665531158,
"rewards/cosine_scaled_reward": 0.24321329407393932,
"rewards/format_reward": 0.5555555745959282,
"step": 426
},
{
"clip_ratio": 0.0,
"completion_length": 403.34722900390625,
"epoch": 0.36597385901007073,
"grad_norm": 17.935285568237305,
"kl": 2.138671875,
"learning_rate": 1.5718506522858572e-07,
"loss": 0.1828,
"reward": 0.46462448686361313,
"reward_std": 0.658824697136879,
"rewards/cosine_scaled_reward": 0.01703446265310049,
"rewards/format_reward": 0.430555559694767,
"step": 427
},
{
"clip_ratio": 0.0,
"completion_length": 457.01390075683594,
"epoch": 0.36683094064709665,
"grad_norm": 24.413970947265625,
"kl": 1.9921875,
"learning_rate": 1.5566199398026147e-07,
"loss": 0.2397,
"reward": 0.9884657636284828,
"reward_std": 0.9193740636110306,
"rewards/cosine_scaled_reward": 0.2928439930547029,
"rewards/format_reward": 0.4027777835726738,
"step": 428
},
{
"clip_ratio": 0.0,
"completion_length": 411.9027786254883,
"epoch": 0.36768802228412256,
"grad_norm": 6.04814338684082,
"kl": 2.146484375,
"learning_rate": 1.5415814221002265e-07,
"loss": 0.2913,
"reward": 0.7584970518946648,
"reward_std": 0.8831272125244141,
"rewards/cosine_scaled_reward": 0.10841521085239947,
"rewards/format_reward": 0.5416666716337204,
"step": 429
},
{
"clip_ratio": 0.0,
"completion_length": 378.62500762939453,
"epoch": 0.3685451039211485,
"grad_norm": 16.954452514648438,
"kl": 1.873046875,
"learning_rate": 1.5267358321348285e-07,
"loss": 0.2931,
"reward": 0.9981023781001568,
"reward_std": 0.7928592413663864,
"rewards/cosine_scaled_reward": 0.26988449646160007,
"rewards/format_reward": 0.4583333395421505,
"step": 430
},
{
"clip_ratio": 0.0,
"completion_length": 461.6527862548828,
"epoch": 0.3694021855581744,
"grad_norm": 16.10793113708496,
"kl": 1.787109375,
"learning_rate": 1.5120838934595337e-07,
"loss": 0.2419,
"reward": 0.9780825227499008,
"reward_std": 0.897609755396843,
"rewards/cosine_scaled_reward": 0.22515234909951687,
"rewards/format_reward": 0.5277777910232544,
"step": 431
},
{
"clip_ratio": 0.0,
"completion_length": 428.72222900390625,
"epoch": 0.37025926719520036,
"grad_norm": 6.1311869621276855,
"kl": 1.884765625,
"learning_rate": 1.4976263201891613e-07,
"loss": 0.2316,
"reward": 1.065040536224842,
"reward_std": 0.827082633972168,
"rewards/cosine_scaled_reward": 0.24779804050922394,
"rewards/format_reward": 0.5694444626569748,
"step": 432
},
{
"clip_ratio": 0.0,
"completion_length": 391.5,
"epoch": 0.3711163488322263,
"grad_norm": 19.106285095214844,
"kl": 1.501953125,
"learning_rate": 1.483363816965435e-07,
"loss": 0.0608,
"reward": 1.2501190304756165,
"reward_std": 0.6760459691286087,
"rewards/cosine_scaled_reward": 0.36811505258083344,
"rewards/format_reward": 0.5138888955116272,
"step": 433
},
{
"clip_ratio": 0.0,
"completion_length": 399.8472213745117,
"epoch": 0.3719734304692522,
"grad_norm": 17.120777130126953,
"kl": 1.87890625,
"learning_rate": 1.469297078922642e-07,
"loss": 0.2051,
"reward": 0.7955693230032921,
"reward_std": 0.6731881201267242,
"rewards/cosine_scaled_reward": 0.11306244693696499,
"rewards/format_reward": 0.5694444626569748,
"step": 434
},
{
"clip_ratio": 0.0,
"completion_length": 464.7083435058594,
"epoch": 0.3728305121062781,
"grad_norm": 17.019750595092773,
"kl": 1.630859375,
"learning_rate": 1.4554267916537495e-07,
"loss": 0.2255,
"reward": 0.9147238731384277,
"reward_std": 0.8846637308597565,
"rewards/cosine_scaled_reward": 0.16569526493549347,
"rewards/format_reward": 0.5833333507180214,
"step": 435
},
{
"clip_ratio": 0.0,
"completion_length": 374.4861145019531,
"epoch": 0.37368759374330407,
"grad_norm": 13.612043380737305,
"kl": 2.029296875,
"learning_rate": 1.4417536311769885e-07,
"loss": 0.2119,
"reward": 0.624715980142355,
"reward_std": 0.8500286191701889,
"rewards/cosine_scaled_reward": 0.06930242432281375,
"rewards/format_reward": 0.4861111119389534,
"step": 436
},
{
"clip_ratio": 0.0,
"completion_length": 380.12500381469727,
"epoch": 0.37454467538033,
"grad_norm": 19.181344985961914,
"kl": 1.3828125,
"learning_rate": 1.4282782639029128e-07,
"loss": 0.0972,
"reward": 1.1469180285930634,
"reward_std": 0.8286690264940262,
"rewards/cosine_scaled_reward": 0.2609590096399188,
"rewards/format_reward": 0.625,
"step": 437
},
{
"clip_ratio": 0.0,
"completion_length": 437.75000762939453,
"epoch": 0.3754017570173559,
"grad_norm": 32.06253433227539,
"kl": 1.892578125,
"learning_rate": 1.4150013466019114e-07,
"loss": 0.1621,
"reward": 0.9302653223276138,
"reward_std": 0.9272859841585159,
"rewards/cosine_scaled_reward": 0.22902152687311172,
"rewards/format_reward": 0.4722222313284874,
"step": 438
},
{
"clip_ratio": 0.0,
"completion_length": 427.75,
"epoch": 0.3762588386543818,
"grad_norm": 12.294611930847168,
"kl": 2.0078125,
"learning_rate": 1.4019235263722034e-07,
"loss": 0.2887,
"reward": 0.9489690512418747,
"reward_std": 0.91136734187603,
"rewards/cosine_scaled_reward": 0.16892896872013807,
"rewards/format_reward": 0.6111111119389534,
"step": 439
},
{
"clip_ratio": 0.0,
"completion_length": 392.9027862548828,
"epoch": 0.3771159202914078,
"grad_norm": 11.586893081665039,
"kl": 1.779296875,
"learning_rate": 1.3890454406082956e-07,
"loss": 0.1511,
"reward": 1.3165553212165833,
"reward_std": 1.0282287746667862,
"rewards/cosine_scaled_reward": 0.40827762335538864,
"rewards/format_reward": 0.5,
"step": 440
},
{
"clip_ratio": 0.0,
"completion_length": 378.2777786254883,
"epoch": 0.3779730019284337,
"grad_norm": 7.445328235626221,
"kl": 1.876953125,
"learning_rate": 1.3763677169699217e-07,
"loss": 0.2252,
"reward": 0.4708889238536358,
"reward_std": 0.5369373112916946,
"rewards/cosine_scaled_reward": 0.01322223711758852,
"rewards/format_reward": 0.4444444552063942,
"step": 441
},
{
"clip_ratio": 0.0,
"completion_length": 366.52777099609375,
"epoch": 0.3788300835654596,
"grad_norm": 37.63726043701172,
"kl": 1.6171875,
"learning_rate": 1.3638909733514452e-07,
"loss": 0.2202,
"reward": 0.9833096265792847,
"reward_std": 0.6800422966480255,
"rewards/cosine_scaled_reward": 0.19998812582343817,
"rewards/format_reward": 0.5833333507180214,
"step": 442
},
{
"clip_ratio": 0.0,
"completion_length": 433.93055725097656,
"epoch": 0.3796871652024855,
"grad_norm": 10.03159236907959,
"kl": 1.92578125,
"learning_rate": 1.351615817851748e-07,
"loss": 0.1216,
"reward": 0.8934760093688965,
"reward_std": 0.8315132707357407,
"rewards/cosine_scaled_reward": 0.23840466793626547,
"rewards/format_reward": 0.4166666716337204,
"step": 443
},
{
"clip_ratio": 0.0,
"completion_length": 408.19444274902344,
"epoch": 0.3805442468395115,
"grad_norm": 10.382827758789062,
"kl": 1.80859375,
"learning_rate": 1.3395428487445914e-07,
"loss": 0.1845,
"reward": 0.8439731672406197,
"reward_std": 0.8578460216522217,
"rewards/cosine_scaled_reward": 0.19976436160504818,
"rewards/format_reward": 0.4444444552063942,
"step": 444
},
{
"clip_ratio": 0.0,
"completion_length": 397.7777786254883,
"epoch": 0.3814013284765374,
"grad_norm": 25.172704696655273,
"kl": 1.984375,
"learning_rate": 1.3276726544494571e-07,
"loss": 0.2339,
"reward": 0.9671718925237656,
"reward_std": 0.8240036368370056,
"rewards/cosine_scaled_reward": 0.2613637112081051,
"rewards/format_reward": 0.4444444552063942,
"step": 445
},
{
"clip_ratio": 0.0,
"completion_length": 406.43055725097656,
"epoch": 0.3822584101135633,
"grad_norm": 5.355282783508301,
"kl": 1.91015625,
"learning_rate": 1.316005813502869e-07,
"loss": 0.1568,
"reward": 0.802594855427742,
"reward_std": 0.7209294140338898,
"rewards/cosine_scaled_reward": 0.13740853779017925,
"rewards/format_reward": 0.5277777835726738,
"step": 446
},
{
"clip_ratio": 0.0,
"completion_length": 387.69444274902344,
"epoch": 0.38311549175058923,
"grad_norm": 7.822175979614258,
"kl": 1.595703125,
"learning_rate": 1.3045428945301953e-07,
"loss": 0.1097,
"reward": 0.7201132848858833,
"reward_std": 0.9133000522851944,
"rewards/cosine_scaled_reward": 0.1447788504883647,
"rewards/format_reward": 0.4305555671453476,
"step": 447
},
{
"clip_ratio": 0.0,
"completion_length": 412.4722213745117,
"epoch": 0.38397257338761515,
"grad_norm": 7.698829650878906,
"kl": 1.88671875,
"learning_rate": 1.2932844562179352e-07,
"loss": 0.1968,
"reward": 0.8293040692806244,
"reward_std": 0.9215101897716522,
"rewards/cosine_scaled_reward": 0.1715964898467064,
"rewards/format_reward": 0.486111119389534,
"step": 448
},
{
"clip_ratio": 0.0,
"completion_length": 393.54166412353516,
"epoch": 0.3848296550246411,
"grad_norm": 9.48220157623291,
"kl": 1.6025390625,
"learning_rate": 1.2822310472864885e-07,
"loss": 0.149,
"reward": 0.9041518270969391,
"reward_std": 0.6833358332514763,
"rewards/cosine_scaled_reward": 0.18124257400631905,
"rewards/format_reward": 0.5416666641831398,
"step": 449
},
{
"clip_ratio": 0.0,
"completion_length": 418.66666412353516,
"epoch": 0.38568673666166703,
"grad_norm": 23.154109954833984,
"kl": 1.86328125,
"learning_rate": 1.2713832064634125e-07,
"loss": 0.1058,
"reward": 0.7730568274855614,
"reward_std": 0.8151258826255798,
"rewards/cosine_scaled_reward": 0.17819508351385593,
"rewards/format_reward": 0.4166666716337204,
"step": 450
},
{
"clip_ratio": 0.0,
"completion_length": 412.0,
"epoch": 0.38654381829869294,
"grad_norm": 11.1907377243042,
"kl": 1.623046875,
"learning_rate": 1.260741462457165e-07,
"loss": 0.1147,
"reward": 0.7459932379424572,
"reward_std": 0.6991625279188156,
"rewards/cosine_scaled_reward": 0.09521883772686124,
"rewards/format_reward": 0.5555555522441864,
"step": 451
},
{
"clip_ratio": 0.0,
"completion_length": 379.6388931274414,
"epoch": 0.38740089993571886,
"grad_norm": 38.63614273071289,
"kl": 1.52734375,
"learning_rate": 1.2503063339313356e-07,
"loss": 0.0696,
"reward": 0.8107917159795761,
"reward_std": 0.8233655989170074,
"rewards/cosine_scaled_reward": 0.11372919054701924,
"rewards/format_reward": 0.5833333432674408,
"step": 452
},
{
"clip_ratio": 0.0,
"completion_length": 400.44444274902344,
"epoch": 0.3882579815727448,
"grad_norm": 7.2680439949035645,
"kl": 1.779296875,
"learning_rate": 1.2400783294793668e-07,
"loss": 0.2399,
"reward": 0.8938721343874931,
"reward_std": 0.8410957902669907,
"rewards/cosine_scaled_reward": 0.16915827617049217,
"rewards/format_reward": 0.5555555671453476,
"step": 453
},
{
"clip_ratio": 0.0,
"completion_length": 418.44445037841797,
"epoch": 0.38911506320977074,
"grad_norm": 8.975519180297852,
"kl": 1.748046875,
"learning_rate": 1.2300579475997657e-07,
"loss": 0.1852,
"reward": 1.3426352962851524,
"reward_std": 0.8928624093532562,
"rewards/cosine_scaled_reward": 0.35881765000522137,
"rewards/format_reward": 0.625,
"step": 454
},
{
"clip_ratio": 0.0,
"completion_length": 399.25000762939453,
"epoch": 0.38997214484679665,
"grad_norm": 8.324163436889648,
"kl": 1.79296875,
"learning_rate": 1.220245676671809e-07,
"loss": 0.305,
"reward": 0.5967597924172878,
"reward_std": 0.6817344427108765,
"rewards/cosine_scaled_reward": 0.06921320641413331,
"rewards/format_reward": 0.4583333507180214,
"step": 455
},
{
"clip_ratio": 0.0,
"completion_length": 366.3888931274414,
"epoch": 0.39082922648382257,
"grad_norm": 6.22599458694458,
"kl": 1.716796875,
"learning_rate": 1.2106419949317388e-07,
"loss": 0.195,
"reward": 1.2024425864219666,
"reward_std": 0.9370259791612625,
"rewards/cosine_scaled_reward": 0.30955461598932743,
"rewards/format_reward": 0.5833333358168602,
"step": 456
},
{
"clip_ratio": 0.0,
"completion_length": 379.9861145019531,
"epoch": 0.39168630812084854,
"grad_norm": 10.595834732055664,
"kl": 2.078125,
"learning_rate": 1.2012473704494537e-07,
"loss": 0.1005,
"reward": 1.4162960648536682,
"reward_std": 0.9950994998216629,
"rewards/cosine_scaled_reward": 0.3887035697698593,
"rewards/format_reward": 0.6388889104127884,
"step": 457
},
{
"clip_ratio": 0.0,
"completion_length": 398.97222900390625,
"epoch": 0.39254338975787445,
"grad_norm": 9.582780838012695,
"kl": 1.748046875,
"learning_rate": 1.1920622611056974e-07,
"loss": 0.1435,
"reward": 0.8297743499279022,
"reward_std": 0.6032012775540352,
"rewards/cosine_scaled_reward": 0.1301649445667863,
"rewards/format_reward": 0.569444440305233,
"step": 458
},
{
"clip_ratio": 0.0,
"completion_length": 416.4166793823242,
"epoch": 0.39340047139490036,
"grad_norm": 13.351261138916016,
"kl": 1.697265625,
"learning_rate": 1.1830871145697412e-07,
"loss": 0.1826,
"reward": 1.3233585357666016,
"reward_std": 0.9728284627199173,
"rewards/cosine_scaled_reward": 0.3561236932873726,
"rewards/format_reward": 0.6111111268401146,
"step": 459
},
{
"clip_ratio": 0.0,
"completion_length": 389.50000762939453,
"epoch": 0.3942575530319263,
"grad_norm": 7.549479007720947,
"kl": 1.9921875,
"learning_rate": 1.1743223682775649e-07,
"loss": 0.2677,
"reward": 1.1245174407958984,
"reward_std": 0.7800677567720413,
"rewards/cosine_scaled_reward": 0.2705920338630676,
"rewards/format_reward": 0.5833333432674408,
"step": 460
},
{
"clip_ratio": 0.0,
"completion_length": 398.0277862548828,
"epoch": 0.3951146346689522,
"grad_norm": 9.143904685974121,
"kl": 1.7109375,
"learning_rate": 1.1657684494105386e-07,
"loss": 0.2179,
"reward": 0.8851038962602615,
"reward_std": 0.7892615795135498,
"rewards/cosine_scaled_reward": 0.15782971866428852,
"rewards/format_reward": 0.5694444552063942,
"step": 461
},
{
"clip_ratio": 0.0,
"completion_length": 373.0138931274414,
"epoch": 0.39597171630597816,
"grad_norm": 6.645680904388428,
"kl": 1.9296875,
"learning_rate": 1.1574257748745986e-07,
"loss": 0.2493,
"reward": 1.0129882618784904,
"reward_std": 0.790315642952919,
"rewards/cosine_scaled_reward": 0.22177189541980624,
"rewards/format_reward": 0.5694444626569748,
"step": 462
},
{
"clip_ratio": 0.0,
"completion_length": 394.30555725097656,
"epoch": 0.3968287979430041,
"grad_norm": 8.618224143981934,
"kl": 1.84765625,
"learning_rate": 1.1492947512799328e-07,
"loss": 0.1754,
"reward": 1.06332229077816,
"reward_std": 0.9512833207845688,
"rewards/cosine_scaled_reward": 0.2677722591906786,
"rewards/format_reward": 0.5277777910232544,
"step": 463
},
{
"clip_ratio": 0.0,
"completion_length": 327.58333587646484,
"epoch": 0.39768587958003,
"grad_norm": 6.552804470062256,
"kl": 2.2421875,
"learning_rate": 1.1413757749211602e-07,
"loss": 0.1814,
"reward": 1.1924152821302414,
"reward_std": 0.8387909829616547,
"rewards/cosine_scaled_reward": 0.2975965216755867,
"rewards/format_reward": 0.5972222313284874,
"step": 464
},
{
"clip_ratio": 0.0,
"completion_length": 381.45833587646484,
"epoch": 0.3985429612170559,
"grad_norm": 23.327707290649414,
"kl": 1.900390625,
"learning_rate": 1.1336692317580158e-07,
"loss": 0.2125,
"reward": 0.9507962316274643,
"reward_std": 0.8488497734069824,
"rewards/cosine_scaled_reward": 0.21845365059562027,
"rewards/format_reward": 0.5138888955116272,
"step": 465
},
{
"clip_ratio": 0.0,
"completion_length": 388.25000762939453,
"epoch": 0.39940004285408187,
"grad_norm": 16.77749252319336,
"kl": 2.42578125,
"learning_rate": 1.1261754973965422e-07,
"loss": 0.1887,
"reward": 1.0224937349557877,
"reward_std": 0.7094171047210693,
"rewards/cosine_scaled_reward": 0.2543024020269513,
"rewards/format_reward": 0.5138888955116272,
"step": 466
},
{
"clip_ratio": 0.0,
"completion_length": 395.4027786254883,
"epoch": 0.4002571244911078,
"grad_norm": 22.206117630004883,
"kl": 1.734375,
"learning_rate": 1.1188949370707787e-07,
"loss": 0.1153,
"reward": 1.1953821629285812,
"reward_std": 0.9889565110206604,
"rewards/cosine_scaled_reward": 0.3338021747767925,
"rewards/format_reward": 0.5277777835726738,
"step": 467
},
{
"clip_ratio": 0.0,
"completion_length": 443.1805648803711,
"epoch": 0.4011142061281337,
"grad_norm": 21.06090545654297,
"kl": 2.35546875,
"learning_rate": 1.1118279056249653e-07,
"loss": 0.2039,
"reward": 0.6936846375465393,
"reward_std": 0.7559118419885635,
"rewards/cosine_scaled_reward": 0.15934233367443085,
"rewards/format_reward": 0.3750000074505806,
"step": 468
},
{
"clip_ratio": 0.0,
"completion_length": 375.4027786254883,
"epoch": 0.4019712877651596,
"grad_norm": 11.80557632446289,
"kl": 1.93359375,
"learning_rate": 1.1049747474962444e-07,
"loss": 0.31,
"reward": 1.1770465597510338,
"reward_std": 0.9983221143484116,
"rewards/cosine_scaled_reward": 0.3385232575237751,
"rewards/format_reward": 0.5000000037252903,
"step": 469
},
{
"clip_ratio": 0.0,
"completion_length": 398.9722213745117,
"epoch": 0.4028283694021856,
"grad_norm": 3.4314489364624023,
"kl": 2.005859375,
"learning_rate": 1.0983357966978745e-07,
"loss": 0.3436,
"reward": 0.823687631636858,
"reward_std": 1.019385039806366,
"rewards/cosine_scaled_reward": 0.21045495197176933,
"rewards/format_reward": 0.4027777872979641,
"step": 470
},
{
"clip_ratio": 0.0,
"completion_length": 405.87500762939453,
"epoch": 0.4036854510392115,
"grad_norm": 28.387819290161133,
"kl": 2.25390625,
"learning_rate": 1.0919113768029517e-07,
"loss": 0.2678,
"reward": 0.5487676113843918,
"reward_std": 0.7938825041055679,
"rewards/cosine_scaled_reward": 0.059106036089360714,
"rewards/format_reward": 0.43055555410683155,
"step": 471
},
{
"clip_ratio": 0.0,
"completion_length": 405.44444274902344,
"epoch": 0.4045425326762374,
"grad_norm": 14.908821105957031,
"kl": 1.94140625,
"learning_rate": 1.0857018009286381e-07,
"loss": 0.0599,
"reward": 0.9214093834161758,
"reward_std": 0.8387825936079025,
"rewards/cosine_scaled_reward": 0.22459355555474758,
"rewards/format_reward": 0.472222238779068,
"step": 472
},
{
"clip_ratio": 0.0,
"completion_length": 431.7777786254883,
"epoch": 0.4053996143132633,
"grad_norm": 31.563308715820312,
"kl": 2.39453125,
"learning_rate": 1.0797073717209013e-07,
"loss": 0.1599,
"reward": 0.9639946967363358,
"reward_std": 0.9418660998344421,
"rewards/cosine_scaled_reward": 0.23199738003313541,
"rewards/format_reward": 0.5000000149011612,
"step": 473
},
{
"clip_ratio": 0.0,
"completion_length": 345.43055725097656,
"epoch": 0.4062566959502893,
"grad_norm": 23.537080764770508,
"kl": 2.203125,
"learning_rate": 1.0739283813397639e-07,
"loss": 0.1802,
"reward": 0.9301662147045135,
"reward_std": 0.9208797365427017,
"rewards/cosine_scaled_reward": 0.24286089045926929,
"rewards/format_reward": 0.4444444552063942,
"step": 474
},
{
"clip_ratio": 0.0,
"completion_length": 407.0416717529297,
"epoch": 0.4071137775873152,
"grad_norm": 113.17594909667969,
"kl": 2.009765625,
"learning_rate": 1.068365111445064e-07,
"loss": 0.1716,
"reward": 0.6663865000009537,
"reward_std": 0.6882055103778839,
"rewards/cosine_scaled_reward": 0.12485991045832634,
"rewards/format_reward": 0.4166666753590107,
"step": 475
},
{
"clip_ratio": 0.0,
"completion_length": 392.68055725097656,
"epoch": 0.4079708592243411,
"grad_norm": 7.956409454345703,
"kl": 2.01171875,
"learning_rate": 1.063017833182728e-07,
"loss": 0.1263,
"reward": 1.0667885690927505,
"reward_std": 0.7681434005498886,
"rewards/cosine_scaled_reward": 0.2347831572405994,
"rewards/format_reward": 0.597222238779068,
"step": 476
},
{
"clip_ratio": 0.0,
"completion_length": 415.0416564941406,
"epoch": 0.40882794086136703,
"grad_norm": 9.29298210144043,
"kl": 2.6171875,
"learning_rate": 1.0578868071715544e-07,
"loss": 0.2517,
"reward": 0.8619468212127686,
"reward_std": 0.951014369726181,
"rewards/cosine_scaled_reward": 0.17402894236147404,
"rewards/format_reward": 0.5138888955116272,
"step": 477
},
{
"clip_ratio": 0.0,
"completion_length": 420.3611145019531,
"epoch": 0.40968502249839295,
"grad_norm": 11.378466606140137,
"kl": 1.888671875,
"learning_rate": 1.0529722834905125e-07,
"loss": 0.1412,
"reward": 0.7714032009243965,
"reward_std": 0.8700239658355713,
"rewards/cosine_scaled_reward": 0.15653494279831648,
"rewards/format_reward": 0.4583333358168602,
"step": 478
},
{
"clip_ratio": 0.0,
"completion_length": 368.3333435058594,
"epoch": 0.4105421041354189,
"grad_norm": 14.92309284210205,
"kl": 2.076171875,
"learning_rate": 1.0482745016665526e-07,
"loss": 0.1466,
"reward": 1.3654054403305054,
"reward_std": 0.9904145002365112,
"rewards/cosine_scaled_reward": 0.4327027127146721,
"rewards/format_reward": 0.5000000074505806,
"step": 479
},
{
"clip_ratio": 0.0,
"completion_length": 401.55555725097656,
"epoch": 0.41139918577244483,
"grad_norm": 18.368589401245117,
"kl": 2.294921875,
"learning_rate": 1.0437936906629334e-07,
"loss": 0.1576,
"reward": 1.0128132551908493,
"reward_std": 0.808688297867775,
"rewards/cosine_scaled_reward": 0.24251772370189428,
"rewards/format_reward": 0.527777798473835,
"step": 480
},
{
"clip_ratio": 0.0,
"completion_length": 394.93054962158203,
"epoch": 0.41225626740947074,
"grad_norm": 15.127246856689453,
"kl": 2.04296875,
"learning_rate": 1.0395300688680625e-07,
"loss": 0.1467,
"reward": 0.712806798517704,
"reward_std": 1.0079237669706345,
"rewards/cosine_scaled_reward": 0.15501452051103115,
"rewards/format_reward": 0.4027777910232544,
"step": 481
},
{
"clip_ratio": 0.0,
"completion_length": 508.3333282470703,
"epoch": 0.41311334904649666,
"grad_norm": 14.581649780273438,
"kl": 1.87890625,
"learning_rate": 1.0354838440848501e-07,
"loss": 0.0734,
"reward": 1.3022873476147652,
"reward_std": 0.9221579432487488,
"rewards/cosine_scaled_reward": 0.36642143689095974,
"rewards/format_reward": 0.5694444477558136,
"step": 482
},
{
"clip_ratio": 0.0,
"completion_length": 448.06945037841797,
"epoch": 0.4139704306835226,
"grad_norm": 16.139543533325195,
"kl": 1.8984375,
"learning_rate": 1.0316552135205837e-07,
"loss": 0.1623,
"reward": 0.8377330377697945,
"reward_std": 0.822199173271656,
"rewards/cosine_scaled_reward": 0.17581098433583975,
"rewards/format_reward": 0.4861111119389534,
"step": 483
},
{
"clip_ratio": 0.0,
"completion_length": 371.7222213745117,
"epoch": 0.41482751232054854,
"grad_norm": 39.747249603271484,
"kl": 2.271484375,
"learning_rate": 1.0280443637773163e-07,
"loss": 0.0188,
"reward": 0.9065616875886917,
"reward_std": 0.8971037119626999,
"rewards/cosine_scaled_reward": 0.2032808493822813,
"rewards/format_reward": 0.5000000074505806,
"step": 484
},
{
"clip_ratio": 0.0,
"completion_length": 411.3472213745117,
"epoch": 0.41568459395757446,
"grad_norm": 12.812687873840332,
"kl": 2.158203125,
"learning_rate": 1.0246514708427701e-07,
"loss": 0.1918,
"reward": 0.7322559207677841,
"reward_std": 0.8131649047136307,
"rewards/cosine_scaled_reward": 0.08835018612444401,
"rewards/format_reward": 0.555555559694767,
"step": 485
},
{
"clip_ratio": 0.0,
"completion_length": 362.08333587646484,
"epoch": 0.41654167559460037,
"grad_norm": 35.39934539794922,
"kl": 1.7734375,
"learning_rate": 1.0214767000817596e-07,
"loss": 0.1173,
"reward": 0.9961818382143974,
"reward_std": 0.6925189048051834,
"rewards/cosine_scaled_reward": 0.22725759260356426,
"rewards/format_reward": 0.5416666716337204,
"step": 486
},
{
"clip_ratio": 0.0,
"completion_length": 349.7777786254883,
"epoch": 0.41739875723162634,
"grad_norm": 35.59743881225586,
"kl": 1.703125,
"learning_rate": 1.0185202062281336e-07,
"loss": 0.1241,
"reward": 0.8367552310228348,
"reward_std": 0.638402059674263,
"rewards/cosine_scaled_reward": 0.14059981796890497,
"rewards/format_reward": 0.555555559694767,
"step": 487
},
{
"clip_ratio": 0.0,
"completion_length": 446.94444274902344,
"epoch": 0.41825583886865225,
"grad_norm": 9.523124694824219,
"kl": 2.037109375,
"learning_rate": 1.0157821333772304e-07,
"loss": 0.1536,
"reward": 0.9173677563667297,
"reward_std": 0.8871555328369141,
"rewards/cosine_scaled_reward": 0.15312829986214638,
"rewards/format_reward": 0.611111119389534,
"step": 488
},
{
"clip_ratio": 0.0,
"completion_length": 406.5416717529297,
"epoch": 0.41911292050567817,
"grad_norm": 18.345104217529297,
"kl": 1.677734375,
"learning_rate": 1.013262614978859e-07,
"loss": 0.2052,
"reward": 1.6547060012817383,
"reward_std": 0.9650345891714096,
"rewards/cosine_scaled_reward": 0.5217974632978439,
"rewards/format_reward": 0.611111119389534,
"step": 489
},
{
"clip_ratio": 0.0,
"completion_length": 414.1527786254883,
"epoch": 0.4199700021427041,
"grad_norm": 31.926456451416016,
"kl": 1.89453125,
"learning_rate": 1.0109617738307911e-07,
"loss": 0.1587,
"reward": 1.0247105360031128,
"reward_std": 0.9507659077644348,
"rewards/cosine_scaled_reward": 0.2692996822297573,
"rewards/format_reward": 0.4861111268401146,
"step": 490
},
{
"clip_ratio": 0.0,
"completion_length": 367.19445037841797,
"epoch": 0.42082708377973,
"grad_norm": 128.7314453125,
"kl": 1.83203125,
"learning_rate": 1.0088797220727779e-07,
"loss": 0.0835,
"reward": 0.910501167178154,
"reward_std": 0.8024759143590927,
"rewards/cosine_scaled_reward": 0.17747278325259686,
"rewards/format_reward": 0.5555555671453476,
"step": 491
},
{
"clip_ratio": 0.0,
"completion_length": 438.0416717529297,
"epoch": 0.42168416541675596,
"grad_norm": 6.026768684387207,
"kl": 1.734375,
"learning_rate": 1.0070165611810855e-07,
"loss": 0.1438,
"reward": 1.2874046564102173,
"reward_std": 0.817937821149826,
"rewards/cosine_scaled_reward": 0.40064676851034164,
"rewards/format_reward": 0.4861111268401146,
"step": 492
},
{
"clip_ratio": 0.0,
"completion_length": 419.15277099609375,
"epoch": 0.4225412470537819,
"grad_norm": 23.038436889648438,
"kl": 1.763671875,
"learning_rate": 1.005372381963547e-07,
"loss": 0.2819,
"reward": 1.25694739818573,
"reward_std": 0.8772137686610222,
"rewards/cosine_scaled_reward": 0.32291813008487225,
"rewards/format_reward": 0.6111111119389534,
"step": 493
},
{
"clip_ratio": 0.0,
"completion_length": 414.87500762939453,
"epoch": 0.4233983286908078,
"grad_norm": 41.3348274230957,
"kl": 1.759765625,
"learning_rate": 1.0039472645551372e-07,
"loss": 0.0636,
"reward": 1.1973340883851051,
"reward_std": 0.8486432880163193,
"rewards/cosine_scaled_reward": 0.32088930322788656,
"rewards/format_reward": 0.5555555745959282,
"step": 494
},
{
"clip_ratio": 0.0,
"completion_length": 472.7361068725586,
"epoch": 0.4242554103278337,
"grad_norm": 19.50507354736328,
"kl": 1.740234375,
"learning_rate": 1.002741278414069e-07,
"loss": 0.1005,
"reward": 1.1010611280798912,
"reward_std": 0.7267665565013885,
"rewards/cosine_scaled_reward": 0.238030556589365,
"rewards/format_reward": 0.625,
"step": 495
},
{
"clip_ratio": 0.0,
"completion_length": 442.76390075683594,
"epoch": 0.4251124919648597,
"grad_norm": 17.064443588256836,
"kl": 1.947265625,
"learning_rate": 1.0017544823184055e-07,
"loss": 0.1467,
"reward": 0.777456559240818,
"reward_std": 0.8870294690132141,
"rewards/cosine_scaled_reward": 0.12483939621597528,
"rewards/format_reward": 0.5277777910232544,
"step": 496
},
{
"clip_ratio": 0.0,
"completion_length": 459.98609924316406,
"epoch": 0.4259695736018856,
"grad_norm": 26.50463104248047,
"kl": 1.8984375,
"learning_rate": 1.0009869243631952e-07,
"loss": 0.1413,
"reward": 0.7809455767273903,
"reward_std": 0.739835649728775,
"rewards/cosine_scaled_reward": 0.11963944719173014,
"rewards/format_reward": 0.5416666716337204,
"step": 497
},
{
"clip_ratio": 0.0,
"completion_length": 429.84722900390625,
"epoch": 0.4268266552389115,
"grad_norm": 7.440047740936279,
"kl": 2.583984375,
"learning_rate": 1.000438641958131e-07,
"loss": 0.2265,
"reward": 0.9033599346876144,
"reward_std": 1.034580335021019,
"rewards/cosine_scaled_reward": 0.20167998038232327,
"rewards/format_reward": 0.5000000149011612,
"step": 498
},
{
"clip_ratio": 0.0,
"completion_length": 430.4166717529297,
"epoch": 0.4276837368759374,
"grad_norm": 9.689515113830566,
"kl": 1.7890625,
"learning_rate": 1.0001096618257236e-07,
"loss": 0.1179,
"reward": 0.6143735200166702,
"reward_std": 0.6945628225803375,
"rewards/cosine_scaled_reward": 0.02246453333646059,
"rewards/format_reward": 0.5694444477558136,
"step": 499
},
{
"clip_ratio": 0.0,
"completion_length": 378.05555725097656,
"epoch": 0.4285408185129634,
"grad_norm": 44.88062286376953,
"kl": 1.921875,
"learning_rate": 1e-07,
"loss": 0.127,
"reward": 1.3835089206695557,
"reward_std": 1.1280009299516678,
"rewards/cosine_scaled_reward": 0.4000878185033798,
"rewards/format_reward": 0.5833333432674408,
"step": 500
},
{
"epoch": 0.4285408185129634,
"step": 500,
"total_flos": 0.0,
"train_loss": 0.15917640645144274,
"train_runtime": 31189.1196,
"train_samples_per_second": 1.154,
"train_steps_per_second": 0.016
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}