Seg-R1-SOD-7B / trainer_state.json
geshang's picture
Upload folder using huggingface_hub
35fa320 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9982944855031268,
"eval_steps": 500,
"global_step": 439,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 147.92708587646484,
"epoch": 0.0022740193291642978,
"grad_norm": 8.737942695617676,
"kl": 0.0,
"learning_rate": 1e-06,
"loss": 0.0,
"reward": 1.6511709690093994,
"reward_std": 0.33823655918240547,
"rewards/format_reward": 0.9166666865348816,
"rewards/segmentation_reward": 0.7345042824745178,
"step": 1
},
{
"completion_length": 144.85417556762695,
"epoch": 0.0045480386583285955,
"grad_norm": 8.010404586791992,
"kl": 0.0012912750244140625,
"learning_rate": 1e-06,
"loss": 0.0001,
"reward": 1.6437489092350006,
"reward_std": 0.2732698582112789,
"rewards/format_reward": 0.9166667014360428,
"rewards/segmentation_reward": 0.7270822674036026,
"step": 2
},
{
"completion_length": 150.1041717529297,
"epoch": 0.006822057987492893,
"grad_norm": 12.285407066345215,
"kl": 0.0012798309326171875,
"learning_rate": 1e-06,
"loss": 0.0001,
"reward": 1.53373184800148,
"reward_std": 0.43030911684036255,
"rewards/format_reward": 0.8437500149011612,
"rewards/segmentation_reward": 0.6899818480014801,
"step": 3
},
{
"completion_length": 147.12500381469727,
"epoch": 0.009096077316657191,
"grad_norm": 10.881176948547363,
"kl": 0.0028820037841796875,
"learning_rate": 1e-06,
"loss": 0.0001,
"reward": 1.6048874258995056,
"reward_std": 0.3273423947393894,
"rewards/format_reward": 0.8958333432674408,
"rewards/segmentation_reward": 0.7090541273355484,
"step": 4
},
{
"completion_length": 144.3958396911621,
"epoch": 0.01137009664582149,
"grad_norm": 12.837152481079102,
"kl": 0.002300262451171875,
"learning_rate": 1e-06,
"loss": 0.0001,
"reward": 1.611760675907135,
"reward_std": 0.37994210980832577,
"rewards/format_reward": 0.9062500149011612,
"rewards/segmentation_reward": 0.7055106610059738,
"step": 5
},
{
"completion_length": 139.15625381469727,
"epoch": 0.013644115974985787,
"grad_norm": 6.645235061645508,
"kl": 0.0039215087890625,
"learning_rate": 1e-06,
"loss": 0.0002,
"reward": 1.726276457309723,
"reward_std": 0.24428023397922516,
"rewards/format_reward": 0.9479166865348816,
"rewards/segmentation_reward": 0.7783599197864532,
"step": 6
},
{
"completion_length": 150.1145896911621,
"epoch": 0.015918135304150087,
"grad_norm": 12.681654930114746,
"kl": 0.0052032470703125,
"learning_rate": 1e-06,
"loss": 0.0002,
"reward": 1.7287286818027496,
"reward_std": 0.23665708303451538,
"rewards/format_reward": 0.9479166865348816,
"rewards/segmentation_reward": 0.7808119505643845,
"step": 7
},
{
"completion_length": 136.68750381469727,
"epoch": 0.018192154633314382,
"grad_norm": 8.222872734069824,
"kl": 0.00634002685546875,
"learning_rate": 1e-06,
"loss": 0.0003,
"reward": 1.8023037910461426,
"reward_std": 0.09579922584816813,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8127204030752182,
"step": 8
},
{
"completion_length": 145.3645896911621,
"epoch": 0.02046617396247868,
"grad_norm": 12.586159706115723,
"kl": 0.0068511962890625,
"learning_rate": 1e-06,
"loss": 0.0003,
"reward": 1.6984942257404327,
"reward_std": 0.24017422273755074,
"rewards/format_reward": 0.9375000298023224,
"rewards/segmentation_reward": 0.7609941959381104,
"step": 9
},
{
"completion_length": 136.4895896911621,
"epoch": 0.02274019329164298,
"grad_norm": 19.224149703979492,
"kl": 0.008087158203125,
"learning_rate": 1e-06,
"loss": 0.0003,
"reward": 1.817267656326294,
"reward_std": 0.10884078592061996,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8276843428611755,
"step": 10
},
{
"completion_length": 150.14583587646484,
"epoch": 0.025014212620807278,
"grad_norm": 17.966585159301758,
"kl": 0.0106658935546875,
"learning_rate": 1e-06,
"loss": 0.0004,
"reward": 1.6834727227687836,
"reward_std": 0.3056778460741043,
"rewards/format_reward": 0.927083358168602,
"rewards/segmentation_reward": 0.7563893795013428,
"step": 11
},
{
"completion_length": 139.58333587646484,
"epoch": 0.027288231949971573,
"grad_norm": 12.995298385620117,
"kl": 0.0103912353515625,
"learning_rate": 1e-06,
"loss": 0.0004,
"reward": 1.7785775661468506,
"reward_std": 0.1881927289068699,
"rewards/format_reward": 0.958333358168602,
"rewards/segmentation_reward": 0.8202441930770874,
"step": 12
},
{
"completion_length": 134.56250762939453,
"epoch": 0.029562251279135872,
"grad_norm": 5.952467918395996,
"kl": 0.0095977783203125,
"learning_rate": 1e-06,
"loss": 0.0004,
"reward": 1.8030498623847961,
"reward_std": 0.15592540614306927,
"rewards/format_reward": 0.9687500149011612,
"rewards/segmentation_reward": 0.8342998623847961,
"step": 13
},
{
"completion_length": 132.7291717529297,
"epoch": 0.031836270608300174,
"grad_norm": 14.455424308776855,
"kl": 0.01409912109375,
"learning_rate": 1e-06,
"loss": 0.0006,
"reward": 1.8657137751579285,
"reward_std": 0.04672300070524216,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8657138496637344,
"step": 14
},
{
"completion_length": 134.8229217529297,
"epoch": 0.03411028993746447,
"grad_norm": 16.328163146972656,
"kl": 0.015960693359375,
"learning_rate": 1e-06,
"loss": 0.0007,
"reward": 1.7946673929691315,
"reward_std": 0.1416209153831005,
"rewards/format_reward": 0.9687500298023224,
"rewards/segmentation_reward": 0.8259173631668091,
"step": 15
},
{
"completion_length": 129.48958778381348,
"epoch": 0.036384309266628764,
"grad_norm": 10.300512313842773,
"kl": 0.016510009765625,
"learning_rate": 1e-06,
"loss": 0.0007,
"reward": 1.8089908957481384,
"reward_std": 0.11957723228260875,
"rewards/format_reward": 0.9791666716337204,
"rewards/segmentation_reward": 0.8298242688179016,
"step": 16
},
{
"completion_length": 130.70833587646484,
"epoch": 0.038658328595793066,
"grad_norm": 7.11127233505249,
"kl": 0.01666259765625,
"learning_rate": 1e-06,
"loss": 0.0007,
"reward": 1.8404812514781952,
"reward_std": 0.09966395457740873,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.86131452023983,
"step": 17
},
{
"completion_length": 131.60417366027832,
"epoch": 0.04093234792495736,
"grad_norm": 34.26894760131836,
"kl": 0.015533447265625,
"learning_rate": 1e-06,
"loss": 0.0006,
"reward": 1.8397277891635895,
"reward_std": 0.0865055019967258,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8605611473321915,
"step": 18
},
{
"completion_length": 128.85416984558105,
"epoch": 0.04320636725412166,
"grad_norm": 8.23103141784668,
"kl": 0.018310546875,
"learning_rate": 1e-06,
"loss": 0.0007,
"reward": 1.8531556725502014,
"reward_std": 0.02550937162595801,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8531556576490402,
"step": 19
},
{
"completion_length": 130.83333587646484,
"epoch": 0.04548038658328596,
"grad_norm": 11.679638862609863,
"kl": 0.0172882080078125,
"learning_rate": 1e-06,
"loss": 0.0007,
"reward": 1.8632822334766388,
"reward_std": 0.04053949285298586,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8632822781801224,
"step": 20
},
{
"completion_length": 130.02083587646484,
"epoch": 0.047754405912450254,
"grad_norm": 57.03630447387695,
"kl": 0.0178070068359375,
"learning_rate": 1e-06,
"loss": 0.0007,
"reward": 1.8496468663215637,
"reward_std": 0.03478804882615805,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8496468216180801,
"step": 21
},
{
"completion_length": 130.65625190734863,
"epoch": 0.050028425241614556,
"grad_norm": 28.802846908569336,
"kl": 0.0260009765625,
"learning_rate": 1e-06,
"loss": 0.001,
"reward": 1.8245560228824615,
"reward_std": 0.1071182056912221,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.84538933634758,
"step": 22
},
{
"completion_length": 128.0937557220459,
"epoch": 0.05230244457077885,
"grad_norm": 10.76288890838623,
"kl": 0.0197296142578125,
"learning_rate": 1e-06,
"loss": 0.0008,
"reward": 1.7964556813240051,
"reward_std": 0.129461950622499,
"rewards/format_reward": 0.9791666716337204,
"rewards/segmentation_reward": 0.8172890096902847,
"step": 23
},
{
"completion_length": 130.0520896911621,
"epoch": 0.054576463899943146,
"grad_norm": 6.2558064460754395,
"kl": 0.0218505859375,
"learning_rate": 1e-06,
"loss": 0.0009,
"reward": 1.8779499530792236,
"reward_std": 0.023469227773603052,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.877949982881546,
"step": 24
},
{
"completion_length": 126.92708587646484,
"epoch": 0.05685048322910745,
"grad_norm": 10.53512954711914,
"kl": 0.017486572265625,
"learning_rate": 1e-06,
"loss": 0.0007,
"reward": 1.8650535941123962,
"reward_std": 0.03197958506643772,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8650535494089127,
"step": 25
},
{
"completion_length": 128.20833778381348,
"epoch": 0.059124502558271744,
"grad_norm": 37.11606216430664,
"kl": 0.0267791748046875,
"learning_rate": 1e-06,
"loss": 0.0011,
"reward": 1.832018405199051,
"reward_std": 0.07763887383043766,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8424350172281265,
"step": 26
},
{
"completion_length": 132.5625057220459,
"epoch": 0.061398521887436046,
"grad_norm": 26.71733856201172,
"kl": 0.019134521484375,
"learning_rate": 1e-06,
"loss": 0.0007,
"reward": 1.8609023690223694,
"reward_std": 0.05331907293293625,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8713190257549286,
"step": 27
},
{
"completion_length": 129.0000057220459,
"epoch": 0.06367254121660035,
"grad_norm": 7.346284866333008,
"kl": 0.017791748046875,
"learning_rate": 1e-06,
"loss": 0.0007,
"reward": 1.806743562221527,
"reward_std": 0.08053719438612461,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8171601891517639,
"step": 28
},
{
"completion_length": 128.45833778381348,
"epoch": 0.06594656054576464,
"grad_norm": 8.270977020263672,
"kl": 0.0204315185546875,
"learning_rate": 1e-06,
"loss": 0.0008,
"reward": 1.840296596288681,
"reward_std": 0.0486476831138134,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8402965515851974,
"step": 29
},
{
"completion_length": 125.03125190734863,
"epoch": 0.06822057987492894,
"grad_norm": 8.62176513671875,
"kl": 0.01678466796875,
"learning_rate": 1e-06,
"loss": 0.0007,
"reward": 1.8195985853672028,
"reward_std": 0.07711292989552021,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8300152719020844,
"step": 30
},
{
"completion_length": 133.76041793823242,
"epoch": 0.07049459920409323,
"grad_norm": 25.146360397338867,
"kl": 0.017852783203125,
"learning_rate": 1e-06,
"loss": 0.0007,
"reward": 1.8119685649871826,
"reward_std": 0.15049411728978157,
"rewards/format_reward": 0.958333358168602,
"rewards/segmentation_reward": 0.853635236620903,
"step": 31
},
{
"completion_length": 127.13541984558105,
"epoch": 0.07276861853325753,
"grad_norm": 11.285983085632324,
"kl": 0.020233154296875,
"learning_rate": 1e-06,
"loss": 0.0008,
"reward": 1.8676734268665314,
"reward_std": 0.04771583795081824,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8780900835990906,
"step": 32
},
{
"completion_length": 130.3541717529297,
"epoch": 0.07504263786242182,
"grad_norm": 58.88550567626953,
"kl": 0.0172576904296875,
"learning_rate": 1e-06,
"loss": 0.0007,
"reward": 1.797868400812149,
"reward_std": 0.0878910388564691,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8082851022481918,
"step": 33
},
{
"completion_length": 128.39583587646484,
"epoch": 0.07731665719158613,
"grad_norm": 13.709405899047852,
"kl": 0.022186279296875,
"learning_rate": 1e-06,
"loss": 0.0009,
"reward": 1.8660337030887604,
"reward_std": 0.04777739220298827,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8660337030887604,
"step": 34
},
{
"completion_length": 127.22916984558105,
"epoch": 0.07959067652075043,
"grad_norm": 7.4637908935546875,
"kl": 0.018890380859375,
"learning_rate": 1e-06,
"loss": 0.0008,
"reward": 1.7917229533195496,
"reward_std": 0.0735958176665008,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8021395653486252,
"step": 35
},
{
"completion_length": 126.66666984558105,
"epoch": 0.08186469584991472,
"grad_norm": 8.999519348144531,
"kl": 0.02191162109375,
"learning_rate": 1e-06,
"loss": 0.0009,
"reward": 1.7737447619438171,
"reward_std": 0.10960677545517683,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.7945781201124191,
"step": 36
},
{
"completion_length": 125.78125190734863,
"epoch": 0.08413871517907902,
"grad_norm": 10.515003204345703,
"kl": 0.016815185546875,
"learning_rate": 1e-06,
"loss": 0.0007,
"reward": 1.8419047594070435,
"reward_std": 0.03469831729307771,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8419047296047211,
"step": 37
},
{
"completion_length": 127.05208396911621,
"epoch": 0.08641273450824331,
"grad_norm": 25.71062660217285,
"kl": 0.020843505859375,
"learning_rate": 1e-06,
"loss": 0.0008,
"reward": 1.8053939044475555,
"reward_std": 0.0809064069762826,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8158105462789536,
"step": 38
},
{
"completion_length": 124.10416793823242,
"epoch": 0.08868675383740762,
"grad_norm": 11.600313186645508,
"kl": 0.0166168212890625,
"learning_rate": 1e-06,
"loss": 0.0007,
"reward": 1.856435239315033,
"reward_std": 0.07491008564829826,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8668518215417862,
"step": 39
},
{
"completion_length": 123.9687557220459,
"epoch": 0.09096077316657192,
"grad_norm": 10.360515594482422,
"kl": 0.022216796875,
"learning_rate": 1e-06,
"loss": 0.0009,
"reward": 1.8669567108154297,
"reward_std": 0.01574411618639715,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8669566959142685,
"step": 40
},
{
"completion_length": 125.32292366027832,
"epoch": 0.09323479249573621,
"grad_norm": 9.638788223266602,
"kl": 0.022430419921875,
"learning_rate": 1e-06,
"loss": 0.0009,
"reward": 1.8379901051521301,
"reward_std": 0.0877154991030693,
"rewards/format_reward": 0.9687500298023224,
"rewards/segmentation_reward": 0.8692402094602585,
"step": 41
},
{
"completion_length": 123.43750190734863,
"epoch": 0.09550881182490051,
"grad_norm": 11.703614234924316,
"kl": 0.0201416015625,
"learning_rate": 1e-06,
"loss": 0.0008,
"reward": 1.896949291229248,
"reward_std": 0.022165193455293775,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8969493061304092,
"step": 42
},
{
"completion_length": 121.48958969116211,
"epoch": 0.0977828311540648,
"grad_norm": 41.82573318481445,
"kl": 0.0218505859375,
"learning_rate": 1e-06,
"loss": 0.0009,
"reward": 1.8304626643657684,
"reward_std": 0.08185502019478008,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8512959778308868,
"step": 43
},
{
"completion_length": 126.96875381469727,
"epoch": 0.10005685048322911,
"grad_norm": 9.426851272583008,
"kl": 0.02276611328125,
"learning_rate": 1e-06,
"loss": 0.0009,
"reward": 1.8303613364696503,
"reward_std": 0.10597201343625784,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8511946946382523,
"step": 44
},
{
"completion_length": 119.44791984558105,
"epoch": 0.10233086981239341,
"grad_norm": 7.771895408630371,
"kl": 0.025543212890625,
"learning_rate": 1e-06,
"loss": 0.001,
"reward": 1.840701162815094,
"reward_std": 0.07642269739881158,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8511178195476532,
"step": 45
},
{
"completion_length": 125.02083587646484,
"epoch": 0.1046048891415577,
"grad_norm": 8.715611457824707,
"kl": 0.022735595703125,
"learning_rate": 1e-06,
"loss": 0.0009,
"reward": 1.8369653820991516,
"reward_std": 0.03333436418324709,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.836965337395668,
"step": 46
},
{
"completion_length": 123.79166984558105,
"epoch": 0.106878908470722,
"grad_norm": 15.2009859085083,
"kl": 0.026092529296875,
"learning_rate": 1e-06,
"loss": 0.0011,
"reward": 1.8661520779132843,
"reward_std": 0.019087713153567165,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8661520928144455,
"step": 47
},
{
"completion_length": 128.84375381469727,
"epoch": 0.10915292779988629,
"grad_norm": 14.767459869384766,
"kl": 0.026611328125,
"learning_rate": 1e-06,
"loss": 0.0011,
"reward": 1.820468544960022,
"reward_std": 0.10572186904028058,
"rewards/format_reward": 0.9791666716337204,
"rewards/segmentation_reward": 0.8413018435239792,
"step": 48
},
{
"completion_length": 122.78125190734863,
"epoch": 0.1114269471290506,
"grad_norm": 34.67581558227539,
"kl": 0.0291748046875,
"learning_rate": 1e-06,
"loss": 0.0012,
"reward": 1.8499588370323181,
"reward_std": 0.06515861582010984,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8603754639625549,
"step": 49
},
{
"completion_length": 118.95833587646484,
"epoch": 0.1137009664582149,
"grad_norm": 23.052759170532227,
"kl": 0.0341796875,
"learning_rate": 1e-06,
"loss": 0.0014,
"reward": 1.851874828338623,
"reward_std": 0.058454849757254124,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8622915297746658,
"step": 50
},
{
"completion_length": 118.75000190734863,
"epoch": 0.11597498578737919,
"grad_norm": 523.3176879882812,
"kl": 0.0345458984375,
"learning_rate": 1e-06,
"loss": 0.0014,
"reward": 1.835026115179062,
"reward_std": 0.0287266579689458,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8350260555744171,
"step": 51
},
{
"completion_length": 126.53125190734863,
"epoch": 0.11824900511654349,
"grad_norm": 10.816275596618652,
"kl": 0.040313720703125,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 1.755786508321762,
"reward_std": 0.13628106890246272,
"rewards/format_reward": 0.9687500149011612,
"rewards/segmentation_reward": 0.7870365083217621,
"step": 52
},
{
"completion_length": 125.81250190734863,
"epoch": 0.12052302444570778,
"grad_norm": 9.201781272888184,
"kl": 0.03826904296875,
"learning_rate": 1e-06,
"loss": 0.0015,
"reward": 1.8181837797164917,
"reward_std": 0.06433252803981304,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8181838095188141,
"step": 53
},
{
"completion_length": 120.90625381469727,
"epoch": 0.12279704377487209,
"grad_norm": 15.950888633728027,
"kl": 0.033416748046875,
"learning_rate": 1e-06,
"loss": 0.0013,
"reward": 1.8185763359069824,
"reward_std": 0.10011043888516724,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.839409664273262,
"step": 54
},
{
"completion_length": 125.96875381469727,
"epoch": 0.12507106310403637,
"grad_norm": 3.631739854812622,
"kl": 0.03045654296875,
"learning_rate": 1e-06,
"loss": 0.0012,
"reward": 1.85866180062294,
"reward_std": 0.028809872455894947,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8586617559194565,
"step": 55
},
{
"completion_length": 120.3437557220459,
"epoch": 0.1273450824332007,
"grad_norm": 10.557564735412598,
"kl": 0.034942626953125,
"learning_rate": 1e-06,
"loss": 0.0014,
"reward": 1.8380783200263977,
"reward_std": 0.07766020158305764,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8484949469566345,
"step": 56
},
{
"completion_length": 122.03125190734863,
"epoch": 0.129619101762365,
"grad_norm": 13.181836128234863,
"kl": 0.032989501953125,
"learning_rate": 1e-06,
"loss": 0.0013,
"reward": 1.8480607271194458,
"reward_std": 0.06112843842129223,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.858477458357811,
"step": 57
},
{
"completion_length": 128.13541984558105,
"epoch": 0.1318931210915293,
"grad_norm": 9.518472671508789,
"kl": 0.03125,
"learning_rate": 1e-06,
"loss": 0.0012,
"reward": 1.8364209532737732,
"reward_std": 0.08701860439032316,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8468376249074936,
"step": 58
},
{
"completion_length": 121.95833778381348,
"epoch": 0.13416714042069358,
"grad_norm": 11.378217697143555,
"kl": 0.03680419921875,
"learning_rate": 1e-06,
"loss": 0.0015,
"reward": 1.818740427494049,
"reward_std": 0.0671944273635745,
"rewards/format_reward": 0.9791666716337204,
"rewards/segmentation_reward": 0.8395737260580063,
"step": 59
},
{
"completion_length": 121.41666984558105,
"epoch": 0.13644115974985788,
"grad_norm": 8.235391616821289,
"kl": 0.035858154296875,
"learning_rate": 1e-06,
"loss": 0.0014,
"reward": 1.8629357516765594,
"reward_std": 0.04245928302407265,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8629357665777206,
"step": 60
},
{
"completion_length": 121.90625190734863,
"epoch": 0.13871517907902217,
"grad_norm": 9.360280990600586,
"kl": 0.03076171875,
"learning_rate": 1e-06,
"loss": 0.0013,
"reward": 1.8437756896018982,
"reward_std": 0.03801816503982991,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8437757194042206,
"step": 61
},
{
"completion_length": 122.83333587646484,
"epoch": 0.14098919840818647,
"grad_norm": 11.74299144744873,
"kl": 0.03961181640625,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 1.8107888400554657,
"reward_std": 0.09169099852442741,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8316220790147781,
"step": 62
},
{
"completion_length": 123.4062557220459,
"epoch": 0.14326321773735076,
"grad_norm": 10.003548622131348,
"kl": 0.0406494140625,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 1.8223926723003387,
"reward_std": 0.07860782567877322,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.832809329032898,
"step": 63
},
{
"completion_length": 124.19792175292969,
"epoch": 0.14553723706651506,
"grad_norm": 8.069796562194824,
"kl": 0.03680419921875,
"learning_rate": 1e-06,
"loss": 0.0015,
"reward": 1.852884978055954,
"reward_std": 0.06025872565805912,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8528849929571152,
"step": 64
},
{
"completion_length": 121.30208587646484,
"epoch": 0.14781125639567935,
"grad_norm": 7.896605014801025,
"kl": 0.0350341796875,
"learning_rate": 1e-06,
"loss": 0.0014,
"reward": 1.840564340353012,
"reward_std": 0.06667589582502842,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8405643403530121,
"step": 65
},
{
"completion_length": 122.70833778381348,
"epoch": 0.15008527572484365,
"grad_norm": 6.732133865356445,
"kl": 0.034759521484375,
"learning_rate": 1e-06,
"loss": 0.0014,
"reward": 1.8494722843170166,
"reward_std": 0.02315727563109249,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8494722992181778,
"step": 66
},
{
"completion_length": 125.42708587646484,
"epoch": 0.15235929505400797,
"grad_norm": 16.19176483154297,
"kl": 0.027984619140625,
"learning_rate": 1e-06,
"loss": 0.0011,
"reward": 1.8468118906021118,
"reward_std": 0.03803225792944431,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.846811830997467,
"step": 67
},
{
"completion_length": 121.63541793823242,
"epoch": 0.15463331438317227,
"grad_norm": 8.936660766601562,
"kl": 0.030914306640625,
"learning_rate": 1e-06,
"loss": 0.0012,
"reward": 1.8819967806339264,
"reward_std": 0.01786850136704743,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.881996750831604,
"step": 68
},
{
"completion_length": 128.8750057220459,
"epoch": 0.15690733371233656,
"grad_norm": 8.807208061218262,
"kl": 0.04010009765625,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 1.8393568098545074,
"reward_std": 0.02671552257379517,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8393567949533463,
"step": 69
},
{
"completion_length": 127.10417175292969,
"epoch": 0.15918135304150086,
"grad_norm": 9.790465354919434,
"kl": 0.032501220703125,
"learning_rate": 1e-06,
"loss": 0.0013,
"reward": 1.8589565753936768,
"reward_std": 0.07576595386490226,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8693732172250748,
"step": 70
},
{
"completion_length": 128.9895839691162,
"epoch": 0.16145537237066515,
"grad_norm": 132.23687744140625,
"kl": 0.03436279296875,
"learning_rate": 1e-06,
"loss": 0.0014,
"reward": 1.782070904970169,
"reward_std": 0.050592198269441724,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7820708751678467,
"step": 71
},
{
"completion_length": 135.29167556762695,
"epoch": 0.16372939169982945,
"grad_norm": 9.341756820678711,
"kl": 0.02923583984375,
"learning_rate": 1e-06,
"loss": 0.0012,
"reward": 1.8484710454940796,
"reward_std": 0.03445305596687831,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.848471000790596,
"step": 72
},
{
"completion_length": 126.71875381469727,
"epoch": 0.16600341102899374,
"grad_norm": 9.518696784973145,
"kl": 0.029144287109375,
"learning_rate": 1e-06,
"loss": 0.0012,
"reward": 1.8627153933048248,
"reward_std": 0.046468528802506626,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8731320649385452,
"step": 73
},
{
"completion_length": 127.64583969116211,
"epoch": 0.16827743035815804,
"grad_norm": 17.517793655395508,
"kl": 0.032470703125,
"learning_rate": 1e-06,
"loss": 0.0013,
"reward": 1.8466759324073792,
"reward_std": 0.027572712278924882,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8466758877038956,
"step": 74
},
{
"completion_length": 128.7291717529297,
"epoch": 0.17055144968732233,
"grad_norm": 8.034920692443848,
"kl": 0.03302001953125,
"learning_rate": 1e-06,
"loss": 0.0013,
"reward": 1.8586640655994415,
"reward_std": 0.028676262591034174,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.858664020895958,
"step": 75
},
{
"completion_length": 128.2187557220459,
"epoch": 0.17282546901648663,
"grad_norm": 9.858979225158691,
"kl": 0.030853271484375,
"learning_rate": 1e-06,
"loss": 0.0012,
"reward": 1.8392693102359772,
"reward_std": 0.050932126585394144,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8392692804336548,
"step": 76
},
{
"completion_length": 135.33333587646484,
"epoch": 0.17509948834565095,
"grad_norm": 10.723231315612793,
"kl": 0.03717041015625,
"learning_rate": 1e-06,
"loss": 0.0015,
"reward": 1.864184021949768,
"reward_std": 0.021687635337002575,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8641840219497681,
"step": 77
},
{
"completion_length": 125.73958587646484,
"epoch": 0.17737350767481525,
"grad_norm": 7.419450283050537,
"kl": 0.04266357421875,
"learning_rate": 1e-06,
"loss": 0.0017,
"reward": 1.8617435693740845,
"reward_std": 0.032619446399621665,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8617434948682785,
"step": 78
},
{
"completion_length": 135.34375381469727,
"epoch": 0.17964752700397954,
"grad_norm": 6.787795543670654,
"kl": 0.0318603515625,
"learning_rate": 1e-06,
"loss": 0.0013,
"reward": 1.8563005030155182,
"reward_std": 0.01326985149353277,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.856300488114357,
"step": 79
},
{
"completion_length": 134.9791717529297,
"epoch": 0.18192154633314384,
"grad_norm": 12.446442604064941,
"kl": 0.03302001953125,
"learning_rate": 1e-06,
"loss": 0.0013,
"reward": 1.8413802683353424,
"reward_std": 0.02990832203067839,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8413802236318588,
"step": 80
},
{
"completion_length": 131.05208587646484,
"epoch": 0.18419556566230813,
"grad_norm": 7.766218185424805,
"kl": 0.031982421875,
"learning_rate": 1e-06,
"loss": 0.0013,
"reward": 1.8655352890491486,
"reward_std": 0.03306874120607972,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8655352592468262,
"step": 81
},
{
"completion_length": 130.08333778381348,
"epoch": 0.18646958499147243,
"grad_norm": 7.591801166534424,
"kl": 0.035125732421875,
"learning_rate": 1e-06,
"loss": 0.0014,
"reward": 1.8489001095294952,
"reward_std": 0.038488025951664895,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8593167364597321,
"step": 82
},
{
"completion_length": 127.83333587646484,
"epoch": 0.18874360432063672,
"grad_norm": 8.222685813903809,
"kl": 0.0341796875,
"learning_rate": 1e-06,
"loss": 0.0014,
"reward": 1.8844203352928162,
"reward_std": 0.03254084661602974,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.884420245885849,
"step": 83
},
{
"completion_length": 128.39583778381348,
"epoch": 0.19101762364980102,
"grad_norm": 16.960952758789062,
"kl": 0.0391845703125,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 1.8208307921886444,
"reward_std": 0.055154044879600406,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8312473893165588,
"step": 84
},
{
"completion_length": 132.9583396911621,
"epoch": 0.1932916429789653,
"grad_norm": 5.855716705322266,
"kl": 0.037109375,
"learning_rate": 1e-06,
"loss": 0.0015,
"reward": 1.86465585231781,
"reward_std": 0.028004995780065656,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8646559119224548,
"step": 85
},
{
"completion_length": 130.6666717529297,
"epoch": 0.1955656623081296,
"grad_norm": 9.59054946899414,
"kl": 0.03985595703125,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 1.827312707901001,
"reward_std": 0.08010158874094486,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8377293199300766,
"step": 86
},
{
"completion_length": 134.71875,
"epoch": 0.19783968163729393,
"grad_norm": 18.06949234008789,
"kl": 0.0369873046875,
"learning_rate": 1e-06,
"loss": 0.0015,
"reward": 1.8673449456691742,
"reward_std": 0.04852295899763703,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8777615576982498,
"step": 87
},
{
"completion_length": 136.03125381469727,
"epoch": 0.20011370096645822,
"grad_norm": 7.5427937507629395,
"kl": 0.0423583984375,
"learning_rate": 1e-06,
"loss": 0.0017,
"reward": 1.8519779443740845,
"reward_std": 0.06849909643642604,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8623945862054825,
"step": 88
},
{
"completion_length": 135.9479217529297,
"epoch": 0.20238772029562252,
"grad_norm": 12.732763290405273,
"kl": 0.03277587890625,
"learning_rate": 1e-06,
"loss": 0.0013,
"reward": 1.8459124863147736,
"reward_std": 0.07695996854454279,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8563291132450104,
"step": 89
},
{
"completion_length": 134.12500381469727,
"epoch": 0.20466173962478681,
"grad_norm": 7.050863265991211,
"kl": 0.0347900390625,
"learning_rate": 1e-06,
"loss": 0.0014,
"reward": 1.837898164987564,
"reward_std": 0.015308346832171082,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8378981947898865,
"step": 90
},
{
"completion_length": 133.28125381469727,
"epoch": 0.2069357589539511,
"grad_norm": 38.80855178833008,
"kl": 0.038330078125,
"learning_rate": 1e-06,
"loss": 0.0015,
"reward": 1.863549381494522,
"reward_std": 0.03241122025065124,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8635492920875549,
"step": 91
},
{
"completion_length": 134.31250381469727,
"epoch": 0.2092097782831154,
"grad_norm": 12.962533950805664,
"kl": 0.03558349609375,
"learning_rate": 1e-06,
"loss": 0.0014,
"reward": 1.8355351388454437,
"reward_std": 0.07070542359724641,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8459518104791641,
"step": 92
},
{
"completion_length": 132.1770896911621,
"epoch": 0.2114837976122797,
"grad_norm": 8.060502052307129,
"kl": 0.0416259765625,
"learning_rate": 1e-06,
"loss": 0.0017,
"reward": 1.847087800502777,
"reward_std": 0.031698971055448055,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8470877408981323,
"step": 93
},
{
"completion_length": 135.55208587646484,
"epoch": 0.213757816941444,
"grad_norm": 14.366216659545898,
"kl": 0.04425048828125,
"learning_rate": 1e-06,
"loss": 0.0018,
"reward": 1.8385899066925049,
"reward_std": 0.06835441221483052,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8490065485239029,
"step": 94
},
{
"completion_length": 127.10416793823242,
"epoch": 0.2160318362706083,
"grad_norm": 26.700092315673828,
"kl": 0.0390625,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 1.8634757697582245,
"reward_std": 0.008791875996394083,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8634757250547409,
"step": 95
},
{
"completion_length": 129.25000381469727,
"epoch": 0.21830585559977259,
"grad_norm": 4.697238445281982,
"kl": 0.0408935546875,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 1.8333858251571655,
"reward_std": 0.08207701286301017,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8438025414943695,
"step": 96
},
{
"completion_length": 139.67709350585938,
"epoch": 0.2205798749289369,
"grad_norm": 18.640583038330078,
"kl": 0.04376220703125,
"learning_rate": 1e-06,
"loss": 0.0017,
"reward": 1.8212727904319763,
"reward_std": 0.10704211867414415,
"rewards/format_reward": 0.9791666716337204,
"rewards/segmentation_reward": 0.8421061336994171,
"step": 97
},
{
"completion_length": 132.62500381469727,
"epoch": 0.2228538942581012,
"grad_norm": 10.532288551330566,
"kl": 0.0390625,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 1.7843947410583496,
"reward_std": 0.07167254062369466,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.79481141269207,
"step": 98
},
{
"completion_length": 133.50000381469727,
"epoch": 0.2251279135872655,
"grad_norm": 10.3671875,
"kl": 0.037353515625,
"learning_rate": 1e-06,
"loss": 0.0015,
"reward": 1.82876256108284,
"reward_std": 0.10020078788511455,
"rewards/format_reward": 0.9791666716337204,
"rewards/segmentation_reward": 0.849595919251442,
"step": 99
},
{
"completion_length": 131.75000381469727,
"epoch": 0.2274019329164298,
"grad_norm": 13.147160530090332,
"kl": 0.0452880859375,
"learning_rate": 1e-06,
"loss": 0.0018,
"reward": 1.8422828912734985,
"reward_std": 0.08142339263577014,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8631161749362946,
"step": 100
},
{
"completion_length": 129.8229217529297,
"epoch": 0.2296759522455941,
"grad_norm": 16.519821166992188,
"kl": 0.034637451171875,
"learning_rate": 1e-06,
"loss": 0.0014,
"reward": 1.806572288274765,
"reward_std": 0.04188450565561652,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8065722435712814,
"step": 101
},
{
"completion_length": 129.39583778381348,
"epoch": 0.23194997157475838,
"grad_norm": 12.152297019958496,
"kl": 0.041015625,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 1.8586591482162476,
"reward_std": 0.06601439183577895,
"rewards/format_reward": 0.9791666716337204,
"rewards/segmentation_reward": 0.8794925063848495,
"step": 102
},
{
"completion_length": 128.5000057220459,
"epoch": 0.23422399090392268,
"grad_norm": 8.767850875854492,
"kl": 0.03546142578125,
"learning_rate": 1e-06,
"loss": 0.0014,
"reward": 1.7989584803581238,
"reward_std": 0.11034804070368409,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8197918385267258,
"step": 103
},
{
"completion_length": 129.6145839691162,
"epoch": 0.23649801023308697,
"grad_norm": 8.40770435333252,
"kl": 0.03826904296875,
"learning_rate": 1e-06,
"loss": 0.0015,
"reward": 1.8321227729320526,
"reward_std": 0.06688260892406106,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8425393849611282,
"step": 104
},
{
"completion_length": 133.00000762939453,
"epoch": 0.23877202956225127,
"grad_norm": 66.30607604980469,
"kl": 0.05133056640625,
"learning_rate": 1e-06,
"loss": 0.002,
"reward": 1.765193372964859,
"reward_std": 0.14037537574768066,
"rewards/format_reward": 0.9583333432674408,
"rewards/segmentation_reward": 0.806860014796257,
"step": 105
},
{
"completion_length": 126.62500381469727,
"epoch": 0.24104604889141557,
"grad_norm": 10.172347068786621,
"kl": 0.0382080078125,
"learning_rate": 1e-06,
"loss": 0.0015,
"reward": 1.8895207047462463,
"reward_std": 0.02245330944424495,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8895206451416016,
"step": 106
},
{
"completion_length": 135.06250762939453,
"epoch": 0.2433200682205799,
"grad_norm": 8.362109184265137,
"kl": 0.041168212890625,
"learning_rate": 1e-06,
"loss": 0.0017,
"reward": 1.806322306394577,
"reward_std": 0.12358620949089527,
"rewards/format_reward": 0.96875,
"rewards/segmentation_reward": 0.8375722914934158,
"step": 107
},
{
"completion_length": 132.0104217529297,
"epoch": 0.24559408754974418,
"grad_norm": 8.960740089416504,
"kl": 0.040069580078125,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 1.8282636106014252,
"reward_std": 0.08110124431550503,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8386802226305008,
"step": 108
},
{
"completion_length": 134.71875381469727,
"epoch": 0.24786810687890848,
"grad_norm": 20.807939529418945,
"kl": 0.0418701171875,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 1.8370684087276459,
"reward_std": 0.03879292996134609,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8474850803613663,
"step": 109
},
{
"completion_length": 132.73958587646484,
"epoch": 0.25014212620807275,
"grad_norm": 19.63970184326172,
"kl": 0.04046630859375,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 1.8565462529659271,
"reward_std": 0.05808442225679755,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8669629096984863,
"step": 110
},
{
"completion_length": 131.5104217529297,
"epoch": 0.25241614553723707,
"grad_norm": 13.530229568481445,
"kl": 0.03961181640625,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 1.869715929031372,
"reward_std": 0.028540480285300873,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8697158843278885,
"step": 111
},
{
"completion_length": 136.64583778381348,
"epoch": 0.2546901648664014,
"grad_norm": 27.007661819458008,
"kl": 0.039306640625,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 1.7972800433635712,
"reward_std": 0.09525090921670198,
"rewards/format_reward": 0.9791666716337204,
"rewards/segmentation_reward": 0.8181134164333344,
"step": 112
},
{
"completion_length": 135.8645896911621,
"epoch": 0.25696418419556566,
"grad_norm": 10.786030769348145,
"kl": 0.03765869140625,
"learning_rate": 1e-06,
"loss": 0.0015,
"reward": 1.8435862362384796,
"reward_std": 0.07883737958036363,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8540029078722,
"step": 113
},
{
"completion_length": 126.04166984558105,
"epoch": 0.25923820352473,
"grad_norm": 9.88869857788086,
"kl": 0.0394287109375,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 1.8222769498825073,
"reward_std": 0.07929788623005152,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8222769200801849,
"step": 114
},
{
"completion_length": 131.81250381469727,
"epoch": 0.26151222285389425,
"grad_norm": 45.116451263427734,
"kl": 0.04217529296875,
"learning_rate": 1e-06,
"loss": 0.0017,
"reward": 1.8684912621974945,
"reward_std": 0.04528397601097822,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8684912025928497,
"step": 115
},
{
"completion_length": 131.7291717529297,
"epoch": 0.2637862421830586,
"grad_norm": 6.04058313369751,
"kl": 0.047119140625,
"learning_rate": 1e-06,
"loss": 0.0019,
"reward": 1.7711567878723145,
"reward_std": 0.1284920796751976,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.7919900417327881,
"step": 116
},
{
"completion_length": 127.82292175292969,
"epoch": 0.26606026151222284,
"grad_norm": 9.686407089233398,
"kl": 0.0443115234375,
"learning_rate": 1e-06,
"loss": 0.0018,
"reward": 1.8393816649913788,
"reward_std": 0.07739171921275556,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8497983366250992,
"step": 117
},
{
"completion_length": 126.60416984558105,
"epoch": 0.26833428084138716,
"grad_norm": 13.846648216247559,
"kl": 0.069580078125,
"learning_rate": 1e-06,
"loss": 0.0028,
"reward": 1.7880859076976776,
"reward_std": 0.1653798259794712,
"rewards/format_reward": 0.958333358168602,
"rewards/segmentation_reward": 0.8297525644302368,
"step": 118
},
{
"completion_length": 125.22916793823242,
"epoch": 0.27060830017055143,
"grad_norm": 14.052881240844727,
"kl": 0.0400390625,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 1.8796150386333466,
"reward_std": 0.028668402694165707,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.879614993929863,
"step": 119
},
{
"completion_length": 120.08333778381348,
"epoch": 0.27288231949971575,
"grad_norm": 6.430720806121826,
"kl": 0.04559326171875,
"learning_rate": 1e-06,
"loss": 0.0018,
"reward": 1.8220676481723785,
"reward_std": 0.08830677217338234,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8429009765386581,
"step": 120
},
{
"completion_length": 124.82291984558105,
"epoch": 0.27515633882888,
"grad_norm": 19.418067932128906,
"kl": 0.04193115234375,
"learning_rate": 1e-06,
"loss": 0.0017,
"reward": 1.8649601340293884,
"reward_std": 0.02245999814476818,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8649601340293884,
"step": 121
},
{
"completion_length": 130.38541984558105,
"epoch": 0.27743035815804434,
"grad_norm": 32.728607177734375,
"kl": 0.0396728515625,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 1.8011479675769806,
"reward_std": 0.10339335759636015,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.821981281042099,
"step": 122
},
{
"completion_length": 131.51041984558105,
"epoch": 0.27970437748720867,
"grad_norm": 10.91059398651123,
"kl": 0.0413818359375,
"learning_rate": 1e-06,
"loss": 0.0017,
"reward": 1.838850736618042,
"reward_std": 0.02782218554057181,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.838850736618042,
"step": 123
},
{
"completion_length": 125.80208778381348,
"epoch": 0.28197839681637293,
"grad_norm": 8.754847526550293,
"kl": 0.0377197265625,
"learning_rate": 1e-06,
"loss": 0.0015,
"reward": 1.8477693796157837,
"reward_std": 0.04511198558611795,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8581860810518265,
"step": 124
},
{
"completion_length": 131.50000190734863,
"epoch": 0.28425241614553726,
"grad_norm": 41.41383743286133,
"kl": 0.03546142578125,
"learning_rate": 1e-06,
"loss": 0.0014,
"reward": 1.8574239313602448,
"reward_std": 0.02917448477819562,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8574239611625671,
"step": 125
},
{
"completion_length": 124.41666984558105,
"epoch": 0.2865264354747015,
"grad_norm": 8.908427238464355,
"kl": 0.041259765625,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 1.8553960621356964,
"reward_std": 0.06732171808835119,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8658127784729004,
"step": 126
},
{
"completion_length": 138.18750381469727,
"epoch": 0.28880045480386585,
"grad_norm": 15.392871856689453,
"kl": 0.033538818359375,
"learning_rate": 1e-06,
"loss": 0.0013,
"reward": 1.8320258855819702,
"reward_std": 0.06953636615071446,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8424425423145294,
"step": 127
},
{
"completion_length": 128.70833778381348,
"epoch": 0.2910744741330301,
"grad_norm": 5.235471248626709,
"kl": 0.03948974609375,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 1.823939561843872,
"reward_std": 0.10366934072226286,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8447728455066681,
"step": 128
},
{
"completion_length": 129.0625057220459,
"epoch": 0.29334849346219444,
"grad_norm": 18.025480270385742,
"kl": 0.0374755859375,
"learning_rate": 1e-06,
"loss": 0.0015,
"reward": 1.8658169209957123,
"reward_std": 0.017597037134692073,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8658169209957123,
"step": 129
},
{
"completion_length": 132.44792366027832,
"epoch": 0.2956225127913587,
"grad_norm": 10.100214004516602,
"kl": 0.03912353515625,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 1.819077491760254,
"reward_std": 0.07401184504851699,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8294941633939743,
"step": 130
},
{
"completion_length": 131.10417556762695,
"epoch": 0.297896532120523,
"grad_norm": 10.517773628234863,
"kl": 0.03955078125,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 1.8419454395771027,
"reward_std": 0.06657789507880807,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8523620814085007,
"step": 131
},
{
"completion_length": 131.1250057220459,
"epoch": 0.3001705514496873,
"grad_norm": 15.198212623596191,
"kl": 0.033355712890625,
"learning_rate": 1e-06,
"loss": 0.0013,
"reward": 1.861166775226593,
"reward_std": 0.02897452423349023,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8611667454242706,
"step": 132
},
{
"completion_length": 129.7708396911621,
"epoch": 0.3024445707788516,
"grad_norm": 10.940967559814453,
"kl": 0.03472900390625,
"learning_rate": 1e-06,
"loss": 0.0014,
"reward": 1.8530578315258026,
"reward_std": 0.0734487110748887,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.873891144990921,
"step": 133
},
{
"completion_length": 127.71875381469727,
"epoch": 0.30471859010801594,
"grad_norm": 7.945354461669922,
"kl": 0.0478515625,
"learning_rate": 1e-06,
"loss": 0.0019,
"reward": 1.7999140620231628,
"reward_std": 0.06981736817397177,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8207473605871201,
"step": 134
},
{
"completion_length": 133.4791717529297,
"epoch": 0.3069926094371802,
"grad_norm": 8.708114624023438,
"kl": 0.041259765625,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 1.8323033154010773,
"reward_std": 0.07405243627727032,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8427200168371201,
"step": 135
},
{
"completion_length": 129.3541717529297,
"epoch": 0.30926662876634453,
"grad_norm": 39.540550231933594,
"kl": 0.0394287109375,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 1.8501177728176117,
"reward_std": 0.06709112878888845,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8605344444513321,
"step": 136
},
{
"completion_length": 124.79166793823242,
"epoch": 0.3115406480955088,
"grad_norm": 6.668416976928711,
"kl": 0.0390625,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 1.8371129930019379,
"reward_std": 0.05754397192504257,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8475296497344971,
"step": 137
},
{
"completion_length": 132.17708587646484,
"epoch": 0.3138146674246731,
"grad_norm": 18.357772827148438,
"kl": 0.03692626953125,
"learning_rate": 1e-06,
"loss": 0.0015,
"reward": 1.8030109107494354,
"reward_std": 0.04487704858183861,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.803010955452919,
"step": 138
},
{
"completion_length": 123.05208969116211,
"epoch": 0.3160886867538374,
"grad_norm": 12.917668342590332,
"kl": 0.04083251953125,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 1.819540798664093,
"reward_std": 0.04616073609213345,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8299574255943298,
"step": 139
},
{
"completion_length": 128.93750381469727,
"epoch": 0.3183627060830017,
"grad_norm": 8.16073226928711,
"kl": 0.03961181640625,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 1.869756668806076,
"reward_std": 0.023664554115384817,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8697566390037537,
"step": 140
},
{
"completion_length": 130.42708587646484,
"epoch": 0.320636725412166,
"grad_norm": 19.300662994384766,
"kl": 0.04595947265625,
"learning_rate": 1e-06,
"loss": 0.0018,
"reward": 1.8823592960834503,
"reward_std": 0.027183939702808857,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8823592811822891,
"step": 141
},
{
"completion_length": 121.57292175292969,
"epoch": 0.3229107447413303,
"grad_norm": 8.414497375488281,
"kl": 0.050537109375,
"learning_rate": 1e-06,
"loss": 0.002,
"reward": 1.8237576484680176,
"reward_std": 0.06241214391775429,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8341742604970932,
"step": 142
},
{
"completion_length": 127.01041984558105,
"epoch": 0.3251847640704946,
"grad_norm": 7.633127689361572,
"kl": 0.041259765625,
"learning_rate": 1e-06,
"loss": 0.0016,
"reward": 1.8560876250267029,
"reward_std": 0.04744653377565555,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8665042370557785,
"step": 143
},
{
"completion_length": 125.28125190734863,
"epoch": 0.3274587833996589,
"grad_norm": 13.39786148071289,
"kl": 0.0440673828125,
"learning_rate": 1e-06,
"loss": 0.0018,
"reward": 1.8448957204818726,
"reward_std": 0.017382028454449028,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8448957204818726,
"step": 144
},
{
"completion_length": 120.63541984558105,
"epoch": 0.3297328027288232,
"grad_norm": 10.50205135345459,
"kl": 0.0458984375,
"learning_rate": 1e-06,
"loss": 0.0018,
"reward": 1.838788241147995,
"reward_std": 0.08608005382120609,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8492048978805542,
"step": 145
},
{
"completion_length": 119.38541793823242,
"epoch": 0.3320068220579875,
"grad_norm": 5.443851470947266,
"kl": 0.04302978515625,
"learning_rate": 1e-06,
"loss": 0.0017,
"reward": 1.8452240228652954,
"reward_std": 0.05149654616252519,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.855640709400177,
"step": 146
},
{
"completion_length": 122.40625190734863,
"epoch": 0.3342808413871518,
"grad_norm": 6.090450286865234,
"kl": 0.0438232421875,
"learning_rate": 1e-06,
"loss": 0.0018,
"reward": 1.856435388326645,
"reward_std": 0.04737340519204736,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8564353585243225,
"step": 147
},
{
"completion_length": 119.72916984558105,
"epoch": 0.3365548607163161,
"grad_norm": 6.615400314331055,
"kl": 0.0478515625,
"learning_rate": 1e-06,
"loss": 0.0019,
"reward": 1.826998233795166,
"reward_std": 0.06745404587127268,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8374148905277252,
"step": 148
},
{
"completion_length": 126.30208778381348,
"epoch": 0.3388288800454804,
"grad_norm": 12.282633781433105,
"kl": 0.057373046875,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.8556068539619446,
"reward_std": 0.02449450278072618,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8556068539619446,
"step": 149
},
{
"completion_length": 125.31250381469727,
"epoch": 0.34110289937464466,
"grad_norm": 6.970208644866943,
"kl": 0.0443115234375,
"learning_rate": 1e-06,
"loss": 0.0018,
"reward": 1.8693890571594238,
"reward_std": 0.01952762738801539,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.869389072060585,
"step": 150
},
{
"completion_length": 125.59375,
"epoch": 0.343376918703809,
"grad_norm": 45.453697204589844,
"kl": 0.0531005859375,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 1.8583467900753021,
"reward_std": 0.051462399773299694,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8687634319067001,
"step": 151
},
{
"completion_length": 124.11458587646484,
"epoch": 0.34565093803297325,
"grad_norm": 20.52977752685547,
"kl": 0.0479736328125,
"learning_rate": 1e-06,
"loss": 0.0019,
"reward": 1.8123543560504913,
"reward_std": 0.0828116275370121,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8227709978818893,
"step": 152
},
{
"completion_length": 118.43750190734863,
"epoch": 0.3479249573621376,
"grad_norm": 11.727397918701172,
"kl": 0.04229736328125,
"learning_rate": 1e-06,
"loss": 0.0017,
"reward": 1.867412656545639,
"reward_std": 0.025605608825571835,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8674126118421555,
"step": 153
},
{
"completion_length": 124.67708587646484,
"epoch": 0.3501989766913019,
"grad_norm": 11.057373046875,
"kl": 0.0531005859375,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 1.869041621685028,
"reward_std": 0.026430562138557434,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8690416067838669,
"step": 154
},
{
"completion_length": 129.88541984558105,
"epoch": 0.35247299602046617,
"grad_norm": 9.447416305541992,
"kl": 0.04443359375,
"learning_rate": 1e-06,
"loss": 0.0018,
"reward": 1.7960728704929352,
"reward_std": 0.06973322853446007,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8064895123243332,
"step": 155
},
{
"completion_length": 126.4687557220459,
"epoch": 0.3547470153496305,
"grad_norm": 25.426660537719727,
"kl": 0.19091796875,
"learning_rate": 1e-06,
"loss": 0.0076,
"reward": 1.8375684916973114,
"reward_std": 0.05996000056620687,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8479850590229034,
"step": 156
},
{
"completion_length": 123.61458587646484,
"epoch": 0.35702103467879476,
"grad_norm": 7.227417469024658,
"kl": 0.04864501953125,
"learning_rate": 1e-06,
"loss": 0.002,
"reward": 1.8216514885425568,
"reward_std": 0.07541643898002803,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8320681154727936,
"step": 157
},
{
"completion_length": 118.41666984558105,
"epoch": 0.3592950540079591,
"grad_norm": 7.97116756439209,
"kl": 0.07366943359375,
"learning_rate": 1e-06,
"loss": 0.0029,
"reward": 1.86759752035141,
"reward_std": 0.014220859797205776,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8675975650548935,
"step": 158
},
{
"completion_length": 127.57291984558105,
"epoch": 0.36156907333712335,
"grad_norm": 9.09946346282959,
"kl": 0.05206298828125,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 1.8287563920021057,
"reward_std": 0.02918955238419585,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8287564218044281,
"step": 159
},
{
"completion_length": 127.68750381469727,
"epoch": 0.36384309266628767,
"grad_norm": 37.89695358276367,
"kl": 0.0693359375,
"learning_rate": 1e-06,
"loss": 0.0028,
"reward": 1.8446174263954163,
"reward_std": 0.03227622219128534,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8446174561977386,
"step": 160
},
{
"completion_length": 123.51041984558105,
"epoch": 0.36611711199545194,
"grad_norm": 10.888017654418945,
"kl": 0.0460205078125,
"learning_rate": 1e-06,
"loss": 0.0018,
"reward": 1.8628927171230316,
"reward_std": 0.022265097475610673,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8628927171230316,
"step": 161
},
{
"completion_length": 123.52083587646484,
"epoch": 0.36839113132461626,
"grad_norm": 31.744691848754883,
"kl": 0.0421142578125,
"learning_rate": 1e-06,
"loss": 0.0017,
"reward": 1.8360488712787628,
"reward_std": 0.04995149944443256,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8464655578136444,
"step": 162
},
{
"completion_length": 124.39583778381348,
"epoch": 0.3706651506537806,
"grad_norm": 119.80357360839844,
"kl": 0.0450439453125,
"learning_rate": 1e-06,
"loss": 0.0018,
"reward": 1.870296448469162,
"reward_std": 0.02627503650728613,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8702964633703232,
"step": 163
},
{
"completion_length": 126.23958587646484,
"epoch": 0.37293916998294485,
"grad_norm": 13.347270011901855,
"kl": 0.0462646484375,
"learning_rate": 1e-06,
"loss": 0.0018,
"reward": 1.8231273889541626,
"reward_std": 0.027737511321902275,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.823127418756485,
"step": 164
},
{
"completion_length": 121.46875381469727,
"epoch": 0.3752131893121092,
"grad_norm": 6.548112869262695,
"kl": 0.046142578125,
"learning_rate": 1e-06,
"loss": 0.0018,
"reward": 1.8643255233764648,
"reward_std": 0.020169232942862436,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8643255233764648,
"step": 165
},
{
"completion_length": 122.41666984558105,
"epoch": 0.37748720864127344,
"grad_norm": 6.858768463134766,
"kl": 0.04974365234375,
"learning_rate": 1e-06,
"loss": 0.002,
"reward": 1.8725146353244781,
"reward_std": 0.01943917891185265,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.872514620423317,
"step": 166
},
{
"completion_length": 131.04166984558105,
"epoch": 0.37976122797043776,
"grad_norm": 8.755220413208008,
"kl": 0.044677734375,
"learning_rate": 1e-06,
"loss": 0.0018,
"reward": 1.8255141377449036,
"reward_std": 0.06648333976045251,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.835930809378624,
"step": 167
},
{
"completion_length": 126.77083778381348,
"epoch": 0.38203524729960203,
"grad_norm": 13.69814682006836,
"kl": 0.04913330078125,
"learning_rate": 1e-06,
"loss": 0.002,
"reward": 1.8552350401878357,
"reward_std": 0.055759434937499464,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8656516671180725,
"step": 168
},
{
"completion_length": 123.25,
"epoch": 0.38430926662876636,
"grad_norm": 9.107439994812012,
"kl": 0.04638671875,
"learning_rate": 1e-06,
"loss": 0.0019,
"reward": 1.869709074497223,
"reward_std": 0.008492362350807525,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8697090148925781,
"step": 169
},
{
"completion_length": 128.6145896911621,
"epoch": 0.3865832859579306,
"grad_norm": 11.448233604431152,
"kl": 0.04742431640625,
"learning_rate": 1e-06,
"loss": 0.0019,
"reward": 1.8550761342048645,
"reward_std": 0.023954114876687527,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8550761044025421,
"step": 170
},
{
"completion_length": 126.64583396911621,
"epoch": 0.38885730528709495,
"grad_norm": 10.13017463684082,
"kl": 0.04632568359375,
"learning_rate": 1e-06,
"loss": 0.0019,
"reward": 1.8708954453468323,
"reward_std": 0.03041057422524318,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8708954006433487,
"step": 171
},
{
"completion_length": 127.83333396911621,
"epoch": 0.3911313246162592,
"grad_norm": 22.73763084411621,
"kl": 0.04736328125,
"learning_rate": 1e-06,
"loss": 0.0019,
"reward": 1.8603352308273315,
"reward_std": 0.02450424269773066,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8603352308273315,
"step": 172
},
{
"completion_length": 124.68750381469727,
"epoch": 0.39340534394542354,
"grad_norm": 9.25720500946045,
"kl": 0.04620361328125,
"learning_rate": 1e-06,
"loss": 0.0018,
"reward": 1.812343418598175,
"reward_std": 0.02242008870234713,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8123434334993362,
"step": 173
},
{
"completion_length": 124.85417175292969,
"epoch": 0.39567936327458786,
"grad_norm": 15.994894981384277,
"kl": 0.0506591796875,
"learning_rate": 1e-06,
"loss": 0.002,
"reward": 1.8353284001350403,
"reward_std": 0.0779900832567364,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8457450717687607,
"step": 174
},
{
"completion_length": 130.52083778381348,
"epoch": 0.3979533826037521,
"grad_norm": 21.059858322143555,
"kl": 0.0555419921875,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 1.7730466425418854,
"reward_std": 0.08199177589267492,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.783463254570961,
"step": 175
},
{
"completion_length": 130.9062557220459,
"epoch": 0.40022740193291645,
"grad_norm": 28.807546615600586,
"kl": 0.04840087890625,
"learning_rate": 1e-06,
"loss": 0.0019,
"reward": 1.8387254476547241,
"reward_std": 0.05101281497627497,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8491421639919281,
"step": 176
},
{
"completion_length": 132.1770896911621,
"epoch": 0.4025014212620807,
"grad_norm": 8.303629875183105,
"kl": 0.0477294921875,
"learning_rate": 1e-06,
"loss": 0.0019,
"reward": 1.8762327134609222,
"reward_std": 0.021599826373858377,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8762326389551163,
"step": 177
},
{
"completion_length": 130.45834159851074,
"epoch": 0.40477544059124504,
"grad_norm": 5.3660359382629395,
"kl": 0.04571533203125,
"learning_rate": 1e-06,
"loss": 0.0018,
"reward": 1.8499539494514465,
"reward_std": 0.0204410245642066,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.849953904747963,
"step": 178
},
{
"completion_length": 130.28125190734863,
"epoch": 0.4070494599204093,
"grad_norm": 9.386882781982422,
"kl": 0.0966796875,
"learning_rate": 1e-06,
"loss": 0.0039,
"reward": 1.824854463338852,
"reward_std": 0.029640484135597944,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8248543739318848,
"step": 179
},
{
"completion_length": 125.65625,
"epoch": 0.40932347924957363,
"grad_norm": 10.826035499572754,
"kl": 0.05224609375,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 1.8699792921543121,
"reward_std": 0.05197575513739139,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8803958892822266,
"step": 180
},
{
"completion_length": 134.8750057220459,
"epoch": 0.4115974985787379,
"grad_norm": 9.238359451293945,
"kl": 0.05242919921875,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 1.881340116262436,
"reward_std": 0.020922310650348663,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8813401609659195,
"step": 181
},
{
"completion_length": 127.14583778381348,
"epoch": 0.4138715179079022,
"grad_norm": 5.321325302124023,
"kl": 0.04754638671875,
"learning_rate": 1e-06,
"loss": 0.0019,
"reward": 1.850901871919632,
"reward_std": 0.0263338660588488,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.850901871919632,
"step": 182
},
{
"completion_length": 127.5312557220459,
"epoch": 0.4161455372370665,
"grad_norm": 27.181861877441406,
"kl": 0.0631103515625,
"learning_rate": 1e-06,
"loss": 0.0025,
"reward": 1.8269298076629639,
"reward_std": 0.08564014174044132,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8373464494943619,
"step": 183
},
{
"completion_length": 124.05208587646484,
"epoch": 0.4184195565662308,
"grad_norm": 7.739187717437744,
"kl": 0.05078125,
"learning_rate": 1e-06,
"loss": 0.002,
"reward": 1.8764528036117554,
"reward_std": 0.0346121295588091,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8764527887105942,
"step": 184
},
{
"completion_length": 127.40625190734863,
"epoch": 0.42069357589539513,
"grad_norm": 5.463039398193359,
"kl": 0.0523681640625,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 1.8427515029907227,
"reward_std": 0.03878836310468614,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8427514582872391,
"step": 185
},
{
"completion_length": 127.60416984558105,
"epoch": 0.4229675952245594,
"grad_norm": 6.220778465270996,
"kl": 0.04833984375,
"learning_rate": 1e-06,
"loss": 0.0019,
"reward": 1.839966058731079,
"reward_std": 0.0670549722854048,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8503826707601547,
"step": 186
},
{
"completion_length": 128.42708587646484,
"epoch": 0.4252416145537237,
"grad_norm": 7.694529056549072,
"kl": 0.0487060546875,
"learning_rate": 1e-06,
"loss": 0.002,
"reward": 1.80168816447258,
"reward_std": 0.09221077559050173,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8225214183330536,
"step": 187
},
{
"completion_length": 123.45833587646484,
"epoch": 0.427515633882888,
"grad_norm": 8.434104919433594,
"kl": 0.0533447265625,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 1.8583765625953674,
"reward_std": 0.027193676389288157,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.858376607298851,
"step": 188
},
{
"completion_length": 121.63541793823242,
"epoch": 0.4297896532120523,
"grad_norm": 12.074097633361816,
"kl": 0.06329345703125,
"learning_rate": 1e-06,
"loss": 0.0025,
"reward": 1.8214238584041595,
"reward_std": 0.06923552230000496,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8214238435029984,
"step": 189
},
{
"completion_length": 124.48958587646484,
"epoch": 0.4320636725412166,
"grad_norm": 12.668107986450195,
"kl": 0.0615234375,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.8321338295936584,
"reward_std": 0.042182555072940886,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.84255051612854,
"step": 190
},
{
"completion_length": 128.0208339691162,
"epoch": 0.4343376918703809,
"grad_norm": 9.82055950164795,
"kl": 0.06005859375,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.81814506649971,
"reward_std": 0.06698361551389098,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8181450814008713,
"step": 191
},
{
"completion_length": 120.97916793823242,
"epoch": 0.43661171119954517,
"grad_norm": 16.151336669921875,
"kl": 0.072265625,
"learning_rate": 1e-06,
"loss": 0.0029,
"reward": 1.8266111612319946,
"reward_std": 0.09967668447643518,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8370277732610703,
"step": 192
},
{
"completion_length": 122.00000381469727,
"epoch": 0.4388857305287095,
"grad_norm": 7.378772735595703,
"kl": 0.06793212890625,
"learning_rate": 1e-06,
"loss": 0.0027,
"reward": 1.807835042476654,
"reward_std": 0.12562582828104496,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8286683708429337,
"step": 193
},
{
"completion_length": 123.76042175292969,
"epoch": 0.4411597498578738,
"grad_norm": 26.526634216308594,
"kl": 0.0714111328125,
"learning_rate": 1e-06,
"loss": 0.0029,
"reward": 1.854819893836975,
"reward_std": 0.039443244226276875,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8548198789358139,
"step": 194
},
{
"completion_length": 121.32291984558105,
"epoch": 0.4434337691870381,
"grad_norm": 5.977930068969727,
"kl": 0.070068359375,
"learning_rate": 1e-06,
"loss": 0.0028,
"reward": 1.7950571477413177,
"reward_std": 0.11713728122413158,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.815890446305275,
"step": 195
},
{
"completion_length": 129.34375190734863,
"epoch": 0.4457077885162024,
"grad_norm": 70.88825225830078,
"kl": 0.06866455078125,
"learning_rate": 1e-06,
"loss": 0.0027,
"reward": 1.8642869293689728,
"reward_std": 0.026344751473516226,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8642869293689728,
"step": 196
},
{
"completion_length": 128.20833778381348,
"epoch": 0.4479818078453667,
"grad_norm": 5.539989948272705,
"kl": 0.0606689453125,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.789596974849701,
"reward_std": 0.11505167232826352,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8104302436113358,
"step": 197
},
{
"completion_length": 120.04166984558105,
"epoch": 0.450255827174531,
"grad_norm": 16.326969146728516,
"kl": 0.071533203125,
"learning_rate": 1e-06,
"loss": 0.0028,
"reward": 1.8599906861782074,
"reward_std": 0.05113175604492426,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8704073280096054,
"step": 198
},
{
"completion_length": 121.82291793823242,
"epoch": 0.45252984650369527,
"grad_norm": 9.708488464355469,
"kl": 0.0889892578125,
"learning_rate": 1e-06,
"loss": 0.0036,
"reward": 1.8156112134456635,
"reward_std": 0.06598617471172474,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8260278552770615,
"step": 199
},
{
"completion_length": 123.58333396911621,
"epoch": 0.4548038658328596,
"grad_norm": 5.935286998748779,
"kl": 0.078857421875,
"learning_rate": 1e-06,
"loss": 0.0032,
"reward": 1.8384647369384766,
"reward_std": 0.06912496162112802,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.848881334066391,
"step": 200
},
{
"completion_length": 119.64583778381348,
"epoch": 0.45707788516202386,
"grad_norm": 8.705190658569336,
"kl": 0.06341552734375,
"learning_rate": 1e-06,
"loss": 0.0025,
"reward": 1.848078191280365,
"reward_std": 0.03278558413148858,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8480781614780426,
"step": 201
},
{
"completion_length": 122.05208396911621,
"epoch": 0.4593519044911882,
"grad_norm": 4.539862632751465,
"kl": 0.06060791015625,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.8337964713573456,
"reward_std": 0.09542246071214322,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8546297699213028,
"step": 202
},
{
"completion_length": 119.18750381469727,
"epoch": 0.46162592382035245,
"grad_norm": 20.850038528442383,
"kl": 0.060546875,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.8311468660831451,
"reward_std": 0.0869890945032239,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8415635526180267,
"step": 203
},
{
"completion_length": 118.44791793823242,
"epoch": 0.46389994314951677,
"grad_norm": 9.465960502624512,
"kl": 0.05963134765625,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.8948685824871063,
"reward_std": 0.006126068299636245,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8948685228824615,
"step": 204
},
{
"completion_length": 124.72916793823242,
"epoch": 0.4661739624786811,
"grad_norm": 19.61353302001953,
"kl": 0.05328369140625,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 1.8210335075855255,
"reward_std": 0.03609966021031141,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8210335075855255,
"step": 205
},
{
"completion_length": 123.12500381469727,
"epoch": 0.46844798180784536,
"grad_norm": 6.793501377105713,
"kl": 0.0574951171875,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.826656460762024,
"reward_std": 0.05289078433997929,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8370731472969055,
"step": 206
},
{
"completion_length": 122.26041984558105,
"epoch": 0.4707220011370097,
"grad_norm": 5.112701892852783,
"kl": 0.052734375,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 1.807763934135437,
"reward_std": 0.06750181829556823,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8181806355714798,
"step": 207
},
{
"completion_length": 121.22916984558105,
"epoch": 0.47299602046617395,
"grad_norm": 6.380675315856934,
"kl": 0.0511474609375,
"learning_rate": 1e-06,
"loss": 0.002,
"reward": 1.8463412821292877,
"reward_std": 0.03807840694207698,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8463412821292877,
"step": 208
},
{
"completion_length": 121.06250190734863,
"epoch": 0.4752700397953383,
"grad_norm": 24.312280654907227,
"kl": 0.05584716796875,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 1.8374760746955872,
"reward_std": 0.05966222519055009,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8478926569223404,
"step": 209
},
{
"completion_length": 121.94792175292969,
"epoch": 0.47754405912450254,
"grad_norm": 7.5535478591918945,
"kl": 0.05242919921875,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 1.8417306244373322,
"reward_std": 0.07789468741975725,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8521473109722137,
"step": 210
},
{
"completion_length": 120.58333587646484,
"epoch": 0.47981807845366686,
"grad_norm": 24.363685607910156,
"kl": 0.05023193359375,
"learning_rate": 1e-06,
"loss": 0.002,
"reward": 1.8678061068058014,
"reward_std": 0.02573518455028534,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8678060472011566,
"step": 211
},
{
"completion_length": 125.44792175292969,
"epoch": 0.48209209778283113,
"grad_norm": 8.597858428955078,
"kl": 0.0648193359375,
"learning_rate": 1e-06,
"loss": 0.0026,
"reward": 1.8275066912174225,
"reward_std": 0.09166095149703324,
"rewards/format_reward": 0.9791666716337204,
"rewards/segmentation_reward": 0.8483400493860245,
"step": 212
},
{
"completion_length": 128.0312557220459,
"epoch": 0.48436611711199545,
"grad_norm": 11.972718238830566,
"kl": 0.057373046875,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.8702231347560883,
"reward_std": 0.019674736773595214,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8702231794595718,
"step": 213
},
{
"completion_length": 124.14583587646484,
"epoch": 0.4866401364411598,
"grad_norm": 8.804828643798828,
"kl": 0.0577392578125,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.8843638896942139,
"reward_std": 0.012492099194787443,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8843639045953751,
"step": 214
},
{
"completion_length": 123.95833587646484,
"epoch": 0.48891415577032404,
"grad_norm": 5.491071701049805,
"kl": 0.05499267578125,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 1.8558919131755829,
"reward_std": 0.0653155903564766,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8663085699081421,
"step": 215
},
{
"completion_length": 128.31250381469727,
"epoch": 0.49118817509948837,
"grad_norm": 6.622015476226807,
"kl": 0.055419921875,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 1.8662042915821075,
"reward_std": 0.010809883824549615,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8662042170763016,
"step": 216
},
{
"completion_length": 122.77083778381348,
"epoch": 0.49346219442865263,
"grad_norm": 6.790284156799316,
"kl": 0.05340576171875,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 1.8543438911437988,
"reward_std": 0.05247529596090317,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8647605180740356,
"step": 217
},
{
"completion_length": 123.45833587646484,
"epoch": 0.49573621375781696,
"grad_norm": 13.34268569946289,
"kl": 0.05609130859375,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 1.8597874641418457,
"reward_std": 0.03391414089128375,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8597874045372009,
"step": 218
},
{
"completion_length": 124.67708778381348,
"epoch": 0.4980102330869812,
"grad_norm": 8.240407943725586,
"kl": 0.058349609375,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.8817353248596191,
"reward_std": 0.012413767748512328,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.881735309958458,
"step": 219
},
{
"completion_length": 124.67708587646484,
"epoch": 0.5002842524161455,
"grad_norm": 17.483158111572266,
"kl": 0.06695556640625,
"learning_rate": 1e-06,
"loss": 0.0027,
"reward": 1.8380079865455627,
"reward_std": 0.07233457683469169,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.848424643278122,
"step": 220
},
{
"completion_length": 122.06250381469727,
"epoch": 0.5025582717453099,
"grad_norm": 7.1976494789123535,
"kl": 0.0635986328125,
"learning_rate": 1e-06,
"loss": 0.0026,
"reward": 1.8233891129493713,
"reward_std": 0.08511380999698304,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8442224413156509,
"step": 221
},
{
"completion_length": 124.62500190734863,
"epoch": 0.5048322910744741,
"grad_norm": 11.14084243774414,
"kl": 0.05706787109375,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.8576789200305939,
"reward_std": 0.04037183988839388,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8576788157224655,
"step": 222
},
{
"completion_length": 127.32292366027832,
"epoch": 0.5071063104036384,
"grad_norm": 5.903656005859375,
"kl": 0.0548095703125,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 1.8232994377613068,
"reward_std": 0.04908563965000212,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8232994079589844,
"step": 223
},
{
"completion_length": 126.42708587646484,
"epoch": 0.5093803297328028,
"grad_norm": 15.478821754455566,
"kl": 0.05560302734375,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 1.8385266661643982,
"reward_std": 0.030962634249590337,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.838526651263237,
"step": 224
},
{
"completion_length": 133.68750381469727,
"epoch": 0.511654349061967,
"grad_norm": 115.98380279541016,
"kl": 0.05615234375,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 1.7851110100746155,
"reward_std": 0.10919084469787776,
"rewards/format_reward": 0.9687500149011612,
"rewards/segmentation_reward": 0.8163610249757767,
"step": 225
},
{
"completion_length": 135.8541717529297,
"epoch": 0.5139283683911313,
"grad_norm": 11.387409210205078,
"kl": 0.05108642578125,
"learning_rate": 1e-06,
"loss": 0.002,
"reward": 1.8544709980487823,
"reward_std": 0.05253174444078468,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8648876249790192,
"step": 226
},
{
"completion_length": 135.78125381469727,
"epoch": 0.5162023877202956,
"grad_norm": 5.589237689971924,
"kl": 0.05682373046875,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.8480293452739716,
"reward_std": 0.04187517584068701,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8584460020065308,
"step": 227
},
{
"completion_length": 129.57291793823242,
"epoch": 0.51847640704946,
"grad_norm": 6.878526210784912,
"kl": 0.06463623046875,
"learning_rate": 1e-06,
"loss": 0.0026,
"reward": 1.797906219959259,
"reward_std": 0.09693466546013951,
"rewards/format_reward": 0.9791666716337204,
"rewards/segmentation_reward": 0.8187395334243774,
"step": 228
},
{
"completion_length": 133.1145896911621,
"epoch": 0.5207504263786242,
"grad_norm": 11.986654281616211,
"kl": 0.05206298828125,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 1.8702963292598724,
"reward_std": 0.046940833679400384,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8807128965854645,
"step": 229
},
{
"completion_length": 137.68750762939453,
"epoch": 0.5230244457077885,
"grad_norm": 4.170595169067383,
"kl": 0.05438232421875,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 1.8651223182678223,
"reward_std": 0.01505957031622529,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8651222884654999,
"step": 230
},
{
"completion_length": 134.43750381469727,
"epoch": 0.5252984650369528,
"grad_norm": 8.454867362976074,
"kl": 0.060546875,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.8617721498012543,
"reward_std": 0.004831298429053277,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8617721498012543,
"step": 231
},
{
"completion_length": 130.88541793823242,
"epoch": 0.5275724843661171,
"grad_norm": 7.359889507293701,
"kl": 0.05206298828125,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 1.875521332025528,
"reward_std": 0.03221741976449266,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8755213916301727,
"step": 232
},
{
"completion_length": 137.62500381469727,
"epoch": 0.5298465036952814,
"grad_norm": 4.686023712158203,
"kl": 0.0633544921875,
"learning_rate": 1e-06,
"loss": 0.0025,
"reward": 1.836742788553238,
"reward_std": 0.05397877559880726,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8471594154834747,
"step": 233
},
{
"completion_length": 133.71875762939453,
"epoch": 0.5321205230244457,
"grad_norm": 8.684386253356934,
"kl": 0.05889892578125,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.8528975546360016,
"reward_std": 0.013704222801607102,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8528975248336792,
"step": 234
},
{
"completion_length": 138.2395896911621,
"epoch": 0.5343945423536101,
"grad_norm": 27.199432373046875,
"kl": 0.060302734375,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.819983571767807,
"reward_std": 0.07903883041581139,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.830400213599205,
"step": 235
},
{
"completion_length": 137.5729217529297,
"epoch": 0.5366685616827743,
"grad_norm": 7.134744644165039,
"kl": 0.060791015625,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.848281979560852,
"reward_std": 0.015959581825882196,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8482819348573685,
"step": 236
},
{
"completion_length": 132.91667366027832,
"epoch": 0.5389425810119386,
"grad_norm": 10.887066841125488,
"kl": 0.060791015625,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.8350262939929962,
"reward_std": 0.06616777507588267,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8454429060220718,
"step": 237
},
{
"completion_length": 137.92708587646484,
"epoch": 0.5412166003411029,
"grad_norm": 30.873247146606445,
"kl": 0.06396484375,
"learning_rate": 1e-06,
"loss": 0.0026,
"reward": 1.8556478321552277,
"reward_std": 0.05000708991428837,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8660645484924316,
"step": 238
},
{
"completion_length": 142.15625762939453,
"epoch": 0.5434906196702672,
"grad_norm": 5.460704326629639,
"kl": 0.0665283203125,
"learning_rate": 1e-06,
"loss": 0.0026,
"reward": 1.8754114508628845,
"reward_std": 0.027424399624578655,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8754114210605621,
"step": 239
},
{
"completion_length": 138.1875057220459,
"epoch": 0.5457646389994315,
"grad_norm": 4.744943618774414,
"kl": 0.071044921875,
"learning_rate": 1e-06,
"loss": 0.0028,
"reward": 1.874392807483673,
"reward_std": 0.012488734326325357,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8743928372859955,
"step": 240
},
{
"completion_length": 140.36458587646484,
"epoch": 0.5480386583285958,
"grad_norm": 39.30780792236328,
"kl": 0.066650390625,
"learning_rate": 1e-06,
"loss": 0.0027,
"reward": 1.8584297001361847,
"reward_std": 0.05610931571573019,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8688463568687439,
"step": 241
},
{
"completion_length": 139.48958778381348,
"epoch": 0.55031267765776,
"grad_norm": 11.226171493530273,
"kl": 0.0614013671875,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.860059529542923,
"reward_std": 0.02563871028542053,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8600595146417618,
"step": 242
},
{
"completion_length": 141.8854217529297,
"epoch": 0.5525866969869244,
"grad_norm": 16.11907196044922,
"kl": 0.06512451171875,
"learning_rate": 1e-06,
"loss": 0.0026,
"reward": 1.8452790975570679,
"reward_std": 0.08041338669136167,
"rewards/format_reward": 0.9791666716337204,
"rewards/segmentation_reward": 0.8661123663187027,
"step": 243
},
{
"completion_length": 136.5104217529297,
"epoch": 0.5548607163160887,
"grad_norm": 66.74971771240234,
"kl": 0.0682373046875,
"learning_rate": 1e-06,
"loss": 0.0027,
"reward": 1.8137792348861694,
"reward_std": 0.11896559037268162,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8346125185489655,
"step": 244
},
{
"completion_length": 141.0833396911621,
"epoch": 0.557134735645253,
"grad_norm": 8.579216957092285,
"kl": 0.1153564453125,
"learning_rate": 1e-06,
"loss": 0.0046,
"reward": 1.8266068398952484,
"reward_std": 0.12713953852653503,
"rewards/format_reward": 0.9687500149011612,
"rewards/segmentation_reward": 0.8578568696975708,
"step": 245
},
{
"completion_length": 153.37500762939453,
"epoch": 0.5594087549744173,
"grad_norm": 4.466084003448486,
"kl": 0.0592041015625,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.782545268535614,
"reward_std": 0.11807792168110609,
"rewards/format_reward": 0.9687500149011612,
"rewards/segmentation_reward": 0.813795268535614,
"step": 246
},
{
"completion_length": 149.6041717529297,
"epoch": 0.5616827743035816,
"grad_norm": 9.517090797424316,
"kl": 0.0572509765625,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.7555188834667206,
"reward_std": 0.17915836814790964,
"rewards/format_reward": 0.9687500149011612,
"rewards/segmentation_reward": 0.786768838763237,
"step": 247
},
{
"completion_length": 148.96875381469727,
"epoch": 0.5639567936327459,
"grad_norm": 10.732756614685059,
"kl": 0.06365966796875,
"learning_rate": 1e-06,
"loss": 0.0026,
"reward": 1.8198718428611755,
"reward_std": 0.11693388223648071,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8302884250879288,
"step": 248
},
{
"completion_length": 152.31250381469727,
"epoch": 0.5662308129619101,
"grad_norm": 15.704789161682129,
"kl": 0.06011962890625,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.7587114572525024,
"reward_std": 0.13302453747019172,
"rewards/format_reward": 0.9687500149011612,
"rewards/segmentation_reward": 0.7899614423513412,
"step": 249
},
{
"completion_length": 151.11458587646484,
"epoch": 0.5685048322910745,
"grad_norm": 10.12311840057373,
"kl": 0.06243896484375,
"learning_rate": 1e-06,
"loss": 0.0025,
"reward": 1.7591370046138763,
"reward_std": 0.19031737372279167,
"rewards/format_reward": 0.96875,
"rewards/segmentation_reward": 0.7903869301080704,
"step": 250
},
{
"completion_length": 143.3020896911621,
"epoch": 0.5707788516202388,
"grad_norm": 9.594857215881348,
"kl": 0.06365966796875,
"learning_rate": 1e-06,
"loss": 0.0025,
"reward": 1.8209541141986847,
"reward_std": 0.11682178732007742,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8417873978614807,
"step": 251
},
{
"completion_length": 148.5,
"epoch": 0.573052870949403,
"grad_norm": 6.971501350402832,
"kl": 0.05908203125,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.7789467871189117,
"reward_std": 0.16825164668262005,
"rewards/format_reward": 0.9687500298023224,
"rewards/segmentation_reward": 0.8101967573165894,
"step": 252
},
{
"completion_length": 152.1041717529297,
"epoch": 0.5753268902785673,
"grad_norm": 23.23783302307129,
"kl": 0.059814453125,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.7420052289962769,
"reward_std": 0.18916566669940948,
"rewards/format_reward": 0.958333358168602,
"rewards/segmentation_reward": 0.7836718857288361,
"step": 253
},
{
"completion_length": 145.42708587646484,
"epoch": 0.5776009096077317,
"grad_norm": 30.076963424682617,
"kl": 0.05194091796875,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 1.7838060855865479,
"reward_std": 0.08882320672273636,
"rewards/format_reward": 0.96875,
"rewards/segmentation_reward": 0.8150560706853867,
"step": 254
},
{
"completion_length": 152.81250762939453,
"epoch": 0.579874928936896,
"grad_norm": 8.252555847167969,
"kl": 0.05322265625,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 1.7935425341129303,
"reward_std": 0.13829213567078114,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8143758326768875,
"step": 255
},
{
"completion_length": 151.12500381469727,
"epoch": 0.5821489482660602,
"grad_norm": 7.083897113800049,
"kl": 0.05743408203125,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.8427066802978516,
"reward_std": 0.103471142007038,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8635399788618088,
"step": 256
},
{
"completion_length": 150.7291717529297,
"epoch": 0.5844229675952246,
"grad_norm": 20.999393463134766,
"kl": 0.05621337890625,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 1.8271246254444122,
"reward_std": 0.07995186559855938,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8375412821769714,
"step": 257
},
{
"completion_length": 146.43750381469727,
"epoch": 0.5866969869243889,
"grad_norm": 4.985034465789795,
"kl": 0.05804443359375,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.875591516494751,
"reward_std": 0.030072015128098428,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.875591516494751,
"step": 258
},
{
"completion_length": 148.57291793823242,
"epoch": 0.5889710062535531,
"grad_norm": 27.399518966674805,
"kl": 0.05438232421875,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 1.7720091938972473,
"reward_std": 0.14485601719934493,
"rewards/format_reward": 0.9583333432674408,
"rewards/segmentation_reward": 0.8136758357286453,
"step": 259
},
{
"completion_length": 146.89583587646484,
"epoch": 0.5912450255827174,
"grad_norm": 8.949258804321289,
"kl": 0.05517578125,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 1.7998264729976654,
"reward_std": 0.11114408634603024,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8102431297302246,
"step": 260
},
{
"completion_length": 143.32292556762695,
"epoch": 0.5935190449118818,
"grad_norm": 13.319233894348145,
"kl": 0.0638427734375,
"learning_rate": 1e-06,
"loss": 0.0026,
"reward": 1.8625755608081818,
"reward_std": 0.024278577242512256,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8625756055116653,
"step": 261
},
{
"completion_length": 151.6354217529297,
"epoch": 0.595793064241046,
"grad_norm": 5.921450138092041,
"kl": 0.0545654296875,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 1.8347079157829285,
"reward_std": 0.05886374693363905,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8451245874166489,
"step": 262
},
{
"completion_length": 150.2083396911621,
"epoch": 0.5980670835702103,
"grad_norm": 6.478372097015381,
"kl": 0.06402587890625,
"learning_rate": 1e-06,
"loss": 0.0025,
"reward": 1.853881686925888,
"reward_std": 0.069986637448892,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8642983585596085,
"step": 263
},
{
"completion_length": 142.2291717529297,
"epoch": 0.6003411028993746,
"grad_norm": 44.52735900878906,
"kl": 0.06976318359375,
"learning_rate": 1e-06,
"loss": 0.0028,
"reward": 1.8701772689819336,
"reward_std": 0.03132961760275066,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8701772391796112,
"step": 264
},
{
"completion_length": 145.3958396911621,
"epoch": 0.602615122228539,
"grad_norm": 9.911026000976562,
"kl": 0.054443359375,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 1.8112512826919556,
"reward_std": 0.12766834167996421,
"rewards/format_reward": 0.9687500149011612,
"rewards/segmentation_reward": 0.8425012826919556,
"step": 265
},
{
"completion_length": 153.6354217529297,
"epoch": 0.6048891415577032,
"grad_norm": 5.161675453186035,
"kl": 0.05157470703125,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 1.8424187302589417,
"reward_std": 0.021243932540528476,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8424187004566193,
"step": 266
},
{
"completion_length": 142.25000381469727,
"epoch": 0.6071631608868675,
"grad_norm": 10.04539966583252,
"kl": 0.05657958984375,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.8589707911014557,
"reward_std": 0.07298957108287141,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8798040300607681,
"step": 267
},
{
"completion_length": 140.06250381469727,
"epoch": 0.6094371802160319,
"grad_norm": 5.002673625946045,
"kl": 0.0604248046875,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.8338092267513275,
"reward_std": 0.07467565825209022,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8442258834838867,
"step": 268
},
{
"completion_length": 143.37500762939453,
"epoch": 0.6117111995451961,
"grad_norm": 16.33036231994629,
"kl": 0.06146240234375,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.828411191701889,
"reward_std": 0.05990099138580263,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8388277888298035,
"step": 269
},
{
"completion_length": 145.7083396911621,
"epoch": 0.6139852188743604,
"grad_norm": 10.558090209960938,
"kl": 0.05633544921875,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 1.8714422285556793,
"reward_std": 0.04079840763006359,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8818589001893997,
"step": 270
},
{
"completion_length": 136.46875762939453,
"epoch": 0.6162592382035247,
"grad_norm": 6.936587333679199,
"kl": 0.0615234375,
"learning_rate": 1e-06,
"loss": 0.0025,
"reward": 1.8483183681964874,
"reward_std": 0.05946638010209426,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8587349951267242,
"step": 271
},
{
"completion_length": 136.8854217529297,
"epoch": 0.6185332575326891,
"grad_norm": 9.40880012512207,
"kl": 0.0555419921875,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 1.8671943843364716,
"reward_std": 0.026569546665996313,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8671943843364716,
"step": 272
},
{
"completion_length": 133.84375,
"epoch": 0.6208072768618533,
"grad_norm": 8.41716480255127,
"kl": 0.06396484375,
"learning_rate": 1e-06,
"loss": 0.0026,
"reward": 1.8463688492774963,
"reward_std": 0.10014531551860273,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8672020584344864,
"step": 273
},
{
"completion_length": 137.4166717529297,
"epoch": 0.6230812961910176,
"grad_norm": 6.68651008605957,
"kl": 0.05877685546875,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.8591451048851013,
"reward_std": 0.0608306503854692,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8695616871118546,
"step": 274
},
{
"completion_length": 139.6354217529297,
"epoch": 0.625355315520182,
"grad_norm": 5.125174522399902,
"kl": 0.05517578125,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 1.8548312783241272,
"reward_std": 0.020504672429524362,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8548312485218048,
"step": 275
},
{
"completion_length": 140.5104217529297,
"epoch": 0.6276293348493462,
"grad_norm": 8.44184684753418,
"kl": 0.05133056640625,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 1.8494782745838165,
"reward_std": 0.04421432921662927,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8494782447814941,
"step": 276
},
{
"completion_length": 141.0520896911621,
"epoch": 0.6299033541785105,
"grad_norm": 5.259810447692871,
"kl": 0.055419921875,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 1.8360374867916107,
"reward_std": 0.060232745250687,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8464541882276535,
"step": 277
},
{
"completion_length": 135.2083396911621,
"epoch": 0.6321773735076748,
"grad_norm": 7.691401481628418,
"kl": 0.061279296875,
"learning_rate": 1e-06,
"loss": 0.0025,
"reward": 1.8679547905921936,
"reward_std": 0.026071164524182677,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8679548501968384,
"step": 278
},
{
"completion_length": 132.30208587646484,
"epoch": 0.6344513928368392,
"grad_norm": 33.14459228515625,
"kl": 0.05517578125,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 1.8575536906719208,
"reward_std": 0.037932454550173134,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8679703027009964,
"step": 279
},
{
"completion_length": 135.06250762939453,
"epoch": 0.6367254121660034,
"grad_norm": 14.222509384155273,
"kl": 0.0467529296875,
"learning_rate": 1e-06,
"loss": 0.0019,
"reward": 1.8211901187896729,
"reward_std": 0.022689874283969402,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8211900591850281,
"step": 280
},
{
"completion_length": 138.9166717529297,
"epoch": 0.6389994314951677,
"grad_norm": 45.212120056152344,
"kl": 0.048828125,
"learning_rate": 1e-06,
"loss": 0.002,
"reward": 1.8557825684547424,
"reward_std": 0.061675679637119174,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8661992251873016,
"step": 281
},
{
"completion_length": 133.5520896911621,
"epoch": 0.641273450824332,
"grad_norm": 6.831585884094238,
"kl": 0.058837890625,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.859096884727478,
"reward_std": 0.02432074275566265,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8590968549251556,
"step": 282
},
{
"completion_length": 134.60416984558105,
"epoch": 0.6435474701534963,
"grad_norm": 17.257526397705078,
"kl": 0.0516357421875,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 1.8761568367481232,
"reward_std": 0.020475412253290415,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.876156821846962,
"step": 283
},
{
"completion_length": 133.95833587646484,
"epoch": 0.6458214894826606,
"grad_norm": 13.777560234069824,
"kl": 0.05450439453125,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 1.8327456414699554,
"reward_std": 0.03332398599013686,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8327456265687943,
"step": 284
},
{
"completion_length": 130.08333587646484,
"epoch": 0.6480955088118249,
"grad_norm": 8.314565658569336,
"kl": 0.06304931640625,
"learning_rate": 1e-06,
"loss": 0.0025,
"reward": 1.8160936534404755,
"reward_std": 0.05730089984717779,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8265102803707123,
"step": 285
},
{
"completion_length": 138.0416717529297,
"epoch": 0.6503695281409893,
"grad_norm": 8.033271789550781,
"kl": 0.052490234375,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 1.8485551476478577,
"reward_std": 0.0665385426254943,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8589717596769333,
"step": 286
},
{
"completion_length": 131.9479217529297,
"epoch": 0.6526435474701535,
"grad_norm": 4.302734851837158,
"kl": 0.0552978515625,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 1.814813256263733,
"reward_std": 0.07665527774952352,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8356466442346573,
"step": 287
},
{
"completion_length": 127.94791984558105,
"epoch": 0.6549175667993178,
"grad_norm": 6.902019023895264,
"kl": 0.05352783203125,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 1.8666083216667175,
"reward_std": 0.05546297336695716,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8770249783992767,
"step": 288
},
{
"completion_length": 128.13541793823242,
"epoch": 0.657191586128482,
"grad_norm": 5.1357293128967285,
"kl": 0.0509033203125,
"learning_rate": 1e-06,
"loss": 0.002,
"reward": 1.8219444751739502,
"reward_std": 0.10313208564184606,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.842777818441391,
"step": 289
},
{
"completion_length": 127.03125381469727,
"epoch": 0.6594656054576464,
"grad_norm": 10.558165550231934,
"kl": 0.05340576171875,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 1.8529814183712006,
"reward_std": 0.04027191852219403,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8529813587665558,
"step": 290
},
{
"completion_length": 123.98958778381348,
"epoch": 0.6617396247868107,
"grad_norm": 17.23190689086914,
"kl": 0.0595703125,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.808007925748825,
"reward_std": 0.11614370718598366,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8288412094116211,
"step": 291
},
{
"completion_length": 124.13542175292969,
"epoch": 0.664013644115975,
"grad_norm": 5.512027263641357,
"kl": 0.04791259765625,
"learning_rate": 1e-06,
"loss": 0.0019,
"reward": 1.84766486287117,
"reward_std": 0.08955034404061735,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8684981167316437,
"step": 292
},
{
"completion_length": 120.54166984558105,
"epoch": 0.6662876634451392,
"grad_norm": 8.465278625488281,
"kl": 0.05010986328125,
"learning_rate": 1e-06,
"loss": 0.002,
"reward": 1.8660954236984253,
"reward_std": 0.049259885265200865,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8765120506286621,
"step": 293
},
{
"completion_length": 126.70833587646484,
"epoch": 0.6685616827743036,
"grad_norm": 11.483687400817871,
"kl": 0.05279541015625,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 1.8239311873912811,
"reward_std": 0.1335212409030646,
"rewards/format_reward": 0.9687500298023224,
"rewards/segmentation_reward": 0.8551811873912811,
"step": 294
},
{
"completion_length": 126.12500190734863,
"epoch": 0.6708357021034679,
"grad_norm": 99337359065088.0,
"kl": 674309865472.063,
"learning_rate": 1e-06,
"loss": 27030312960.0,
"reward": 1.8664152026176453,
"reward_std": 0.04355062101967633,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8768318891525269,
"step": 295
},
{
"completion_length": 125.26041984558105,
"epoch": 0.6731097214326321,
"grad_norm": 9.260116577148438,
"kl": 0.053955078125,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 1.8506720662117004,
"reward_std": 0.07204537454526871,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8610887378454208,
"step": 296
},
{
"completion_length": 131.7604217529297,
"epoch": 0.6753837407617965,
"grad_norm": 11.006333351135254,
"kl": 0.05487060546875,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 1.8154217898845673,
"reward_std": 0.029030885722022504,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8154216706752777,
"step": 297
},
{
"completion_length": 129.21875,
"epoch": 0.6776577600909608,
"grad_norm": 22.507421493530273,
"kl": 0.057373046875,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.833427518606186,
"reward_std": 0.041746608447283506,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8334274739027023,
"step": 298
},
{
"completion_length": 126.59375381469727,
"epoch": 0.6799317794201251,
"grad_norm": 8.090750694274902,
"kl": 0.0552978515625,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 1.8329502940177917,
"reward_std": 0.03257988323457539,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8329502940177917,
"step": 299
},
{
"completion_length": 127.93750381469727,
"epoch": 0.6822057987492893,
"grad_norm": 23.35614585876465,
"kl": 0.05328369140625,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 1.8777255713939667,
"reward_std": 0.0277888648561202,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8777255564928055,
"step": 300
},
{
"completion_length": 124.33333587646484,
"epoch": 0.6844798180784537,
"grad_norm": 7.305347442626953,
"kl": 0.062744140625,
"learning_rate": 1e-06,
"loss": 0.0025,
"reward": 1.887622207403183,
"reward_std": 0.008017042011488229,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.887622207403183,
"step": 301
},
{
"completion_length": 124.26041984558105,
"epoch": 0.686753837407618,
"grad_norm": 32.80826950073242,
"kl": 0.05267333984375,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 1.8523709774017334,
"reward_std": 0.01647485780995339,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8523710370063782,
"step": 302
},
{
"completion_length": 122.36458396911621,
"epoch": 0.6890278567367822,
"grad_norm": 9.616044044494629,
"kl": 0.0548095703125,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 1.8915051221847534,
"reward_std": 0.020660731475800276,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.891505092382431,
"step": 303
},
{
"completion_length": 125.66666793823242,
"epoch": 0.6913018760659465,
"grad_norm": 4.894202709197998,
"kl": 0.0616455078125,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.8347567915916443,
"reward_std": 0.023919553961604834,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8347567617893219,
"step": 304
},
{
"completion_length": 124.51042175292969,
"epoch": 0.6935758953951109,
"grad_norm": 50.18326950073242,
"kl": 0.09136962890625,
"learning_rate": 1e-06,
"loss": 0.0036,
"reward": 1.8522542119026184,
"reward_std": 0.08253866015002131,
"rewards/format_reward": 0.9791666716337204,
"rewards/segmentation_reward": 0.873087465763092,
"step": 305
},
{
"completion_length": 125.41666984558105,
"epoch": 0.6958499147242752,
"grad_norm": 5.762409687042236,
"kl": 0.05657958984375,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.8508845269680023,
"reward_std": 0.06522158032748848,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8613012135028839,
"step": 306
},
{
"completion_length": 131.19791984558105,
"epoch": 0.6981239340534394,
"grad_norm": 6.682615756988525,
"kl": 0.0689697265625,
"learning_rate": 1e-06,
"loss": 0.0027,
"reward": 1.8200619518756866,
"reward_std": 0.07617322774603963,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8304786384105682,
"step": 307
},
{
"completion_length": 132.42708587646484,
"epoch": 0.7003979533826038,
"grad_norm": 19.110347747802734,
"kl": 0.0611572265625,
"learning_rate": 1e-06,
"loss": 0.0025,
"reward": 1.883405864238739,
"reward_std": 0.03483639005571604,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.883405864238739,
"step": 308
},
{
"completion_length": 130.9791717529297,
"epoch": 0.7026719727117681,
"grad_norm": 14.51934814453125,
"kl": 0.05859375,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.832915335893631,
"reward_std": 0.06287066219374537,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8433319926261902,
"step": 309
},
{
"completion_length": 126.01042175292969,
"epoch": 0.7049459920409323,
"grad_norm": 9.068413734436035,
"kl": 0.05810546875,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.8837738931179047,
"reward_std": 0.02703522122465074,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8837738037109375,
"step": 310
},
{
"completion_length": 134.0729217529297,
"epoch": 0.7072200113700966,
"grad_norm": 9.09504222869873,
"kl": 0.0723876953125,
"learning_rate": 1e-06,
"loss": 0.0029,
"reward": 1.8339940905570984,
"reward_std": 0.08121342983213253,
"rewards/format_reward": 0.9791666716337204,
"rewards/segmentation_reward": 0.8548273891210556,
"step": 311
},
{
"completion_length": 136.13541984558105,
"epoch": 0.709494030699261,
"grad_norm": 4.718442440032959,
"kl": 0.05426025390625,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 1.8730711042881012,
"reward_std": 0.03671956621110439,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.87307108938694,
"step": 312
},
{
"completion_length": 126.04166984558105,
"epoch": 0.7117680500284252,
"grad_norm": 12.670175552368164,
"kl": 0.05718994140625,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.8251341879367828,
"reward_std": 0.04767660913057625,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8251341283321381,
"step": 313
},
{
"completion_length": 131.08333587646484,
"epoch": 0.7140420693575895,
"grad_norm": 7.425668716430664,
"kl": 0.0506591796875,
"learning_rate": 1e-06,
"loss": 0.002,
"reward": 1.8413426876068115,
"reward_std": 0.0643298716749996,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8517593443393707,
"step": 314
},
{
"completion_length": 126.39583778381348,
"epoch": 0.7163160886867538,
"grad_norm": 7.450831413269043,
"kl": 0.05279541015625,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 1.8399729132652283,
"reward_std": 0.03783059283159673,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8399728983640671,
"step": 315
},
{
"completion_length": 131.39583778381348,
"epoch": 0.7185901080159182,
"grad_norm": 4.984809398651123,
"kl": 0.0579833984375,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.8401748836040497,
"reward_std": 0.06397436745464802,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8505915254354477,
"step": 316
},
{
"completion_length": 126.29166984558105,
"epoch": 0.7208641273450824,
"grad_norm": 8.88703727722168,
"kl": 0.055908203125,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 1.8670341670513153,
"reward_std": 0.023501697811298072,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8670340925455093,
"step": 317
},
{
"completion_length": 132.0104217529297,
"epoch": 0.7231381466742467,
"grad_norm": 6.049837589263916,
"kl": 0.05718994140625,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.8481946885585785,
"reward_std": 0.04346911353059113,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8481947034597397,
"step": 318
},
{
"completion_length": 132.4270896911621,
"epoch": 0.7254121660034111,
"grad_norm": 46.442176818847656,
"kl": 0.05584716796875,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 1.858694463968277,
"reward_std": 0.023713725386187434,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8586944341659546,
"step": 319
},
{
"completion_length": 131.89584159851074,
"epoch": 0.7276861853325753,
"grad_norm": 28.615896224975586,
"kl": 0.0504150390625,
"learning_rate": 1e-06,
"loss": 0.002,
"reward": 1.8294812142848969,
"reward_std": 0.058421910274773836,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8398977816104889,
"step": 320
},
{
"completion_length": 131.5104217529297,
"epoch": 0.7299602046617396,
"grad_norm": 4.829744338989258,
"kl": 0.0491943359375,
"learning_rate": 1e-06,
"loss": 0.002,
"reward": 1.8195575773715973,
"reward_std": 0.05906218430027366,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8299742341041565,
"step": 321
},
{
"completion_length": 135.25000381469727,
"epoch": 0.7322342239909039,
"grad_norm": 8.953800201416016,
"kl": 0.0484619140625,
"learning_rate": 1e-06,
"loss": 0.0019,
"reward": 1.8626637756824493,
"reward_std": 0.028036643168888986,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8626636862754822,
"step": 322
},
{
"completion_length": 134.64583587646484,
"epoch": 0.7345082433200683,
"grad_norm": 5.906992435455322,
"kl": 0.06317138671875,
"learning_rate": 1e-06,
"loss": 0.0025,
"reward": 1.8335199654102325,
"reward_std": 0.03467556097893976,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8439366221427917,
"step": 323
},
{
"completion_length": 129.5520896911621,
"epoch": 0.7367822626492325,
"grad_norm": 5.112988471984863,
"kl": 0.0526123046875,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 1.836005687713623,
"reward_std": 0.07491261116228998,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8464223295450211,
"step": 324
},
{
"completion_length": 131.0833396911621,
"epoch": 0.7390562819783968,
"grad_norm": 5.518293380737305,
"kl": 0.0584716796875,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.8426667749881744,
"reward_std": 0.04552039853297174,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.842666745185852,
"step": 325
},
{
"completion_length": 135.5,
"epoch": 0.7413303013075612,
"grad_norm": 4.705676555633545,
"kl": 0.053466796875,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 1.8192458748817444,
"reward_std": 0.03886064630933106,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8192458003759384,
"step": 326
},
{
"completion_length": 134.08333778381348,
"epoch": 0.7436043206367254,
"grad_norm": 33.00435256958008,
"kl": 0.05438232421875,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 1.8162838518619537,
"reward_std": 0.1155676506459713,
"rewards/format_reward": 0.9791666716337204,
"rewards/segmentation_reward": 0.8371172249317169,
"step": 327
},
{
"completion_length": 134.50000381469727,
"epoch": 0.7458783399658897,
"grad_norm": 8.883432388305664,
"kl": 0.0660400390625,
"learning_rate": 1e-06,
"loss": 0.0026,
"reward": 1.8558038473129272,
"reward_std": 0.021242189570330083,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8558038026094437,
"step": 328
},
{
"completion_length": 130.9166717529297,
"epoch": 0.748152359295054,
"grad_norm": 7.608541965484619,
"kl": 0.0537109375,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 1.8631189167499542,
"reward_std": 0.0687082838267088,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8839522004127502,
"step": 329
},
{
"completion_length": 134.0416717529297,
"epoch": 0.7504263786242183,
"grad_norm": 12.23119068145752,
"kl": 0.05078125,
"learning_rate": 1e-06,
"loss": 0.002,
"reward": 1.7768693566322327,
"reward_std": 0.249573610490188,
"rewards/format_reward": 0.9375000149011612,
"rewards/segmentation_reward": 0.8393692970275879,
"step": 330
},
{
"completion_length": 130.4791717529297,
"epoch": 0.7527003979533826,
"grad_norm": 5.401615142822266,
"kl": 0.05450439453125,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 1.8891821205615997,
"reward_std": 0.011260898812906817,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8891821354627609,
"step": 331
},
{
"completion_length": 131.0416717529297,
"epoch": 0.7549744172825469,
"grad_norm": 13.699596405029297,
"kl": 0.053955078125,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 1.8251034915447235,
"reward_std": 0.09959045611321926,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8459368050098419,
"step": 332
},
{
"completion_length": 127.47916984558105,
"epoch": 0.7572484366117112,
"grad_norm": 3.924161672592163,
"kl": 0.05859375,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.8615702688694,
"reward_std": 0.023111989721655846,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8615703135728836,
"step": 333
},
{
"completion_length": 130.03125190734863,
"epoch": 0.7595224559408755,
"grad_norm": 4.831616401672363,
"kl": 0.05377197265625,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 1.8001680374145508,
"reward_std": 0.11854788940399885,
"rewards/format_reward": 0.9687500298023224,
"rewards/segmentation_reward": 0.8314179629087448,
"step": 334
},
{
"completion_length": 134.46875762939453,
"epoch": 0.7617964752700398,
"grad_norm": 7.652170181274414,
"kl": 0.0491943359375,
"learning_rate": 1e-06,
"loss": 0.002,
"reward": 1.8629566133022308,
"reward_std": 0.019374964205780998,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8629565536975861,
"step": 335
},
{
"completion_length": 125.67708587646484,
"epoch": 0.7640704945992041,
"grad_norm": 9.895513534545898,
"kl": 0.05560302734375,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 1.8746117651462555,
"reward_std": 0.01222996957221767,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8746117502450943,
"step": 336
},
{
"completion_length": 124.05208587646484,
"epoch": 0.7663445139283684,
"grad_norm": 5.920944690704346,
"kl": 0.05902099609375,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.8395130336284637,
"reward_std": 0.05145470690331422,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.849929690361023,
"step": 337
},
{
"completion_length": 132.37500381469727,
"epoch": 0.7686185332575327,
"grad_norm": 5.402987003326416,
"kl": 0.05596923828125,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.8278735280036926,
"reward_std": 0.05184625834226608,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8278734982013702,
"step": 338
},
{
"completion_length": 127.58333778381348,
"epoch": 0.770892552586697,
"grad_norm": 5.004913806915283,
"kl": 0.06292724609375,
"learning_rate": 1e-06,
"loss": 0.0025,
"reward": 1.8682883381843567,
"reward_std": 0.04544829938095063,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8787050098180771,
"step": 339
},
{
"completion_length": 125.28125190734863,
"epoch": 0.7731665719158612,
"grad_norm": 10.156172752380371,
"kl": 0.062744140625,
"learning_rate": 1e-06,
"loss": 0.0025,
"reward": 1.8668439388275146,
"reward_std": 0.041374096646904945,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8668439537286758,
"step": 340
},
{
"completion_length": 131.6562557220459,
"epoch": 0.7754405912450256,
"grad_norm": 7.755644798278809,
"kl": 0.05865478515625,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.8412960767745972,
"reward_std": 0.07608503196388483,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8517126888036728,
"step": 341
},
{
"completion_length": 131.5729217529297,
"epoch": 0.7777146105741899,
"grad_norm": 4.717071533203125,
"kl": 0.065185546875,
"learning_rate": 1e-06,
"loss": 0.0026,
"reward": 1.8332788348197937,
"reward_std": 0.11580408085137606,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8541121035814285,
"step": 342
},
{
"completion_length": 128.59375381469727,
"epoch": 0.7799886299033542,
"grad_norm": 5.577821731567383,
"kl": 0.06103515625,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.8615179657936096,
"reward_std": 0.032256070990115404,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8615180253982544,
"step": 343
},
{
"completion_length": 133.35417556762695,
"epoch": 0.7822626492325184,
"grad_norm": 17.56463050842285,
"kl": 0.0675048828125,
"learning_rate": 1e-06,
"loss": 0.0027,
"reward": 1.826512634754181,
"reward_std": 0.04132456611841917,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8265126645565033,
"step": 344
},
{
"completion_length": 120.48958396911621,
"epoch": 0.7845366685616828,
"grad_norm": 10.41229248046875,
"kl": 0.06341552734375,
"learning_rate": 1e-06,
"loss": 0.0025,
"reward": 1.867453545331955,
"reward_std": 0.03282071987632662,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8674535155296326,
"step": 345
},
{
"completion_length": 124.66666793823242,
"epoch": 0.7868106878908471,
"grad_norm": 11.105557441711426,
"kl": 0.064208984375,
"learning_rate": 1e-06,
"loss": 0.0026,
"reward": 1.8204675316810608,
"reward_std": 0.08718774002045393,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8308842182159424,
"step": 346
},
{
"completion_length": 135.03125762939453,
"epoch": 0.7890847072200113,
"grad_norm": 7.016823768615723,
"kl": 0.064208984375,
"learning_rate": 1e-06,
"loss": 0.0026,
"reward": 1.8432948589324951,
"reward_std": 0.04058923898264766,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8432948887348175,
"step": 347
},
{
"completion_length": 132.23958587646484,
"epoch": 0.7913587265491757,
"grad_norm": 10.676706314086914,
"kl": 0.05792236328125,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.8691777288913727,
"reward_std": 0.027745802886784077,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8691777735948563,
"step": 348
},
{
"completion_length": 134.97917556762695,
"epoch": 0.79363274587834,
"grad_norm": 6.6281418800354,
"kl": 0.0599365234375,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.864767611026764,
"reward_std": 0.014376505510881543,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8647675514221191,
"step": 349
},
{
"completion_length": 133.29166793823242,
"epoch": 0.7959067652075043,
"grad_norm": 5.383996963500977,
"kl": 0.06011962890625,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.8472253382205963,
"reward_std": 0.045232664968352765,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8576419502496719,
"step": 350
},
{
"completion_length": 127.71875190734863,
"epoch": 0.7981807845366685,
"grad_norm": 7.840123176574707,
"kl": 0.071533203125,
"learning_rate": 1e-06,
"loss": 0.0029,
"reward": 1.8401671946048737,
"reward_std": 0.060452125035226345,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8505838364362717,
"step": 351
},
{
"completion_length": 130.77083587646484,
"epoch": 0.8004548038658329,
"grad_norm": 8.090129852294922,
"kl": 0.0594482421875,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.8789224326610565,
"reward_std": 0.0189595150295645,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8789223730564117,
"step": 352
},
{
"completion_length": 131.5312557220459,
"epoch": 0.8027288231949972,
"grad_norm": 5.633690357208252,
"kl": 0.06817626953125,
"learning_rate": 1e-06,
"loss": 0.0027,
"reward": 1.818681389093399,
"reward_std": 0.13099218998104334,
"rewards/format_reward": 0.9687500149011612,
"rewards/segmentation_reward": 0.8499313741922379,
"step": 353
},
{
"completion_length": 129.14583587646484,
"epoch": 0.8050028425241614,
"grad_norm": 4.412927627563477,
"kl": 0.06451416015625,
"learning_rate": 1e-06,
"loss": 0.0026,
"reward": 1.8463831841945648,
"reward_std": 0.03683751542121172,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8463831394910812,
"step": 354
},
{
"completion_length": 126.17708587646484,
"epoch": 0.8072768618533257,
"grad_norm": 11.527948379516602,
"kl": 0.06689453125,
"learning_rate": 1e-06,
"loss": 0.0027,
"reward": 1.8371998071670532,
"reward_std": 0.021064158499939367,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8371998518705368,
"step": 355
},
{
"completion_length": 128.8854217529297,
"epoch": 0.8095508811824901,
"grad_norm": 48.39152908325195,
"kl": 0.05670166015625,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.8901303112506866,
"reward_std": 0.009621757606510073,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8901302367448807,
"step": 356
},
{
"completion_length": 127.15625381469727,
"epoch": 0.8118249005116543,
"grad_norm": 58.79667663574219,
"kl": 0.06072998046875,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.8344822525978088,
"reward_std": 0.07819899823516607,
"rewards/format_reward": 0.9791666716337204,
"rewards/segmentation_reward": 0.8553156107664108,
"step": 357
},
{
"completion_length": 124.82292366027832,
"epoch": 0.8140989198408186,
"grad_norm": 4.849188327789307,
"kl": 0.065185546875,
"learning_rate": 1e-06,
"loss": 0.0026,
"reward": 1.8805176317691803,
"reward_std": 0.03933787811547518,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8805176019668579,
"step": 358
},
{
"completion_length": 127.5312557220459,
"epoch": 0.816372939169983,
"grad_norm": 9.841382026672363,
"kl": 0.06512451171875,
"learning_rate": 1e-06,
"loss": 0.0026,
"reward": 1.8762429058551788,
"reward_std": 0.012105958012398332,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8762429058551788,
"step": 359
},
{
"completion_length": 124.17708587646484,
"epoch": 0.8186469584991473,
"grad_norm": 23.35593032836914,
"kl": 0.06060791015625,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.815219759941101,
"reward_std": 0.10280088149011135,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8256364315748215,
"step": 360
},
{
"completion_length": 133.375,
"epoch": 0.8209209778283115,
"grad_norm": 8.626202583312988,
"kl": 0.05999755859375,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.8821953237056732,
"reward_std": 0.03062002846854739,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8821953237056732,
"step": 361
},
{
"completion_length": 123.88541984558105,
"epoch": 0.8231949971574758,
"grad_norm": 8.920926094055176,
"kl": 0.06756591796875,
"learning_rate": 1e-06,
"loss": 0.0027,
"reward": 1.8656490445137024,
"reward_std": 0.016483795596286654,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.865649089217186,
"step": 362
},
{
"completion_length": 120.92708587646484,
"epoch": 0.8254690164866402,
"grad_norm": 7.630340576171875,
"kl": 0.0657958984375,
"learning_rate": 1e-06,
"loss": 0.0026,
"reward": 1.8661229014396667,
"reward_std": 0.05962109373649582,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8765395879745483,
"step": 363
},
{
"completion_length": 126.13542175292969,
"epoch": 0.8277430358158044,
"grad_norm": 10.0298490524292,
"kl": 0.1300048828125,
"learning_rate": 1e-06,
"loss": 0.0052,
"reward": 1.853810042142868,
"reward_std": 0.027746433101128787,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8538100123405457,
"step": 364
},
{
"completion_length": 129.50000381469727,
"epoch": 0.8300170551449687,
"grad_norm": 23.108720779418945,
"kl": 0.05908203125,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.8352333307266235,
"reward_std": 0.060839211102575064,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8456498980522156,
"step": 365
},
{
"completion_length": 126.13541984558105,
"epoch": 0.832291074474133,
"grad_norm": 6.668464183807373,
"kl": 0.0648193359375,
"learning_rate": 1e-06,
"loss": 0.0026,
"reward": 1.8593635261058807,
"reward_std": 0.015451492741703987,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8593635261058807,
"step": 366
},
{
"completion_length": 123.51041984558105,
"epoch": 0.8345650938032974,
"grad_norm": 7.906125545501709,
"kl": 0.0628662109375,
"learning_rate": 1e-06,
"loss": 0.0025,
"reward": 1.8826908469200134,
"reward_std": 0.012024826108245179,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8826908022165298,
"step": 367
},
{
"completion_length": 127.81250381469727,
"epoch": 0.8368391131324616,
"grad_norm": 5.663455486297607,
"kl": 0.07000732421875,
"learning_rate": 1e-06,
"loss": 0.0028,
"reward": 1.816881150007248,
"reward_std": 0.06968936347402632,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8272978365421295,
"step": 368
},
{
"completion_length": 129.1979217529297,
"epoch": 0.8391131324616259,
"grad_norm": 4.211099147796631,
"kl": 0.05908203125,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.8330486714839935,
"reward_std": 0.11126681696623564,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8538819551467896,
"step": 369
},
{
"completion_length": 127.00000190734863,
"epoch": 0.8413871517907903,
"grad_norm": 9.754108428955078,
"kl": 0.06201171875,
"learning_rate": 1e-06,
"loss": 0.0025,
"reward": 1.888169288635254,
"reward_std": 0.009245644323527813,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8881692886352539,
"step": 370
},
{
"completion_length": 132.10417366027832,
"epoch": 0.8436611711199545,
"grad_norm": 10.026612281799316,
"kl": 0.06280517578125,
"learning_rate": 1e-06,
"loss": 0.0025,
"reward": 1.83915776014328,
"reward_std": 0.09443543804809451,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8599910587072372,
"step": 371
},
{
"completion_length": 122.82291793823242,
"epoch": 0.8459351904491188,
"grad_norm": 6.644383907318115,
"kl": 0.0699462890625,
"learning_rate": 1e-06,
"loss": 0.0028,
"reward": 1.808215469121933,
"reward_std": 0.10915855201892555,
"rewards/format_reward": 0.9791666716337204,
"rewards/segmentation_reward": 0.8290487676858902,
"step": 372
},
{
"completion_length": 133.25000190734863,
"epoch": 0.8482092097782831,
"grad_norm": 7.538217544555664,
"kl": 0.06182861328125,
"learning_rate": 1e-06,
"loss": 0.0025,
"reward": 1.8048012256622314,
"reward_std": 0.11374149052426219,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8256345838308334,
"step": 373
},
{
"completion_length": 139.2083396911621,
"epoch": 0.8504832291074474,
"grad_norm": 6.131237030029297,
"kl": 0.06085205078125,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.834800899028778,
"reward_std": 0.04532355163246393,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8348008543252945,
"step": 374
},
{
"completion_length": 122.94791984558105,
"epoch": 0.8527572484366117,
"grad_norm": 6.690881252288818,
"kl": 0.05902099609375,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.860317587852478,
"reward_std": 0.05016712564975023,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8707341998815536,
"step": 375
},
{
"completion_length": 131.50000381469727,
"epoch": 0.855031267765776,
"grad_norm": 7.440776348114014,
"kl": 0.0614013671875,
"learning_rate": 1e-06,
"loss": 0.0025,
"reward": 1.8719264268875122,
"reward_std": 0.0520896875532344,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8823430240154266,
"step": 376
},
{
"completion_length": 131.84375381469727,
"epoch": 0.8573052870949404,
"grad_norm": 8.669480323791504,
"kl": 0.057861328125,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.8542674481868744,
"reward_std": 0.030696504516527057,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8542674630880356,
"step": 377
},
{
"completion_length": 134.1979217529297,
"epoch": 0.8595793064241046,
"grad_norm": 8.286240577697754,
"kl": 0.065673828125,
"learning_rate": 1e-06,
"loss": 0.0026,
"reward": 1.8401267230510712,
"reward_std": 0.07586855837143958,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8505433797836304,
"step": 378
},
{
"completion_length": 139.29166793823242,
"epoch": 0.8618533257532689,
"grad_norm": 8.042603492736816,
"kl": 0.0582275390625,
"learning_rate": 1e-06,
"loss": 0.0023,
"reward": 1.8034493625164032,
"reward_std": 0.12077387608587742,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8242826908826828,
"step": 379
},
{
"completion_length": 131.09375381469727,
"epoch": 0.8641273450824332,
"grad_norm": 6.3277788162231445,
"kl": 0.07427978515625,
"learning_rate": 1e-06,
"loss": 0.003,
"reward": 1.841260701417923,
"reward_std": 0.053736023139208555,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8412606567144394,
"step": 380
},
{
"completion_length": 132.83333587646484,
"epoch": 0.8664013644115975,
"grad_norm": 6.668425559997559,
"kl": 0.06573486328125,
"learning_rate": 1e-06,
"loss": 0.0026,
"reward": 1.852523535490036,
"reward_std": 0.06596486712805927,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.862940177321434,
"step": 381
},
{
"completion_length": 129.06250381469727,
"epoch": 0.8686753837407618,
"grad_norm": 5.517845153808594,
"kl": 0.060546875,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.8688926994800568,
"reward_std": 0.04422900633653626,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8793093860149384,
"step": 382
},
{
"completion_length": 133.6770839691162,
"epoch": 0.8709494030699261,
"grad_norm": 6.203193664550781,
"kl": 0.0697021484375,
"learning_rate": 1e-06,
"loss": 0.0028,
"reward": 1.8271671533584595,
"reward_std": 0.06463960534892976,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8375838100910187,
"step": 383
},
{
"completion_length": 134.02083587646484,
"epoch": 0.8732234223990903,
"grad_norm": 10.909748077392578,
"kl": 0.07537841796875,
"learning_rate": 1e-06,
"loss": 0.003,
"reward": 1.8480697870254517,
"reward_std": 0.034777372784446925,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8480697721242905,
"step": 384
},
{
"completion_length": 129.8541717529297,
"epoch": 0.8754974417282547,
"grad_norm": 9.155665397644043,
"kl": 0.061279296875,
"learning_rate": 1e-06,
"loss": 0.0024,
"reward": 1.8443851470947266,
"reward_std": 0.027714062947779894,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8443851172924042,
"step": 385
},
{
"completion_length": 132.1458396911621,
"epoch": 0.877771461057419,
"grad_norm": 4.895893573760986,
"kl": 0.06463623046875,
"learning_rate": 1e-06,
"loss": 0.0026,
"reward": 1.8438906073570251,
"reward_std": 0.04654449052759446,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8543073236942291,
"step": 386
},
{
"completion_length": 133.4166717529297,
"epoch": 0.8800454803865833,
"grad_norm": 10.000784873962402,
"kl": 0.062255859375,
"learning_rate": 1e-06,
"loss": 0.0025,
"reward": 1.8418076932430267,
"reward_std": 0.0637913720565848,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8522243201732635,
"step": 387
},
{
"completion_length": 132.00000762939453,
"epoch": 0.8823194997157476,
"grad_norm": 7.047492027282715,
"kl": 0.06658935546875,
"learning_rate": 1e-06,
"loss": 0.0027,
"reward": 1.8548587560653687,
"reward_std": 0.05772953329142183,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8652754127979279,
"step": 388
},
{
"completion_length": 129.31250381469727,
"epoch": 0.8845935190449119,
"grad_norm": 6.7407989501953125,
"kl": 0.066650390625,
"learning_rate": 1e-06,
"loss": 0.0027,
"reward": 1.8317703306674957,
"reward_std": 0.09461293439380825,
"rewards/format_reward": 0.9791666716337204,
"rewards/segmentation_reward": 0.8526036590337753,
"step": 389
},
{
"completion_length": 131.3645896911621,
"epoch": 0.8868675383740762,
"grad_norm": 6.323419570922852,
"kl": 0.072998046875,
"learning_rate": 1e-06,
"loss": 0.0029,
"reward": 1.857729732990265,
"reward_std": 0.026556792086921632,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8577296435832977,
"step": 390
},
{
"completion_length": 132.5208396911621,
"epoch": 0.8891415577032404,
"grad_norm": 13.720745086669922,
"kl": 0.069580078125,
"learning_rate": 1e-06,
"loss": 0.0028,
"reward": 1.7931525707244873,
"reward_std": 0.14119149651378393,
"rewards/format_reward": 0.9687500298023224,
"rewards/segmentation_reward": 0.8244025856256485,
"step": 391
},
{
"completion_length": 127.55208587646484,
"epoch": 0.8914155770324048,
"grad_norm": 11.731847763061523,
"kl": 0.0699462890625,
"learning_rate": 1e-06,
"loss": 0.0028,
"reward": 1.8540717661380768,
"reward_std": 0.025957859819754958,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8540717363357544,
"step": 392
},
{
"completion_length": 134.59375381469727,
"epoch": 0.8936895963615691,
"grad_norm": 8.114005088806152,
"kl": 0.06787109375,
"learning_rate": 1e-06,
"loss": 0.0027,
"reward": 1.8252622783184052,
"reward_std": 0.06493170734029263,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8356789350509644,
"step": 393
},
{
"completion_length": 128.76041984558105,
"epoch": 0.8959636156907334,
"grad_norm": 4.420175552368164,
"kl": 0.07574462890625,
"learning_rate": 1e-06,
"loss": 0.003,
"reward": 1.858299344778061,
"reward_std": 0.05047441285569221,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8582993745803833,
"step": 394
},
{
"completion_length": 127.26041793823242,
"epoch": 0.8982376350198976,
"grad_norm": 9.239371299743652,
"kl": 0.0760498046875,
"learning_rate": 1e-06,
"loss": 0.0031,
"reward": 1.8622342646121979,
"reward_std": 0.02814770070835948,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8622342348098755,
"step": 395
},
{
"completion_length": 132.2395839691162,
"epoch": 0.900511654349062,
"grad_norm": 7.392596244812012,
"kl": 0.0726318359375,
"learning_rate": 1e-06,
"loss": 0.0029,
"reward": 1.855131834745407,
"reward_std": 0.051054751384072006,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8655484467744827,
"step": 396
},
{
"completion_length": 128.13541984558105,
"epoch": 0.9027856736782263,
"grad_norm": 8.125931739807129,
"kl": 0.07080078125,
"learning_rate": 1e-06,
"loss": 0.0028,
"reward": 1.8950492441654205,
"reward_std": 0.01147704414324835,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.895049199461937,
"step": 397
},
{
"completion_length": 130.39583587646484,
"epoch": 0.9050596930073905,
"grad_norm": 5.162111759185791,
"kl": 0.0958251953125,
"learning_rate": 1e-06,
"loss": 0.0038,
"reward": 1.8326389491558075,
"reward_std": 0.12029724020976573,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8534722775220871,
"step": 398
},
{
"completion_length": 137.5520896911621,
"epoch": 0.9073337123365549,
"grad_norm": 5.906332015991211,
"kl": 0.0780029296875,
"learning_rate": 1e-06,
"loss": 0.0031,
"reward": 1.83551886677742,
"reward_std": 0.04398070462048054,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8355187922716141,
"step": 399
},
{
"completion_length": 136.5729217529297,
"epoch": 0.9096077316657192,
"grad_norm": 6.234349250793457,
"kl": 0.073486328125,
"learning_rate": 1e-06,
"loss": 0.0029,
"reward": 1.8535465598106384,
"reward_std": 0.029846468474715948,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8535465747117996,
"step": 400
},
{
"completion_length": 138.0208396911621,
"epoch": 0.9118817509948834,
"grad_norm": 9.130826950073242,
"kl": 0.07421875,
"learning_rate": 1e-06,
"loss": 0.003,
"reward": 1.8739716410636902,
"reward_std": 0.02106904413085431,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.873971700668335,
"step": 401
},
{
"completion_length": 134.43750381469727,
"epoch": 0.9141557703240477,
"grad_norm": 4.951981544494629,
"kl": 0.0703125,
"learning_rate": 1e-06,
"loss": 0.0028,
"reward": 1.871123731136322,
"reward_std": 0.017409008694812655,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8711237162351608,
"step": 402
},
{
"completion_length": 131.08333778381348,
"epoch": 0.9164297896532121,
"grad_norm": 5.46921968460083,
"kl": 0.07666015625,
"learning_rate": 1e-06,
"loss": 0.0031,
"reward": 1.8896593153476715,
"reward_std": 0.013306577457115054,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8896592557430267,
"step": 403
},
{
"completion_length": 130.71875381469727,
"epoch": 0.9187038089823764,
"grad_norm": 3.1999082565307617,
"kl": 0.078125,
"learning_rate": 1e-06,
"loss": 0.0031,
"reward": 1.8919513523578644,
"reward_std": 0.020059253147337586,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8919513821601868,
"step": 404
},
{
"completion_length": 140.6041717529297,
"epoch": 0.9209778283115406,
"grad_norm": 9.87605094909668,
"kl": 0.07421875,
"learning_rate": 1e-06,
"loss": 0.003,
"reward": 1.8576882183551788,
"reward_std": 0.05526958662085235,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8681048899888992,
"step": 405
},
{
"completion_length": 141.71875762939453,
"epoch": 0.9232518476407049,
"grad_norm": 6.338070392608643,
"kl": 0.0753173828125,
"learning_rate": 1e-06,
"loss": 0.003,
"reward": 1.8482947051525116,
"reward_std": 0.03551662730751559,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8482946902513504,
"step": 406
},
{
"completion_length": 143.32291793823242,
"epoch": 0.9255258669698693,
"grad_norm": 10.858189582824707,
"kl": 0.06878662109375,
"learning_rate": 1e-06,
"loss": 0.0028,
"reward": 1.8244743645191193,
"reward_std": 0.09086814895272255,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8453076481819153,
"step": 407
},
{
"completion_length": 135.8958396911621,
"epoch": 0.9277998862990335,
"grad_norm": 5.908721923828125,
"kl": 0.072509765625,
"learning_rate": 1e-06,
"loss": 0.0029,
"reward": 1.7982184290885925,
"reward_std": 0.1342280562967062,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8190517276525497,
"step": 408
},
{
"completion_length": 136.5833396911621,
"epoch": 0.9300739056281978,
"grad_norm": 11.515835762023926,
"kl": 0.073486328125,
"learning_rate": 1e-06,
"loss": 0.0029,
"reward": 1.7793036699295044,
"reward_std": 0.05075064115226269,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.7793037295341492,
"step": 409
},
{
"completion_length": 138.71875381469727,
"epoch": 0.9323479249573622,
"grad_norm": 5.331714153289795,
"kl": 0.0684814453125,
"learning_rate": 1e-06,
"loss": 0.0028,
"reward": 1.843429058790207,
"reward_std": 0.02589858788996935,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8434290289878845,
"step": 410
},
{
"completion_length": 138.46875381469727,
"epoch": 0.9346219442865265,
"grad_norm": 30.081806182861328,
"kl": 0.06719970703125,
"learning_rate": 1e-06,
"loss": 0.0027,
"reward": 1.8344328999519348,
"reward_std": 0.08471645403187722,
"rewards/format_reward": 0.9791666716337204,
"rewards/segmentation_reward": 0.8552662283182144,
"step": 411
},
{
"completion_length": 139.90625381469727,
"epoch": 0.9368959636156907,
"grad_norm": 4.920882701873779,
"kl": 0.072265625,
"learning_rate": 1e-06,
"loss": 0.0029,
"reward": 1.8378893733024597,
"reward_std": 0.0527505818172358,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8483060598373413,
"step": 412
},
{
"completion_length": 135.1979217529297,
"epoch": 0.939169982944855,
"grad_norm": 4.576317310333252,
"kl": 0.07666015625,
"learning_rate": 1e-06,
"loss": 0.0031,
"reward": 1.8669978380203247,
"reward_std": 0.016623229486867785,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8669977784156799,
"step": 413
},
{
"completion_length": 135.60416793823242,
"epoch": 0.9414440022740194,
"grad_norm": 14.46871280670166,
"kl": 0.07025146484375,
"learning_rate": 1e-06,
"loss": 0.0028,
"reward": 1.7731802463531494,
"reward_std": 0.13738415925763547,
"rewards/format_reward": 0.9687500149011612,
"rewards/segmentation_reward": 0.804430216550827,
"step": 414
},
{
"completion_length": 141.68750381469727,
"epoch": 0.9437180216031836,
"grad_norm": 7.103405475616455,
"kl": 0.0738525390625,
"learning_rate": 1e-06,
"loss": 0.003,
"reward": 1.8259045779705048,
"reward_std": 0.059208789840340614,
"rewards/format_reward": 0.9791666716337204,
"rewards/segmentation_reward": 0.8467378467321396,
"step": 415
},
{
"completion_length": 138.7083396911621,
"epoch": 0.9459920409323479,
"grad_norm": 5.577739238739014,
"kl": 0.07861328125,
"learning_rate": 1e-06,
"loss": 0.0031,
"reward": 1.8508521616458893,
"reward_std": 0.04068222228670493,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8612687885761261,
"step": 416
},
{
"completion_length": 134.65625762939453,
"epoch": 0.9482660602615123,
"grad_norm": 4.289518356323242,
"kl": 0.074462890625,
"learning_rate": 1e-06,
"loss": 0.003,
"reward": 1.8845854997634888,
"reward_std": 0.01646020170301199,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8845854997634888,
"step": 417
},
{
"completion_length": 139.65625381469727,
"epoch": 0.9505400795906765,
"grad_norm": 8.5299072265625,
"kl": 0.0740966796875,
"learning_rate": 1e-06,
"loss": 0.003,
"reward": 1.8217022120952606,
"reward_std": 0.05600218917243183,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8321189731359482,
"step": 418
},
{
"completion_length": 138.50000762939453,
"epoch": 0.9528140989198408,
"grad_norm": 6.827524662017822,
"kl": 0.0794677734375,
"learning_rate": 1e-06,
"loss": 0.0032,
"reward": 1.822449415922165,
"reward_std": 0.04770416300743818,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8224494010210037,
"step": 419
},
{
"completion_length": 142.55208587646484,
"epoch": 0.9550881182490051,
"grad_norm": 6.027355670928955,
"kl": 0.0709228515625,
"learning_rate": 1e-06,
"loss": 0.0028,
"reward": 1.8189789950847626,
"reward_std": 0.09298859292175621,
"rewards/format_reward": 0.9791666716337204,
"rewards/segmentation_reward": 0.8398122638463974,
"step": 420
},
{
"completion_length": 141.3020896911621,
"epoch": 0.9573621375781695,
"grad_norm": 5.657737731933594,
"kl": 0.0723876953125,
"learning_rate": 1e-06,
"loss": 0.0029,
"reward": 1.8466951251029968,
"reward_std": 0.04600943787954748,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8466951251029968,
"step": 421
},
{
"completion_length": 140.84375762939453,
"epoch": 0.9596361569073337,
"grad_norm": 6.143070697784424,
"kl": 0.07037353515625,
"learning_rate": 1e-06,
"loss": 0.0028,
"reward": 1.7793289721012115,
"reward_std": 0.14454051246866584,
"rewards/format_reward": 0.9583333432674408,
"rewards/segmentation_reward": 0.8209956139326096,
"step": 422
},
{
"completion_length": 143.00000381469727,
"epoch": 0.961910176236498,
"grad_norm": 6.810637474060059,
"kl": 0.0731201171875,
"learning_rate": 1e-06,
"loss": 0.0029,
"reward": 1.861203372478485,
"reward_std": 0.02801788877695799,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8612033277750015,
"step": 423
},
{
"completion_length": 143.27083587646484,
"epoch": 0.9641841955656623,
"grad_norm": 4.375816822052002,
"kl": 0.06439208984375,
"learning_rate": 1e-06,
"loss": 0.0026,
"reward": 1.8603475391864777,
"reward_std": 0.01995037216693163,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8603476583957672,
"step": 424
},
{
"completion_length": 139.7708396911621,
"epoch": 0.9664582148948266,
"grad_norm": 10.486807823181152,
"kl": 0.0780029296875,
"learning_rate": 1e-06,
"loss": 0.0031,
"reward": 1.8208959996700287,
"reward_std": 0.10637146979570389,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8417292982339859,
"step": 425
},
{
"completion_length": 137.46875381469727,
"epoch": 0.9687322342239909,
"grad_norm": 3.8334126472473145,
"kl": 0.0692138671875,
"learning_rate": 1e-06,
"loss": 0.0028,
"reward": 1.8344232141971588,
"reward_std": 0.059240641247015446,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8448398411273956,
"step": 426
},
{
"completion_length": 135.53125381469727,
"epoch": 0.9710062535531552,
"grad_norm": 7.44934606552124,
"kl": 0.0675048828125,
"learning_rate": 1e-06,
"loss": 0.0027,
"reward": 1.7985945045948029,
"reward_std": 0.12954094889573753,
"rewards/format_reward": 0.9687500298023224,
"rewards/segmentation_reward": 0.8298445492982864,
"step": 427
},
{
"completion_length": 141.2604217529297,
"epoch": 0.9732802728823196,
"grad_norm": 10.400691032409668,
"kl": 0.06573486328125,
"learning_rate": 1e-06,
"loss": 0.0026,
"reward": 1.8283280432224274,
"reward_std": 0.13098794838879257,
"rewards/format_reward": 0.9687500149011612,
"rewards/segmentation_reward": 0.859578013420105,
"step": 428
},
{
"completion_length": 137.05208587646484,
"epoch": 0.9755542922114838,
"grad_norm": 6.7781758308410645,
"kl": 0.06414794921875,
"learning_rate": 1e-06,
"loss": 0.0026,
"reward": 1.8502939641475677,
"reward_std": 0.01936683728126809,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8502939641475677,
"step": 429
},
{
"completion_length": 137.20833587646484,
"epoch": 0.9778283115406481,
"grad_norm": 8.260501861572266,
"kl": 0.0726318359375,
"learning_rate": 1e-06,
"loss": 0.0029,
"reward": 1.8315411806106567,
"reward_std": 0.10588931851089001,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8523745536804199,
"step": 430
},
{
"completion_length": 135.75000762939453,
"epoch": 0.9801023308698124,
"grad_norm": 9.982911109924316,
"kl": 0.06793212890625,
"learning_rate": 1e-06,
"loss": 0.0027,
"reward": 1.8607007265090942,
"reward_std": 0.0624299687333405,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8711173385381699,
"step": 431
},
{
"completion_length": 144.5833396911621,
"epoch": 0.9823763501989767,
"grad_norm": 3.751823663711548,
"kl": 0.066162109375,
"learning_rate": 1e-06,
"loss": 0.0026,
"reward": 1.8520955741405487,
"reward_std": 0.05543442675843835,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8520955145359039,
"step": 432
},
{
"completion_length": 134.0520896911621,
"epoch": 0.984650369528141,
"grad_norm": 3.716749429702759,
"kl": 0.076904296875,
"learning_rate": 1e-06,
"loss": 0.0031,
"reward": 1.7991288006305695,
"reward_std": 0.10263975383713841,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8199621438980103,
"step": 433
},
{
"completion_length": 137.96875762939453,
"epoch": 0.9869243888573053,
"grad_norm": 8.06445598602295,
"kl": 0.06341552734375,
"learning_rate": 1e-06,
"loss": 0.0025,
"reward": 1.8634448647499084,
"reward_std": 0.030836602323688567,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8634448498487473,
"step": 434
},
{
"completion_length": 136.0104217529297,
"epoch": 0.9891984081864695,
"grad_norm": 6.394524574279785,
"kl": 0.0859375,
"learning_rate": 1e-06,
"loss": 0.0035,
"reward": 1.8284251689910889,
"reward_std": 0.10163544374518096,
"rewards/format_reward": 0.9791666716337204,
"rewards/segmentation_reward": 0.8492584675550461,
"step": 435
},
{
"completion_length": 136.32292556762695,
"epoch": 0.9914724275156339,
"grad_norm": 5.4122314453125,
"kl": 0.0718994140625,
"learning_rate": 1e-06,
"loss": 0.0029,
"reward": 1.8416071236133575,
"reward_std": 0.09167469386011362,
"rewards/format_reward": 0.9791666716337204,
"rewards/segmentation_reward": 0.8624404817819595,
"step": 436
},
{
"completion_length": 134.3645839691162,
"epoch": 0.9937464468447982,
"grad_norm": 6.90580940246582,
"kl": 0.06646728515625,
"learning_rate": 1e-06,
"loss": 0.0027,
"reward": 1.8837913274765015,
"reward_std": 0.00442745303735137,
"rewards/format_reward": 1.0,
"rewards/segmentation_reward": 0.8837913274765015,
"step": 437
},
{
"completion_length": 131.6666717529297,
"epoch": 0.9960204661739624,
"grad_norm": 3.684438943862915,
"kl": 0.065673828125,
"learning_rate": 1e-06,
"loss": 0.0026,
"reward": 1.8093055188655853,
"reward_std": 0.07611760849249549,
"rewards/format_reward": 0.9791666865348816,
"rewards/segmentation_reward": 0.8301387876272202,
"step": 438
},
{
"completion_length": 134.8541717529297,
"epoch": 0.9982944855031268,
"grad_norm": 7.167064189910889,
"kl": 0.0731201171875,
"learning_rate": 1e-06,
"loss": 0.0029,
"reward": 1.8386133015155792,
"reward_std": 0.07322599086910486,
"rewards/format_reward": 0.9895833432674408,
"rewards/segmentation_reward": 0.8490300327539444,
"step": 439
}
],
"logging_steps": 1.0,
"max_steps": 439,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}