| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9982944855031268, | |
| "eval_steps": 500, | |
| "global_step": 439, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 147.92708587646484, | |
| "epoch": 0.0022740193291642978, | |
| "grad_norm": 8.737942695617676, | |
| "kl": 0.0, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "reward": 1.6511709690093994, | |
| "reward_std": 0.33823655918240547, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "rewards/segmentation_reward": 0.7345042824745178, | |
| "step": 1 | |
| }, | |
| { | |
| "completion_length": 144.85417556762695, | |
| "epoch": 0.0045480386583285955, | |
| "grad_norm": 8.010404586791992, | |
| "kl": 0.0012912750244140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0001, | |
| "reward": 1.6437489092350006, | |
| "reward_std": 0.2732698582112789, | |
| "rewards/format_reward": 0.9166667014360428, | |
| "rewards/segmentation_reward": 0.7270822674036026, | |
| "step": 2 | |
| }, | |
| { | |
| "completion_length": 150.1041717529297, | |
| "epoch": 0.006822057987492893, | |
| "grad_norm": 12.285407066345215, | |
| "kl": 0.0012798309326171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0001, | |
| "reward": 1.53373184800148, | |
| "reward_std": 0.43030911684036255, | |
| "rewards/format_reward": 0.8437500149011612, | |
| "rewards/segmentation_reward": 0.6899818480014801, | |
| "step": 3 | |
| }, | |
| { | |
| "completion_length": 147.12500381469727, | |
| "epoch": 0.009096077316657191, | |
| "grad_norm": 10.881176948547363, | |
| "kl": 0.0028820037841796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0001, | |
| "reward": 1.6048874258995056, | |
| "reward_std": 0.3273423947393894, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "rewards/segmentation_reward": 0.7090541273355484, | |
| "step": 4 | |
| }, | |
| { | |
| "completion_length": 144.3958396911621, | |
| "epoch": 0.01137009664582149, | |
| "grad_norm": 12.837152481079102, | |
| "kl": 0.002300262451171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0001, | |
| "reward": 1.611760675907135, | |
| "reward_std": 0.37994210980832577, | |
| "rewards/format_reward": 0.9062500149011612, | |
| "rewards/segmentation_reward": 0.7055106610059738, | |
| "step": 5 | |
| }, | |
| { | |
| "completion_length": 139.15625381469727, | |
| "epoch": 0.013644115974985787, | |
| "grad_norm": 6.645235061645508, | |
| "kl": 0.0039215087890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.726276457309723, | |
| "reward_std": 0.24428023397922516, | |
| "rewards/format_reward": 0.9479166865348816, | |
| "rewards/segmentation_reward": 0.7783599197864532, | |
| "step": 6 | |
| }, | |
| { | |
| "completion_length": 150.1145896911621, | |
| "epoch": 0.015918135304150087, | |
| "grad_norm": 12.681654930114746, | |
| "kl": 0.0052032470703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.7287286818027496, | |
| "reward_std": 0.23665708303451538, | |
| "rewards/format_reward": 0.9479166865348816, | |
| "rewards/segmentation_reward": 0.7808119505643845, | |
| "step": 7 | |
| }, | |
| { | |
| "completion_length": 136.68750381469727, | |
| "epoch": 0.018192154633314382, | |
| "grad_norm": 8.222872734069824, | |
| "kl": 0.00634002685546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0003, | |
| "reward": 1.8023037910461426, | |
| "reward_std": 0.09579922584816813, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8127204030752182, | |
| "step": 8 | |
| }, | |
| { | |
| "completion_length": 145.3645896911621, | |
| "epoch": 0.02046617396247868, | |
| "grad_norm": 12.586159706115723, | |
| "kl": 0.0068511962890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0003, | |
| "reward": 1.6984942257404327, | |
| "reward_std": 0.24017422273755074, | |
| "rewards/format_reward": 0.9375000298023224, | |
| "rewards/segmentation_reward": 0.7609941959381104, | |
| "step": 9 | |
| }, | |
| { | |
| "completion_length": 136.4895896911621, | |
| "epoch": 0.02274019329164298, | |
| "grad_norm": 19.224149703979492, | |
| "kl": 0.008087158203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0003, | |
| "reward": 1.817267656326294, | |
| "reward_std": 0.10884078592061996, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8276843428611755, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 150.14583587646484, | |
| "epoch": 0.025014212620807278, | |
| "grad_norm": 17.966585159301758, | |
| "kl": 0.0106658935546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0004, | |
| "reward": 1.6834727227687836, | |
| "reward_std": 0.3056778460741043, | |
| "rewards/format_reward": 0.927083358168602, | |
| "rewards/segmentation_reward": 0.7563893795013428, | |
| "step": 11 | |
| }, | |
| { | |
| "completion_length": 139.58333587646484, | |
| "epoch": 0.027288231949971573, | |
| "grad_norm": 12.995298385620117, | |
| "kl": 0.0103912353515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0004, | |
| "reward": 1.7785775661468506, | |
| "reward_std": 0.1881927289068699, | |
| "rewards/format_reward": 0.958333358168602, | |
| "rewards/segmentation_reward": 0.8202441930770874, | |
| "step": 12 | |
| }, | |
| { | |
| "completion_length": 134.56250762939453, | |
| "epoch": 0.029562251279135872, | |
| "grad_norm": 5.952467918395996, | |
| "kl": 0.0095977783203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0004, | |
| "reward": 1.8030498623847961, | |
| "reward_std": 0.15592540614306927, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.8342998623847961, | |
| "step": 13 | |
| }, | |
| { | |
| "completion_length": 132.7291717529297, | |
| "epoch": 0.031836270608300174, | |
| "grad_norm": 14.455424308776855, | |
| "kl": 0.01409912109375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0006, | |
| "reward": 1.8657137751579285, | |
| "reward_std": 0.04672300070524216, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8657138496637344, | |
| "step": 14 | |
| }, | |
| { | |
| "completion_length": 134.8229217529297, | |
| "epoch": 0.03411028993746447, | |
| "grad_norm": 16.328163146972656, | |
| "kl": 0.015960693359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0007, | |
| "reward": 1.7946673929691315, | |
| "reward_std": 0.1416209153831005, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "rewards/segmentation_reward": 0.8259173631668091, | |
| "step": 15 | |
| }, | |
| { | |
| "completion_length": 129.48958778381348, | |
| "epoch": 0.036384309266628764, | |
| "grad_norm": 10.300512313842773, | |
| "kl": 0.016510009765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0007, | |
| "reward": 1.8089908957481384, | |
| "reward_std": 0.11957723228260875, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.8298242688179016, | |
| "step": 16 | |
| }, | |
| { | |
| "completion_length": 130.70833587646484, | |
| "epoch": 0.038658328595793066, | |
| "grad_norm": 7.11127233505249, | |
| "kl": 0.01666259765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0007, | |
| "reward": 1.8404812514781952, | |
| "reward_std": 0.09966395457740873, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.86131452023983, | |
| "step": 17 | |
| }, | |
| { | |
| "completion_length": 131.60417366027832, | |
| "epoch": 0.04093234792495736, | |
| "grad_norm": 34.26894760131836, | |
| "kl": 0.015533447265625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0006, | |
| "reward": 1.8397277891635895, | |
| "reward_std": 0.0865055019967258, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8605611473321915, | |
| "step": 18 | |
| }, | |
| { | |
| "completion_length": 128.85416984558105, | |
| "epoch": 0.04320636725412166, | |
| "grad_norm": 8.23103141784668, | |
| "kl": 0.018310546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0007, | |
| "reward": 1.8531556725502014, | |
| "reward_std": 0.02550937162595801, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8531556576490402, | |
| "step": 19 | |
| }, | |
| { | |
| "completion_length": 130.83333587646484, | |
| "epoch": 0.04548038658328596, | |
| "grad_norm": 11.679638862609863, | |
| "kl": 0.0172882080078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0007, | |
| "reward": 1.8632822334766388, | |
| "reward_std": 0.04053949285298586, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8632822781801224, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 130.02083587646484, | |
| "epoch": 0.047754405912450254, | |
| "grad_norm": 57.03630447387695, | |
| "kl": 0.0178070068359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0007, | |
| "reward": 1.8496468663215637, | |
| "reward_std": 0.03478804882615805, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8496468216180801, | |
| "step": 21 | |
| }, | |
| { | |
| "completion_length": 130.65625190734863, | |
| "epoch": 0.050028425241614556, | |
| "grad_norm": 28.802846908569336, | |
| "kl": 0.0260009765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.8245560228824615, | |
| "reward_std": 0.1071182056912221, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.84538933634758, | |
| "step": 22 | |
| }, | |
| { | |
| "completion_length": 128.0937557220459, | |
| "epoch": 0.05230244457077885, | |
| "grad_norm": 10.76288890838623, | |
| "kl": 0.0197296142578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0008, | |
| "reward": 1.7964556813240051, | |
| "reward_std": 0.129461950622499, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.8172890096902847, | |
| "step": 23 | |
| }, | |
| { | |
| "completion_length": 130.0520896911621, | |
| "epoch": 0.054576463899943146, | |
| "grad_norm": 6.2558064460754395, | |
| "kl": 0.0218505859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0009, | |
| "reward": 1.8779499530792236, | |
| "reward_std": 0.023469227773603052, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.877949982881546, | |
| "step": 24 | |
| }, | |
| { | |
| "completion_length": 126.92708587646484, | |
| "epoch": 0.05685048322910745, | |
| "grad_norm": 10.53512954711914, | |
| "kl": 0.017486572265625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0007, | |
| "reward": 1.8650535941123962, | |
| "reward_std": 0.03197958506643772, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8650535494089127, | |
| "step": 25 | |
| }, | |
| { | |
| "completion_length": 128.20833778381348, | |
| "epoch": 0.059124502558271744, | |
| "grad_norm": 37.11606216430664, | |
| "kl": 0.0267791748046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.832018405199051, | |
| "reward_std": 0.07763887383043766, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8424350172281265, | |
| "step": 26 | |
| }, | |
| { | |
| "completion_length": 132.5625057220459, | |
| "epoch": 0.061398521887436046, | |
| "grad_norm": 26.71733856201172, | |
| "kl": 0.019134521484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0007, | |
| "reward": 1.8609023690223694, | |
| "reward_std": 0.05331907293293625, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8713190257549286, | |
| "step": 27 | |
| }, | |
| { | |
| "completion_length": 129.0000057220459, | |
| "epoch": 0.06367254121660035, | |
| "grad_norm": 7.346284866333008, | |
| "kl": 0.017791748046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0007, | |
| "reward": 1.806743562221527, | |
| "reward_std": 0.08053719438612461, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8171601891517639, | |
| "step": 28 | |
| }, | |
| { | |
| "completion_length": 128.45833778381348, | |
| "epoch": 0.06594656054576464, | |
| "grad_norm": 8.270977020263672, | |
| "kl": 0.0204315185546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0008, | |
| "reward": 1.840296596288681, | |
| "reward_std": 0.0486476831138134, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8402965515851974, | |
| "step": 29 | |
| }, | |
| { | |
| "completion_length": 125.03125190734863, | |
| "epoch": 0.06822057987492894, | |
| "grad_norm": 8.62176513671875, | |
| "kl": 0.01678466796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0007, | |
| "reward": 1.8195985853672028, | |
| "reward_std": 0.07711292989552021, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8300152719020844, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 133.76041793823242, | |
| "epoch": 0.07049459920409323, | |
| "grad_norm": 25.146360397338867, | |
| "kl": 0.017852783203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0007, | |
| "reward": 1.8119685649871826, | |
| "reward_std": 0.15049411728978157, | |
| "rewards/format_reward": 0.958333358168602, | |
| "rewards/segmentation_reward": 0.853635236620903, | |
| "step": 31 | |
| }, | |
| { | |
| "completion_length": 127.13541984558105, | |
| "epoch": 0.07276861853325753, | |
| "grad_norm": 11.285983085632324, | |
| "kl": 0.020233154296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0008, | |
| "reward": 1.8676734268665314, | |
| "reward_std": 0.04771583795081824, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8780900835990906, | |
| "step": 32 | |
| }, | |
| { | |
| "completion_length": 130.3541717529297, | |
| "epoch": 0.07504263786242182, | |
| "grad_norm": 58.88550567626953, | |
| "kl": 0.0172576904296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0007, | |
| "reward": 1.797868400812149, | |
| "reward_std": 0.0878910388564691, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8082851022481918, | |
| "step": 33 | |
| }, | |
| { | |
| "completion_length": 128.39583587646484, | |
| "epoch": 0.07731665719158613, | |
| "grad_norm": 13.709405899047852, | |
| "kl": 0.022186279296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0009, | |
| "reward": 1.8660337030887604, | |
| "reward_std": 0.04777739220298827, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8660337030887604, | |
| "step": 34 | |
| }, | |
| { | |
| "completion_length": 127.22916984558105, | |
| "epoch": 0.07959067652075043, | |
| "grad_norm": 7.4637908935546875, | |
| "kl": 0.018890380859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0008, | |
| "reward": 1.7917229533195496, | |
| "reward_std": 0.0735958176665008, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8021395653486252, | |
| "step": 35 | |
| }, | |
| { | |
| "completion_length": 126.66666984558105, | |
| "epoch": 0.08186469584991472, | |
| "grad_norm": 8.999519348144531, | |
| "kl": 0.02191162109375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0009, | |
| "reward": 1.7737447619438171, | |
| "reward_std": 0.10960677545517683, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7945781201124191, | |
| "step": 36 | |
| }, | |
| { | |
| "completion_length": 125.78125190734863, | |
| "epoch": 0.08413871517907902, | |
| "grad_norm": 10.515003204345703, | |
| "kl": 0.016815185546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0007, | |
| "reward": 1.8419047594070435, | |
| "reward_std": 0.03469831729307771, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8419047296047211, | |
| "step": 37 | |
| }, | |
| { | |
| "completion_length": 127.05208396911621, | |
| "epoch": 0.08641273450824331, | |
| "grad_norm": 25.71062660217285, | |
| "kl": 0.020843505859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0008, | |
| "reward": 1.8053939044475555, | |
| "reward_std": 0.0809064069762826, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8158105462789536, | |
| "step": 38 | |
| }, | |
| { | |
| "completion_length": 124.10416793823242, | |
| "epoch": 0.08868675383740762, | |
| "grad_norm": 11.600313186645508, | |
| "kl": 0.0166168212890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0007, | |
| "reward": 1.856435239315033, | |
| "reward_std": 0.07491008564829826, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8668518215417862, | |
| "step": 39 | |
| }, | |
| { | |
| "completion_length": 123.9687557220459, | |
| "epoch": 0.09096077316657192, | |
| "grad_norm": 10.360515594482422, | |
| "kl": 0.022216796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0009, | |
| "reward": 1.8669567108154297, | |
| "reward_std": 0.01574411618639715, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8669566959142685, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 125.32292366027832, | |
| "epoch": 0.09323479249573621, | |
| "grad_norm": 9.638788223266602, | |
| "kl": 0.022430419921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0009, | |
| "reward": 1.8379901051521301, | |
| "reward_std": 0.0877154991030693, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "rewards/segmentation_reward": 0.8692402094602585, | |
| "step": 41 | |
| }, | |
| { | |
| "completion_length": 123.43750190734863, | |
| "epoch": 0.09550881182490051, | |
| "grad_norm": 11.703614234924316, | |
| "kl": 0.0201416015625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0008, | |
| "reward": 1.896949291229248, | |
| "reward_std": 0.022165193455293775, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8969493061304092, | |
| "step": 42 | |
| }, | |
| { | |
| "completion_length": 121.48958969116211, | |
| "epoch": 0.0977828311540648, | |
| "grad_norm": 41.82573318481445, | |
| "kl": 0.0218505859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0009, | |
| "reward": 1.8304626643657684, | |
| "reward_std": 0.08185502019478008, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8512959778308868, | |
| "step": 43 | |
| }, | |
| { | |
| "completion_length": 126.96875381469727, | |
| "epoch": 0.10005685048322911, | |
| "grad_norm": 9.426851272583008, | |
| "kl": 0.02276611328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0009, | |
| "reward": 1.8303613364696503, | |
| "reward_std": 0.10597201343625784, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8511946946382523, | |
| "step": 44 | |
| }, | |
| { | |
| "completion_length": 119.44791984558105, | |
| "epoch": 0.10233086981239341, | |
| "grad_norm": 7.771895408630371, | |
| "kl": 0.025543212890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.840701162815094, | |
| "reward_std": 0.07642269739881158, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8511178195476532, | |
| "step": 45 | |
| }, | |
| { | |
| "completion_length": 125.02083587646484, | |
| "epoch": 0.1046048891415577, | |
| "grad_norm": 8.715611457824707, | |
| "kl": 0.022735595703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0009, | |
| "reward": 1.8369653820991516, | |
| "reward_std": 0.03333436418324709, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.836965337395668, | |
| "step": 46 | |
| }, | |
| { | |
| "completion_length": 123.79166984558105, | |
| "epoch": 0.106878908470722, | |
| "grad_norm": 15.2009859085083, | |
| "kl": 0.026092529296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.8661520779132843, | |
| "reward_std": 0.019087713153567165, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8661520928144455, | |
| "step": 47 | |
| }, | |
| { | |
| "completion_length": 128.84375381469727, | |
| "epoch": 0.10915292779988629, | |
| "grad_norm": 14.767459869384766, | |
| "kl": 0.026611328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.820468544960022, | |
| "reward_std": 0.10572186904028058, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.8413018435239792, | |
| "step": 48 | |
| }, | |
| { | |
| "completion_length": 122.78125190734863, | |
| "epoch": 0.1114269471290506, | |
| "grad_norm": 34.67581558227539, | |
| "kl": 0.0291748046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.8499588370323181, | |
| "reward_std": 0.06515861582010984, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8603754639625549, | |
| "step": 49 | |
| }, | |
| { | |
| "completion_length": 118.95833587646484, | |
| "epoch": 0.1137009664582149, | |
| "grad_norm": 23.052759170532227, | |
| "kl": 0.0341796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.851874828338623, | |
| "reward_std": 0.058454849757254124, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8622915297746658, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 118.75000190734863, | |
| "epoch": 0.11597498578737919, | |
| "grad_norm": 523.3176879882812, | |
| "kl": 0.0345458984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.835026115179062, | |
| "reward_std": 0.0287266579689458, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8350260555744171, | |
| "step": 51 | |
| }, | |
| { | |
| "completion_length": 126.53125190734863, | |
| "epoch": 0.11824900511654349, | |
| "grad_norm": 10.816275596618652, | |
| "kl": 0.040313720703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.755786508321762, | |
| "reward_std": 0.13628106890246272, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.7870365083217621, | |
| "step": 52 | |
| }, | |
| { | |
| "completion_length": 125.81250190734863, | |
| "epoch": 0.12052302444570778, | |
| "grad_norm": 9.201781272888184, | |
| "kl": 0.03826904296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.8181837797164917, | |
| "reward_std": 0.06433252803981304, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8181838095188141, | |
| "step": 53 | |
| }, | |
| { | |
| "completion_length": 120.90625381469727, | |
| "epoch": 0.12279704377487209, | |
| "grad_norm": 15.950888633728027, | |
| "kl": 0.033416748046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.8185763359069824, | |
| "reward_std": 0.10011043888516724, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.839409664273262, | |
| "step": 54 | |
| }, | |
| { | |
| "completion_length": 125.96875381469727, | |
| "epoch": 0.12507106310403637, | |
| "grad_norm": 3.631739854812622, | |
| "kl": 0.03045654296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.85866180062294, | |
| "reward_std": 0.028809872455894947, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8586617559194565, | |
| "step": 55 | |
| }, | |
| { | |
| "completion_length": 120.3437557220459, | |
| "epoch": 0.1273450824332007, | |
| "grad_norm": 10.557564735412598, | |
| "kl": 0.034942626953125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.8380783200263977, | |
| "reward_std": 0.07766020158305764, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8484949469566345, | |
| "step": 56 | |
| }, | |
| { | |
| "completion_length": 122.03125190734863, | |
| "epoch": 0.129619101762365, | |
| "grad_norm": 13.181836128234863, | |
| "kl": 0.032989501953125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.8480607271194458, | |
| "reward_std": 0.06112843842129223, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.858477458357811, | |
| "step": 57 | |
| }, | |
| { | |
| "completion_length": 128.13541984558105, | |
| "epoch": 0.1318931210915293, | |
| "grad_norm": 9.518472671508789, | |
| "kl": 0.03125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.8364209532737732, | |
| "reward_std": 0.08701860439032316, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8468376249074936, | |
| "step": 58 | |
| }, | |
| { | |
| "completion_length": 121.95833778381348, | |
| "epoch": 0.13416714042069358, | |
| "grad_norm": 11.378217697143555, | |
| "kl": 0.03680419921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.818740427494049, | |
| "reward_std": 0.0671944273635745, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.8395737260580063, | |
| "step": 59 | |
| }, | |
| { | |
| "completion_length": 121.41666984558105, | |
| "epoch": 0.13644115974985788, | |
| "grad_norm": 8.235391616821289, | |
| "kl": 0.035858154296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.8629357516765594, | |
| "reward_std": 0.04245928302407265, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8629357665777206, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 121.90625190734863, | |
| "epoch": 0.13871517907902217, | |
| "grad_norm": 9.360280990600586, | |
| "kl": 0.03076171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.8437756896018982, | |
| "reward_std": 0.03801816503982991, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8437757194042206, | |
| "step": 61 | |
| }, | |
| { | |
| "completion_length": 122.83333587646484, | |
| "epoch": 0.14098919840818647, | |
| "grad_norm": 11.74299144744873, | |
| "kl": 0.03961181640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.8107888400554657, | |
| "reward_std": 0.09169099852442741, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8316220790147781, | |
| "step": 62 | |
| }, | |
| { | |
| "completion_length": 123.4062557220459, | |
| "epoch": 0.14326321773735076, | |
| "grad_norm": 10.003548622131348, | |
| "kl": 0.0406494140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.8223926723003387, | |
| "reward_std": 0.07860782567877322, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.832809329032898, | |
| "step": 63 | |
| }, | |
| { | |
| "completion_length": 124.19792175292969, | |
| "epoch": 0.14553723706651506, | |
| "grad_norm": 8.069796562194824, | |
| "kl": 0.03680419921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.852884978055954, | |
| "reward_std": 0.06025872565805912, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8528849929571152, | |
| "step": 64 | |
| }, | |
| { | |
| "completion_length": 121.30208587646484, | |
| "epoch": 0.14781125639567935, | |
| "grad_norm": 7.896605014801025, | |
| "kl": 0.0350341796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.840564340353012, | |
| "reward_std": 0.06667589582502842, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8405643403530121, | |
| "step": 65 | |
| }, | |
| { | |
| "completion_length": 122.70833778381348, | |
| "epoch": 0.15008527572484365, | |
| "grad_norm": 6.732133865356445, | |
| "kl": 0.034759521484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.8494722843170166, | |
| "reward_std": 0.02315727563109249, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8494722992181778, | |
| "step": 66 | |
| }, | |
| { | |
| "completion_length": 125.42708587646484, | |
| "epoch": 0.15235929505400797, | |
| "grad_norm": 16.19176483154297, | |
| "kl": 0.027984619140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.8468118906021118, | |
| "reward_std": 0.03803225792944431, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.846811830997467, | |
| "step": 67 | |
| }, | |
| { | |
| "completion_length": 121.63541793823242, | |
| "epoch": 0.15463331438317227, | |
| "grad_norm": 8.936660766601562, | |
| "kl": 0.030914306640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.8819967806339264, | |
| "reward_std": 0.01786850136704743, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.881996750831604, | |
| "step": 68 | |
| }, | |
| { | |
| "completion_length": 128.8750057220459, | |
| "epoch": 0.15690733371233656, | |
| "grad_norm": 8.807208061218262, | |
| "kl": 0.04010009765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.8393568098545074, | |
| "reward_std": 0.02671552257379517, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8393567949533463, | |
| "step": 69 | |
| }, | |
| { | |
| "completion_length": 127.10417175292969, | |
| "epoch": 0.15918135304150086, | |
| "grad_norm": 9.790465354919434, | |
| "kl": 0.032501220703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.8589565753936768, | |
| "reward_std": 0.07576595386490226, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8693732172250748, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 128.9895839691162, | |
| "epoch": 0.16145537237066515, | |
| "grad_norm": 132.23687744140625, | |
| "kl": 0.03436279296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.782070904970169, | |
| "reward_std": 0.050592198269441724, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7820708751678467, | |
| "step": 71 | |
| }, | |
| { | |
| "completion_length": 135.29167556762695, | |
| "epoch": 0.16372939169982945, | |
| "grad_norm": 9.341756820678711, | |
| "kl": 0.02923583984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.8484710454940796, | |
| "reward_std": 0.03445305596687831, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.848471000790596, | |
| "step": 72 | |
| }, | |
| { | |
| "completion_length": 126.71875381469727, | |
| "epoch": 0.16600341102899374, | |
| "grad_norm": 9.518696784973145, | |
| "kl": 0.029144287109375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.8627153933048248, | |
| "reward_std": 0.046468528802506626, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8731320649385452, | |
| "step": 73 | |
| }, | |
| { | |
| "completion_length": 127.64583969116211, | |
| "epoch": 0.16827743035815804, | |
| "grad_norm": 17.517793655395508, | |
| "kl": 0.032470703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.8466759324073792, | |
| "reward_std": 0.027572712278924882, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8466758877038956, | |
| "step": 74 | |
| }, | |
| { | |
| "completion_length": 128.7291717529297, | |
| "epoch": 0.17055144968732233, | |
| "grad_norm": 8.034920692443848, | |
| "kl": 0.03302001953125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.8586640655994415, | |
| "reward_std": 0.028676262591034174, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.858664020895958, | |
| "step": 75 | |
| }, | |
| { | |
| "completion_length": 128.2187557220459, | |
| "epoch": 0.17282546901648663, | |
| "grad_norm": 9.858979225158691, | |
| "kl": 0.030853271484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.8392693102359772, | |
| "reward_std": 0.050932126585394144, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8392692804336548, | |
| "step": 76 | |
| }, | |
| { | |
| "completion_length": 135.33333587646484, | |
| "epoch": 0.17509948834565095, | |
| "grad_norm": 10.723231315612793, | |
| "kl": 0.03717041015625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.864184021949768, | |
| "reward_std": 0.021687635337002575, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8641840219497681, | |
| "step": 77 | |
| }, | |
| { | |
| "completion_length": 125.73958587646484, | |
| "epoch": 0.17737350767481525, | |
| "grad_norm": 7.419450283050537, | |
| "kl": 0.04266357421875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "reward": 1.8617435693740845, | |
| "reward_std": 0.032619446399621665, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8617434948682785, | |
| "step": 78 | |
| }, | |
| { | |
| "completion_length": 135.34375381469727, | |
| "epoch": 0.17964752700397954, | |
| "grad_norm": 6.787795543670654, | |
| "kl": 0.0318603515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.8563005030155182, | |
| "reward_std": 0.01326985149353277, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.856300488114357, | |
| "step": 79 | |
| }, | |
| { | |
| "completion_length": 134.9791717529297, | |
| "epoch": 0.18192154633314384, | |
| "grad_norm": 12.446442604064941, | |
| "kl": 0.03302001953125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.8413802683353424, | |
| "reward_std": 0.02990832203067839, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8413802236318588, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 131.05208587646484, | |
| "epoch": 0.18419556566230813, | |
| "grad_norm": 7.766218185424805, | |
| "kl": 0.031982421875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.8655352890491486, | |
| "reward_std": 0.03306874120607972, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8655352592468262, | |
| "step": 81 | |
| }, | |
| { | |
| "completion_length": 130.08333778381348, | |
| "epoch": 0.18646958499147243, | |
| "grad_norm": 7.591801166534424, | |
| "kl": 0.035125732421875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.8489001095294952, | |
| "reward_std": 0.038488025951664895, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8593167364597321, | |
| "step": 82 | |
| }, | |
| { | |
| "completion_length": 127.83333587646484, | |
| "epoch": 0.18874360432063672, | |
| "grad_norm": 8.222685813903809, | |
| "kl": 0.0341796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.8844203352928162, | |
| "reward_std": 0.03254084661602974, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.884420245885849, | |
| "step": 83 | |
| }, | |
| { | |
| "completion_length": 128.39583778381348, | |
| "epoch": 0.19101762364980102, | |
| "grad_norm": 16.960952758789062, | |
| "kl": 0.0391845703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.8208307921886444, | |
| "reward_std": 0.055154044879600406, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8312473893165588, | |
| "step": 84 | |
| }, | |
| { | |
| "completion_length": 132.9583396911621, | |
| "epoch": 0.1932916429789653, | |
| "grad_norm": 5.855716705322266, | |
| "kl": 0.037109375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.86465585231781, | |
| "reward_std": 0.028004995780065656, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8646559119224548, | |
| "step": 85 | |
| }, | |
| { | |
| "completion_length": 130.6666717529297, | |
| "epoch": 0.1955656623081296, | |
| "grad_norm": 9.59054946899414, | |
| "kl": 0.03985595703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.827312707901001, | |
| "reward_std": 0.08010158874094486, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8377293199300766, | |
| "step": 86 | |
| }, | |
| { | |
| "completion_length": 134.71875, | |
| "epoch": 0.19783968163729393, | |
| "grad_norm": 18.06949234008789, | |
| "kl": 0.0369873046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.8673449456691742, | |
| "reward_std": 0.04852295899763703, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8777615576982498, | |
| "step": 87 | |
| }, | |
| { | |
| "completion_length": 136.03125381469727, | |
| "epoch": 0.20011370096645822, | |
| "grad_norm": 7.5427937507629395, | |
| "kl": 0.0423583984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "reward": 1.8519779443740845, | |
| "reward_std": 0.06849909643642604, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8623945862054825, | |
| "step": 88 | |
| }, | |
| { | |
| "completion_length": 135.9479217529297, | |
| "epoch": 0.20238772029562252, | |
| "grad_norm": 12.732763290405273, | |
| "kl": 0.03277587890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.8459124863147736, | |
| "reward_std": 0.07695996854454279, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8563291132450104, | |
| "step": 89 | |
| }, | |
| { | |
| "completion_length": 134.12500381469727, | |
| "epoch": 0.20466173962478681, | |
| "grad_norm": 7.050863265991211, | |
| "kl": 0.0347900390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.837898164987564, | |
| "reward_std": 0.015308346832171082, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8378981947898865, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 133.28125381469727, | |
| "epoch": 0.2069357589539511, | |
| "grad_norm": 38.80855178833008, | |
| "kl": 0.038330078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.863549381494522, | |
| "reward_std": 0.03241122025065124, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8635492920875549, | |
| "step": 91 | |
| }, | |
| { | |
| "completion_length": 134.31250381469727, | |
| "epoch": 0.2092097782831154, | |
| "grad_norm": 12.962533950805664, | |
| "kl": 0.03558349609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.8355351388454437, | |
| "reward_std": 0.07070542359724641, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8459518104791641, | |
| "step": 92 | |
| }, | |
| { | |
| "completion_length": 132.1770896911621, | |
| "epoch": 0.2114837976122797, | |
| "grad_norm": 8.060502052307129, | |
| "kl": 0.0416259765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "reward": 1.847087800502777, | |
| "reward_std": 0.031698971055448055, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8470877408981323, | |
| "step": 93 | |
| }, | |
| { | |
| "completion_length": 135.55208587646484, | |
| "epoch": 0.213757816941444, | |
| "grad_norm": 14.366216659545898, | |
| "kl": 0.04425048828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0018, | |
| "reward": 1.8385899066925049, | |
| "reward_std": 0.06835441221483052, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8490065485239029, | |
| "step": 94 | |
| }, | |
| { | |
| "completion_length": 127.10416793823242, | |
| "epoch": 0.2160318362706083, | |
| "grad_norm": 26.700092315673828, | |
| "kl": 0.0390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.8634757697582245, | |
| "reward_std": 0.008791875996394083, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8634757250547409, | |
| "step": 95 | |
| }, | |
| { | |
| "completion_length": 129.25000381469727, | |
| "epoch": 0.21830585559977259, | |
| "grad_norm": 4.697238445281982, | |
| "kl": 0.0408935546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.8333858251571655, | |
| "reward_std": 0.08207701286301017, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8438025414943695, | |
| "step": 96 | |
| }, | |
| { | |
| "completion_length": 139.67709350585938, | |
| "epoch": 0.2205798749289369, | |
| "grad_norm": 18.640583038330078, | |
| "kl": 0.04376220703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "reward": 1.8212727904319763, | |
| "reward_std": 0.10704211867414415, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.8421061336994171, | |
| "step": 97 | |
| }, | |
| { | |
| "completion_length": 132.62500381469727, | |
| "epoch": 0.2228538942581012, | |
| "grad_norm": 10.532288551330566, | |
| "kl": 0.0390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.7843947410583496, | |
| "reward_std": 0.07167254062369466, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.79481141269207, | |
| "step": 98 | |
| }, | |
| { | |
| "completion_length": 133.50000381469727, | |
| "epoch": 0.2251279135872655, | |
| "grad_norm": 10.3671875, | |
| "kl": 0.037353515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.82876256108284, | |
| "reward_std": 0.10020078788511455, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.849595919251442, | |
| "step": 99 | |
| }, | |
| { | |
| "completion_length": 131.75000381469727, | |
| "epoch": 0.2274019329164298, | |
| "grad_norm": 13.147160530090332, | |
| "kl": 0.0452880859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0018, | |
| "reward": 1.8422828912734985, | |
| "reward_std": 0.08142339263577014, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8631161749362946, | |
| "step": 100 | |
| }, | |
| { | |
| "completion_length": 129.8229217529297, | |
| "epoch": 0.2296759522455941, | |
| "grad_norm": 16.519821166992188, | |
| "kl": 0.034637451171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.806572288274765, | |
| "reward_std": 0.04188450565561652, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8065722435712814, | |
| "step": 101 | |
| }, | |
| { | |
| "completion_length": 129.39583778381348, | |
| "epoch": 0.23194997157475838, | |
| "grad_norm": 12.152297019958496, | |
| "kl": 0.041015625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.8586591482162476, | |
| "reward_std": 0.06601439183577895, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.8794925063848495, | |
| "step": 102 | |
| }, | |
| { | |
| "completion_length": 128.5000057220459, | |
| "epoch": 0.23422399090392268, | |
| "grad_norm": 8.767850875854492, | |
| "kl": 0.03546142578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.7989584803581238, | |
| "reward_std": 0.11034804070368409, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8197918385267258, | |
| "step": 103 | |
| }, | |
| { | |
| "completion_length": 129.6145839691162, | |
| "epoch": 0.23649801023308697, | |
| "grad_norm": 8.40770435333252, | |
| "kl": 0.03826904296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.8321227729320526, | |
| "reward_std": 0.06688260892406106, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8425393849611282, | |
| "step": 104 | |
| }, | |
| { | |
| "completion_length": 133.00000762939453, | |
| "epoch": 0.23877202956225127, | |
| "grad_norm": 66.30607604980469, | |
| "kl": 0.05133056640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.002, | |
| "reward": 1.765193372964859, | |
| "reward_std": 0.14037537574768066, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "rewards/segmentation_reward": 0.806860014796257, | |
| "step": 105 | |
| }, | |
| { | |
| "completion_length": 126.62500381469727, | |
| "epoch": 0.24104604889141557, | |
| "grad_norm": 10.172347068786621, | |
| "kl": 0.0382080078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.8895207047462463, | |
| "reward_std": 0.02245330944424495, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8895206451416016, | |
| "step": 106 | |
| }, | |
| { | |
| "completion_length": 135.06250762939453, | |
| "epoch": 0.2433200682205799, | |
| "grad_norm": 8.362109184265137, | |
| "kl": 0.041168212890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "reward": 1.806322306394577, | |
| "reward_std": 0.12358620949089527, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/segmentation_reward": 0.8375722914934158, | |
| "step": 107 | |
| }, | |
| { | |
| "completion_length": 132.0104217529297, | |
| "epoch": 0.24559408754974418, | |
| "grad_norm": 8.960740089416504, | |
| "kl": 0.040069580078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.8282636106014252, | |
| "reward_std": 0.08110124431550503, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8386802226305008, | |
| "step": 108 | |
| }, | |
| { | |
| "completion_length": 134.71875381469727, | |
| "epoch": 0.24786810687890848, | |
| "grad_norm": 20.807939529418945, | |
| "kl": 0.0418701171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.8370684087276459, | |
| "reward_std": 0.03879292996134609, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8474850803613663, | |
| "step": 109 | |
| }, | |
| { | |
| "completion_length": 132.73958587646484, | |
| "epoch": 0.25014212620807275, | |
| "grad_norm": 19.63970184326172, | |
| "kl": 0.04046630859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.8565462529659271, | |
| "reward_std": 0.05808442225679755, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8669629096984863, | |
| "step": 110 | |
| }, | |
| { | |
| "completion_length": 131.5104217529297, | |
| "epoch": 0.25241614553723707, | |
| "grad_norm": 13.530229568481445, | |
| "kl": 0.03961181640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.869715929031372, | |
| "reward_std": 0.028540480285300873, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8697158843278885, | |
| "step": 111 | |
| }, | |
| { | |
| "completion_length": 136.64583778381348, | |
| "epoch": 0.2546901648664014, | |
| "grad_norm": 27.007661819458008, | |
| "kl": 0.039306640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.7972800433635712, | |
| "reward_std": 0.09525090921670198, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.8181134164333344, | |
| "step": 112 | |
| }, | |
| { | |
| "completion_length": 135.8645896911621, | |
| "epoch": 0.25696418419556566, | |
| "grad_norm": 10.786030769348145, | |
| "kl": 0.03765869140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.8435862362384796, | |
| "reward_std": 0.07883737958036363, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8540029078722, | |
| "step": 113 | |
| }, | |
| { | |
| "completion_length": 126.04166984558105, | |
| "epoch": 0.25923820352473, | |
| "grad_norm": 9.88869857788086, | |
| "kl": 0.0394287109375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.8222769498825073, | |
| "reward_std": 0.07929788623005152, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8222769200801849, | |
| "step": 114 | |
| }, | |
| { | |
| "completion_length": 131.81250381469727, | |
| "epoch": 0.26151222285389425, | |
| "grad_norm": 45.116451263427734, | |
| "kl": 0.04217529296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "reward": 1.8684912621974945, | |
| "reward_std": 0.04528397601097822, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8684912025928497, | |
| "step": 115 | |
| }, | |
| { | |
| "completion_length": 131.7291717529297, | |
| "epoch": 0.2637862421830586, | |
| "grad_norm": 6.04058313369751, | |
| "kl": 0.047119140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0019, | |
| "reward": 1.7711567878723145, | |
| "reward_std": 0.1284920796751976, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7919900417327881, | |
| "step": 116 | |
| }, | |
| { | |
| "completion_length": 127.82292175292969, | |
| "epoch": 0.26606026151222284, | |
| "grad_norm": 9.686407089233398, | |
| "kl": 0.0443115234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0018, | |
| "reward": 1.8393816649913788, | |
| "reward_std": 0.07739171921275556, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8497983366250992, | |
| "step": 117 | |
| }, | |
| { | |
| "completion_length": 126.60416984558105, | |
| "epoch": 0.26833428084138716, | |
| "grad_norm": 13.846648216247559, | |
| "kl": 0.069580078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0028, | |
| "reward": 1.7880859076976776, | |
| "reward_std": 0.1653798259794712, | |
| "rewards/format_reward": 0.958333358168602, | |
| "rewards/segmentation_reward": 0.8297525644302368, | |
| "step": 118 | |
| }, | |
| { | |
| "completion_length": 125.22916793823242, | |
| "epoch": 0.27060830017055143, | |
| "grad_norm": 14.052881240844727, | |
| "kl": 0.0400390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.8796150386333466, | |
| "reward_std": 0.028668402694165707, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.879614993929863, | |
| "step": 119 | |
| }, | |
| { | |
| "completion_length": 120.08333778381348, | |
| "epoch": 0.27288231949971575, | |
| "grad_norm": 6.430720806121826, | |
| "kl": 0.04559326171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0018, | |
| "reward": 1.8220676481723785, | |
| "reward_std": 0.08830677217338234, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8429009765386581, | |
| "step": 120 | |
| }, | |
| { | |
| "completion_length": 124.82291984558105, | |
| "epoch": 0.27515633882888, | |
| "grad_norm": 19.418067932128906, | |
| "kl": 0.04193115234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "reward": 1.8649601340293884, | |
| "reward_std": 0.02245999814476818, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8649601340293884, | |
| "step": 121 | |
| }, | |
| { | |
| "completion_length": 130.38541984558105, | |
| "epoch": 0.27743035815804434, | |
| "grad_norm": 32.728607177734375, | |
| "kl": 0.0396728515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.8011479675769806, | |
| "reward_std": 0.10339335759636015, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.821981281042099, | |
| "step": 122 | |
| }, | |
| { | |
| "completion_length": 131.51041984558105, | |
| "epoch": 0.27970437748720867, | |
| "grad_norm": 10.91059398651123, | |
| "kl": 0.0413818359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "reward": 1.838850736618042, | |
| "reward_std": 0.02782218554057181, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.838850736618042, | |
| "step": 123 | |
| }, | |
| { | |
| "completion_length": 125.80208778381348, | |
| "epoch": 0.28197839681637293, | |
| "grad_norm": 8.754847526550293, | |
| "kl": 0.0377197265625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.8477693796157837, | |
| "reward_std": 0.04511198558611795, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8581860810518265, | |
| "step": 124 | |
| }, | |
| { | |
| "completion_length": 131.50000190734863, | |
| "epoch": 0.28425241614553726, | |
| "grad_norm": 41.41383743286133, | |
| "kl": 0.03546142578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.8574239313602448, | |
| "reward_std": 0.02917448477819562, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8574239611625671, | |
| "step": 125 | |
| }, | |
| { | |
| "completion_length": 124.41666984558105, | |
| "epoch": 0.2865264354747015, | |
| "grad_norm": 8.908427238464355, | |
| "kl": 0.041259765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.8553960621356964, | |
| "reward_std": 0.06732171808835119, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8658127784729004, | |
| "step": 126 | |
| }, | |
| { | |
| "completion_length": 138.18750381469727, | |
| "epoch": 0.28880045480386585, | |
| "grad_norm": 15.392871856689453, | |
| "kl": 0.033538818359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.8320258855819702, | |
| "reward_std": 0.06953636615071446, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8424425423145294, | |
| "step": 127 | |
| }, | |
| { | |
| "completion_length": 128.70833778381348, | |
| "epoch": 0.2910744741330301, | |
| "grad_norm": 5.235471248626709, | |
| "kl": 0.03948974609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.823939561843872, | |
| "reward_std": 0.10366934072226286, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8447728455066681, | |
| "step": 128 | |
| }, | |
| { | |
| "completion_length": 129.0625057220459, | |
| "epoch": 0.29334849346219444, | |
| "grad_norm": 18.025480270385742, | |
| "kl": 0.0374755859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.8658169209957123, | |
| "reward_std": 0.017597037134692073, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8658169209957123, | |
| "step": 129 | |
| }, | |
| { | |
| "completion_length": 132.44792366027832, | |
| "epoch": 0.2956225127913587, | |
| "grad_norm": 10.100214004516602, | |
| "kl": 0.03912353515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.819077491760254, | |
| "reward_std": 0.07401184504851699, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8294941633939743, | |
| "step": 130 | |
| }, | |
| { | |
| "completion_length": 131.10417556762695, | |
| "epoch": 0.297896532120523, | |
| "grad_norm": 10.517773628234863, | |
| "kl": 0.03955078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.8419454395771027, | |
| "reward_std": 0.06657789507880807, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8523620814085007, | |
| "step": 131 | |
| }, | |
| { | |
| "completion_length": 131.1250057220459, | |
| "epoch": 0.3001705514496873, | |
| "grad_norm": 15.198212623596191, | |
| "kl": 0.033355712890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.861166775226593, | |
| "reward_std": 0.02897452423349023, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8611667454242706, | |
| "step": 132 | |
| }, | |
| { | |
| "completion_length": 129.7708396911621, | |
| "epoch": 0.3024445707788516, | |
| "grad_norm": 10.940967559814453, | |
| "kl": 0.03472900390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.8530578315258026, | |
| "reward_std": 0.0734487110748887, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.873891144990921, | |
| "step": 133 | |
| }, | |
| { | |
| "completion_length": 127.71875381469727, | |
| "epoch": 0.30471859010801594, | |
| "grad_norm": 7.945354461669922, | |
| "kl": 0.0478515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0019, | |
| "reward": 1.7999140620231628, | |
| "reward_std": 0.06981736817397177, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8207473605871201, | |
| "step": 134 | |
| }, | |
| { | |
| "completion_length": 133.4791717529297, | |
| "epoch": 0.3069926094371802, | |
| "grad_norm": 8.708114624023438, | |
| "kl": 0.041259765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.8323033154010773, | |
| "reward_std": 0.07405243627727032, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8427200168371201, | |
| "step": 135 | |
| }, | |
| { | |
| "completion_length": 129.3541717529297, | |
| "epoch": 0.30926662876634453, | |
| "grad_norm": 39.540550231933594, | |
| "kl": 0.0394287109375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.8501177728176117, | |
| "reward_std": 0.06709112878888845, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8605344444513321, | |
| "step": 136 | |
| }, | |
| { | |
| "completion_length": 124.79166793823242, | |
| "epoch": 0.3115406480955088, | |
| "grad_norm": 6.668416976928711, | |
| "kl": 0.0390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.8371129930019379, | |
| "reward_std": 0.05754397192504257, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8475296497344971, | |
| "step": 137 | |
| }, | |
| { | |
| "completion_length": 132.17708587646484, | |
| "epoch": 0.3138146674246731, | |
| "grad_norm": 18.357772827148438, | |
| "kl": 0.03692626953125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.8030109107494354, | |
| "reward_std": 0.04487704858183861, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.803010955452919, | |
| "step": 138 | |
| }, | |
| { | |
| "completion_length": 123.05208969116211, | |
| "epoch": 0.3160886867538374, | |
| "grad_norm": 12.917668342590332, | |
| "kl": 0.04083251953125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.819540798664093, | |
| "reward_std": 0.04616073609213345, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8299574255943298, | |
| "step": 139 | |
| }, | |
| { | |
| "completion_length": 128.93750381469727, | |
| "epoch": 0.3183627060830017, | |
| "grad_norm": 8.16073226928711, | |
| "kl": 0.03961181640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.869756668806076, | |
| "reward_std": 0.023664554115384817, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8697566390037537, | |
| "step": 140 | |
| }, | |
| { | |
| "completion_length": 130.42708587646484, | |
| "epoch": 0.320636725412166, | |
| "grad_norm": 19.300662994384766, | |
| "kl": 0.04595947265625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0018, | |
| "reward": 1.8823592960834503, | |
| "reward_std": 0.027183939702808857, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8823592811822891, | |
| "step": 141 | |
| }, | |
| { | |
| "completion_length": 121.57292175292969, | |
| "epoch": 0.3229107447413303, | |
| "grad_norm": 8.414497375488281, | |
| "kl": 0.050537109375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.002, | |
| "reward": 1.8237576484680176, | |
| "reward_std": 0.06241214391775429, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8341742604970932, | |
| "step": 142 | |
| }, | |
| { | |
| "completion_length": 127.01041984558105, | |
| "epoch": 0.3251847640704946, | |
| "grad_norm": 7.633127689361572, | |
| "kl": 0.041259765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.8560876250267029, | |
| "reward_std": 0.04744653377565555, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8665042370557785, | |
| "step": 143 | |
| }, | |
| { | |
| "completion_length": 125.28125190734863, | |
| "epoch": 0.3274587833996589, | |
| "grad_norm": 13.39786148071289, | |
| "kl": 0.0440673828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0018, | |
| "reward": 1.8448957204818726, | |
| "reward_std": 0.017382028454449028, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8448957204818726, | |
| "step": 144 | |
| }, | |
| { | |
| "completion_length": 120.63541984558105, | |
| "epoch": 0.3297328027288232, | |
| "grad_norm": 10.50205135345459, | |
| "kl": 0.0458984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0018, | |
| "reward": 1.838788241147995, | |
| "reward_std": 0.08608005382120609, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8492048978805542, | |
| "step": 145 | |
| }, | |
| { | |
| "completion_length": 119.38541793823242, | |
| "epoch": 0.3320068220579875, | |
| "grad_norm": 5.443851470947266, | |
| "kl": 0.04302978515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "reward": 1.8452240228652954, | |
| "reward_std": 0.05149654616252519, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.855640709400177, | |
| "step": 146 | |
| }, | |
| { | |
| "completion_length": 122.40625190734863, | |
| "epoch": 0.3342808413871518, | |
| "grad_norm": 6.090450286865234, | |
| "kl": 0.0438232421875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0018, | |
| "reward": 1.856435388326645, | |
| "reward_std": 0.04737340519204736, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8564353585243225, | |
| "step": 147 | |
| }, | |
| { | |
| "completion_length": 119.72916984558105, | |
| "epoch": 0.3365548607163161, | |
| "grad_norm": 6.615400314331055, | |
| "kl": 0.0478515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0019, | |
| "reward": 1.826998233795166, | |
| "reward_std": 0.06745404587127268, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8374148905277252, | |
| "step": 148 | |
| }, | |
| { | |
| "completion_length": 126.30208778381348, | |
| "epoch": 0.3388288800454804, | |
| "grad_norm": 12.282633781433105, | |
| "kl": 0.057373046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.8556068539619446, | |
| "reward_std": 0.02449450278072618, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8556068539619446, | |
| "step": 149 | |
| }, | |
| { | |
| "completion_length": 125.31250381469727, | |
| "epoch": 0.34110289937464466, | |
| "grad_norm": 6.970208644866943, | |
| "kl": 0.0443115234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0018, | |
| "reward": 1.8693890571594238, | |
| "reward_std": 0.01952762738801539, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.869389072060585, | |
| "step": 150 | |
| }, | |
| { | |
| "completion_length": 125.59375, | |
| "epoch": 0.343376918703809, | |
| "grad_norm": 45.453697204589844, | |
| "kl": 0.0531005859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.8583467900753021, | |
| "reward_std": 0.051462399773299694, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8687634319067001, | |
| "step": 151 | |
| }, | |
| { | |
| "completion_length": 124.11458587646484, | |
| "epoch": 0.34565093803297325, | |
| "grad_norm": 20.52977752685547, | |
| "kl": 0.0479736328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0019, | |
| "reward": 1.8123543560504913, | |
| "reward_std": 0.0828116275370121, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8227709978818893, | |
| "step": 152 | |
| }, | |
| { | |
| "completion_length": 118.43750190734863, | |
| "epoch": 0.3479249573621376, | |
| "grad_norm": 11.727397918701172, | |
| "kl": 0.04229736328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "reward": 1.867412656545639, | |
| "reward_std": 0.025605608825571835, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8674126118421555, | |
| "step": 153 | |
| }, | |
| { | |
| "completion_length": 124.67708587646484, | |
| "epoch": 0.3501989766913019, | |
| "grad_norm": 11.057373046875, | |
| "kl": 0.0531005859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.869041621685028, | |
| "reward_std": 0.026430562138557434, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8690416067838669, | |
| "step": 154 | |
| }, | |
| { | |
| "completion_length": 129.88541984558105, | |
| "epoch": 0.35247299602046617, | |
| "grad_norm": 9.447416305541992, | |
| "kl": 0.04443359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0018, | |
| "reward": 1.7960728704929352, | |
| "reward_std": 0.06973322853446007, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8064895123243332, | |
| "step": 155 | |
| }, | |
| { | |
| "completion_length": 126.4687557220459, | |
| "epoch": 0.3547470153496305, | |
| "grad_norm": 25.426660537719727, | |
| "kl": 0.19091796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0076, | |
| "reward": 1.8375684916973114, | |
| "reward_std": 0.05996000056620687, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8479850590229034, | |
| "step": 156 | |
| }, | |
| { | |
| "completion_length": 123.61458587646484, | |
| "epoch": 0.35702103467879476, | |
| "grad_norm": 7.227417469024658, | |
| "kl": 0.04864501953125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.002, | |
| "reward": 1.8216514885425568, | |
| "reward_std": 0.07541643898002803, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8320681154727936, | |
| "step": 157 | |
| }, | |
| { | |
| "completion_length": 118.41666984558105, | |
| "epoch": 0.3592950540079591, | |
| "grad_norm": 7.97116756439209, | |
| "kl": 0.07366943359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0029, | |
| "reward": 1.86759752035141, | |
| "reward_std": 0.014220859797205776, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8675975650548935, | |
| "step": 158 | |
| }, | |
| { | |
| "completion_length": 127.57291984558105, | |
| "epoch": 0.36156907333712335, | |
| "grad_norm": 9.09946346282959, | |
| "kl": 0.05206298828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.8287563920021057, | |
| "reward_std": 0.02918955238419585, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8287564218044281, | |
| "step": 159 | |
| }, | |
| { | |
| "completion_length": 127.68750381469727, | |
| "epoch": 0.36384309266628767, | |
| "grad_norm": 37.89695358276367, | |
| "kl": 0.0693359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0028, | |
| "reward": 1.8446174263954163, | |
| "reward_std": 0.03227622219128534, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8446174561977386, | |
| "step": 160 | |
| }, | |
| { | |
| "completion_length": 123.51041984558105, | |
| "epoch": 0.36611711199545194, | |
| "grad_norm": 10.888017654418945, | |
| "kl": 0.0460205078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0018, | |
| "reward": 1.8628927171230316, | |
| "reward_std": 0.022265097475610673, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8628927171230316, | |
| "step": 161 | |
| }, | |
| { | |
| "completion_length": 123.52083587646484, | |
| "epoch": 0.36839113132461626, | |
| "grad_norm": 31.744691848754883, | |
| "kl": 0.0421142578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "reward": 1.8360488712787628, | |
| "reward_std": 0.04995149944443256, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8464655578136444, | |
| "step": 162 | |
| }, | |
| { | |
| "completion_length": 124.39583778381348, | |
| "epoch": 0.3706651506537806, | |
| "grad_norm": 119.80357360839844, | |
| "kl": 0.0450439453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0018, | |
| "reward": 1.870296448469162, | |
| "reward_std": 0.02627503650728613, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8702964633703232, | |
| "step": 163 | |
| }, | |
| { | |
| "completion_length": 126.23958587646484, | |
| "epoch": 0.37293916998294485, | |
| "grad_norm": 13.347270011901855, | |
| "kl": 0.0462646484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0018, | |
| "reward": 1.8231273889541626, | |
| "reward_std": 0.027737511321902275, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.823127418756485, | |
| "step": 164 | |
| }, | |
| { | |
| "completion_length": 121.46875381469727, | |
| "epoch": 0.3752131893121092, | |
| "grad_norm": 6.548112869262695, | |
| "kl": 0.046142578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0018, | |
| "reward": 1.8643255233764648, | |
| "reward_std": 0.020169232942862436, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8643255233764648, | |
| "step": 165 | |
| }, | |
| { | |
| "completion_length": 122.41666984558105, | |
| "epoch": 0.37748720864127344, | |
| "grad_norm": 6.858768463134766, | |
| "kl": 0.04974365234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.002, | |
| "reward": 1.8725146353244781, | |
| "reward_std": 0.01943917891185265, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.872514620423317, | |
| "step": 166 | |
| }, | |
| { | |
| "completion_length": 131.04166984558105, | |
| "epoch": 0.37976122797043776, | |
| "grad_norm": 8.755220413208008, | |
| "kl": 0.044677734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0018, | |
| "reward": 1.8255141377449036, | |
| "reward_std": 0.06648333976045251, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.835930809378624, | |
| "step": 167 | |
| }, | |
| { | |
| "completion_length": 126.77083778381348, | |
| "epoch": 0.38203524729960203, | |
| "grad_norm": 13.69814682006836, | |
| "kl": 0.04913330078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.002, | |
| "reward": 1.8552350401878357, | |
| "reward_std": 0.055759434937499464, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8656516671180725, | |
| "step": 168 | |
| }, | |
| { | |
| "completion_length": 123.25, | |
| "epoch": 0.38430926662876636, | |
| "grad_norm": 9.107439994812012, | |
| "kl": 0.04638671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0019, | |
| "reward": 1.869709074497223, | |
| "reward_std": 0.008492362350807525, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8697090148925781, | |
| "step": 169 | |
| }, | |
| { | |
| "completion_length": 128.6145896911621, | |
| "epoch": 0.3865832859579306, | |
| "grad_norm": 11.448233604431152, | |
| "kl": 0.04742431640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0019, | |
| "reward": 1.8550761342048645, | |
| "reward_std": 0.023954114876687527, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8550761044025421, | |
| "step": 170 | |
| }, | |
| { | |
| "completion_length": 126.64583396911621, | |
| "epoch": 0.38885730528709495, | |
| "grad_norm": 10.13017463684082, | |
| "kl": 0.04632568359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0019, | |
| "reward": 1.8708954453468323, | |
| "reward_std": 0.03041057422524318, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8708954006433487, | |
| "step": 171 | |
| }, | |
| { | |
| "completion_length": 127.83333396911621, | |
| "epoch": 0.3911313246162592, | |
| "grad_norm": 22.73763084411621, | |
| "kl": 0.04736328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0019, | |
| "reward": 1.8603352308273315, | |
| "reward_std": 0.02450424269773066, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8603352308273315, | |
| "step": 172 | |
| }, | |
| { | |
| "completion_length": 124.68750381469727, | |
| "epoch": 0.39340534394542354, | |
| "grad_norm": 9.25720500946045, | |
| "kl": 0.04620361328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0018, | |
| "reward": 1.812343418598175, | |
| "reward_std": 0.02242008870234713, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8123434334993362, | |
| "step": 173 | |
| }, | |
| { | |
| "completion_length": 124.85417175292969, | |
| "epoch": 0.39567936327458786, | |
| "grad_norm": 15.994894981384277, | |
| "kl": 0.0506591796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.002, | |
| "reward": 1.8353284001350403, | |
| "reward_std": 0.0779900832567364, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8457450717687607, | |
| "step": 174 | |
| }, | |
| { | |
| "completion_length": 130.52083778381348, | |
| "epoch": 0.3979533826037521, | |
| "grad_norm": 21.059858322143555, | |
| "kl": 0.0555419921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.7730466425418854, | |
| "reward_std": 0.08199177589267492, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.783463254570961, | |
| "step": 175 | |
| }, | |
| { | |
| "completion_length": 130.9062557220459, | |
| "epoch": 0.40022740193291645, | |
| "grad_norm": 28.807546615600586, | |
| "kl": 0.04840087890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0019, | |
| "reward": 1.8387254476547241, | |
| "reward_std": 0.05101281497627497, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8491421639919281, | |
| "step": 176 | |
| }, | |
| { | |
| "completion_length": 132.1770896911621, | |
| "epoch": 0.4025014212620807, | |
| "grad_norm": 8.303629875183105, | |
| "kl": 0.0477294921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0019, | |
| "reward": 1.8762327134609222, | |
| "reward_std": 0.021599826373858377, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8762326389551163, | |
| "step": 177 | |
| }, | |
| { | |
| "completion_length": 130.45834159851074, | |
| "epoch": 0.40477544059124504, | |
| "grad_norm": 5.3660359382629395, | |
| "kl": 0.04571533203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0018, | |
| "reward": 1.8499539494514465, | |
| "reward_std": 0.0204410245642066, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.849953904747963, | |
| "step": 178 | |
| }, | |
| { | |
| "completion_length": 130.28125190734863, | |
| "epoch": 0.4070494599204093, | |
| "grad_norm": 9.386882781982422, | |
| "kl": 0.0966796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0039, | |
| "reward": 1.824854463338852, | |
| "reward_std": 0.029640484135597944, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8248543739318848, | |
| "step": 179 | |
| }, | |
| { | |
| "completion_length": 125.65625, | |
| "epoch": 0.40932347924957363, | |
| "grad_norm": 10.826035499572754, | |
| "kl": 0.05224609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.8699792921543121, | |
| "reward_std": 0.05197575513739139, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8803958892822266, | |
| "step": 180 | |
| }, | |
| { | |
| "completion_length": 134.8750057220459, | |
| "epoch": 0.4115974985787379, | |
| "grad_norm": 9.238359451293945, | |
| "kl": 0.05242919921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.881340116262436, | |
| "reward_std": 0.020922310650348663, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8813401609659195, | |
| "step": 181 | |
| }, | |
| { | |
| "completion_length": 127.14583778381348, | |
| "epoch": 0.4138715179079022, | |
| "grad_norm": 5.321325302124023, | |
| "kl": 0.04754638671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0019, | |
| "reward": 1.850901871919632, | |
| "reward_std": 0.0263338660588488, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.850901871919632, | |
| "step": 182 | |
| }, | |
| { | |
| "completion_length": 127.5312557220459, | |
| "epoch": 0.4161455372370665, | |
| "grad_norm": 27.181861877441406, | |
| "kl": 0.0631103515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.8269298076629639, | |
| "reward_std": 0.08564014174044132, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8373464494943619, | |
| "step": 183 | |
| }, | |
| { | |
| "completion_length": 124.05208587646484, | |
| "epoch": 0.4184195565662308, | |
| "grad_norm": 7.739187717437744, | |
| "kl": 0.05078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.002, | |
| "reward": 1.8764528036117554, | |
| "reward_std": 0.0346121295588091, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8764527887105942, | |
| "step": 184 | |
| }, | |
| { | |
| "completion_length": 127.40625190734863, | |
| "epoch": 0.42069357589539513, | |
| "grad_norm": 5.463039398193359, | |
| "kl": 0.0523681640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.8427515029907227, | |
| "reward_std": 0.03878836310468614, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8427514582872391, | |
| "step": 185 | |
| }, | |
| { | |
| "completion_length": 127.60416984558105, | |
| "epoch": 0.4229675952245594, | |
| "grad_norm": 6.220778465270996, | |
| "kl": 0.04833984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0019, | |
| "reward": 1.839966058731079, | |
| "reward_std": 0.0670549722854048, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8503826707601547, | |
| "step": 186 | |
| }, | |
| { | |
| "completion_length": 128.42708587646484, | |
| "epoch": 0.4252416145537237, | |
| "grad_norm": 7.694529056549072, | |
| "kl": 0.0487060546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.002, | |
| "reward": 1.80168816447258, | |
| "reward_std": 0.09221077559050173, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8225214183330536, | |
| "step": 187 | |
| }, | |
| { | |
| "completion_length": 123.45833587646484, | |
| "epoch": 0.427515633882888, | |
| "grad_norm": 8.434104919433594, | |
| "kl": 0.0533447265625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.8583765625953674, | |
| "reward_std": 0.027193676389288157, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.858376607298851, | |
| "step": 188 | |
| }, | |
| { | |
| "completion_length": 121.63541793823242, | |
| "epoch": 0.4297896532120523, | |
| "grad_norm": 12.074097633361816, | |
| "kl": 0.06329345703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.8214238584041595, | |
| "reward_std": 0.06923552230000496, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8214238435029984, | |
| "step": 189 | |
| }, | |
| { | |
| "completion_length": 124.48958587646484, | |
| "epoch": 0.4320636725412166, | |
| "grad_norm": 12.668107986450195, | |
| "kl": 0.0615234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.8321338295936584, | |
| "reward_std": 0.042182555072940886, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.84255051612854, | |
| "step": 190 | |
| }, | |
| { | |
| "completion_length": 128.0208339691162, | |
| "epoch": 0.4343376918703809, | |
| "grad_norm": 9.82055950164795, | |
| "kl": 0.06005859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.81814506649971, | |
| "reward_std": 0.06698361551389098, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8181450814008713, | |
| "step": 191 | |
| }, | |
| { | |
| "completion_length": 120.97916793823242, | |
| "epoch": 0.43661171119954517, | |
| "grad_norm": 16.151336669921875, | |
| "kl": 0.072265625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0029, | |
| "reward": 1.8266111612319946, | |
| "reward_std": 0.09967668447643518, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8370277732610703, | |
| "step": 192 | |
| }, | |
| { | |
| "completion_length": 122.00000381469727, | |
| "epoch": 0.4388857305287095, | |
| "grad_norm": 7.378772735595703, | |
| "kl": 0.06793212890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.807835042476654, | |
| "reward_std": 0.12562582828104496, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8286683708429337, | |
| "step": 193 | |
| }, | |
| { | |
| "completion_length": 123.76042175292969, | |
| "epoch": 0.4411597498578738, | |
| "grad_norm": 26.526634216308594, | |
| "kl": 0.0714111328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0029, | |
| "reward": 1.854819893836975, | |
| "reward_std": 0.039443244226276875, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8548198789358139, | |
| "step": 194 | |
| }, | |
| { | |
| "completion_length": 121.32291984558105, | |
| "epoch": 0.4434337691870381, | |
| "grad_norm": 5.977930068969727, | |
| "kl": 0.070068359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0028, | |
| "reward": 1.7950571477413177, | |
| "reward_std": 0.11713728122413158, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.815890446305275, | |
| "step": 195 | |
| }, | |
| { | |
| "completion_length": 129.34375190734863, | |
| "epoch": 0.4457077885162024, | |
| "grad_norm": 70.88825225830078, | |
| "kl": 0.06866455078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.8642869293689728, | |
| "reward_std": 0.026344751473516226, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8642869293689728, | |
| "step": 196 | |
| }, | |
| { | |
| "completion_length": 128.20833778381348, | |
| "epoch": 0.4479818078453667, | |
| "grad_norm": 5.539989948272705, | |
| "kl": 0.0606689453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.789596974849701, | |
| "reward_std": 0.11505167232826352, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8104302436113358, | |
| "step": 197 | |
| }, | |
| { | |
| "completion_length": 120.04166984558105, | |
| "epoch": 0.450255827174531, | |
| "grad_norm": 16.326969146728516, | |
| "kl": 0.071533203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0028, | |
| "reward": 1.8599906861782074, | |
| "reward_std": 0.05113175604492426, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8704073280096054, | |
| "step": 198 | |
| }, | |
| { | |
| "completion_length": 121.82291793823242, | |
| "epoch": 0.45252984650369527, | |
| "grad_norm": 9.708488464355469, | |
| "kl": 0.0889892578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0036, | |
| "reward": 1.8156112134456635, | |
| "reward_std": 0.06598617471172474, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8260278552770615, | |
| "step": 199 | |
| }, | |
| { | |
| "completion_length": 123.58333396911621, | |
| "epoch": 0.4548038658328596, | |
| "grad_norm": 5.935286998748779, | |
| "kl": 0.078857421875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0032, | |
| "reward": 1.8384647369384766, | |
| "reward_std": 0.06912496162112802, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.848881334066391, | |
| "step": 200 | |
| }, | |
| { | |
| "completion_length": 119.64583778381348, | |
| "epoch": 0.45707788516202386, | |
| "grad_norm": 8.705190658569336, | |
| "kl": 0.06341552734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.848078191280365, | |
| "reward_std": 0.03278558413148858, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8480781614780426, | |
| "step": 201 | |
| }, | |
| { | |
| "completion_length": 122.05208396911621, | |
| "epoch": 0.4593519044911882, | |
| "grad_norm": 4.539862632751465, | |
| "kl": 0.06060791015625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.8337964713573456, | |
| "reward_std": 0.09542246071214322, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8546297699213028, | |
| "step": 202 | |
| }, | |
| { | |
| "completion_length": 119.18750381469727, | |
| "epoch": 0.46162592382035245, | |
| "grad_norm": 20.850038528442383, | |
| "kl": 0.060546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.8311468660831451, | |
| "reward_std": 0.0869890945032239, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8415635526180267, | |
| "step": 203 | |
| }, | |
| { | |
| "completion_length": 118.44791793823242, | |
| "epoch": 0.46389994314951677, | |
| "grad_norm": 9.465960502624512, | |
| "kl": 0.05963134765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.8948685824871063, | |
| "reward_std": 0.006126068299636245, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8948685228824615, | |
| "step": 204 | |
| }, | |
| { | |
| "completion_length": 124.72916793823242, | |
| "epoch": 0.4661739624786811, | |
| "grad_norm": 19.61353302001953, | |
| "kl": 0.05328369140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.8210335075855255, | |
| "reward_std": 0.03609966021031141, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8210335075855255, | |
| "step": 205 | |
| }, | |
| { | |
| "completion_length": 123.12500381469727, | |
| "epoch": 0.46844798180784536, | |
| "grad_norm": 6.793501377105713, | |
| "kl": 0.0574951171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.826656460762024, | |
| "reward_std": 0.05289078433997929, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8370731472969055, | |
| "step": 206 | |
| }, | |
| { | |
| "completion_length": 122.26041984558105, | |
| "epoch": 0.4707220011370097, | |
| "grad_norm": 5.112701892852783, | |
| "kl": 0.052734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.807763934135437, | |
| "reward_std": 0.06750181829556823, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8181806355714798, | |
| "step": 207 | |
| }, | |
| { | |
| "completion_length": 121.22916984558105, | |
| "epoch": 0.47299602046617395, | |
| "grad_norm": 6.380675315856934, | |
| "kl": 0.0511474609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.002, | |
| "reward": 1.8463412821292877, | |
| "reward_std": 0.03807840694207698, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8463412821292877, | |
| "step": 208 | |
| }, | |
| { | |
| "completion_length": 121.06250190734863, | |
| "epoch": 0.4752700397953383, | |
| "grad_norm": 24.312280654907227, | |
| "kl": 0.05584716796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.8374760746955872, | |
| "reward_std": 0.05966222519055009, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8478926569223404, | |
| "step": 209 | |
| }, | |
| { | |
| "completion_length": 121.94792175292969, | |
| "epoch": 0.47754405912450254, | |
| "grad_norm": 7.5535478591918945, | |
| "kl": 0.05242919921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.8417306244373322, | |
| "reward_std": 0.07789468741975725, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8521473109722137, | |
| "step": 210 | |
| }, | |
| { | |
| "completion_length": 120.58333587646484, | |
| "epoch": 0.47981807845366686, | |
| "grad_norm": 24.363685607910156, | |
| "kl": 0.05023193359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.002, | |
| "reward": 1.8678061068058014, | |
| "reward_std": 0.02573518455028534, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8678060472011566, | |
| "step": 211 | |
| }, | |
| { | |
| "completion_length": 125.44792175292969, | |
| "epoch": 0.48209209778283113, | |
| "grad_norm": 8.597858428955078, | |
| "kl": 0.0648193359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.8275066912174225, | |
| "reward_std": 0.09166095149703324, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.8483400493860245, | |
| "step": 212 | |
| }, | |
| { | |
| "completion_length": 128.0312557220459, | |
| "epoch": 0.48436611711199545, | |
| "grad_norm": 11.972718238830566, | |
| "kl": 0.057373046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.8702231347560883, | |
| "reward_std": 0.019674736773595214, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8702231794595718, | |
| "step": 213 | |
| }, | |
| { | |
| "completion_length": 124.14583587646484, | |
| "epoch": 0.4866401364411598, | |
| "grad_norm": 8.804828643798828, | |
| "kl": 0.0577392578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.8843638896942139, | |
| "reward_std": 0.012492099194787443, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8843639045953751, | |
| "step": 214 | |
| }, | |
| { | |
| "completion_length": 123.95833587646484, | |
| "epoch": 0.48891415577032404, | |
| "grad_norm": 5.491071701049805, | |
| "kl": 0.05499267578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.8558919131755829, | |
| "reward_std": 0.0653155903564766, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8663085699081421, | |
| "step": 215 | |
| }, | |
| { | |
| "completion_length": 128.31250381469727, | |
| "epoch": 0.49118817509948837, | |
| "grad_norm": 6.622015476226807, | |
| "kl": 0.055419921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.8662042915821075, | |
| "reward_std": 0.010809883824549615, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8662042170763016, | |
| "step": 216 | |
| }, | |
| { | |
| "completion_length": 122.77083778381348, | |
| "epoch": 0.49346219442865263, | |
| "grad_norm": 6.790284156799316, | |
| "kl": 0.05340576171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.8543438911437988, | |
| "reward_std": 0.05247529596090317, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8647605180740356, | |
| "step": 217 | |
| }, | |
| { | |
| "completion_length": 123.45833587646484, | |
| "epoch": 0.49573621375781696, | |
| "grad_norm": 13.34268569946289, | |
| "kl": 0.05609130859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.8597874641418457, | |
| "reward_std": 0.03391414089128375, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8597874045372009, | |
| "step": 218 | |
| }, | |
| { | |
| "completion_length": 124.67708778381348, | |
| "epoch": 0.4980102330869812, | |
| "grad_norm": 8.240407943725586, | |
| "kl": 0.058349609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.8817353248596191, | |
| "reward_std": 0.012413767748512328, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.881735309958458, | |
| "step": 219 | |
| }, | |
| { | |
| "completion_length": 124.67708587646484, | |
| "epoch": 0.5002842524161455, | |
| "grad_norm": 17.483158111572266, | |
| "kl": 0.06695556640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.8380079865455627, | |
| "reward_std": 0.07233457683469169, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.848424643278122, | |
| "step": 220 | |
| }, | |
| { | |
| "completion_length": 122.06250381469727, | |
| "epoch": 0.5025582717453099, | |
| "grad_norm": 7.1976494789123535, | |
| "kl": 0.0635986328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.8233891129493713, | |
| "reward_std": 0.08511380999698304, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8442224413156509, | |
| "step": 221 | |
| }, | |
| { | |
| "completion_length": 124.62500190734863, | |
| "epoch": 0.5048322910744741, | |
| "grad_norm": 11.14084243774414, | |
| "kl": 0.05706787109375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.8576789200305939, | |
| "reward_std": 0.04037183988839388, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8576788157224655, | |
| "step": 222 | |
| }, | |
| { | |
| "completion_length": 127.32292366027832, | |
| "epoch": 0.5071063104036384, | |
| "grad_norm": 5.903656005859375, | |
| "kl": 0.0548095703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.8232994377613068, | |
| "reward_std": 0.04908563965000212, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8232994079589844, | |
| "step": 223 | |
| }, | |
| { | |
| "completion_length": 126.42708587646484, | |
| "epoch": 0.5093803297328028, | |
| "grad_norm": 15.478821754455566, | |
| "kl": 0.05560302734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.8385266661643982, | |
| "reward_std": 0.030962634249590337, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.838526651263237, | |
| "step": 224 | |
| }, | |
| { | |
| "completion_length": 133.68750381469727, | |
| "epoch": 0.511654349061967, | |
| "grad_norm": 115.98380279541016, | |
| "kl": 0.05615234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.7851110100746155, | |
| "reward_std": 0.10919084469787776, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.8163610249757767, | |
| "step": 225 | |
| }, | |
| { | |
| "completion_length": 135.8541717529297, | |
| "epoch": 0.5139283683911313, | |
| "grad_norm": 11.387409210205078, | |
| "kl": 0.05108642578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.002, | |
| "reward": 1.8544709980487823, | |
| "reward_std": 0.05253174444078468, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8648876249790192, | |
| "step": 226 | |
| }, | |
| { | |
| "completion_length": 135.78125381469727, | |
| "epoch": 0.5162023877202956, | |
| "grad_norm": 5.589237689971924, | |
| "kl": 0.05682373046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.8480293452739716, | |
| "reward_std": 0.04187517584068701, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8584460020065308, | |
| "step": 227 | |
| }, | |
| { | |
| "completion_length": 129.57291793823242, | |
| "epoch": 0.51847640704946, | |
| "grad_norm": 6.878526210784912, | |
| "kl": 0.06463623046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.797906219959259, | |
| "reward_std": 0.09693466546013951, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.8187395334243774, | |
| "step": 228 | |
| }, | |
| { | |
| "completion_length": 133.1145896911621, | |
| "epoch": 0.5207504263786242, | |
| "grad_norm": 11.986654281616211, | |
| "kl": 0.05206298828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.8702963292598724, | |
| "reward_std": 0.046940833679400384, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8807128965854645, | |
| "step": 229 | |
| }, | |
| { | |
| "completion_length": 137.68750762939453, | |
| "epoch": 0.5230244457077885, | |
| "grad_norm": 4.170595169067383, | |
| "kl": 0.05438232421875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.8651223182678223, | |
| "reward_std": 0.01505957031622529, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8651222884654999, | |
| "step": 230 | |
| }, | |
| { | |
| "completion_length": 134.43750381469727, | |
| "epoch": 0.5252984650369528, | |
| "grad_norm": 8.454867362976074, | |
| "kl": 0.060546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.8617721498012543, | |
| "reward_std": 0.004831298429053277, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8617721498012543, | |
| "step": 231 | |
| }, | |
| { | |
| "completion_length": 130.88541793823242, | |
| "epoch": 0.5275724843661171, | |
| "grad_norm": 7.359889507293701, | |
| "kl": 0.05206298828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.875521332025528, | |
| "reward_std": 0.03221741976449266, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8755213916301727, | |
| "step": 232 | |
| }, | |
| { | |
| "completion_length": 137.62500381469727, | |
| "epoch": 0.5298465036952814, | |
| "grad_norm": 4.686023712158203, | |
| "kl": 0.0633544921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.836742788553238, | |
| "reward_std": 0.05397877559880726, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8471594154834747, | |
| "step": 233 | |
| }, | |
| { | |
| "completion_length": 133.71875762939453, | |
| "epoch": 0.5321205230244457, | |
| "grad_norm": 8.684386253356934, | |
| "kl": 0.05889892578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.8528975546360016, | |
| "reward_std": 0.013704222801607102, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8528975248336792, | |
| "step": 234 | |
| }, | |
| { | |
| "completion_length": 138.2395896911621, | |
| "epoch": 0.5343945423536101, | |
| "grad_norm": 27.199432373046875, | |
| "kl": 0.060302734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.819983571767807, | |
| "reward_std": 0.07903883041581139, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.830400213599205, | |
| "step": 235 | |
| }, | |
| { | |
| "completion_length": 137.5729217529297, | |
| "epoch": 0.5366685616827743, | |
| "grad_norm": 7.134744644165039, | |
| "kl": 0.060791015625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.848281979560852, | |
| "reward_std": 0.015959581825882196, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8482819348573685, | |
| "step": 236 | |
| }, | |
| { | |
| "completion_length": 132.91667366027832, | |
| "epoch": 0.5389425810119386, | |
| "grad_norm": 10.887066841125488, | |
| "kl": 0.060791015625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.8350262939929962, | |
| "reward_std": 0.06616777507588267, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8454429060220718, | |
| "step": 237 | |
| }, | |
| { | |
| "completion_length": 137.92708587646484, | |
| "epoch": 0.5412166003411029, | |
| "grad_norm": 30.873247146606445, | |
| "kl": 0.06396484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.8556478321552277, | |
| "reward_std": 0.05000708991428837, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8660645484924316, | |
| "step": 238 | |
| }, | |
| { | |
| "completion_length": 142.15625762939453, | |
| "epoch": 0.5434906196702672, | |
| "grad_norm": 5.460704326629639, | |
| "kl": 0.0665283203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.8754114508628845, | |
| "reward_std": 0.027424399624578655, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8754114210605621, | |
| "step": 239 | |
| }, | |
| { | |
| "completion_length": 138.1875057220459, | |
| "epoch": 0.5457646389994315, | |
| "grad_norm": 4.744943618774414, | |
| "kl": 0.071044921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0028, | |
| "reward": 1.874392807483673, | |
| "reward_std": 0.012488734326325357, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8743928372859955, | |
| "step": 240 | |
| }, | |
| { | |
| "completion_length": 140.36458587646484, | |
| "epoch": 0.5480386583285958, | |
| "grad_norm": 39.30780792236328, | |
| "kl": 0.066650390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.8584297001361847, | |
| "reward_std": 0.05610931571573019, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8688463568687439, | |
| "step": 241 | |
| }, | |
| { | |
| "completion_length": 139.48958778381348, | |
| "epoch": 0.55031267765776, | |
| "grad_norm": 11.226171493530273, | |
| "kl": 0.0614013671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.860059529542923, | |
| "reward_std": 0.02563871028542053, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8600595146417618, | |
| "step": 242 | |
| }, | |
| { | |
| "completion_length": 141.8854217529297, | |
| "epoch": 0.5525866969869244, | |
| "grad_norm": 16.11907196044922, | |
| "kl": 0.06512451171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.8452790975570679, | |
| "reward_std": 0.08041338669136167, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.8661123663187027, | |
| "step": 243 | |
| }, | |
| { | |
| "completion_length": 136.5104217529297, | |
| "epoch": 0.5548607163160887, | |
| "grad_norm": 66.74971771240234, | |
| "kl": 0.0682373046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.8137792348861694, | |
| "reward_std": 0.11896559037268162, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8346125185489655, | |
| "step": 244 | |
| }, | |
| { | |
| "completion_length": 141.0833396911621, | |
| "epoch": 0.557134735645253, | |
| "grad_norm": 8.579216957092285, | |
| "kl": 0.1153564453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0046, | |
| "reward": 1.8266068398952484, | |
| "reward_std": 0.12713953852653503, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.8578568696975708, | |
| "step": 245 | |
| }, | |
| { | |
| "completion_length": 153.37500762939453, | |
| "epoch": 0.5594087549744173, | |
| "grad_norm": 4.466084003448486, | |
| "kl": 0.0592041015625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.782545268535614, | |
| "reward_std": 0.11807792168110609, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.813795268535614, | |
| "step": 246 | |
| }, | |
| { | |
| "completion_length": 149.6041717529297, | |
| "epoch": 0.5616827743035816, | |
| "grad_norm": 9.517090797424316, | |
| "kl": 0.0572509765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.7555188834667206, | |
| "reward_std": 0.17915836814790964, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.786768838763237, | |
| "step": 247 | |
| }, | |
| { | |
| "completion_length": 148.96875381469727, | |
| "epoch": 0.5639567936327459, | |
| "grad_norm": 10.732756614685059, | |
| "kl": 0.06365966796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.8198718428611755, | |
| "reward_std": 0.11693388223648071, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8302884250879288, | |
| "step": 248 | |
| }, | |
| { | |
| "completion_length": 152.31250381469727, | |
| "epoch": 0.5662308129619101, | |
| "grad_norm": 15.704789161682129, | |
| "kl": 0.06011962890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.7587114572525024, | |
| "reward_std": 0.13302453747019172, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.7899614423513412, | |
| "step": 249 | |
| }, | |
| { | |
| "completion_length": 151.11458587646484, | |
| "epoch": 0.5685048322910745, | |
| "grad_norm": 10.12311840057373, | |
| "kl": 0.06243896484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.7591370046138763, | |
| "reward_std": 0.19031737372279167, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/segmentation_reward": 0.7903869301080704, | |
| "step": 250 | |
| }, | |
| { | |
| "completion_length": 143.3020896911621, | |
| "epoch": 0.5707788516202388, | |
| "grad_norm": 9.594857215881348, | |
| "kl": 0.06365966796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.8209541141986847, | |
| "reward_std": 0.11682178732007742, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8417873978614807, | |
| "step": 251 | |
| }, | |
| { | |
| "completion_length": 148.5, | |
| "epoch": 0.573052870949403, | |
| "grad_norm": 6.971501350402832, | |
| "kl": 0.05908203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.7789467871189117, | |
| "reward_std": 0.16825164668262005, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "rewards/segmentation_reward": 0.8101967573165894, | |
| "step": 252 | |
| }, | |
| { | |
| "completion_length": 152.1041717529297, | |
| "epoch": 0.5753268902785673, | |
| "grad_norm": 23.23783302307129, | |
| "kl": 0.059814453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.7420052289962769, | |
| "reward_std": 0.18916566669940948, | |
| "rewards/format_reward": 0.958333358168602, | |
| "rewards/segmentation_reward": 0.7836718857288361, | |
| "step": 253 | |
| }, | |
| { | |
| "completion_length": 145.42708587646484, | |
| "epoch": 0.5776009096077317, | |
| "grad_norm": 30.076963424682617, | |
| "kl": 0.05194091796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.7838060855865479, | |
| "reward_std": 0.08882320672273636, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/segmentation_reward": 0.8150560706853867, | |
| "step": 254 | |
| }, | |
| { | |
| "completion_length": 152.81250762939453, | |
| "epoch": 0.579874928936896, | |
| "grad_norm": 8.252555847167969, | |
| "kl": 0.05322265625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.7935425341129303, | |
| "reward_std": 0.13829213567078114, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8143758326768875, | |
| "step": 255 | |
| }, | |
| { | |
| "completion_length": 151.12500381469727, | |
| "epoch": 0.5821489482660602, | |
| "grad_norm": 7.083897113800049, | |
| "kl": 0.05743408203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.8427066802978516, | |
| "reward_std": 0.103471142007038, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8635399788618088, | |
| "step": 256 | |
| }, | |
| { | |
| "completion_length": 150.7291717529297, | |
| "epoch": 0.5844229675952246, | |
| "grad_norm": 20.999393463134766, | |
| "kl": 0.05621337890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.8271246254444122, | |
| "reward_std": 0.07995186559855938, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8375412821769714, | |
| "step": 257 | |
| }, | |
| { | |
| "completion_length": 146.43750381469727, | |
| "epoch": 0.5866969869243889, | |
| "grad_norm": 4.985034465789795, | |
| "kl": 0.05804443359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.875591516494751, | |
| "reward_std": 0.030072015128098428, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.875591516494751, | |
| "step": 258 | |
| }, | |
| { | |
| "completion_length": 148.57291793823242, | |
| "epoch": 0.5889710062535531, | |
| "grad_norm": 27.399518966674805, | |
| "kl": 0.05438232421875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.7720091938972473, | |
| "reward_std": 0.14485601719934493, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "rewards/segmentation_reward": 0.8136758357286453, | |
| "step": 259 | |
| }, | |
| { | |
| "completion_length": 146.89583587646484, | |
| "epoch": 0.5912450255827174, | |
| "grad_norm": 8.949258804321289, | |
| "kl": 0.05517578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.7998264729976654, | |
| "reward_std": 0.11114408634603024, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8102431297302246, | |
| "step": 260 | |
| }, | |
| { | |
| "completion_length": 143.32292556762695, | |
| "epoch": 0.5935190449118818, | |
| "grad_norm": 13.319233894348145, | |
| "kl": 0.0638427734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.8625755608081818, | |
| "reward_std": 0.024278577242512256, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8625756055116653, | |
| "step": 261 | |
| }, | |
| { | |
| "completion_length": 151.6354217529297, | |
| "epoch": 0.595793064241046, | |
| "grad_norm": 5.921450138092041, | |
| "kl": 0.0545654296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.8347079157829285, | |
| "reward_std": 0.05886374693363905, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8451245874166489, | |
| "step": 262 | |
| }, | |
| { | |
| "completion_length": 150.2083396911621, | |
| "epoch": 0.5980670835702103, | |
| "grad_norm": 6.478372097015381, | |
| "kl": 0.06402587890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.853881686925888, | |
| "reward_std": 0.069986637448892, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8642983585596085, | |
| "step": 263 | |
| }, | |
| { | |
| "completion_length": 142.2291717529297, | |
| "epoch": 0.6003411028993746, | |
| "grad_norm": 44.52735900878906, | |
| "kl": 0.06976318359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0028, | |
| "reward": 1.8701772689819336, | |
| "reward_std": 0.03132961760275066, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8701772391796112, | |
| "step": 264 | |
| }, | |
| { | |
| "completion_length": 145.3958396911621, | |
| "epoch": 0.602615122228539, | |
| "grad_norm": 9.911026000976562, | |
| "kl": 0.054443359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.8112512826919556, | |
| "reward_std": 0.12766834167996421, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.8425012826919556, | |
| "step": 265 | |
| }, | |
| { | |
| "completion_length": 153.6354217529297, | |
| "epoch": 0.6048891415577032, | |
| "grad_norm": 5.161675453186035, | |
| "kl": 0.05157470703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.8424187302589417, | |
| "reward_std": 0.021243932540528476, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8424187004566193, | |
| "step": 266 | |
| }, | |
| { | |
| "completion_length": 142.25000381469727, | |
| "epoch": 0.6071631608868675, | |
| "grad_norm": 10.04539966583252, | |
| "kl": 0.05657958984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.8589707911014557, | |
| "reward_std": 0.07298957108287141, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8798040300607681, | |
| "step": 267 | |
| }, | |
| { | |
| "completion_length": 140.06250381469727, | |
| "epoch": 0.6094371802160319, | |
| "grad_norm": 5.002673625946045, | |
| "kl": 0.0604248046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.8338092267513275, | |
| "reward_std": 0.07467565825209022, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8442258834838867, | |
| "step": 268 | |
| }, | |
| { | |
| "completion_length": 143.37500762939453, | |
| "epoch": 0.6117111995451961, | |
| "grad_norm": 16.33036231994629, | |
| "kl": 0.06146240234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.828411191701889, | |
| "reward_std": 0.05990099138580263, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8388277888298035, | |
| "step": 269 | |
| }, | |
| { | |
| "completion_length": 145.7083396911621, | |
| "epoch": 0.6139852188743604, | |
| "grad_norm": 10.558090209960938, | |
| "kl": 0.05633544921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.8714422285556793, | |
| "reward_std": 0.04079840763006359, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8818589001893997, | |
| "step": 270 | |
| }, | |
| { | |
| "completion_length": 136.46875762939453, | |
| "epoch": 0.6162592382035247, | |
| "grad_norm": 6.936587333679199, | |
| "kl": 0.0615234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.8483183681964874, | |
| "reward_std": 0.05946638010209426, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8587349951267242, | |
| "step": 271 | |
| }, | |
| { | |
| "completion_length": 136.8854217529297, | |
| "epoch": 0.6185332575326891, | |
| "grad_norm": 9.40880012512207, | |
| "kl": 0.0555419921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.8671943843364716, | |
| "reward_std": 0.026569546665996313, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8671943843364716, | |
| "step": 272 | |
| }, | |
| { | |
| "completion_length": 133.84375, | |
| "epoch": 0.6208072768618533, | |
| "grad_norm": 8.41716480255127, | |
| "kl": 0.06396484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.8463688492774963, | |
| "reward_std": 0.10014531551860273, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8672020584344864, | |
| "step": 273 | |
| }, | |
| { | |
| "completion_length": 137.4166717529297, | |
| "epoch": 0.6230812961910176, | |
| "grad_norm": 6.68651008605957, | |
| "kl": 0.05877685546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.8591451048851013, | |
| "reward_std": 0.0608306503854692, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8695616871118546, | |
| "step": 274 | |
| }, | |
| { | |
| "completion_length": 139.6354217529297, | |
| "epoch": 0.625355315520182, | |
| "grad_norm": 5.125174522399902, | |
| "kl": 0.05517578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.8548312783241272, | |
| "reward_std": 0.020504672429524362, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8548312485218048, | |
| "step": 275 | |
| }, | |
| { | |
| "completion_length": 140.5104217529297, | |
| "epoch": 0.6276293348493462, | |
| "grad_norm": 8.44184684753418, | |
| "kl": 0.05133056640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.8494782745838165, | |
| "reward_std": 0.04421432921662927, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8494782447814941, | |
| "step": 276 | |
| }, | |
| { | |
| "completion_length": 141.0520896911621, | |
| "epoch": 0.6299033541785105, | |
| "grad_norm": 5.259810447692871, | |
| "kl": 0.055419921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.8360374867916107, | |
| "reward_std": 0.060232745250687, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8464541882276535, | |
| "step": 277 | |
| }, | |
| { | |
| "completion_length": 135.2083396911621, | |
| "epoch": 0.6321773735076748, | |
| "grad_norm": 7.691401481628418, | |
| "kl": 0.061279296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.8679547905921936, | |
| "reward_std": 0.026071164524182677, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8679548501968384, | |
| "step": 278 | |
| }, | |
| { | |
| "completion_length": 132.30208587646484, | |
| "epoch": 0.6344513928368392, | |
| "grad_norm": 33.14459228515625, | |
| "kl": 0.05517578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.8575536906719208, | |
| "reward_std": 0.037932454550173134, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8679703027009964, | |
| "step": 279 | |
| }, | |
| { | |
| "completion_length": 135.06250762939453, | |
| "epoch": 0.6367254121660034, | |
| "grad_norm": 14.222509384155273, | |
| "kl": 0.0467529296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0019, | |
| "reward": 1.8211901187896729, | |
| "reward_std": 0.022689874283969402, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8211900591850281, | |
| "step": 280 | |
| }, | |
| { | |
| "completion_length": 138.9166717529297, | |
| "epoch": 0.6389994314951677, | |
| "grad_norm": 45.212120056152344, | |
| "kl": 0.048828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.002, | |
| "reward": 1.8557825684547424, | |
| "reward_std": 0.061675679637119174, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8661992251873016, | |
| "step": 281 | |
| }, | |
| { | |
| "completion_length": 133.5520896911621, | |
| "epoch": 0.641273450824332, | |
| "grad_norm": 6.831585884094238, | |
| "kl": 0.058837890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.859096884727478, | |
| "reward_std": 0.02432074275566265, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8590968549251556, | |
| "step": 282 | |
| }, | |
| { | |
| "completion_length": 134.60416984558105, | |
| "epoch": 0.6435474701534963, | |
| "grad_norm": 17.257526397705078, | |
| "kl": 0.0516357421875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.8761568367481232, | |
| "reward_std": 0.020475412253290415, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.876156821846962, | |
| "step": 283 | |
| }, | |
| { | |
| "completion_length": 133.95833587646484, | |
| "epoch": 0.6458214894826606, | |
| "grad_norm": 13.777560234069824, | |
| "kl": 0.05450439453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.8327456414699554, | |
| "reward_std": 0.03332398599013686, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8327456265687943, | |
| "step": 284 | |
| }, | |
| { | |
| "completion_length": 130.08333587646484, | |
| "epoch": 0.6480955088118249, | |
| "grad_norm": 8.314565658569336, | |
| "kl": 0.06304931640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.8160936534404755, | |
| "reward_std": 0.05730089984717779, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8265102803707123, | |
| "step": 285 | |
| }, | |
| { | |
| "completion_length": 138.0416717529297, | |
| "epoch": 0.6503695281409893, | |
| "grad_norm": 8.033271789550781, | |
| "kl": 0.052490234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.8485551476478577, | |
| "reward_std": 0.0665385426254943, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8589717596769333, | |
| "step": 286 | |
| }, | |
| { | |
| "completion_length": 131.9479217529297, | |
| "epoch": 0.6526435474701535, | |
| "grad_norm": 4.302734851837158, | |
| "kl": 0.0552978515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.814813256263733, | |
| "reward_std": 0.07665527774952352, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8356466442346573, | |
| "step": 287 | |
| }, | |
| { | |
| "completion_length": 127.94791984558105, | |
| "epoch": 0.6549175667993178, | |
| "grad_norm": 6.902019023895264, | |
| "kl": 0.05352783203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.8666083216667175, | |
| "reward_std": 0.05546297336695716, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8770249783992767, | |
| "step": 288 | |
| }, | |
| { | |
| "completion_length": 128.13541793823242, | |
| "epoch": 0.657191586128482, | |
| "grad_norm": 5.1357293128967285, | |
| "kl": 0.0509033203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.002, | |
| "reward": 1.8219444751739502, | |
| "reward_std": 0.10313208564184606, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.842777818441391, | |
| "step": 289 | |
| }, | |
| { | |
| "completion_length": 127.03125381469727, | |
| "epoch": 0.6594656054576464, | |
| "grad_norm": 10.558165550231934, | |
| "kl": 0.05340576171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.8529814183712006, | |
| "reward_std": 0.04027191852219403, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8529813587665558, | |
| "step": 290 | |
| }, | |
| { | |
| "completion_length": 123.98958778381348, | |
| "epoch": 0.6617396247868107, | |
| "grad_norm": 17.23190689086914, | |
| "kl": 0.0595703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.808007925748825, | |
| "reward_std": 0.11614370718598366, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8288412094116211, | |
| "step": 291 | |
| }, | |
| { | |
| "completion_length": 124.13542175292969, | |
| "epoch": 0.664013644115975, | |
| "grad_norm": 5.512027263641357, | |
| "kl": 0.04791259765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0019, | |
| "reward": 1.84766486287117, | |
| "reward_std": 0.08955034404061735, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8684981167316437, | |
| "step": 292 | |
| }, | |
| { | |
| "completion_length": 120.54166984558105, | |
| "epoch": 0.6662876634451392, | |
| "grad_norm": 8.465278625488281, | |
| "kl": 0.05010986328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.002, | |
| "reward": 1.8660954236984253, | |
| "reward_std": 0.049259885265200865, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8765120506286621, | |
| "step": 293 | |
| }, | |
| { | |
| "completion_length": 126.70833587646484, | |
| "epoch": 0.6685616827743036, | |
| "grad_norm": 11.483687400817871, | |
| "kl": 0.05279541015625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.8239311873912811, | |
| "reward_std": 0.1335212409030646, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "rewards/segmentation_reward": 0.8551811873912811, | |
| "step": 294 | |
| }, | |
| { | |
| "completion_length": 126.12500190734863, | |
| "epoch": 0.6708357021034679, | |
| "grad_norm": 99337359065088.0, | |
| "kl": 674309865472.063, | |
| "learning_rate": 1e-06, | |
| "loss": 27030312960.0, | |
| "reward": 1.8664152026176453, | |
| "reward_std": 0.04355062101967633, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8768318891525269, | |
| "step": 295 | |
| }, | |
| { | |
| "completion_length": 125.26041984558105, | |
| "epoch": 0.6731097214326321, | |
| "grad_norm": 9.260116577148438, | |
| "kl": 0.053955078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.8506720662117004, | |
| "reward_std": 0.07204537454526871, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8610887378454208, | |
| "step": 296 | |
| }, | |
| { | |
| "completion_length": 131.7604217529297, | |
| "epoch": 0.6753837407617965, | |
| "grad_norm": 11.006333351135254, | |
| "kl": 0.05487060546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.8154217898845673, | |
| "reward_std": 0.029030885722022504, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8154216706752777, | |
| "step": 297 | |
| }, | |
| { | |
| "completion_length": 129.21875, | |
| "epoch": 0.6776577600909608, | |
| "grad_norm": 22.507421493530273, | |
| "kl": 0.057373046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.833427518606186, | |
| "reward_std": 0.041746608447283506, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8334274739027023, | |
| "step": 298 | |
| }, | |
| { | |
| "completion_length": 126.59375381469727, | |
| "epoch": 0.6799317794201251, | |
| "grad_norm": 8.090750694274902, | |
| "kl": 0.0552978515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.8329502940177917, | |
| "reward_std": 0.03257988323457539, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8329502940177917, | |
| "step": 299 | |
| }, | |
| { | |
| "completion_length": 127.93750381469727, | |
| "epoch": 0.6822057987492893, | |
| "grad_norm": 23.35614585876465, | |
| "kl": 0.05328369140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.8777255713939667, | |
| "reward_std": 0.0277888648561202, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8777255564928055, | |
| "step": 300 | |
| }, | |
| { | |
| "completion_length": 124.33333587646484, | |
| "epoch": 0.6844798180784537, | |
| "grad_norm": 7.305347442626953, | |
| "kl": 0.062744140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.887622207403183, | |
| "reward_std": 0.008017042011488229, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.887622207403183, | |
| "step": 301 | |
| }, | |
| { | |
| "completion_length": 124.26041984558105, | |
| "epoch": 0.686753837407618, | |
| "grad_norm": 32.80826950073242, | |
| "kl": 0.05267333984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.8523709774017334, | |
| "reward_std": 0.01647485780995339, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8523710370063782, | |
| "step": 302 | |
| }, | |
| { | |
| "completion_length": 122.36458396911621, | |
| "epoch": 0.6890278567367822, | |
| "grad_norm": 9.616044044494629, | |
| "kl": 0.0548095703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.8915051221847534, | |
| "reward_std": 0.020660731475800276, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.891505092382431, | |
| "step": 303 | |
| }, | |
| { | |
| "completion_length": 125.66666793823242, | |
| "epoch": 0.6913018760659465, | |
| "grad_norm": 4.894202709197998, | |
| "kl": 0.0616455078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.8347567915916443, | |
| "reward_std": 0.023919553961604834, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8347567617893219, | |
| "step": 304 | |
| }, | |
| { | |
| "completion_length": 124.51042175292969, | |
| "epoch": 0.6935758953951109, | |
| "grad_norm": 50.18326950073242, | |
| "kl": 0.09136962890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0036, | |
| "reward": 1.8522542119026184, | |
| "reward_std": 0.08253866015002131, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.873087465763092, | |
| "step": 305 | |
| }, | |
| { | |
| "completion_length": 125.41666984558105, | |
| "epoch": 0.6958499147242752, | |
| "grad_norm": 5.762409687042236, | |
| "kl": 0.05657958984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.8508845269680023, | |
| "reward_std": 0.06522158032748848, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8613012135028839, | |
| "step": 306 | |
| }, | |
| { | |
| "completion_length": 131.19791984558105, | |
| "epoch": 0.6981239340534394, | |
| "grad_norm": 6.682615756988525, | |
| "kl": 0.0689697265625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.8200619518756866, | |
| "reward_std": 0.07617322774603963, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8304786384105682, | |
| "step": 307 | |
| }, | |
| { | |
| "completion_length": 132.42708587646484, | |
| "epoch": 0.7003979533826038, | |
| "grad_norm": 19.110347747802734, | |
| "kl": 0.0611572265625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.883405864238739, | |
| "reward_std": 0.03483639005571604, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.883405864238739, | |
| "step": 308 | |
| }, | |
| { | |
| "completion_length": 130.9791717529297, | |
| "epoch": 0.7026719727117681, | |
| "grad_norm": 14.51934814453125, | |
| "kl": 0.05859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.832915335893631, | |
| "reward_std": 0.06287066219374537, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8433319926261902, | |
| "step": 309 | |
| }, | |
| { | |
| "completion_length": 126.01042175292969, | |
| "epoch": 0.7049459920409323, | |
| "grad_norm": 9.068413734436035, | |
| "kl": 0.05810546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.8837738931179047, | |
| "reward_std": 0.02703522122465074, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8837738037109375, | |
| "step": 310 | |
| }, | |
| { | |
| "completion_length": 134.0729217529297, | |
| "epoch": 0.7072200113700966, | |
| "grad_norm": 9.09504222869873, | |
| "kl": 0.0723876953125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0029, | |
| "reward": 1.8339940905570984, | |
| "reward_std": 0.08121342983213253, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.8548273891210556, | |
| "step": 311 | |
| }, | |
| { | |
| "completion_length": 136.13541984558105, | |
| "epoch": 0.709494030699261, | |
| "grad_norm": 4.718442440032959, | |
| "kl": 0.05426025390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.8730711042881012, | |
| "reward_std": 0.03671956621110439, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.87307108938694, | |
| "step": 312 | |
| }, | |
| { | |
| "completion_length": 126.04166984558105, | |
| "epoch": 0.7117680500284252, | |
| "grad_norm": 12.670175552368164, | |
| "kl": 0.05718994140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.8251341879367828, | |
| "reward_std": 0.04767660913057625, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8251341283321381, | |
| "step": 313 | |
| }, | |
| { | |
| "completion_length": 131.08333587646484, | |
| "epoch": 0.7140420693575895, | |
| "grad_norm": 7.425668716430664, | |
| "kl": 0.0506591796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.002, | |
| "reward": 1.8413426876068115, | |
| "reward_std": 0.0643298716749996, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8517593443393707, | |
| "step": 314 | |
| }, | |
| { | |
| "completion_length": 126.39583778381348, | |
| "epoch": 0.7163160886867538, | |
| "grad_norm": 7.450831413269043, | |
| "kl": 0.05279541015625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.8399729132652283, | |
| "reward_std": 0.03783059283159673, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8399728983640671, | |
| "step": 315 | |
| }, | |
| { | |
| "completion_length": 131.39583778381348, | |
| "epoch": 0.7185901080159182, | |
| "grad_norm": 4.984809398651123, | |
| "kl": 0.0579833984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.8401748836040497, | |
| "reward_std": 0.06397436745464802, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8505915254354477, | |
| "step": 316 | |
| }, | |
| { | |
| "completion_length": 126.29166984558105, | |
| "epoch": 0.7208641273450824, | |
| "grad_norm": 8.88703727722168, | |
| "kl": 0.055908203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.8670341670513153, | |
| "reward_std": 0.023501697811298072, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8670340925455093, | |
| "step": 317 | |
| }, | |
| { | |
| "completion_length": 132.0104217529297, | |
| "epoch": 0.7231381466742467, | |
| "grad_norm": 6.049837589263916, | |
| "kl": 0.05718994140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.8481946885585785, | |
| "reward_std": 0.04346911353059113, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8481947034597397, | |
| "step": 318 | |
| }, | |
| { | |
| "completion_length": 132.4270896911621, | |
| "epoch": 0.7254121660034111, | |
| "grad_norm": 46.442176818847656, | |
| "kl": 0.05584716796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.858694463968277, | |
| "reward_std": 0.023713725386187434, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8586944341659546, | |
| "step": 319 | |
| }, | |
| { | |
| "completion_length": 131.89584159851074, | |
| "epoch": 0.7276861853325753, | |
| "grad_norm": 28.615896224975586, | |
| "kl": 0.0504150390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.002, | |
| "reward": 1.8294812142848969, | |
| "reward_std": 0.058421910274773836, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8398977816104889, | |
| "step": 320 | |
| }, | |
| { | |
| "completion_length": 131.5104217529297, | |
| "epoch": 0.7299602046617396, | |
| "grad_norm": 4.829744338989258, | |
| "kl": 0.0491943359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.002, | |
| "reward": 1.8195575773715973, | |
| "reward_std": 0.05906218430027366, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8299742341041565, | |
| "step": 321 | |
| }, | |
| { | |
| "completion_length": 135.25000381469727, | |
| "epoch": 0.7322342239909039, | |
| "grad_norm": 8.953800201416016, | |
| "kl": 0.0484619140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0019, | |
| "reward": 1.8626637756824493, | |
| "reward_std": 0.028036643168888986, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8626636862754822, | |
| "step": 322 | |
| }, | |
| { | |
| "completion_length": 134.64583587646484, | |
| "epoch": 0.7345082433200683, | |
| "grad_norm": 5.906992435455322, | |
| "kl": 0.06317138671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.8335199654102325, | |
| "reward_std": 0.03467556097893976, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8439366221427917, | |
| "step": 323 | |
| }, | |
| { | |
| "completion_length": 129.5520896911621, | |
| "epoch": 0.7367822626492325, | |
| "grad_norm": 5.112988471984863, | |
| "kl": 0.0526123046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.836005687713623, | |
| "reward_std": 0.07491261116228998, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8464223295450211, | |
| "step": 324 | |
| }, | |
| { | |
| "completion_length": 131.0833396911621, | |
| "epoch": 0.7390562819783968, | |
| "grad_norm": 5.518293380737305, | |
| "kl": 0.0584716796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.8426667749881744, | |
| "reward_std": 0.04552039853297174, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.842666745185852, | |
| "step": 325 | |
| }, | |
| { | |
| "completion_length": 135.5, | |
| "epoch": 0.7413303013075612, | |
| "grad_norm": 4.705676555633545, | |
| "kl": 0.053466796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.8192458748817444, | |
| "reward_std": 0.03886064630933106, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8192458003759384, | |
| "step": 326 | |
| }, | |
| { | |
| "completion_length": 134.08333778381348, | |
| "epoch": 0.7436043206367254, | |
| "grad_norm": 33.00435256958008, | |
| "kl": 0.05438232421875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.8162838518619537, | |
| "reward_std": 0.1155676506459713, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.8371172249317169, | |
| "step": 327 | |
| }, | |
| { | |
| "completion_length": 134.50000381469727, | |
| "epoch": 0.7458783399658897, | |
| "grad_norm": 8.883432388305664, | |
| "kl": 0.0660400390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.8558038473129272, | |
| "reward_std": 0.021242189570330083, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8558038026094437, | |
| "step": 328 | |
| }, | |
| { | |
| "completion_length": 130.9166717529297, | |
| "epoch": 0.748152359295054, | |
| "grad_norm": 7.608541965484619, | |
| "kl": 0.0537109375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.8631189167499542, | |
| "reward_std": 0.0687082838267088, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8839522004127502, | |
| "step": 329 | |
| }, | |
| { | |
| "completion_length": 134.0416717529297, | |
| "epoch": 0.7504263786242183, | |
| "grad_norm": 12.23119068145752, | |
| "kl": 0.05078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.002, | |
| "reward": 1.7768693566322327, | |
| "reward_std": 0.249573610490188, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "rewards/segmentation_reward": 0.8393692970275879, | |
| "step": 330 | |
| }, | |
| { | |
| "completion_length": 130.4791717529297, | |
| "epoch": 0.7527003979533826, | |
| "grad_norm": 5.401615142822266, | |
| "kl": 0.05450439453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.8891821205615997, | |
| "reward_std": 0.011260898812906817, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8891821354627609, | |
| "step": 331 | |
| }, | |
| { | |
| "completion_length": 131.0416717529297, | |
| "epoch": 0.7549744172825469, | |
| "grad_norm": 13.699596405029297, | |
| "kl": 0.053955078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.8251034915447235, | |
| "reward_std": 0.09959045611321926, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8459368050098419, | |
| "step": 332 | |
| }, | |
| { | |
| "completion_length": 127.47916984558105, | |
| "epoch": 0.7572484366117112, | |
| "grad_norm": 3.924161672592163, | |
| "kl": 0.05859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.8615702688694, | |
| "reward_std": 0.023111989721655846, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8615703135728836, | |
| "step": 333 | |
| }, | |
| { | |
| "completion_length": 130.03125190734863, | |
| "epoch": 0.7595224559408755, | |
| "grad_norm": 4.831616401672363, | |
| "kl": 0.05377197265625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.8001680374145508, | |
| "reward_std": 0.11854788940399885, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "rewards/segmentation_reward": 0.8314179629087448, | |
| "step": 334 | |
| }, | |
| { | |
| "completion_length": 134.46875762939453, | |
| "epoch": 0.7617964752700398, | |
| "grad_norm": 7.652170181274414, | |
| "kl": 0.0491943359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.002, | |
| "reward": 1.8629566133022308, | |
| "reward_std": 0.019374964205780998, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8629565536975861, | |
| "step": 335 | |
| }, | |
| { | |
| "completion_length": 125.67708587646484, | |
| "epoch": 0.7640704945992041, | |
| "grad_norm": 9.895513534545898, | |
| "kl": 0.05560302734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.8746117651462555, | |
| "reward_std": 0.01222996957221767, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8746117502450943, | |
| "step": 336 | |
| }, | |
| { | |
| "completion_length": 124.05208587646484, | |
| "epoch": 0.7663445139283684, | |
| "grad_norm": 5.920944690704346, | |
| "kl": 0.05902099609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.8395130336284637, | |
| "reward_std": 0.05145470690331422, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.849929690361023, | |
| "step": 337 | |
| }, | |
| { | |
| "completion_length": 132.37500381469727, | |
| "epoch": 0.7686185332575327, | |
| "grad_norm": 5.402987003326416, | |
| "kl": 0.05596923828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.8278735280036926, | |
| "reward_std": 0.05184625834226608, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8278734982013702, | |
| "step": 338 | |
| }, | |
| { | |
| "completion_length": 127.58333778381348, | |
| "epoch": 0.770892552586697, | |
| "grad_norm": 5.004913806915283, | |
| "kl": 0.06292724609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.8682883381843567, | |
| "reward_std": 0.04544829938095063, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8787050098180771, | |
| "step": 339 | |
| }, | |
| { | |
| "completion_length": 125.28125190734863, | |
| "epoch": 0.7731665719158612, | |
| "grad_norm": 10.156172752380371, | |
| "kl": 0.062744140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.8668439388275146, | |
| "reward_std": 0.041374096646904945, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8668439537286758, | |
| "step": 340 | |
| }, | |
| { | |
| "completion_length": 131.6562557220459, | |
| "epoch": 0.7754405912450256, | |
| "grad_norm": 7.755644798278809, | |
| "kl": 0.05865478515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.8412960767745972, | |
| "reward_std": 0.07608503196388483, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8517126888036728, | |
| "step": 341 | |
| }, | |
| { | |
| "completion_length": 131.5729217529297, | |
| "epoch": 0.7777146105741899, | |
| "grad_norm": 4.717071533203125, | |
| "kl": 0.065185546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.8332788348197937, | |
| "reward_std": 0.11580408085137606, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8541121035814285, | |
| "step": 342 | |
| }, | |
| { | |
| "completion_length": 128.59375381469727, | |
| "epoch": 0.7799886299033542, | |
| "grad_norm": 5.577821731567383, | |
| "kl": 0.06103515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.8615179657936096, | |
| "reward_std": 0.032256070990115404, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8615180253982544, | |
| "step": 343 | |
| }, | |
| { | |
| "completion_length": 133.35417556762695, | |
| "epoch": 0.7822626492325184, | |
| "grad_norm": 17.56463050842285, | |
| "kl": 0.0675048828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.826512634754181, | |
| "reward_std": 0.04132456611841917, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8265126645565033, | |
| "step": 344 | |
| }, | |
| { | |
| "completion_length": 120.48958396911621, | |
| "epoch": 0.7845366685616828, | |
| "grad_norm": 10.41229248046875, | |
| "kl": 0.06341552734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.867453545331955, | |
| "reward_std": 0.03282071987632662, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8674535155296326, | |
| "step": 345 | |
| }, | |
| { | |
| "completion_length": 124.66666793823242, | |
| "epoch": 0.7868106878908471, | |
| "grad_norm": 11.105557441711426, | |
| "kl": 0.064208984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.8204675316810608, | |
| "reward_std": 0.08718774002045393, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8308842182159424, | |
| "step": 346 | |
| }, | |
| { | |
| "completion_length": 135.03125762939453, | |
| "epoch": 0.7890847072200113, | |
| "grad_norm": 7.016823768615723, | |
| "kl": 0.064208984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.8432948589324951, | |
| "reward_std": 0.04058923898264766, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8432948887348175, | |
| "step": 347 | |
| }, | |
| { | |
| "completion_length": 132.23958587646484, | |
| "epoch": 0.7913587265491757, | |
| "grad_norm": 10.676706314086914, | |
| "kl": 0.05792236328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.8691777288913727, | |
| "reward_std": 0.027745802886784077, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8691777735948563, | |
| "step": 348 | |
| }, | |
| { | |
| "completion_length": 134.97917556762695, | |
| "epoch": 0.79363274587834, | |
| "grad_norm": 6.6281418800354, | |
| "kl": 0.0599365234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.864767611026764, | |
| "reward_std": 0.014376505510881543, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8647675514221191, | |
| "step": 349 | |
| }, | |
| { | |
| "completion_length": 133.29166793823242, | |
| "epoch": 0.7959067652075043, | |
| "grad_norm": 5.383996963500977, | |
| "kl": 0.06011962890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.8472253382205963, | |
| "reward_std": 0.045232664968352765, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8576419502496719, | |
| "step": 350 | |
| }, | |
| { | |
| "completion_length": 127.71875190734863, | |
| "epoch": 0.7981807845366685, | |
| "grad_norm": 7.840123176574707, | |
| "kl": 0.071533203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0029, | |
| "reward": 1.8401671946048737, | |
| "reward_std": 0.060452125035226345, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8505838364362717, | |
| "step": 351 | |
| }, | |
| { | |
| "completion_length": 130.77083587646484, | |
| "epoch": 0.8004548038658329, | |
| "grad_norm": 8.090129852294922, | |
| "kl": 0.0594482421875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.8789224326610565, | |
| "reward_std": 0.0189595150295645, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8789223730564117, | |
| "step": 352 | |
| }, | |
| { | |
| "completion_length": 131.5312557220459, | |
| "epoch": 0.8027288231949972, | |
| "grad_norm": 5.633690357208252, | |
| "kl": 0.06817626953125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.818681389093399, | |
| "reward_std": 0.13099218998104334, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.8499313741922379, | |
| "step": 353 | |
| }, | |
| { | |
| "completion_length": 129.14583587646484, | |
| "epoch": 0.8050028425241614, | |
| "grad_norm": 4.412927627563477, | |
| "kl": 0.06451416015625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.8463831841945648, | |
| "reward_std": 0.03683751542121172, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8463831394910812, | |
| "step": 354 | |
| }, | |
| { | |
| "completion_length": 126.17708587646484, | |
| "epoch": 0.8072768618533257, | |
| "grad_norm": 11.527948379516602, | |
| "kl": 0.06689453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.8371998071670532, | |
| "reward_std": 0.021064158499939367, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8371998518705368, | |
| "step": 355 | |
| }, | |
| { | |
| "completion_length": 128.8854217529297, | |
| "epoch": 0.8095508811824901, | |
| "grad_norm": 48.39152908325195, | |
| "kl": 0.05670166015625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.8901303112506866, | |
| "reward_std": 0.009621757606510073, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8901302367448807, | |
| "step": 356 | |
| }, | |
| { | |
| "completion_length": 127.15625381469727, | |
| "epoch": 0.8118249005116543, | |
| "grad_norm": 58.79667663574219, | |
| "kl": 0.06072998046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.8344822525978088, | |
| "reward_std": 0.07819899823516607, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.8553156107664108, | |
| "step": 357 | |
| }, | |
| { | |
| "completion_length": 124.82292366027832, | |
| "epoch": 0.8140989198408186, | |
| "grad_norm": 4.849188327789307, | |
| "kl": 0.065185546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.8805176317691803, | |
| "reward_std": 0.03933787811547518, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8805176019668579, | |
| "step": 358 | |
| }, | |
| { | |
| "completion_length": 127.5312557220459, | |
| "epoch": 0.816372939169983, | |
| "grad_norm": 9.841382026672363, | |
| "kl": 0.06512451171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.8762429058551788, | |
| "reward_std": 0.012105958012398332, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8762429058551788, | |
| "step": 359 | |
| }, | |
| { | |
| "completion_length": 124.17708587646484, | |
| "epoch": 0.8186469584991473, | |
| "grad_norm": 23.35593032836914, | |
| "kl": 0.06060791015625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.815219759941101, | |
| "reward_std": 0.10280088149011135, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8256364315748215, | |
| "step": 360 | |
| }, | |
| { | |
| "completion_length": 133.375, | |
| "epoch": 0.8209209778283115, | |
| "grad_norm": 8.626202583312988, | |
| "kl": 0.05999755859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.8821953237056732, | |
| "reward_std": 0.03062002846854739, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8821953237056732, | |
| "step": 361 | |
| }, | |
| { | |
| "completion_length": 123.88541984558105, | |
| "epoch": 0.8231949971574758, | |
| "grad_norm": 8.920926094055176, | |
| "kl": 0.06756591796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.8656490445137024, | |
| "reward_std": 0.016483795596286654, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.865649089217186, | |
| "step": 362 | |
| }, | |
| { | |
| "completion_length": 120.92708587646484, | |
| "epoch": 0.8254690164866402, | |
| "grad_norm": 7.630340576171875, | |
| "kl": 0.0657958984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.8661229014396667, | |
| "reward_std": 0.05962109373649582, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8765395879745483, | |
| "step": 363 | |
| }, | |
| { | |
| "completion_length": 126.13542175292969, | |
| "epoch": 0.8277430358158044, | |
| "grad_norm": 10.0298490524292, | |
| "kl": 0.1300048828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0052, | |
| "reward": 1.853810042142868, | |
| "reward_std": 0.027746433101128787, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8538100123405457, | |
| "step": 364 | |
| }, | |
| { | |
| "completion_length": 129.50000381469727, | |
| "epoch": 0.8300170551449687, | |
| "grad_norm": 23.108720779418945, | |
| "kl": 0.05908203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.8352333307266235, | |
| "reward_std": 0.060839211102575064, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8456498980522156, | |
| "step": 365 | |
| }, | |
| { | |
| "completion_length": 126.13541984558105, | |
| "epoch": 0.832291074474133, | |
| "grad_norm": 6.668464183807373, | |
| "kl": 0.0648193359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.8593635261058807, | |
| "reward_std": 0.015451492741703987, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8593635261058807, | |
| "step": 366 | |
| }, | |
| { | |
| "completion_length": 123.51041984558105, | |
| "epoch": 0.8345650938032974, | |
| "grad_norm": 7.906125545501709, | |
| "kl": 0.0628662109375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.8826908469200134, | |
| "reward_std": 0.012024826108245179, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8826908022165298, | |
| "step": 367 | |
| }, | |
| { | |
| "completion_length": 127.81250381469727, | |
| "epoch": 0.8368391131324616, | |
| "grad_norm": 5.663455486297607, | |
| "kl": 0.07000732421875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0028, | |
| "reward": 1.816881150007248, | |
| "reward_std": 0.06968936347402632, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8272978365421295, | |
| "step": 368 | |
| }, | |
| { | |
| "completion_length": 129.1979217529297, | |
| "epoch": 0.8391131324616259, | |
| "grad_norm": 4.211099147796631, | |
| "kl": 0.05908203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.8330486714839935, | |
| "reward_std": 0.11126681696623564, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8538819551467896, | |
| "step": 369 | |
| }, | |
| { | |
| "completion_length": 127.00000190734863, | |
| "epoch": 0.8413871517907903, | |
| "grad_norm": 9.754108428955078, | |
| "kl": 0.06201171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.888169288635254, | |
| "reward_std": 0.009245644323527813, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8881692886352539, | |
| "step": 370 | |
| }, | |
| { | |
| "completion_length": 132.10417366027832, | |
| "epoch": 0.8436611711199545, | |
| "grad_norm": 10.026612281799316, | |
| "kl": 0.06280517578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.83915776014328, | |
| "reward_std": 0.09443543804809451, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8599910587072372, | |
| "step": 371 | |
| }, | |
| { | |
| "completion_length": 122.82291793823242, | |
| "epoch": 0.8459351904491188, | |
| "grad_norm": 6.644383907318115, | |
| "kl": 0.0699462890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0028, | |
| "reward": 1.808215469121933, | |
| "reward_std": 0.10915855201892555, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.8290487676858902, | |
| "step": 372 | |
| }, | |
| { | |
| "completion_length": 133.25000190734863, | |
| "epoch": 0.8482092097782831, | |
| "grad_norm": 7.538217544555664, | |
| "kl": 0.06182861328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.8048012256622314, | |
| "reward_std": 0.11374149052426219, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8256345838308334, | |
| "step": 373 | |
| }, | |
| { | |
| "completion_length": 139.2083396911621, | |
| "epoch": 0.8504832291074474, | |
| "grad_norm": 6.131237030029297, | |
| "kl": 0.06085205078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.834800899028778, | |
| "reward_std": 0.04532355163246393, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8348008543252945, | |
| "step": 374 | |
| }, | |
| { | |
| "completion_length": 122.94791984558105, | |
| "epoch": 0.8527572484366117, | |
| "grad_norm": 6.690881252288818, | |
| "kl": 0.05902099609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.860317587852478, | |
| "reward_std": 0.05016712564975023, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8707341998815536, | |
| "step": 375 | |
| }, | |
| { | |
| "completion_length": 131.50000381469727, | |
| "epoch": 0.855031267765776, | |
| "grad_norm": 7.440776348114014, | |
| "kl": 0.0614013671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.8719264268875122, | |
| "reward_std": 0.0520896875532344, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8823430240154266, | |
| "step": 376 | |
| }, | |
| { | |
| "completion_length": 131.84375381469727, | |
| "epoch": 0.8573052870949404, | |
| "grad_norm": 8.669480323791504, | |
| "kl": 0.057861328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.8542674481868744, | |
| "reward_std": 0.030696504516527057, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8542674630880356, | |
| "step": 377 | |
| }, | |
| { | |
| "completion_length": 134.1979217529297, | |
| "epoch": 0.8595793064241046, | |
| "grad_norm": 8.286240577697754, | |
| "kl": 0.065673828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.8401267230510712, | |
| "reward_std": 0.07586855837143958, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8505433797836304, | |
| "step": 378 | |
| }, | |
| { | |
| "completion_length": 139.29166793823242, | |
| "epoch": 0.8618533257532689, | |
| "grad_norm": 8.042603492736816, | |
| "kl": 0.0582275390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.8034493625164032, | |
| "reward_std": 0.12077387608587742, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8242826908826828, | |
| "step": 379 | |
| }, | |
| { | |
| "completion_length": 131.09375381469727, | |
| "epoch": 0.8641273450824332, | |
| "grad_norm": 6.3277788162231445, | |
| "kl": 0.07427978515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.003, | |
| "reward": 1.841260701417923, | |
| "reward_std": 0.053736023139208555, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8412606567144394, | |
| "step": 380 | |
| }, | |
| { | |
| "completion_length": 132.83333587646484, | |
| "epoch": 0.8664013644115975, | |
| "grad_norm": 6.668425559997559, | |
| "kl": 0.06573486328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.852523535490036, | |
| "reward_std": 0.06596486712805927, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.862940177321434, | |
| "step": 381 | |
| }, | |
| { | |
| "completion_length": 129.06250381469727, | |
| "epoch": 0.8686753837407618, | |
| "grad_norm": 5.517845153808594, | |
| "kl": 0.060546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.8688926994800568, | |
| "reward_std": 0.04422900633653626, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8793093860149384, | |
| "step": 382 | |
| }, | |
| { | |
| "completion_length": 133.6770839691162, | |
| "epoch": 0.8709494030699261, | |
| "grad_norm": 6.203193664550781, | |
| "kl": 0.0697021484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0028, | |
| "reward": 1.8271671533584595, | |
| "reward_std": 0.06463960534892976, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8375838100910187, | |
| "step": 383 | |
| }, | |
| { | |
| "completion_length": 134.02083587646484, | |
| "epoch": 0.8732234223990903, | |
| "grad_norm": 10.909748077392578, | |
| "kl": 0.07537841796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.003, | |
| "reward": 1.8480697870254517, | |
| "reward_std": 0.034777372784446925, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8480697721242905, | |
| "step": 384 | |
| }, | |
| { | |
| "completion_length": 129.8541717529297, | |
| "epoch": 0.8754974417282547, | |
| "grad_norm": 9.155665397644043, | |
| "kl": 0.061279296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.8443851470947266, | |
| "reward_std": 0.027714062947779894, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8443851172924042, | |
| "step": 385 | |
| }, | |
| { | |
| "completion_length": 132.1458396911621, | |
| "epoch": 0.877771461057419, | |
| "grad_norm": 4.895893573760986, | |
| "kl": 0.06463623046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.8438906073570251, | |
| "reward_std": 0.04654449052759446, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8543073236942291, | |
| "step": 386 | |
| }, | |
| { | |
| "completion_length": 133.4166717529297, | |
| "epoch": 0.8800454803865833, | |
| "grad_norm": 10.000784873962402, | |
| "kl": 0.062255859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.8418076932430267, | |
| "reward_std": 0.0637913720565848, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8522243201732635, | |
| "step": 387 | |
| }, | |
| { | |
| "completion_length": 132.00000762939453, | |
| "epoch": 0.8823194997157476, | |
| "grad_norm": 7.047492027282715, | |
| "kl": 0.06658935546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.8548587560653687, | |
| "reward_std": 0.05772953329142183, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8652754127979279, | |
| "step": 388 | |
| }, | |
| { | |
| "completion_length": 129.31250381469727, | |
| "epoch": 0.8845935190449119, | |
| "grad_norm": 6.7407989501953125, | |
| "kl": 0.066650390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.8317703306674957, | |
| "reward_std": 0.09461293439380825, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.8526036590337753, | |
| "step": 389 | |
| }, | |
| { | |
| "completion_length": 131.3645896911621, | |
| "epoch": 0.8868675383740762, | |
| "grad_norm": 6.323419570922852, | |
| "kl": 0.072998046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0029, | |
| "reward": 1.857729732990265, | |
| "reward_std": 0.026556792086921632, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8577296435832977, | |
| "step": 390 | |
| }, | |
| { | |
| "completion_length": 132.5208396911621, | |
| "epoch": 0.8891415577032404, | |
| "grad_norm": 13.720745086669922, | |
| "kl": 0.069580078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0028, | |
| "reward": 1.7931525707244873, | |
| "reward_std": 0.14119149651378393, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "rewards/segmentation_reward": 0.8244025856256485, | |
| "step": 391 | |
| }, | |
| { | |
| "completion_length": 127.55208587646484, | |
| "epoch": 0.8914155770324048, | |
| "grad_norm": 11.731847763061523, | |
| "kl": 0.0699462890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0028, | |
| "reward": 1.8540717661380768, | |
| "reward_std": 0.025957859819754958, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8540717363357544, | |
| "step": 392 | |
| }, | |
| { | |
| "completion_length": 134.59375381469727, | |
| "epoch": 0.8936895963615691, | |
| "grad_norm": 8.114005088806152, | |
| "kl": 0.06787109375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.8252622783184052, | |
| "reward_std": 0.06493170734029263, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8356789350509644, | |
| "step": 393 | |
| }, | |
| { | |
| "completion_length": 128.76041984558105, | |
| "epoch": 0.8959636156907334, | |
| "grad_norm": 4.420175552368164, | |
| "kl": 0.07574462890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.003, | |
| "reward": 1.858299344778061, | |
| "reward_std": 0.05047441285569221, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8582993745803833, | |
| "step": 394 | |
| }, | |
| { | |
| "completion_length": 127.26041793823242, | |
| "epoch": 0.8982376350198976, | |
| "grad_norm": 9.239371299743652, | |
| "kl": 0.0760498046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0031, | |
| "reward": 1.8622342646121979, | |
| "reward_std": 0.02814770070835948, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8622342348098755, | |
| "step": 395 | |
| }, | |
| { | |
| "completion_length": 132.2395839691162, | |
| "epoch": 0.900511654349062, | |
| "grad_norm": 7.392596244812012, | |
| "kl": 0.0726318359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0029, | |
| "reward": 1.855131834745407, | |
| "reward_std": 0.051054751384072006, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8655484467744827, | |
| "step": 396 | |
| }, | |
| { | |
| "completion_length": 128.13541984558105, | |
| "epoch": 0.9027856736782263, | |
| "grad_norm": 8.125931739807129, | |
| "kl": 0.07080078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0028, | |
| "reward": 1.8950492441654205, | |
| "reward_std": 0.01147704414324835, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.895049199461937, | |
| "step": 397 | |
| }, | |
| { | |
| "completion_length": 130.39583587646484, | |
| "epoch": 0.9050596930073905, | |
| "grad_norm": 5.162111759185791, | |
| "kl": 0.0958251953125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0038, | |
| "reward": 1.8326389491558075, | |
| "reward_std": 0.12029724020976573, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8534722775220871, | |
| "step": 398 | |
| }, | |
| { | |
| "completion_length": 137.5520896911621, | |
| "epoch": 0.9073337123365549, | |
| "grad_norm": 5.906332015991211, | |
| "kl": 0.0780029296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0031, | |
| "reward": 1.83551886677742, | |
| "reward_std": 0.04398070462048054, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8355187922716141, | |
| "step": 399 | |
| }, | |
| { | |
| "completion_length": 136.5729217529297, | |
| "epoch": 0.9096077316657192, | |
| "grad_norm": 6.234349250793457, | |
| "kl": 0.073486328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0029, | |
| "reward": 1.8535465598106384, | |
| "reward_std": 0.029846468474715948, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8535465747117996, | |
| "step": 400 | |
| }, | |
| { | |
| "completion_length": 138.0208396911621, | |
| "epoch": 0.9118817509948834, | |
| "grad_norm": 9.130826950073242, | |
| "kl": 0.07421875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.003, | |
| "reward": 1.8739716410636902, | |
| "reward_std": 0.02106904413085431, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.873971700668335, | |
| "step": 401 | |
| }, | |
| { | |
| "completion_length": 134.43750381469727, | |
| "epoch": 0.9141557703240477, | |
| "grad_norm": 4.951981544494629, | |
| "kl": 0.0703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0028, | |
| "reward": 1.871123731136322, | |
| "reward_std": 0.017409008694812655, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8711237162351608, | |
| "step": 402 | |
| }, | |
| { | |
| "completion_length": 131.08333778381348, | |
| "epoch": 0.9164297896532121, | |
| "grad_norm": 5.46921968460083, | |
| "kl": 0.07666015625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0031, | |
| "reward": 1.8896593153476715, | |
| "reward_std": 0.013306577457115054, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8896592557430267, | |
| "step": 403 | |
| }, | |
| { | |
| "completion_length": 130.71875381469727, | |
| "epoch": 0.9187038089823764, | |
| "grad_norm": 3.1999082565307617, | |
| "kl": 0.078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0031, | |
| "reward": 1.8919513523578644, | |
| "reward_std": 0.020059253147337586, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8919513821601868, | |
| "step": 404 | |
| }, | |
| { | |
| "completion_length": 140.6041717529297, | |
| "epoch": 0.9209778283115406, | |
| "grad_norm": 9.87605094909668, | |
| "kl": 0.07421875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.003, | |
| "reward": 1.8576882183551788, | |
| "reward_std": 0.05526958662085235, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8681048899888992, | |
| "step": 405 | |
| }, | |
| { | |
| "completion_length": 141.71875762939453, | |
| "epoch": 0.9232518476407049, | |
| "grad_norm": 6.338070392608643, | |
| "kl": 0.0753173828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.003, | |
| "reward": 1.8482947051525116, | |
| "reward_std": 0.03551662730751559, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8482946902513504, | |
| "step": 406 | |
| }, | |
| { | |
| "completion_length": 143.32291793823242, | |
| "epoch": 0.9255258669698693, | |
| "grad_norm": 10.858189582824707, | |
| "kl": 0.06878662109375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0028, | |
| "reward": 1.8244743645191193, | |
| "reward_std": 0.09086814895272255, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8453076481819153, | |
| "step": 407 | |
| }, | |
| { | |
| "completion_length": 135.8958396911621, | |
| "epoch": 0.9277998862990335, | |
| "grad_norm": 5.908721923828125, | |
| "kl": 0.072509765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0029, | |
| "reward": 1.7982184290885925, | |
| "reward_std": 0.1342280562967062, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8190517276525497, | |
| "step": 408 | |
| }, | |
| { | |
| "completion_length": 136.5833396911621, | |
| "epoch": 0.9300739056281978, | |
| "grad_norm": 11.515835762023926, | |
| "kl": 0.073486328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0029, | |
| "reward": 1.7793036699295044, | |
| "reward_std": 0.05075064115226269, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7793037295341492, | |
| "step": 409 | |
| }, | |
| { | |
| "completion_length": 138.71875381469727, | |
| "epoch": 0.9323479249573622, | |
| "grad_norm": 5.331714153289795, | |
| "kl": 0.0684814453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0028, | |
| "reward": 1.843429058790207, | |
| "reward_std": 0.02589858788996935, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8434290289878845, | |
| "step": 410 | |
| }, | |
| { | |
| "completion_length": 138.46875381469727, | |
| "epoch": 0.9346219442865265, | |
| "grad_norm": 30.081806182861328, | |
| "kl": 0.06719970703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.8344328999519348, | |
| "reward_std": 0.08471645403187722, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.8552662283182144, | |
| "step": 411 | |
| }, | |
| { | |
| "completion_length": 139.90625381469727, | |
| "epoch": 0.9368959636156907, | |
| "grad_norm": 4.920882701873779, | |
| "kl": 0.072265625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0029, | |
| "reward": 1.8378893733024597, | |
| "reward_std": 0.0527505818172358, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8483060598373413, | |
| "step": 412 | |
| }, | |
| { | |
| "completion_length": 135.1979217529297, | |
| "epoch": 0.939169982944855, | |
| "grad_norm": 4.576317310333252, | |
| "kl": 0.07666015625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0031, | |
| "reward": 1.8669978380203247, | |
| "reward_std": 0.016623229486867785, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8669977784156799, | |
| "step": 413 | |
| }, | |
| { | |
| "completion_length": 135.60416793823242, | |
| "epoch": 0.9414440022740194, | |
| "grad_norm": 14.46871280670166, | |
| "kl": 0.07025146484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0028, | |
| "reward": 1.7731802463531494, | |
| "reward_std": 0.13738415925763547, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.804430216550827, | |
| "step": 414 | |
| }, | |
| { | |
| "completion_length": 141.68750381469727, | |
| "epoch": 0.9437180216031836, | |
| "grad_norm": 7.103405475616455, | |
| "kl": 0.0738525390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.003, | |
| "reward": 1.8259045779705048, | |
| "reward_std": 0.059208789840340614, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.8467378467321396, | |
| "step": 415 | |
| }, | |
| { | |
| "completion_length": 138.7083396911621, | |
| "epoch": 0.9459920409323479, | |
| "grad_norm": 5.577739238739014, | |
| "kl": 0.07861328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0031, | |
| "reward": 1.8508521616458893, | |
| "reward_std": 0.04068222228670493, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8612687885761261, | |
| "step": 416 | |
| }, | |
| { | |
| "completion_length": 134.65625762939453, | |
| "epoch": 0.9482660602615123, | |
| "grad_norm": 4.289518356323242, | |
| "kl": 0.074462890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.003, | |
| "reward": 1.8845854997634888, | |
| "reward_std": 0.01646020170301199, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8845854997634888, | |
| "step": 417 | |
| }, | |
| { | |
| "completion_length": 139.65625381469727, | |
| "epoch": 0.9505400795906765, | |
| "grad_norm": 8.5299072265625, | |
| "kl": 0.0740966796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.003, | |
| "reward": 1.8217022120952606, | |
| "reward_std": 0.05600218917243183, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8321189731359482, | |
| "step": 418 | |
| }, | |
| { | |
| "completion_length": 138.50000762939453, | |
| "epoch": 0.9528140989198408, | |
| "grad_norm": 6.827524662017822, | |
| "kl": 0.0794677734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0032, | |
| "reward": 1.822449415922165, | |
| "reward_std": 0.04770416300743818, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8224494010210037, | |
| "step": 419 | |
| }, | |
| { | |
| "completion_length": 142.55208587646484, | |
| "epoch": 0.9550881182490051, | |
| "grad_norm": 6.027355670928955, | |
| "kl": 0.0709228515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0028, | |
| "reward": 1.8189789950847626, | |
| "reward_std": 0.09298859292175621, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.8398122638463974, | |
| "step": 420 | |
| }, | |
| { | |
| "completion_length": 141.3020896911621, | |
| "epoch": 0.9573621375781695, | |
| "grad_norm": 5.657737731933594, | |
| "kl": 0.0723876953125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0029, | |
| "reward": 1.8466951251029968, | |
| "reward_std": 0.04600943787954748, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8466951251029968, | |
| "step": 421 | |
| }, | |
| { | |
| "completion_length": 140.84375762939453, | |
| "epoch": 0.9596361569073337, | |
| "grad_norm": 6.143070697784424, | |
| "kl": 0.07037353515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0028, | |
| "reward": 1.7793289721012115, | |
| "reward_std": 0.14454051246866584, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "rewards/segmentation_reward": 0.8209956139326096, | |
| "step": 422 | |
| }, | |
| { | |
| "completion_length": 143.00000381469727, | |
| "epoch": 0.961910176236498, | |
| "grad_norm": 6.810637474060059, | |
| "kl": 0.0731201171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0029, | |
| "reward": 1.861203372478485, | |
| "reward_std": 0.02801788877695799, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8612033277750015, | |
| "step": 423 | |
| }, | |
| { | |
| "completion_length": 143.27083587646484, | |
| "epoch": 0.9641841955656623, | |
| "grad_norm": 4.375816822052002, | |
| "kl": 0.06439208984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.8603475391864777, | |
| "reward_std": 0.01995037216693163, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8603476583957672, | |
| "step": 424 | |
| }, | |
| { | |
| "completion_length": 139.7708396911621, | |
| "epoch": 0.9664582148948266, | |
| "grad_norm": 10.486807823181152, | |
| "kl": 0.0780029296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0031, | |
| "reward": 1.8208959996700287, | |
| "reward_std": 0.10637146979570389, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8417292982339859, | |
| "step": 425 | |
| }, | |
| { | |
| "completion_length": 137.46875381469727, | |
| "epoch": 0.9687322342239909, | |
| "grad_norm": 3.8334126472473145, | |
| "kl": 0.0692138671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0028, | |
| "reward": 1.8344232141971588, | |
| "reward_std": 0.059240641247015446, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8448398411273956, | |
| "step": 426 | |
| }, | |
| { | |
| "completion_length": 135.53125381469727, | |
| "epoch": 0.9710062535531552, | |
| "grad_norm": 7.44934606552124, | |
| "kl": 0.0675048828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.7985945045948029, | |
| "reward_std": 0.12954094889573753, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "rewards/segmentation_reward": 0.8298445492982864, | |
| "step": 427 | |
| }, | |
| { | |
| "completion_length": 141.2604217529297, | |
| "epoch": 0.9732802728823196, | |
| "grad_norm": 10.400691032409668, | |
| "kl": 0.06573486328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.8283280432224274, | |
| "reward_std": 0.13098794838879257, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.859578013420105, | |
| "step": 428 | |
| }, | |
| { | |
| "completion_length": 137.05208587646484, | |
| "epoch": 0.9755542922114838, | |
| "grad_norm": 6.7781758308410645, | |
| "kl": 0.06414794921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.8502939641475677, | |
| "reward_std": 0.01936683728126809, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8502939641475677, | |
| "step": 429 | |
| }, | |
| { | |
| "completion_length": 137.20833587646484, | |
| "epoch": 0.9778283115406481, | |
| "grad_norm": 8.260501861572266, | |
| "kl": 0.0726318359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0029, | |
| "reward": 1.8315411806106567, | |
| "reward_std": 0.10588931851089001, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8523745536804199, | |
| "step": 430 | |
| }, | |
| { | |
| "completion_length": 135.75000762939453, | |
| "epoch": 0.9801023308698124, | |
| "grad_norm": 9.982911109924316, | |
| "kl": 0.06793212890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.8607007265090942, | |
| "reward_std": 0.0624299687333405, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8711173385381699, | |
| "step": 431 | |
| }, | |
| { | |
| "completion_length": 144.5833396911621, | |
| "epoch": 0.9823763501989767, | |
| "grad_norm": 3.751823663711548, | |
| "kl": 0.066162109375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.8520955741405487, | |
| "reward_std": 0.05543442675843835, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8520955145359039, | |
| "step": 432 | |
| }, | |
| { | |
| "completion_length": 134.0520896911621, | |
| "epoch": 0.984650369528141, | |
| "grad_norm": 3.716749429702759, | |
| "kl": 0.076904296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0031, | |
| "reward": 1.7991288006305695, | |
| "reward_std": 0.10263975383713841, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8199621438980103, | |
| "step": 433 | |
| }, | |
| { | |
| "completion_length": 137.96875762939453, | |
| "epoch": 0.9869243888573053, | |
| "grad_norm": 8.06445598602295, | |
| "kl": 0.06341552734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.8634448647499084, | |
| "reward_std": 0.030836602323688567, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8634448498487473, | |
| "step": 434 | |
| }, | |
| { | |
| "completion_length": 136.0104217529297, | |
| "epoch": 0.9891984081864695, | |
| "grad_norm": 6.394524574279785, | |
| "kl": 0.0859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0035, | |
| "reward": 1.8284251689910889, | |
| "reward_std": 0.10163544374518096, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.8492584675550461, | |
| "step": 435 | |
| }, | |
| { | |
| "completion_length": 136.32292556762695, | |
| "epoch": 0.9914724275156339, | |
| "grad_norm": 5.4122314453125, | |
| "kl": 0.0718994140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0029, | |
| "reward": 1.8416071236133575, | |
| "reward_std": 0.09167469386011362, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.8624404817819595, | |
| "step": 436 | |
| }, | |
| { | |
| "completion_length": 134.3645839691162, | |
| "epoch": 0.9937464468447982, | |
| "grad_norm": 6.90580940246582, | |
| "kl": 0.06646728515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.8837913274765015, | |
| "reward_std": 0.00442745303735137, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8837913274765015, | |
| "step": 437 | |
| }, | |
| { | |
| "completion_length": 131.6666717529297, | |
| "epoch": 0.9960204661739624, | |
| "grad_norm": 3.684438943862915, | |
| "kl": 0.065673828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.8093055188655853, | |
| "reward_std": 0.07611760849249549, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.8301387876272202, | |
| "step": 438 | |
| }, | |
| { | |
| "completion_length": 134.8541717529297, | |
| "epoch": 0.9982944855031268, | |
| "grad_norm": 7.167064189910889, | |
| "kl": 0.0731201171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0029, | |
| "reward": 1.8386133015155792, | |
| "reward_std": 0.07322599086910486, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8490300327539444, | |
| "step": 439 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 439, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |