{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 375, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 458.96876525878906, "epoch": 0.0026666666666666666, "grad_norm": 0.46892526745796204, "kl": 0.0, "learning_rate": 5.263157894736843e-07, "loss": 0.0151, "reward": 0.27287947945296764, "reward_std": 0.4559166468679905, "rewards/accuracy_reward": 0.15178571781143546, "rewards/format_reward": 0.03571428800933063, "rewards/tag_count_reward": 0.08537946850992739, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 484.49109649658203, "epoch": 0.005333333333333333, "grad_norm": 0.3571339547634125, "kl": 0.0, "learning_rate": 1.0526315789473685e-06, "loss": 0.0171, "reward": 0.35435269586741924, "reward_std": 0.45219872146844864, "rewards/accuracy_reward": 0.23437501164153218, "rewards/format_reward": 0.03571428777649999, "rewards/tag_count_reward": 0.08426339691504836, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 485.7589530944824, "epoch": 0.008, "grad_norm": 0.40610507130622864, "kl": 0.00011050701141357422, "learning_rate": 1.5789473684210526e-06, "loss": 0.0203, "reward": 0.28962054662406445, "reward_std": 0.3996347077190876, "rewards/accuracy_reward": 0.18303572479635477, "rewards/format_reward": 0.02232142980210483, "rewards/tag_count_reward": 0.084263397147879, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 465.26341247558594, "epoch": 0.010666666666666666, "grad_norm": 0.40742260217666626, "kl": 0.00010585784912109375, "learning_rate": 2.105263157894737e-06, "loss": 0.0039, "reward": 0.2533482275903225, "reward_std": 0.40483200177550316, "rewards/accuracy_reward": 0.16517857555299997, "rewards/format_reward": 0.02232142980210483, "rewards/tag_count_reward": 0.06584821734577417, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 509.9531440734863, "epoch": 0.013333333333333334, "grad_norm": 0.39532214403152466, "kl": 0.0002484321594238281, "learning_rate": 2.631578947368421e-06, "loss": 0.022, "reward": 0.3203125111758709, "reward_std": 0.46360545977950096, "rewards/accuracy_reward": 0.16741072293370962, "rewards/format_reward": 0.046875001629814506, "rewards/tag_count_reward": 0.10602679150179029, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 471.4799270629883, "epoch": 0.016, "grad_norm": 0.4526801109313965, "kl": 0.002498626708984375, "learning_rate": 3.157894736842105e-06, "loss": 0.0443, "reward": 0.3766741268336773, "reward_std": 0.5414610058069229, "rewards/accuracy_reward": 0.12500000651925802, "rewards/format_reward": 0.07812500395812094, "rewards/tag_count_reward": 0.17354911426082253, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 464.9776954650879, "epoch": 0.018666666666666668, "grad_norm": 0.6029561758041382, "kl": 0.0282135009765625, "learning_rate": 3.6842105263157896e-06, "loss": 0.0547, "reward": 0.4609375260770321, "reward_std": 0.5835300870239735, "rewards/accuracy_reward": 0.15848214970901608, "rewards/format_reward": 0.10044643096625805, "rewards/tag_count_reward": 0.2020089365541935, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 486.8638610839844, "epoch": 0.021333333333333333, "grad_norm": 12.084760665893555, "kl": 0.3585205078125, "learning_rate": 4.210526315789474e-06, "loss": 0.0975, "reward": 0.6953125298023224, "reward_std": 0.6895988658070564, "rewards/accuracy_reward": 0.1941964365541935, "rewards/format_reward": 0.16964286379516125, "rewards/tag_count_reward": 0.3314732275903225, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 505.02457427978516, "epoch": 0.024, "grad_norm": 2.077465772628784, "kl": 0.099151611328125, "learning_rate": 4.736842105263158e-06, "loss": 0.0874, "reward": 0.6886161044239998, "reward_std": 0.7351350113749504, "rewards/accuracy_reward": 0.22991072619333863, "rewards/format_reward": 0.1830357238650322, "rewards/tag_count_reward": 0.27566965483129025, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 424.9910888671875, "epoch": 0.02666666666666667, "grad_norm": 0.5744335651397705, "kl": 0.0402679443359375, "learning_rate": 5.263157894736842e-06, "loss": 0.0758, "reward": 0.8264509364962578, "reward_std": 0.7150396555662155, "rewards/accuracy_reward": 0.341517873108387, "rewards/format_reward": 0.17187500931322575, "rewards/tag_count_reward": 0.3130580522119999, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 500.03350830078125, "epoch": 0.029333333333333333, "grad_norm": 0.587772786617279, "kl": 0.0297393798828125, "learning_rate": 5.789473684210527e-06, "loss": 0.0688, "reward": 0.651785746216774, "reward_std": 0.6237742006778717, "rewards/accuracy_reward": 0.2366071566939354, "rewards/format_reward": 0.16964286658912897, "rewards/tag_count_reward": 0.2455357275903225, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 492.5357360839844, "epoch": 0.032, "grad_norm": 0.4513722360134125, "kl": 0.020191192626953125, "learning_rate": 6.31578947368421e-06, "loss": 0.0719, "reward": 0.7382812947034836, "reward_std": 0.6591350436210632, "rewards/accuracy_reward": 0.2500000111758709, "rewards/format_reward": 0.20535715576261282, "rewards/tag_count_reward": 0.28292411752045155, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 421.35493087768555, "epoch": 0.034666666666666665, "grad_norm": 0.4251323342323303, "kl": 0.0242767333984375, "learning_rate": 6.842105263157896e-06, "loss": 0.0909, "reward": 0.8208705708384514, "reward_std": 0.655682947486639, "rewards/accuracy_reward": 0.37053573969751596, "rewards/format_reward": 0.18080357927829027, "rewards/tag_count_reward": 0.26953126303851604, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 452.12055587768555, "epoch": 0.037333333333333336, "grad_norm": 0.46205392479896545, "kl": 0.01677703857421875, "learning_rate": 7.368421052631579e-06, "loss": 0.1193, "reward": 1.0200893357396126, "reward_std": 0.7653229907155037, "rewards/accuracy_reward": 0.3325893059372902, "rewards/format_reward": 0.3102678768336773, "rewards/tag_count_reward": 0.37723215483129025, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 448.9710006713867, "epoch": 0.04, "grad_norm": 0.4762975573539734, "kl": 0.0328216552734375, "learning_rate": 7.894736842105265e-06, "loss": 0.1402, "reward": 1.0072545260190964, "reward_std": 0.7546271607279778, "rewards/accuracy_reward": 0.3125000186264515, "rewards/format_reward": 0.29464287124574184, "rewards/tag_count_reward": 0.400111623108387, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 385.267879486084, "epoch": 0.042666666666666665, "grad_norm": 0.7390420436859131, "kl": 0.057586669921875, "learning_rate": 8.421052631578948e-06, "loss": 0.1962, "reward": 1.1467634439468384, "reward_std": 0.855265200138092, "rewards/accuracy_reward": 0.1584821492433548, "rewards/format_reward": 0.4508928768336773, "rewards/tag_count_reward": 0.5373884215950966, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 313.7522487640381, "epoch": 0.04533333333333334, "grad_norm": 5.1166672706604, "kl": 0.2215576171875, "learning_rate": 8.947368421052632e-06, "loss": 0.2362, "reward": 1.5094866752624512, "reward_std": 0.7659965306520462, "rewards/accuracy_reward": 0.20758929196745157, "rewards/format_reward": 0.5825893096625805, "rewards/tag_count_reward": 0.7193080633878708, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 257.66518783569336, "epoch": 0.048, "grad_norm": 227.28211975097656, "kl": 4.34381103515625, "learning_rate": 9.473684210526315e-06, "loss": 0.1844, "reward": 1.6919643580913544, "reward_std": 0.5936451926827431, "rewards/accuracy_reward": 0.10937500605359674, "rewards/format_reward": 0.7366071790456772, "rewards/tag_count_reward": 0.8459821790456772, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 265.9732208251953, "epoch": 0.050666666666666665, "grad_norm": 1.1222331523895264, "kl": 0.14581298828125, "learning_rate": 1e-05, "loss": 0.1309, "reward": 1.865513488650322, "reward_std": 0.505423042923212, "rewards/accuracy_reward": 0.08705357438884676, "rewards/format_reward": 0.8459821864962578, "rewards/tag_count_reward": 0.9324777275323868, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 284.7544746398926, "epoch": 0.05333333333333334, "grad_norm": 1.1658185720443726, "kl": 0.17352294921875, "learning_rate": 1.0526315789473684e-05, "loss": 0.0248, "reward": 1.9095983058214188, "reward_std": 0.4629954472184181, "rewards/accuracy_reward": 0.07812500209547579, "rewards/format_reward": 0.8772321939468384, "rewards/tag_count_reward": 0.954241119325161, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 362.5602798461914, "epoch": 0.056, "grad_norm": 2.576747179031372, "kl": 0.215087890625, "learning_rate": 1.105263157894737e-05, "loss": -0.0262, "reward": 1.9508929550647736, "reward_std": 0.45306090638041496, "rewards/accuracy_reward": 0.08928571990691125, "rewards/format_reward": 0.8928571939468384, "rewards/tag_count_reward": 0.9687500447034836, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 402.1875190734863, "epoch": 0.058666666666666666, "grad_norm": 0.5915634036064148, "kl": 0.10125732421875, "learning_rate": 1.1578947368421053e-05, "loss": -0.024, "reward": 1.9921875894069672, "reward_std": 0.47527335956692696, "rewards/accuracy_reward": 0.13392857555299997, "rewards/format_reward": 0.8973214775323868, "rewards/tag_count_reward": 0.9609375521540642, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 407.0379638671875, "epoch": 0.06133333333333333, "grad_norm": 1.3497282266616821, "kl": 0.19049072265625, "learning_rate": 1.2105263157894737e-05, "loss": -0.0088, "reward": 1.5842634737491608, "reward_std": 0.6860349476337433, "rewards/accuracy_reward": 0.3080357313156128, "rewards/format_reward": 0.4776786006987095, "rewards/tag_count_reward": 0.7985491454601288, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 391.67635345458984, "epoch": 0.064, "grad_norm": 0.8537726402282715, "kl": 0.16400146484375, "learning_rate": 1.263157894736842e-05, "loss": 0.0033, "reward": 1.8833706229925156, "reward_std": 0.6259407699108124, "rewards/accuracy_reward": 0.22098215599544346, "rewards/format_reward": 0.7433036118745804, "rewards/tag_count_reward": 0.9190848618745804, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 454.6205635070801, "epoch": 0.06666666666666667, "grad_norm": 1.942647099494934, "kl": 0.2576904296875, "learning_rate": 1.3157894736842108e-05, "loss": 0.0536, "reward": 1.9882813543081284, "reward_std": 0.48262083530426025, "rewards/accuracy_reward": 0.17187500931322575, "rewards/format_reward": 0.863839328289032, "rewards/tag_count_reward": 0.9525670036673546, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 542.8727912902832, "epoch": 0.06933333333333333, "grad_norm": 2.0896551609039307, "kl": 0.12127685546875, "learning_rate": 1.3684210526315791e-05, "loss": 0.0504, "reward": 1.8939732909202576, "reward_std": 0.5775122344493866, "rewards/accuracy_reward": 0.16294643306173384, "rewards/format_reward": 0.8125000298023224, "rewards/tag_count_reward": 0.918526828289032, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 586.6004753112793, "epoch": 0.072, "grad_norm": 1.5025179386138916, "kl": 0.171630859375, "learning_rate": 1.4210526315789475e-05, "loss": 0.0595, "reward": 1.727678656578064, "reward_std": 0.6498573049902916, "rewards/accuracy_reward": 0.1250000053551048, "rewards/format_reward": 0.714285746216774, "rewards/tag_count_reward": 0.8883929029107094, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 566.8102912902832, "epoch": 0.07466666666666667, "grad_norm": 32.01606369018555, "kl": 3.818359375, "learning_rate": 1.4736842105263159e-05, "loss": 0.2776, "reward": 1.5814732760190964, "reward_std": 0.7524110227823257, "rewards/accuracy_reward": 0.19419643934816122, "rewards/format_reward": 0.589285746216774, "rewards/tag_count_reward": 0.7979911044239998, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 560.9710121154785, "epoch": 0.07733333333333334, "grad_norm": 27.419784545898438, "kl": 2.98828125, "learning_rate": 1.5263157894736846e-05, "loss": 0.3264, "reward": 1.4570313096046448, "reward_std": 0.8036252707242966, "rewards/accuracy_reward": 0.18526786705479026, "rewards/format_reward": 0.5111607350409031, "rewards/tag_count_reward": 0.7606027126312256, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 678.9933395385742, "epoch": 0.08, "grad_norm": 5.0629682540893555, "kl": 0.550537109375, "learning_rate": 1.578947368421053e-05, "loss": 0.2401, "reward": 1.18526791036129, "reward_std": 0.7161316871643066, "rewards/accuracy_reward": 0.14062500419095159, "rewards/format_reward": 0.3348214440047741, "rewards/tag_count_reward": 0.7098214626312256, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 660.4018173217773, "epoch": 0.08266666666666667, "grad_norm": 37.217041015625, "kl": 0.595703125, "learning_rate": 1.6315789473684213e-05, "loss": 0.142, "reward": 1.1584821939468384, "reward_std": 0.6607379615306854, "rewards/accuracy_reward": 0.15625000931322575, "rewards/format_reward": 0.2745535895228386, "rewards/tag_count_reward": 0.7276786118745804, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 658.4107513427734, "epoch": 0.08533333333333333, "grad_norm": 6019.00439453125, "kl": 20.4326171875, "learning_rate": 1.6842105263157896e-05, "loss": 1.3873, "reward": 0.9341518431901932, "reward_std": 0.5787044316530228, "rewards/accuracy_reward": 0.058035716181620955, "rewards/format_reward": 0.14508929196745157, "rewards/tag_count_reward": 0.7310268208384514, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 580.5580596923828, "epoch": 0.088, "grad_norm": 290.9708557128906, "kl": 8.6171875, "learning_rate": 1.736842105263158e-05, "loss": 0.4883, "reward": 0.7516741380095482, "reward_std": 0.5007706061005592, "rewards/accuracy_reward": 0.029017858440056443, "rewards/format_reward": 0.07812500349245965, "rewards/tag_count_reward": 0.644531287252903, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 583.0335083007812, "epoch": 0.09066666666666667, "grad_norm": 16.34986114501953, "kl": 1.13134765625, "learning_rate": 1.7894736842105264e-05, "loss": 0.0601, "reward": 0.7003348544239998, "reward_std": 0.5002335086464882, "rewards/accuracy_reward": 0.046875002793967724, "rewards/format_reward": 0.07589286053553224, "rewards/tag_count_reward": 0.577566996216774, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 507.6719093322754, "epoch": 0.09333333333333334, "grad_norm": 7.823866367340088, "kl": 1.912109375, "learning_rate": 1.8421052631578947e-05, "loss": -0.0674, "reward": 0.6406250298023224, "reward_std": 0.5031706914305687, "rewards/accuracy_reward": 0.02232142980210483, "rewards/format_reward": 0.08928571757860482, "rewards/tag_count_reward": 0.5290178842842579, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 355.6808223724365, "epoch": 0.096, "grad_norm": 4.270777225494385, "kl": 1.470703125, "learning_rate": 1.894736842105263e-05, "loss": -0.2041, "reward": 0.6618303954601288, "reward_std": 0.5147153101861477, "rewards/accuracy_reward": 0.026785714784637094, "rewards/format_reward": 0.10044643189758062, "rewards/tag_count_reward": 0.5345982424914837, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 317.5245666503906, "epoch": 0.09866666666666667, "grad_norm": 4.440794467926025, "kl": 1.41943359375, "learning_rate": 1.9473684210526318e-05, "loss": -0.1957, "reward": 0.758370578289032, "reward_std": 0.5318198576569557, "rewards/accuracy_reward": 0.01562500069849193, "rewards/format_reward": 0.1361607201397419, "rewards/tag_count_reward": 0.6065848544239998, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 293.32813835144043, "epoch": 0.10133333333333333, "grad_norm": 1485.8994140625, "kl": 12.12353515625, "learning_rate": 2e-05, "loss": 0.1922, "reward": 1.1914063096046448, "reward_std": 0.6419440135359764, "rewards/accuracy_reward": 0.015625000931322575, "rewards/format_reward": 0.5312500298023224, "rewards/tag_count_reward": 0.6445312723517418, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 209.48884773254395, "epoch": 0.104, "grad_norm": 990314.5, "kl": 6982.73046875, "learning_rate": 1.999956548296958e-05, "loss": 215.6623, "reward": 0.6473214477300644, "reward_std": 0.5485238395631313, "rewards/accuracy_reward": 0.017857143888249993, "rewards/format_reward": 0.13169643469154835, "rewards/tag_count_reward": 0.4977678842842579, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 172.20759868621826, "epoch": 0.10666666666666667, "grad_norm": 9381907.0, "kl": 130954.7265625, "learning_rate": 1.9998261969639324e-05, "loss": 5173.1377, "reward": 0.4804687649011612, "reward_std": 0.39905556850135326, "rewards/accuracy_reward": 0.04464286030270159, "rewards/format_reward": 0.049107144586741924, "rewards/tag_count_reward": 0.3867187649011612, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 208.4575958251953, "epoch": 0.10933333333333334, "grad_norm": 174.25518798828125, "kl": 8.283203125, "learning_rate": 1.9996089573288985e-05, "loss": -0.2933, "reward": 0.5055803842842579, "reward_std": 0.4746779501438141, "rewards/accuracy_reward": 0.0669642889406532, "rewards/format_reward": 0.04687500232830644, "rewards/tag_count_reward": 0.3917410895228386, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 123.5022382736206, "epoch": 0.112, "grad_norm": 135551.15625, "kl": 210.92578125, "learning_rate": 1.99930484827072e-05, "loss": 11.072, "reward": 0.3387276977300644, "reward_std": 0.37313414365053177, "rewards/accuracy_reward": 0.02901785890571773, "rewards/format_reward": 0.026785715715959668, "rewards/tag_count_reward": 0.282924123108387, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 228.55134963989258, "epoch": 0.11466666666666667, "grad_norm": 366.1275939941406, "kl": 15.40625, "learning_rate": 1.9989138962175105e-05, "loss": -0.214, "reward": 0.4938616268336773, "reward_std": 0.5711234211921692, "rewards/accuracy_reward": 0.04464286006987095, "rewards/format_reward": 0.13392857741564512, "rewards/tag_count_reward": 0.3152901902794838, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 247.72992134094238, "epoch": 0.11733333333333333, "grad_norm": 53.45908737182617, "kl": 10.09765625, "learning_rate": 1.9984361351443343e-05, "loss": -0.4247, "reward": 0.5357142984867096, "reward_std": 0.6442215740680695, "rewards/accuracy_reward": 0.07142857532016933, "rewards/format_reward": 0.15625000558793545, "rewards/tag_count_reward": 0.3080357313156128, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 267.9419746398926, "epoch": 0.12, "grad_norm": 8.117420196533203, "kl": 4.306640625, "learning_rate": 1.9978716065702566e-05, "loss": -0.3767, "reward": 0.5859375260770321, "reward_std": 0.5742851197719574, "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.1718750074505806, "rewards/tag_count_reward": 0.3738839440047741, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 327.2009162902832, "epoch": 0.12266666666666666, "grad_norm": 390668.9375, "kl": 652.412109375, "learning_rate": 1.9972203595547334e-05, "loss": 27.4051, "reward": 0.820870578289032, "reward_std": 0.7370434999465942, "rewards/accuracy_reward": 0.05357143119908869, "rewards/format_reward": 0.3504464440047741, "rewards/tag_count_reward": 0.4168526977300644, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 308.95314025878906, "epoch": 0.12533333333333332, "grad_norm": 3.3901305198669434, "kl": 1.37255859375, "learning_rate": 1.996482450693348e-05, "loss": -0.235, "reward": 0.9084821864962578, "reward_std": 0.7239489033818245, "rewards/accuracy_reward": 0.04241071594879031, "rewards/format_reward": 0.3526785932481289, "rewards/tag_count_reward": 0.5133928842842579, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 269.2343864440918, "epoch": 0.128, "grad_norm": 13.403223037719727, "kl": 0.79248046875, "learning_rate": 1.9956579441128942e-05, "loss": -0.1234, "reward": 0.627790205180645, "reward_std": 0.3243283350020647, "rewards/accuracy_reward": 0.040178572526201606, "rewards/format_reward": 0.017857144121080637, "rewards/tag_count_reward": 0.5697544887661934, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 244.00894165039062, "epoch": 0.13066666666666665, "grad_norm": 1.7707425355911255, "kl": 0.734130859375, "learning_rate": 1.994746911465802e-05, "loss": -0.0886, "reward": 0.530133955180645, "reward_std": 0.24073401279747486, "rewards/accuracy_reward": 0.01562500069849193, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.5122768133878708, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 277.7120609283447, "epoch": 0.13333333333333333, "grad_norm": 1.7800672054290771, "kl": 1.8212890625, "learning_rate": 1.9937494319239112e-05, "loss": -0.1572, "reward": 0.5022321678698063, "reward_std": 0.2955008540302515, "rewards/accuracy_reward": 0.011160715017467737, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.4888393059372902, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 295.51340675354004, "epoch": 0.136, "grad_norm": 10.020119667053223, "kl": 5.904296875, "learning_rate": 1.9926655921715924e-05, "loss": -0.3163, "reward": 0.3733259104192257, "reward_std": 0.2695033699274063, "rewards/accuracy_reward": 0.026785715948790312, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3465401940047741, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 441.823673248291, "epoch": 0.13866666666666666, "grad_norm": 2.06915283203125, "kl": 2.1552734375, "learning_rate": 1.9914954863982106e-05, "loss": -0.3148, "reward": 0.2756696492433548, "reward_std": 0.17585041373968124, "rewards/accuracy_reward": 0.008928571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.266741082072258, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 938.0960235595703, "epoch": 0.14133333333333334, "grad_norm": 0.31667816638946533, "kl": 0.596923828125, "learning_rate": 1.990239216289944e-05, "loss": -0.1824, "reward": 0.23493304289877415, "reward_std": 0.07539755944162607, "rewards/accuracy_reward": 0.004464285913854837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.23046875931322575, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 930.7031631469727, "epoch": 0.144, "grad_norm": 0.257773756980896, "kl": 0.6866455078125, "learning_rate": 1.9888968910209433e-05, "loss": -0.1836, "reward": 0.22935268841683865, "reward_std": 0.062135092448443174, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.22935268841683865, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 959.5089569091797, "epoch": 0.14666666666666667, "grad_norm": 0.38278838992118835, "kl": 0.61932373046875, "learning_rate": 1.9874686272438467e-05, "loss": -0.1345, "reward": 0.23437500931322575, "reward_std": 0.04414202296175063, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.23437500931322575, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 989.2812881469727, "epoch": 0.14933333333333335, "grad_norm": 5.235660552978516, "kl": 0.56573486328125, "learning_rate": 1.9859545490796414e-05, "loss": -0.0736, "reward": 0.23772322572767735, "reward_std": 0.04692125436849892, "rewards/accuracy_reward": 0.0022321429569274187, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.235491082072258, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 984.7567443847656, "epoch": 0.152, "grad_norm": 0.7513731718063354, "kl": 1.2568359375, "learning_rate": 1.9843547881068763e-05, "loss": -0.0667, "reward": 0.23493304289877415, "reward_std": 0.0440019175875932, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.23493304289877415, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 1004.8973541259766, "epoch": 0.15466666666666667, "grad_norm": 2176257.75, "kl": 24204.224578857422, "learning_rate": 1.9826694833502295e-05, "loss": 964.5377, "reward": 0.23995536752045155, "reward_std": 0.0476194906514138, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.23995536752045155, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 1013.3593978881836, "epoch": 0.15733333333333333, "grad_norm": 3.9350333213806152, "kl": 6.3824462890625, "learning_rate": 1.9808987812684247e-05, "loss": -0.0163, "reward": 0.239397332072258, "reward_std": 0.04491220973432064, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.239397332072258, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 1002.2120819091797, "epoch": 0.16, "grad_norm": 10.338529586791992, "kl": 2.3958740234375, "learning_rate": 1.979042835741503e-05, "loss": -0.0206, "reward": 0.2265625111758709, "reward_std": 0.07814983604475856, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2265625111758709, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 1009.8348388671875, "epoch": 0.16266666666666665, "grad_norm": 0.21856586635112762, "kl": 0.22381591796875, "learning_rate": 1.9771018080574534e-05, "loss": -0.0197, "reward": 0.22767858020961285, "reward_std": 0.058380599366500974, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.22767858020961285, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 1009.4732360839844, "epoch": 0.16533333333333333, "grad_norm": 0.38580521941185, "kl": 0.169189453125, "learning_rate": 1.9750758668981925e-05, "loss": -0.0383, "reward": 0.24051340483129025, "reward_std": 0.03663408872671425, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24051340483129025, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 979.4420013427734, "epoch": 0.168, "grad_norm": 0.6577736735343933, "kl": 0.9158935546875, "learning_rate": 1.9729651883249075e-05, "loss": -0.0914, "reward": 0.2399553656578064, "reward_std": 0.03222780209034681, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2399553656578064, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 1006.5223541259766, "epoch": 0.17066666666666666, "grad_norm": 10.438541412353516, "kl": 1.66058349609375, "learning_rate": 1.9707699557627554e-05, "loss": -0.005, "reward": 0.2460937574505806, "reward_std": 0.014615848893299699, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2460937574505806, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 996.0647659301758, "epoch": 0.17333333333333334, "grad_norm": 0.5891804099082947, "kl": 0.95574951171875, "learning_rate": 1.968490359984923e-05, "loss": -0.0622, "reward": 0.2421875074505806, "reward_std": 0.024542340775951743, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2421875074505806, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 996.9174499511719, "epoch": 0.176, "grad_norm": 0.47327500581741333, "kl": 0.97607421875, "learning_rate": 1.9661265990960486e-05, "loss": -0.0374, "reward": 0.24386161379516125, "reward_std": 0.021628810092806816, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24386161379516125, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 1009.5178833007812, "epoch": 0.17866666666666667, "grad_norm": 6.1433186531066895, "kl": 15.404144287109375, "learning_rate": 1.9636788785150037e-05, "loss": -0.0088, "reward": 0.24609375558793545, "reward_std": 0.013276896439492702, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24609375558793545, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 1018.4620666503906, "epoch": 0.18133333333333335, "grad_norm": 1.1267215013504028, "kl": 0.652923583984375, "learning_rate": 1.9611474109570446e-05, "loss": -0.007, "reward": 0.2483258955180645, "reward_std": 0.006263935239985585, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2483258955180645, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 1019.7433166503906, "epoch": 0.184, "grad_norm": 0.2426103949546814, "kl": 0.08197021484375, "learning_rate": 1.9585324164153236e-05, "loss": -0.0126, "reward": 0.2477678582072258, "reward_std": 0.008351913653314114, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2477678582072258, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 1015.9888534545898, "epoch": 0.18666666666666668, "grad_norm": 0.12586605548858643, "kl": 0.143280029296875, "learning_rate": 1.9558341221417744e-05, "loss": -0.0217, "reward": 0.2472098283469677, "reward_std": 0.009100939147174358, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2472098283469677, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 1018.0803756713867, "epoch": 0.18933333333333333, "grad_norm": 0.38826289772987366, "kl": 0.079498291015625, "learning_rate": 1.9530527626273592e-05, "loss": -0.0119, "reward": 0.246651791036129, "reward_std": 0.01252787047997117, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.246651791036129, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 1015.8348388671875, "epoch": 0.192, "grad_norm": 615.1087036132812, "kl": 32.953125, "learning_rate": 1.9501885795816937e-05, "loss": 1.3001, "reward": 0.246651791036129, "reward_std": 0.01252787047997117, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.246651791036129, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 1017.616081237793, "epoch": 0.19466666666666665, "grad_norm": 0.04497808218002319, "kl": 0.08001708984375, "learning_rate": 1.9472418219120403e-05, "loss": -0.0054, "reward": 0.2494419664144516, "reward_std": 0.0020879784133285284, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2494419664144516, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 1018.8951110839844, "epoch": 0.19733333333333333, "grad_norm": 2.6043214797973633, "kl": 0.47564697265625, "learning_rate": 1.9442127457016768e-05, "loss": -0.0041, "reward": 0.24720982648432255, "reward_std": 0.010439892299473286, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24720982648432255, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 1001.9643249511719, "epoch": 0.2, "grad_norm": 0.7235760688781738, "kl": 0.35595703125, "learning_rate": 1.9411016141876438e-05, "loss": -0.0072, "reward": 0.2460937574505806, "reward_std": 0.01745285326614976, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2460937574505806, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 1001.7567443847656, "epoch": 0.20266666666666666, "grad_norm": 11.015027046203613, "kl": 0.2857666015625, "learning_rate": 1.9379086977378664e-05, "loss": -0.0117, "reward": 0.2466517947614193, "reward_std": 0.015364875085651875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2466517947614193, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 980.8705825805664, "epoch": 0.20533333333333334, "grad_norm": 0.5128747224807739, "kl": 0.2799072265625, "learning_rate": 1.9346342738276593e-05, "loss": -0.0305, "reward": 0.24665179289877415, "reward_std": 0.032296012388542295, "rewards/accuracy_reward": 0.0022321429569274187, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24441965110599995, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 942.6451416015625, "epoch": 0.208, "grad_norm": 8.115665435791016, "kl": 1.64599609375, "learning_rate": 1.9312786270156135e-05, "loss": 0.0306, "reward": 0.24497768841683865, "reward_std": 0.03677184786647558, "rewards/accuracy_reward": 0.0022321429569274187, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24274554289877415, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 947.0089797973633, "epoch": 0.21066666666666667, "grad_norm": 86.71783447265625, "kl": 10.751220703125, "learning_rate": 1.927842048918867e-05, "loss": 0.3956, "reward": 0.2527901902794838, "reward_std": 0.041396952932700515, "rewards/accuracy_reward": 0.004464285913854837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2483259029686451, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 976.4687881469727, "epoch": 0.21333333333333335, "grad_norm": 0.8117868304252625, "kl": 1.150146484375, "learning_rate": 1.9243248381877605e-05, "loss": 0.0132, "reward": 0.2594866156578064, "reward_std": 0.061963471584022045, "rewards/accuracy_reward": 0.011160714784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2483258992433548, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 999.3281707763672, "epoch": 0.216, "grad_norm": 0.5710753798484802, "kl": 0.2978515625, "learning_rate": 1.9207273004798873e-05, "loss": -0.0116, "reward": 0.25279018841683865, "reward_std": 0.04546203720383346, "rewards/accuracy_reward": 0.004464285913854837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24832589738070965, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 1006.6428985595703, "epoch": 0.21866666666666668, "grad_norm": 0.4451937973499298, "kl": 0.3206787109375, "learning_rate": 1.9170497484335276e-05, "loss": -0.0188, "reward": 0.2695312611758709, "reward_std": 0.09501239191740751, "rewards/accuracy_reward": 0.01785714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2516741156578064, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 1012.850471496582, "epoch": 0.22133333333333333, "grad_norm": 0.6105583310127258, "kl": 0.556640625, "learning_rate": 1.9132925016404805e-05, "loss": 0.0129, "reward": 0.3136160857975483, "reward_std": 0.1892830766737461, "rewards/accuracy_reward": 0.0491071455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2645089440047741, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 1011.8192291259766, "epoch": 0.224, "grad_norm": 0.4263147711753845, "kl": 1.17919921875, "learning_rate": 1.9094558866182892e-05, "loss": 0.0313, "reward": 0.4051339440047741, "reward_std": 0.2580750435590744, "rewards/accuracy_reward": 0.11830357857979834, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2868303693830967, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 994.6518173217773, "epoch": 0.22666666666666666, "grad_norm": 4.379977226257324, "kl": 2.775390625, "learning_rate": 1.9055402367818673e-05, "loss": 0.0242, "reward": 0.4547991268336773, "reward_std": 0.2754308069124818, "rewards/accuracy_reward": 0.17633929336443543, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2784598357975483, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 981.888427734375, "epoch": 0.22933333333333333, "grad_norm": 0.5411613583564758, "kl": 1.14501953125, "learning_rate": 1.901545892414523e-05, "loss": 0.0135, "reward": 0.474888414144516, "reward_std": 0.298506336286664, "rewards/accuracy_reward": 0.18973215203732252, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2851562649011612, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 979.1473693847656, "epoch": 0.232, "grad_norm": 0.31463608145713806, "kl": 1.203125, "learning_rate": 1.897473200638386e-05, "loss": -0.0059, "reward": 0.4665178768336773, "reward_std": 0.2942663496360183, "rewards/accuracy_reward": 0.19866072945296764, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2678571566939354, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 1000.0179061889648, "epoch": 0.23466666666666666, "grad_norm": 0.2776876986026764, "kl": 0.96484375, "learning_rate": 1.8933225153842446e-05, "loss": 0.0193, "reward": 0.4268973432481289, "reward_std": 0.25522872246801853, "rewards/accuracy_reward": 0.16517858393490314, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2617187574505806, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 996.5000305175781, "epoch": 0.23733333333333334, "grad_norm": 0.5454741716384888, "kl": 1.11767578125, "learning_rate": 1.8890941973607843e-05, "loss": 0.0167, "reward": 0.4676339514553547, "reward_std": 0.2707557659596205, "rewards/accuracy_reward": 0.22544644214212894, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2421875074505806, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 996.5893325805664, "epoch": 0.24, "grad_norm": 0.45834293961524963, "kl": 1.054443359375, "learning_rate": 1.8847886140232438e-05, "loss": 0.0215, "reward": 0.3158482275903225, "reward_std": 0.2969865184277296, "rewards/accuracy_reward": 0.14732143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.16852679289877415, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 927.3036041259766, "epoch": 0.24266666666666667, "grad_norm": 2.235508918762207, "kl": 2.0927734375, "learning_rate": 1.8804061395414795e-05, "loss": 0.0831, "reward": 0.2265625149011612, "reward_std": 0.2394270822405815, "rewards/accuracy_reward": 0.07812500232830644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1484375074505806, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 954.2366485595703, "epoch": 0.24533333333333332, "grad_norm": 0.3937546908855438, "kl": 1.47265625, "learning_rate": 1.875947154767452e-05, "loss": 0.0661, "reward": 0.19140626024454832, "reward_std": 0.23799890838563442, "rewards/accuracy_reward": 0.06250000419095159, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.12890625651925802, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 1003.2478103637695, "epoch": 0.248, "grad_norm": 0.41460925340652466, "kl": 1.29296875, "learning_rate": 1.8714120472021252e-05, "loss": 0.0501, "reward": 0.23828126303851604, "reward_std": 0.28448878042399883, "rewards/accuracy_reward": 0.09821428917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1400669701397419, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 995.8393096923828, "epoch": 0.25066666666666665, "grad_norm": 0.31394830346107483, "kl": 1.2431640625, "learning_rate": 1.8668012109617933e-05, "loss": 0.0441, "reward": 0.25558036752045155, "reward_std": 0.28702029772102833, "rewards/accuracy_reward": 0.10491071850992739, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1506696492433548, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 1013.6808471679688, "epoch": 0.25333333333333335, "grad_norm": 0.26716697216033936, "kl": 1.11474609375, "learning_rate": 1.862115046743831e-05, "loss": 0.039, "reward": 0.30747769959270954, "reward_std": 0.3026493303477764, "rewards/accuracy_reward": 0.12053572107106447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1869419738650322, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 1017.1518173217773, "epoch": 0.256, "grad_norm": 0.28958389163017273, "kl": 1.10400390625, "learning_rate": 1.85735396179187e-05, "loss": 0.0416, "reward": 0.4603794813156128, "reward_std": 0.3755842447280884, "rewards/accuracy_reward": 0.20312500931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25725447572767735, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 1013.7857437133789, "epoch": 0.25866666666666666, "grad_norm": 0.2677743434906006, "kl": 1.009765625, "learning_rate": 1.8525183698604098e-05, "loss": 0.0371, "reward": 0.4949777014553547, "reward_std": 0.3382803946733475, "rewards/accuracy_reward": 0.18303572293370962, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3119419813156128, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 1012.8259201049805, "epoch": 0.2613333333333333, "grad_norm": 0.2341116964817047, "kl": 0.853759765625, "learning_rate": 1.8476086911788588e-05, "loss": 0.0321, "reward": 0.510044664144516, "reward_std": 0.2946113534271717, "rewards/accuracy_reward": 0.1651785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3448660857975483, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 1005.520133972168, "epoch": 0.264, "grad_norm": 0.4626868665218353, "kl": 1.1552734375, "learning_rate": 1.8426253524150176e-05, "loss": 0.0344, "reward": 0.5691964514553547, "reward_std": 0.34905775636434555, "rewards/accuracy_reward": 0.21205358253791928, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.357142873108387, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 1007.6183471679688, "epoch": 0.26666666666666666, "grad_norm": 0.4349505603313446, "kl": 0.7802734375, "learning_rate": 1.8375687866379988e-05, "loss": 0.0271, "reward": 0.5998884215950966, "reward_std": 0.33171170204877853, "rewards/accuracy_reward": 0.22544644167646766, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.3722098395228386, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 1005.4777221679688, "epoch": 0.2693333333333333, "grad_norm": 0.4926176071166992, "kl": 0.667236328125, "learning_rate": 1.8324394332805913e-05, "loss": 0.0161, "reward": 0.6283482350409031, "reward_std": 0.330145962536335, "rewards/accuracy_reward": 0.24776786658912897, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3805803768336773, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 1007.0223617553711, "epoch": 0.272, "grad_norm": 0.5684264898300171, "kl": 0.4881591796875, "learning_rate": 1.8272377381010726e-05, "loss": 0.0033, "reward": 0.6183036044239998, "reward_std": 0.32388063333928585, "rewards/accuracy_reward": 0.2500000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3683035932481289, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 1007.1875457763672, "epoch": 0.27466666666666667, "grad_norm": 4.337119102478027, "kl": 0.94384765625, "learning_rate": 1.8219641531444713e-05, "loss": 0.031, "reward": 0.7594866454601288, "reward_std": 0.3420899845659733, "rewards/accuracy_reward": 0.3883928768336773, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3710937723517418, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 1001.1563034057617, "epoch": 0.2773333333333333, "grad_norm": 0.5998754501342773, "kl": 0.375, "learning_rate": 1.8166191367032828e-05, "loss": 0.0018, "reward": 0.6891741305589676, "reward_std": 0.38426094129681587, "rewards/accuracy_reward": 0.3147321566939354, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.3722098357975483, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 1015.3370895385742, "epoch": 0.28, "grad_norm": 0.2921030521392822, "kl": 0.419189453125, "learning_rate": 1.811203153277641e-05, "loss": 0.0097, "reward": 0.8236607536673546, "reward_std": 0.32553235441446304, "rewards/accuracy_reward": 0.4330357424914837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3906250149011612, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 994.6964645385742, "epoch": 0.2826666666666667, "grad_norm": 3.2729032039642334, "kl": 0.99658203125, "learning_rate": 1.8057166735349533e-05, "loss": 0.0083, "reward": 0.7879464700818062, "reward_std": 0.33463743701577187, "rewards/accuracy_reward": 0.3906250149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3973214440047741, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 983.4241485595703, "epoch": 0.2853333333333333, "grad_norm": 1.0045377016067505, "kl": 0.7274169921875, "learning_rate": 1.800160174268996e-05, "loss": -0.0159, "reward": 0.6489955708384514, "reward_std": 0.3681374154984951, "rewards/accuracy_reward": 0.25446429289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3945312686264515, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 986.0915603637695, "epoch": 0.288, "grad_norm": 0.5037855505943298, "kl": 0.580078125, "learning_rate": 1.7945341383584818e-05, "loss": -0.0129, "reward": 0.7650670036673546, "reward_std": 0.39130162820219994, "rewards/accuracy_reward": 0.35044644586741924, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4146205559372902, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 965.9866561889648, "epoch": 0.2906666666666667, "grad_norm": 1.3588380813598633, "kl": 1.8997802734375, "learning_rate": 1.7888390547250944e-05, "loss": 0.032, "reward": 0.7505580708384514, "reward_std": 0.3513164669275284, "rewards/accuracy_reward": 0.3325893059372902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4179687649011612, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 946.988883972168, "epoch": 0.29333333333333333, "grad_norm": 0.42907217144966125, "kl": 1.02099609375, "learning_rate": 1.7830754182909985e-05, "loss": -0.002, "reward": 0.8191964626312256, "reward_std": 0.3892271090298891, "rewards/accuracy_reward": 0.39732144633308053, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4218750186264515, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 908.0603103637695, "epoch": 0.296, "grad_norm": 0.554644763469696, "kl": 1.03662109375, "learning_rate": 1.7772437299358324e-05, "loss": -0.0007, "reward": 0.7488839663565159, "reward_std": 0.34201290644705296, "rewards/accuracy_reward": 0.3102678705472499, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4386160932481289, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 912.4442443847656, "epoch": 0.2986666666666667, "grad_norm": 0.5132027268409729, "kl": 1.64404296875, "learning_rate": 1.771344496453177e-05, "loss": 0.0156, "reward": 0.7243304029107094, "reward_std": 0.36312241293489933, "rewards/accuracy_reward": 0.28348215762525797, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4408482313156128, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 946.7210235595703, "epoch": 0.30133333333333334, "grad_norm": 2.337268114089966, "kl": 4.0595703125, "learning_rate": 1.7653782305065158e-05, "loss": 0.0725, "reward": 0.7773437798023224, "reward_std": 0.35461460426449776, "rewards/accuracy_reward": 0.2991071557626128, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4782366268336773, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 971.0312957763672, "epoch": 0.304, "grad_norm": 0.5845390558242798, "kl": 2.72265625, "learning_rate": 1.7593454505846807e-05, "loss": 0.0799, "reward": 0.781808078289032, "reward_std": 0.3098542857915163, "rewards/accuracy_reward": 0.3125000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4693080484867096, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 999.4821701049805, "epoch": 0.30666666666666664, "grad_norm": 0.36925044655799866, "kl": 1.072998046875, "learning_rate": 1.753246680956795e-05, "loss": 0.0012, "reward": 0.6941964626312256, "reward_std": 0.3417879194021225, "rewards/accuracy_reward": 0.27455358393490314, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4196428768336773, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 997.1049423217773, "epoch": 0.30933333333333335, "grad_norm": 0.7683995366096497, "kl": 0.99810791015625, "learning_rate": 1.7470824516267125e-05, "loss": -0.0258, "reward": 0.5870536006987095, "reward_std": 0.3048571478575468, "rewards/accuracy_reward": 0.16741072060540318, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4196428805589676, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 1004.1183319091797, "epoch": 0.312, "grad_norm": 0.6834697127342224, "kl": 0.769775390625, "learning_rate": 1.7408532982869573e-05, "loss": -0.0129, "reward": 0.6238839514553547, "reward_std": 0.32993016950786114, "rewards/accuracy_reward": 0.18973215110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4341518059372902, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 856.0893325805664, "epoch": 0.31466666666666665, "grad_norm": 18.79123306274414, "kl": 7.33984375, "learning_rate": 1.7345597622721727e-05, "loss": 0.1389, "reward": 0.5736607387661934, "reward_std": 0.3069217577576637, "rewards/accuracy_reward": 0.12723214901052415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4464285895228386, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 857.9777145385742, "epoch": 0.31733333333333336, "grad_norm": 6.959547996520996, "kl": 5.00390625, "learning_rate": 1.7282023905120743e-05, "loss": 0.0227, "reward": 0.4944196678698063, "reward_std": 0.23757354356348515, "rewards/accuracy_reward": 0.05133928917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4430803768336773, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 914.3504867553711, "epoch": 0.32, "grad_norm": 2.665106773376465, "kl": 0.91259765625, "learning_rate": 1.721781735483921e-05, "loss": -0.0911, "reward": 0.5279018022119999, "reward_std": 0.2325394507497549, "rewards/accuracy_reward": 0.06026785937137902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4676339514553547, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 966.8370971679688, "epoch": 0.32266666666666666, "grad_norm": 1.2509655952453613, "kl": 0.35968017578125, "learning_rate": 1.7152983551645054e-05, "loss": -0.06, "reward": 0.5563616342842579, "reward_std": 0.25420672446489334, "rewards/accuracy_reward": 0.08705357438884676, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4693080596625805, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 981.8326263427734, "epoch": 0.3253333333333333, "grad_norm": 0.5648319125175476, "kl": 0.5328369140625, "learning_rate": 1.708752812981659e-05, "loss": -0.0362, "reward": 0.5742187760770321, "reward_std": 0.26508820056915283, "rewards/accuracy_reward": 0.10491071850992739, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4693080522119999, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 959.6272735595703, "epoch": 0.328, "grad_norm": 10.74463939666748, "kl": 1.951416015625, "learning_rate": 1.702145677765293e-05, "loss": -0.0225, "reward": 0.6300223544239998, "reward_std": 0.28988964296877384, "rewards/accuracy_reward": 0.13616071711294353, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4938616305589676, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 980.8192443847656, "epoch": 0.33066666666666666, "grad_norm": 4.069360256195068, "kl": 1.291015625, "learning_rate": 1.6954775236979616e-05, "loss": -0.0099, "reward": 0.5474330633878708, "reward_std": 0.2495187409222126, "rewards/accuracy_reward": 0.06473214668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.482700914144516, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 950.1986999511719, "epoch": 0.3333333333333333, "grad_norm": 8.897069931030273, "kl": 2.5947265625, "learning_rate": 1.6887489302649657e-05, "loss": 0.0202, "reward": 0.6049107387661934, "reward_std": 0.31144498474895954, "rewards/accuracy_reward": 0.12500000605359674, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4799107350409031, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 987.8058395385742, "epoch": 0.336, "grad_norm": 2.9771108627319336, "kl": 1.3916015625, "learning_rate": 1.6819604822039924e-05, "loss": 0.0017, "reward": 0.5251116268336773, "reward_std": 0.21684774663299322, "rewards/accuracy_reward": 0.04464285937137902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4804687723517418, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 982.4085311889648, "epoch": 0.33866666666666667, "grad_norm": 1.1636186838150024, "kl": 1.030029296875, "learning_rate": 1.6751127694543012e-05, "loss": -0.02, "reward": 0.5340402089059353, "reward_std": 0.21295001544058323, "rewards/accuracy_reward": 0.04687500186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.487165205180645, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 980.8795166015625, "epoch": 0.3413333333333333, "grad_norm": 1.9692587852478027, "kl": 9.68798828125, "learning_rate": 1.6682063871054534e-05, "loss": -0.0215, "reward": 0.5200893133878708, "reward_std": 0.2094257827848196, "rewards/accuracy_reward": 0.037946430733427405, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4821428768336773, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 1005.0111923217773, "epoch": 0.344, "grad_norm": 1.1740566492080688, "kl": 0.3739013671875, "learning_rate": 1.661241935345599e-05, "loss": -0.0071, "reward": 0.6735491380095482, "reward_std": 0.2998756691813469, "rewards/accuracy_reward": 0.16517857555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5083705559372902, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 991.6607360839844, "epoch": 0.3466666666666667, "grad_norm": 0.4999859929084778, "kl": 0.33416748046875, "learning_rate": 1.654220019409317e-05, "loss": -0.0413, "reward": 0.6049107499420643, "reward_std": 0.21885671466588974, "rewards/accuracy_reward": 0.10044643492437899, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5044643096625805, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 1006.4777069091797, "epoch": 0.34933333333333333, "grad_norm": 0.31324487924575806, "kl": 0.33984375, "learning_rate": 1.6471412495250195e-05, "loss": -0.0189, "reward": 0.580915205180645, "reward_std": 0.22965830191969872, "rewards/accuracy_reward": 0.07589285913854837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5050223432481289, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 1021.9040222167969, "epoch": 0.352, "grad_norm": 0.3059490919113159, "kl": 0.1947021484375, "learning_rate": 1.640006240861921e-05, "loss": 0.0013, "reward": 0.6244419887661934, "reward_std": 0.23875875398516655, "rewards/accuracy_reward": 0.10937500582076609, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5150669887661934, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 1007.4710083007812, "epoch": 0.3546666666666667, "grad_norm": 0.9267669916152954, "kl": 0.306396484375, "learning_rate": 1.632815613476576e-05, "loss": -0.0092, "reward": 0.6741071715950966, "reward_std": 0.3328205347061157, "rewards/accuracy_reward": 0.19419643934816122, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4799107313156128, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 746.9375305175781, "epoch": 0.35733333333333334, "grad_norm": 6.944976329803467, "kl": 1.9111328125, "learning_rate": 1.6255699922589968e-05, "loss": -0.0158, "reward": 0.5117187686264515, "reward_std": 0.33955446630716324, "rewards/accuracy_reward": 0.14732143585570157, "rewards/format_reward": 0.004464285913854837, "rewards/tag_count_reward": 0.3599330522119999, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 688.2656555175781, "epoch": 0.36, "grad_norm": 4.551051139831543, "kl": 1.47119140625, "learning_rate": 1.6182700068783463e-05, "loss": 0.074, "reward": 0.4464285969734192, "reward_std": 0.32823895290493965, "rewards/accuracy_reward": 0.10714286146685481, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.3370535857975483, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 932.7768249511719, "epoch": 0.3626666666666667, "grad_norm": 0.7402526140213013, "kl": 0.3092041015625, "learning_rate": 1.610916291728218e-05, "loss": 0.015, "reward": 0.6032366417348385, "reward_std": 0.4040101356804371, "rewards/accuracy_reward": 0.19866072572767735, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4045759104192257, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 916.5134353637695, "epoch": 0.36533333333333334, "grad_norm": 0.6370754837989807, "kl": 0.3028564453125, "learning_rate": 1.6035094858715065e-05, "loss": -0.0375, "reward": 0.7421875298023224, "reward_std": 0.3760041519999504, "rewards/accuracy_reward": 0.2968750149011612, "rewards/format_reward": 0.006696428870782256, "rewards/tag_count_reward": 0.4386160969734192, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 890.5982513427734, "epoch": 0.368, "grad_norm": 0.6831709742546082, "kl": 0.2908935546875, "learning_rate": 1.5960502329848683e-05, "loss": -0.0319, "reward": 0.8917411044239998, "reward_std": 0.4364234544336796, "rewards/accuracy_reward": 0.39508930407464504, "rewards/format_reward": 0.024553572293370962, "rewards/tag_count_reward": 0.4720982387661934, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 804.1205749511719, "epoch": 0.37066666666666664, "grad_norm": 2.1319992542266846, "kl": 0.320068359375, "learning_rate": 1.588539181302786e-05, "loss": 0.0253, "reward": 0.7901785969734192, "reward_std": 0.4901719093322754, "rewards/accuracy_reward": 0.24776787287555635, "rewards/format_reward": 0.04017857415601611, "rewards/tag_count_reward": 0.5022321678698063, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 786.3839645385742, "epoch": 0.37333333333333335, "grad_norm": 2.234715700149536, "kl": 0.26708984375, "learning_rate": 1.580976983561235e-05, "loss": -0.0489, "reward": 0.8493303880095482, "reward_std": 0.43167896941304207, "rewards/accuracy_reward": 0.3013392947614193, "rewards/format_reward": 0.008928571827709675, "rewards/tag_count_reward": 0.5390625074505806, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 816.1518325805664, "epoch": 0.376, "grad_norm": 2.293884038925171, "kl": 0.3765869140625, "learning_rate": 1.5733642969409553e-05, "loss": -0.0492, "reward": 0.7857143208384514, "reward_std": 0.39178847149014473, "rewards/accuracy_reward": 0.26116072526201606, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.522321455180645, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 867.8973693847656, "epoch": 0.37866666666666665, "grad_norm": 2.1033823490142822, "kl": 0.519775390625, "learning_rate": 1.5657017830103448e-05, "loss": -0.0506, "reward": 0.7667411044239998, "reward_std": 0.34294718876481056, "rewards/accuracy_reward": 0.227678582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5390625223517418, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 819.8661117553711, "epoch": 0.38133333333333336, "grad_norm": 5.280317306518555, "kl": 0.734130859375, "learning_rate": 1.5579901076679625e-05, "loss": -0.0823, "reward": 0.89620541036129, "reward_std": 0.4438219405710697, "rewards/accuracy_reward": 0.39955358766019344, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4966518133878708, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 853.4085235595703, "epoch": 0.384, "grad_norm": 6.651998519897461, "kl": 1.65087890625, "learning_rate": 1.5502299410846626e-05, "loss": -0.071, "reward": 0.7762277126312256, "reward_std": 0.3538948893547058, "rewards/accuracy_reward": 0.22098215576261282, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5552455745637417, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 825.1585235595703, "epoch": 0.38666666666666666, "grad_norm": 45.774906158447266, "kl": 7.41796875, "learning_rate": 1.5424219576453526e-05, "loss": 0.0825, "reward": 0.7812500298023224, "reward_std": 0.35702329128980637, "rewards/accuracy_reward": 0.2410714402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5401785969734192, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 843.4263687133789, "epoch": 0.3893333333333333, "grad_norm": 10.040907859802246, "kl": 3.3037109375, "learning_rate": 1.5345668358903886e-05, "loss": -0.0851, "reward": 0.786272369325161, "reward_std": 0.3718103840947151, "rewards/accuracy_reward": 0.2120535783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5742187723517418, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 788.7656555175781, "epoch": 0.392, "grad_norm": 3.9412412643432617, "kl": 1.4208984375, "learning_rate": 1.5266652584566056e-05, "loss": -0.1977, "reward": 0.7784598544239998, "reward_std": 0.39768509939312935, "rewards/accuracy_reward": 0.23883929662406445, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5396205633878708, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 778.2053909301758, "epoch": 0.39466666666666667, "grad_norm": 2.3464877605438232, "kl": 1.646484375, "learning_rate": 1.5187179120179969e-05, "loss": -0.2273, "reward": 0.7354910895228386, "reward_std": 0.42870184034109116, "rewards/accuracy_reward": 0.18973215110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.545758955180645, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 763.8794937133789, "epoch": 0.3973333333333333, "grad_norm": 1.7182663679122925, "kl": 1.0068359375, "learning_rate": 1.5107254872260366e-05, "loss": -0.2177, "reward": 0.8147321864962578, "reward_std": 0.4319792427122593, "rewards/accuracy_reward": 0.28571429662406445, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5290178842842579, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 848.1295013427734, "epoch": 0.4, "grad_norm": 1.2562867403030396, "kl": 0.520263671875, "learning_rate": 1.5026886786496624e-05, "loss": -0.1592, "reward": 0.9051339626312256, "reward_std": 0.38791827112436295, "rewards/accuracy_reward": 0.3348214477300644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5703125223517418, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 835.6518173217773, "epoch": 0.4026666666666667, "grad_norm": 0.8011857867240906, "kl": 0.93359375, "learning_rate": 1.4946081847149134e-05, "loss": -0.171, "reward": 0.926339328289032, "reward_std": 0.43096619471907616, "rewards/accuracy_reward": 0.37276787124574184, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5535714626312256, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 866.0201263427734, "epoch": 0.4053333333333333, "grad_norm": 2.4044995307922363, "kl": 2.04150390625, "learning_rate": 1.4864847076442358e-05, "loss": -0.1295, "reward": 0.9380580633878708, "reward_std": 0.3936535269021988, "rewards/accuracy_reward": 0.35714287497103214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.580915205180645, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 850.8460159301758, "epoch": 0.408, "grad_norm": 3.49764084815979, "kl": 2.35498046875, "learning_rate": 1.4783189533954555e-05, "loss": -0.1393, "reward": 0.8465402200818062, "reward_std": 0.36915882118046284, "rewards/accuracy_reward": 0.2656250116415322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.580915205180645, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 844.9888763427734, "epoch": 0.4106666666666667, "grad_norm": 0.9199227690696716, "kl": 0.605712890625, "learning_rate": 1.4701116316004307e-05, "loss": -0.1487, "reward": 0.847656287252903, "reward_std": 0.41640398278832436, "rewards/accuracy_reward": 0.25446430034935474, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5931919738650322, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 894.4620895385742, "epoch": 0.41333333333333333, "grad_norm": 0.9275841116905212, "kl": 0.5400390625, "learning_rate": 1.46186345550338e-05, "loss": -0.1346, "reward": 0.851562537252903, "reward_std": 0.40566281601786613, "rewards/accuracy_reward": 0.24330358393490314, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.608258955180645, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 778.1741333007812, "epoch": 0.416, "grad_norm": 1.4222042560577393, "kl": 1.9130859375, "learning_rate": 1.4535751418989e-05, "loss": -0.1921, "reward": 0.694196455180645, "reward_std": 0.39460423216223717, "rewards/accuracy_reward": 0.15401786495931447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5401785969734192, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 691.5089569091797, "epoch": 0.4186666666666667, "grad_norm": 2.3009650707244873, "kl": 2.8173828125, "learning_rate": 1.4452474110696738e-05, "loss": -0.1628, "reward": 0.5496652014553547, "reward_std": 0.3982691466808319, "rewards/accuracy_reward": 0.10491072060540318, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4447544813156128, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 749.8437805175781, "epoch": 0.42133333333333334, "grad_norm": 1.4146252870559692, "kl": 2.4423828125, "learning_rate": 1.4368809867238754e-05, "loss": -0.139, "reward": 0.4933035932481289, "reward_std": 0.3439667113125324, "rewards/accuracy_reward": 0.07366071688011289, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.419642873108387, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 777.3147735595703, "epoch": 0.424, "grad_norm": 1.5294604301452637, "kl": 2.662109375, "learning_rate": 1.4284765959322772e-05, "loss": -0.0536, "reward": 0.4246651977300644, "reward_std": 0.28294636122882366, "rewards/accuracy_reward": 0.037946430733427405, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3867187649011612, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 840.5290603637695, "epoch": 0.4266666666666667, "grad_norm": 1.1588491201400757, "kl": 2.80078125, "learning_rate": 1.4200349690650654e-05, "loss": -0.0394, "reward": 0.4810268096625805, "reward_std": 0.33573491498827934, "rewards/accuracy_reward": 0.06919643329456449, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4118303768336773, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 892.8437881469727, "epoch": 0.42933333333333334, "grad_norm": 1.1430574655532837, "kl": 2.9453125, "learning_rate": 1.411556839728367e-05, "loss": 0.0025, "reward": 0.5619419924914837, "reward_std": 0.360417190939188, "rewards/accuracy_reward": 0.0915178598370403, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4704241268336773, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 945.9308395385742, "epoch": 0.432, "grad_norm": 2.161261796951294, "kl": 3.875, "learning_rate": 1.4030429447004992e-05, "loss": 0.0688, "reward": 0.6428571678698063, "reward_std": 0.3830233383923769, "rewards/accuracy_reward": 0.12053572200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5223214514553547, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 992.6518249511719, "epoch": 0.43466666666666665, "grad_norm": 1.7473344802856445, "kl": 3.423828125, "learning_rate": 1.3944940238679384e-05, "loss": 0.0891, "reward": 0.6618303805589676, "reward_std": 0.2981398981064558, "rewards/accuracy_reward": 0.06250000395812094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5993303805589676, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 1013.8393096923828, "epoch": 0.43733333333333335, "grad_norm": 7.500110149383545, "kl": 6.62255859375, "learning_rate": 1.3859108201610236e-05, "loss": 0.254, "reward": 0.729352705180645, "reward_std": 0.35707739368081093, "rewards/accuracy_reward": 0.1294642894063145, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5998884215950966, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 1014.9397506713867, "epoch": 0.44, "grad_norm": 5.974610805511475, "kl": 4.771484375, "learning_rate": 1.3772940794893916e-05, "loss": 0.1753, "reward": 0.7025669887661934, "reward_std": 0.322284035384655, "rewards/accuracy_reward": 0.09821429080329835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6043526902794838, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 1019.9397506713867, "epoch": 0.44266666666666665, "grad_norm": 7.7188496589660645, "kl": 3.6796875, "learning_rate": 1.368644550677157e-05, "loss": 0.1398, "reward": 0.7522321790456772, "reward_std": 0.3287883847951889, "rewards/accuracy_reward": 0.1183035746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6339285969734192, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 1020.819206237793, "epoch": 0.44533333333333336, "grad_norm": 15.867236137390137, "kl": 8.28125, "learning_rate": 1.3599629853978342e-05, "loss": 0.3249, "reward": 0.7204241380095482, "reward_std": 0.32472028210759163, "rewards/accuracy_reward": 0.10491072037257254, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6155134215950966, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 1020.0424194335938, "epoch": 0.448, "grad_norm": 6.8363213539123535, "kl": 4.9951171875, "learning_rate": 1.3512501381090158e-05, "loss": 0.1957, "reward": 0.6914062798023224, "reward_std": 0.3072166871279478, "rewards/accuracy_reward": 0.0848214307334274, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6065848544239998, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 1016.553596496582, "epoch": 0.45066666666666666, "grad_norm": 3.5362603664398193, "kl": 6.8984375, "learning_rate": 1.3425067659868084e-05, "loss": 0.2479, "reward": 0.6785714477300644, "reward_std": 0.3066155780106783, "rewards/accuracy_reward": 0.07142857532016933, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6071428805589676, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 1011.7210006713867, "epoch": 0.4533333333333333, "grad_norm": 10.573477745056152, "kl": 3.390625, "learning_rate": 1.3337336288600297e-05, "loss": 0.1204, "reward": 0.7215402126312256, "reward_std": 0.33272249065339565, "rewards/accuracy_reward": 0.11160714761354029, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6099330708384514, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 1019.7209930419922, "epoch": 0.456, "grad_norm": 8.912219047546387, "kl": 6.275390625, "learning_rate": 1.324931489144178e-05, "loss": 0.248, "reward": 0.816406287252903, "reward_std": 0.3113563619554043, "rewards/accuracy_reward": 0.16294643562287092, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6534598544239998, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 1019.9620819091797, "epoch": 0.45866666666666667, "grad_norm": 5.703995704650879, "kl": 3.966796875, "learning_rate": 1.3161011117751756e-05, "loss": 0.1519, "reward": 0.7667411044239998, "reward_std": 0.31835605204105377, "rewards/accuracy_reward": 0.12723214970901608, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6395089477300644, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 1018.3928680419922, "epoch": 0.4613333333333333, "grad_norm": 8.292807579040527, "kl": 8.216796875, "learning_rate": 1.3072432641428931e-05, "loss": 0.1965, "reward": 0.7946428954601288, "reward_std": 0.3237072564661503, "rewards/accuracy_reward": 0.12946429196745157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6651785969734192, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 1022.4241104125977, "epoch": 0.464, "grad_norm": 5.680503845214844, "kl": 2.45263671875, "learning_rate": 1.2983587160244602e-05, "loss": 0.0982, "reward": 0.7505580708384514, "reward_std": 0.24723401945084333, "rewards/accuracy_reward": 0.08035714528523386, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6702009215950966, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 1023.0937576293945, "epoch": 0.4666666666666667, "grad_norm": 2.7311763763427734, "kl": 2.314453125, "learning_rate": 1.2894482395173695e-05, "loss": 0.0906, "reward": 0.8197545111179352, "reward_std": 0.3316057715564966, "rewards/accuracy_reward": 0.1450892926659435, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.674665205180645, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 1020.8437652587891, "epoch": 0.4693333333333333, "grad_norm": 7.489380836486816, "kl": 3.44140625, "learning_rate": 1.2805126089723798e-05, "loss": 0.1294, "reward": 0.8493303954601288, "reward_std": 0.35581264086067677, "rewards/accuracy_reward": 0.1741071492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6752232536673546, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.472, "grad_norm": 2.636575937271118, "kl": 2.30322265625, "learning_rate": 1.2715526009262209e-05, "loss": 0.0922, "reward": 0.7790178805589676, "reward_std": 0.2882107999175787, "rewards/accuracy_reward": 0.10714286286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6718750298023224, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 1020.0401840209961, "epoch": 0.4746666666666667, "grad_norm": 4.1205291748046875, "kl": 0.9132080078125, "learning_rate": 1.2625689940341102e-05, "loss": 0.0322, "reward": 0.8643973618745804, "reward_std": 0.3784267157316208, "rewards/accuracy_reward": 0.20535715203732252, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.659040205180645, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 1022.0803756713867, "epoch": 0.47733333333333333, "grad_norm": 4.4023823738098145, "kl": 0.935791015625, "learning_rate": 1.2535625690020861e-05, "loss": 0.0334, "reward": 0.8593750521540642, "reward_std": 0.3589657451957464, "rewards/accuracy_reward": 0.20982143888249993, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6495535969734192, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.48, "grad_norm": 2.203847885131836, "kl": 1.26123046875, "learning_rate": 1.24453410851916e-05, "loss": 0.0504, "reward": 0.825334869325161, "reward_std": 0.3454053979367018, "rewards/accuracy_reward": 0.17857143841683865, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6467634066939354, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4826666666666667, "grad_norm": 0.9205332398414612, "kl": 0.9808349609375, "learning_rate": 1.2354843971892998e-05, "loss": 0.0393, "reward": 0.8794643357396126, "reward_std": 0.3468211852014065, "rewards/accuracy_reward": 0.2142857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6651786118745804, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.48533333333333334, "grad_norm": 1.2223032712936401, "kl": 0.931640625, "learning_rate": 1.2264142214632441e-05, "loss": 0.0373, "reward": 0.8850446715950966, "reward_std": 0.36373014003038406, "rewards/accuracy_reward": 0.2142857275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6707589626312256, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.488, "grad_norm": 0.24835845828056335, "kl": 0.541015625, "learning_rate": 1.2173243695701575e-05, "loss": 0.0217, "reward": 0.981026828289032, "reward_std": 0.37699259258806705, "rewards/accuracy_reward": 0.2924107275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6886161044239998, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 1021.8236694335938, "epoch": 0.49066666666666664, "grad_norm": 7.40866756439209, "kl": 0.44091796875, "learning_rate": 1.2082156314491298e-05, "loss": 0.0111, "reward": 0.8722098544239998, "reward_std": 0.2936495263129473, "rewards/accuracy_reward": 0.17857143515720963, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6936384290456772, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 1022.1406326293945, "epoch": 0.49333333333333335, "grad_norm": 0.5567572712898254, "kl": 0.3126220703125, "learning_rate": 1.1990887986805295e-05, "loss": 0.0092, "reward": 0.9202009439468384, "reward_std": 0.38105889968574047, "rewards/accuracy_reward": 0.22991072572767735, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6902902126312256, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 1021.8236694335938, "epoch": 0.496, "grad_norm": 0.5544495582580566, "kl": 0.29736328125, "learning_rate": 1.1899446644172106e-05, "loss": 0.0076, "reward": 0.9854911267757416, "reward_std": 0.37811920419335365, "rewards/accuracy_reward": 0.290178582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6953125298023224, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.49866666666666665, "grad_norm": 0.2723735570907593, "kl": 0.351318359375, "learning_rate": 1.1807840233155863e-05, "loss": 0.0141, "reward": 0.9441964626312256, "reward_std": 0.40811292454600334, "rewards/accuracy_reward": 0.29017858766019344, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6540178880095482, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 1021.8482208251953, "epoch": 0.5013333333333333, "grad_norm": 0.5237597823143005, "kl": 0.347900390625, "learning_rate": 1.1716076714665701e-05, "loss": 0.0104, "reward": 0.832031287252903, "reward_std": 0.3854901008307934, "rewards/accuracy_reward": 0.20312500838190317, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6289062723517418, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.504, "grad_norm": 0.7326854467391968, "kl": 0.361572265625, "learning_rate": 1.1624164063263931e-05, "loss": 0.0145, "reward": 0.8487723544239998, "reward_std": 0.40539926290512085, "rewards/accuracy_reward": 0.23437501792795956, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6143973395228386, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 1021.8415222167969, "epoch": 0.5066666666666667, "grad_norm": 0.1683388352394104, "kl": 0.231689453125, "learning_rate": 1.1532110266473026e-05, "loss": 0.0062, "reward": 0.9581473767757416, "reward_std": 0.41959446854889393, "rewards/accuracy_reward": 0.28571429708972573, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6724330633878708, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5093333333333333, "grad_norm": 0.4431297481060028, "kl": 0.2105712890625, "learning_rate": 1.1439923324081465e-05, "loss": 0.0084, "reward": 0.9871652275323868, "reward_std": 0.39653555303812027, "rewards/accuracy_reward": 0.3080357275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.679129496216774, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.512, "grad_norm": 0.293804407119751, "kl": 0.230224609375, "learning_rate": 1.1347611247448544e-05, "loss": 0.0092, "reward": 0.9916295036673546, "reward_std": 0.41540490463376045, "rewards/accuracy_reward": 0.34821430034935474, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.643415205180645, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5146666666666667, "grad_norm": 0.5157522559165955, "kl": 0.388671875, "learning_rate": 1.1255182058808143e-05, "loss": 0.0155, "reward": 0.8850446864962578, "reward_std": 0.3409024402499199, "rewards/accuracy_reward": 0.28125001676380634, "rewards/format_reward": 0.011160714784637094, "rewards/tag_count_reward": 0.592633955180645, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5173333333333333, "grad_norm": 2.269129514694214, "kl": 0.740234375, "learning_rate": 1.1162643790571574e-05, "loss": 0.0296, "reward": 0.934709869325161, "reward_std": 0.5415398068726063, "rewards/accuracy_reward": 0.22321429615840316, "rewards/format_reward": 0.15178571827709675, "rewards/tag_count_reward": 0.5597098469734192, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 1022.5959854125977, "epoch": 0.52, "grad_norm": 2.012619972229004, "kl": 0.505859375, "learning_rate": 1.1070004484629543e-05, "loss": 0.0196, "reward": 0.9972098618745804, "reward_std": 0.5713630616664886, "rewards/accuracy_reward": 0.19196429569274187, "rewards/format_reward": 0.18080358020961285, "rewards/tag_count_reward": 0.624441996216774, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 1022.0870590209961, "epoch": 0.5226666666666666, "grad_norm": 2.0236079692840576, "kl": 0.2779541015625, "learning_rate": 1.0977272191653272e-05, "loss": 0.0071, "reward": 1.0167411044239998, "reward_std": 0.4556749537587166, "rewards/accuracy_reward": 0.26339287124574184, "rewards/format_reward": 0.07366071781143546, "rewards/tag_count_reward": 0.6796875298023224, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5253333333333333, "grad_norm": 2.6833431720733643, "kl": 0.3123779296875, "learning_rate": 1.0884454970394871e-05, "loss": 0.0125, "reward": 1.047433078289032, "reward_std": 0.45435722172260284, "rewards/accuracy_reward": 0.2901785857975483, "rewards/format_reward": 0.06026785960420966, "rewards/tag_count_reward": 0.6969866380095482, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 1015.2656478881836, "epoch": 0.528, "grad_norm": 1.6515158414840698, "kl": 0.37060546875, "learning_rate": 1.0791560886987016e-05, "loss": -0.0019, "reward": 1.0569196939468384, "reward_std": 0.4782850816845894, "rewards/accuracy_reward": 0.26116072689183056, "rewards/format_reward": 0.06919643143191934, "rewards/tag_count_reward": 0.7265625223517418, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5306666666666666, "grad_norm": 0.6246099472045898, "kl": 0.23095703125, "learning_rate": 1.069859801424196e-05, "loss": 0.0092, "reward": 1.0781250596046448, "reward_std": 0.46683337539434433, "rewards/accuracy_reward": 0.24553573061712086, "rewards/format_reward": 0.07812500419095159, "rewards/tag_count_reward": 0.7544643133878708, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5333333333333333, "grad_norm": 0.20519134402275085, "kl": 0.2835693359375, "learning_rate": 1.0605574430949983e-05, "loss": 0.0113, "reward": 1.1729911118745804, "reward_std": 0.5779759176075459, "rewards/accuracy_reward": 0.27678572945296764, "rewards/format_reward": 0.13392857555299997, "rewards/tag_count_reward": 0.7622768208384514, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.536, "grad_norm": 0.5384864807128906, "kl": 0.3441162109375, "learning_rate": 1.0512498221177319e-05, "loss": 0.0138, "reward": 1.323102742433548, "reward_std": 0.7179402336478233, "rewards/accuracy_reward": 0.2812500149011612, "rewards/format_reward": 0.2656250111758709, "rewards/tag_count_reward": 0.776227705180645, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 1021.7790222167969, "epoch": 0.5386666666666666, "grad_norm": 0.9296804070472717, "kl": 0.4293212890625, "learning_rate": 1.0419377473563621e-05, "loss": 0.0124, "reward": 1.3621652275323868, "reward_std": 0.7298394441604614, "rewards/accuracy_reward": 0.20982143934816122, "rewards/format_reward": 0.3727678768336773, "rewards/tag_count_reward": 0.7795759290456772, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5413333333333333, "grad_norm": 1.218420147895813, "kl": 0.484619140625, "learning_rate": 1.0326220280619036e-05, "loss": 0.0194, "reward": 1.6858259737491608, "reward_std": 0.8232333958148956, "rewards/accuracy_reward": 0.38392858393490314, "rewards/format_reward": 0.5379464626312256, "rewards/tag_count_reward": 0.7639509364962578, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.544, "grad_norm": 0.30742985010147095, "kl": 0.3109130859375, "learning_rate": 1.0233034738020933e-05, "loss": 0.0124, "reward": 1.7661831378936768, "reward_std": 0.720308743417263, "rewards/accuracy_reward": 0.2700892984867096, "rewards/format_reward": 0.714285746216774, "rewards/tag_count_reward": 0.7818080708384514, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5466666666666666, "grad_norm": 0.2768275737762451, "kl": 0.3094482421875, "learning_rate": 1.0139828943910358e-05, "loss": 0.0124, "reward": 1.6763393431901932, "reward_std": 0.7501099109649658, "rewards/accuracy_reward": 0.26116073061712086, "rewards/format_reward": 0.698660746216774, "rewards/tag_count_reward": 0.7165178880095482, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5493333333333333, "grad_norm": 0.7091500759124756, "kl": 0.293701171875, "learning_rate": 1.004661099818829e-05, "loss": 0.0118, "reward": 1.9626117050647736, "reward_std": 0.7170767486095428, "rewards/accuracy_reward": 0.4062500111758709, "rewards/format_reward": 0.8303571864962578, "rewards/tag_count_reward": 0.7260045036673546, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.552, "grad_norm": 1.4856609106063843, "kl": 0.3824462890625, "learning_rate": 9.953389001811716e-06, "loss": 0.0153, "reward": 1.7338170409202576, "reward_std": 0.6667964346706867, "rewards/accuracy_reward": 0.2924107313156128, "rewards/format_reward": 0.785714328289032, "rewards/tag_count_reward": 0.655691996216774, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5546666666666666, "grad_norm": 0.4151708483695984, "kl": 0.361083984375, "learning_rate": 9.860171056089646e-06, "loss": 0.0145, "reward": 1.8677456378936768, "reward_std": 0.6513328105211258, "rewards/accuracy_reward": 0.3906250223517418, "rewards/format_reward": 0.828125037252903, "rewards/tag_count_reward": 0.6489955559372902, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 1022.0312576293945, "epoch": 0.5573333333333333, "grad_norm": 0.8962245583534241, "kl": 0.468994140625, "learning_rate": 9.766965261979072e-06, "loss": 0.0162, "reward": 1.7784598916769028, "reward_std": 0.6881442964076996, "rewards/accuracy_reward": 0.28125000838190317, "rewards/format_reward": 0.7946429029107094, "rewards/tag_count_reward": 0.702566996216774, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 1020.3080520629883, "epoch": 0.56, "grad_norm": 0.398674875497818, "kl": 0.547607421875, "learning_rate": 9.673779719380967e-06, "loss": 0.0146, "reward": 1.7405134588479996, "reward_std": 0.7453675791621208, "rewards/accuracy_reward": 0.23660715529695153, "rewards/format_reward": 0.7566964626312256, "rewards/tag_count_reward": 0.7472098544239998, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 1019.8259048461914, "epoch": 0.5626666666666666, "grad_norm": 0.8340033292770386, "kl": 0.69189453125, "learning_rate": 9.580622526436382e-06, "loss": 0.0191, "reward": 1.5396205931901932, "reward_std": 0.7650129646062851, "rewards/accuracy_reward": 0.18973215157166123, "rewards/format_reward": 0.5781250223517418, "rewards/tag_count_reward": 0.7717634290456772, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5653333333333334, "grad_norm": 1.366788387298584, "kl": 0.626708984375, "learning_rate": 9.487501778822685e-06, "loss": 0.0251, "reward": 1.494977742433548, "reward_std": 0.7519652545452118, "rewards/accuracy_reward": 0.13169643515720963, "rewards/format_reward": 0.5736607387661934, "rewards/tag_count_reward": 0.7896205708384514, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.568, "grad_norm": 0.7474933862686157, "kl": 0.3994140625, "learning_rate": 9.394425569050018e-06, "loss": 0.016, "reward": 1.5239956080913544, "reward_std": 0.747960276901722, "rewards/accuracy_reward": 0.11830357694998384, "rewards/format_reward": 0.589285746216774, "rewards/tag_count_reward": 0.816406287252903, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5706666666666667, "grad_norm": 0.7150735259056091, "kl": 0.3843994140625, "learning_rate": 9.30140198575804e-06, "loss": 0.0154, "reward": 1.7003348916769028, "reward_std": 0.7763254791498184, "rewards/accuracy_reward": 0.2366071545984596, "rewards/format_reward": 0.6651786118745804, "rewards/tag_count_reward": 0.7985491454601288, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5733333333333334, "grad_norm": 0.3341580629348755, "kl": 0.252685546875, "learning_rate": 9.208439113012984e-06, "loss": 0.0101, "reward": 1.6869420260190964, "reward_std": 0.6856246665120125, "rewards/accuracy_reward": 0.15401786495931447, "rewards/format_reward": 0.7209821790456772, "rewards/tag_count_reward": 0.8119420036673546, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 1021.8214340209961, "epoch": 0.576, "grad_norm": 0.24437068402767181, "kl": 0.2630615234375, "learning_rate": 9.115545029605129e-06, "loss": 0.0061, "reward": 1.7594867050647736, "reward_std": 0.6430581621825695, "rewards/accuracy_reward": 0.16294643678702414, "rewards/format_reward": 0.781250037252903, "rewards/tag_count_reward": 0.8152902200818062, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5786666666666667, "grad_norm": 0.2341134250164032, "kl": 0.2742919921875, "learning_rate": 9.022727808346731e-06, "loss": 0.011, "reward": 1.7907367050647736, "reward_std": 0.6435003951191902, "rewards/accuracy_reward": 0.212053582072258, "rewards/format_reward": 0.812500037252903, "rewards/tag_count_reward": 0.7661830708384514, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 1023.8683090209961, "epoch": 0.5813333333333334, "grad_norm": 0.4210372269153595, "kl": 0.2279052734375, "learning_rate": 8.92999551537046e-06, "loss": 0.0089, "reward": 1.7952009737491608, "reward_std": 0.6400342211127281, "rewards/accuracy_reward": 0.20982144260779023, "rewards/format_reward": 0.837053619325161, "rewards/tag_count_reward": 0.7483259364962578, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.584, "grad_norm": 0.24627907574176788, "kl": 0.1993408203125, "learning_rate": 8.837356209428428e-06, "loss": 0.008, "reward": 1.8950893729925156, "reward_std": 0.5851474218070507, "rewards/accuracy_reward": 0.23214286752045155, "rewards/format_reward": 0.8928571864962578, "rewards/tag_count_reward": 0.770089328289032, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5866666666666667, "grad_norm": 0.20180067420005798, "kl": 0.2174072265625, "learning_rate": 8.744817941191862e-06, "loss": 0.0087, "reward": 2.039620652794838, "reward_std": 0.630747739225626, "rewards/accuracy_reward": 0.3214285895228386, "rewards/format_reward": 0.9062500447034836, "rewards/tag_count_reward": 0.811941996216774, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5893333333333334, "grad_norm": 0.5031952261924744, "kl": 0.2564697265625, "learning_rate": 8.652388752551458e-06, "loss": 0.0103, "reward": 1.864397406578064, "reward_std": 0.6196173951029778, "rewards/accuracy_reward": 0.25223215483129025, "rewards/format_reward": 0.8482143208384514, "rewards/tag_count_reward": 0.7639509364962578, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.592, "grad_norm": 0.3535725176334381, "kl": 0.2275390625, "learning_rate": 8.560076675918537e-06, "loss": 0.0091, "reward": 2.0172992199659348, "reward_std": 0.5813482627272606, "rewards/accuracy_reward": 0.27232144260779023, "rewards/format_reward": 0.899553619325161, "rewards/tag_count_reward": 0.8454241454601288, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5946666666666667, "grad_norm": 0.3289671838283539, "kl": 0.258056640625, "learning_rate": 8.467889733526977e-06, "loss": 0.0103, "reward": 1.983258992433548, "reward_std": 0.6793729364871979, "rewards/accuracy_reward": 0.29241072945296764, "rewards/format_reward": 0.8459821864962578, "rewards/tag_count_reward": 0.844866119325161, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5973333333333334, "grad_norm": 1.2749642133712769, "kl": 0.409912109375, "learning_rate": 8.375835936736072e-06, "loss": 0.0164, "reward": 1.8945313394069672, "reward_std": 0.7175267487764359, "rewards/accuracy_reward": 0.29017858393490314, "rewards/format_reward": 0.7678571715950966, "rewards/tag_count_reward": 0.8364955708384514, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 1021.7678604125977, "epoch": 0.6, "grad_norm": 3.180739641189575, "kl": 0.831787109375, "learning_rate": 8.283923285334304e-06, "loss": 0.0279, "reward": 1.8989956080913544, "reward_std": 0.679141990840435, "rewards/accuracy_reward": 0.2834821492433548, "rewards/format_reward": 0.7433036044239998, "rewards/tag_count_reward": 0.8722098618745804, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.6026666666666667, "grad_norm": 5.543436527252197, "kl": 0.572998046875, "learning_rate": 8.19215976684414e-06, "loss": 0.0229, "reward": 1.9581474363803864, "reward_std": 0.7154616340994835, "rewards/accuracy_reward": 0.32142858812585473, "rewards/format_reward": 0.750000037252903, "rewards/tag_count_reward": 0.8867187798023224, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 1021.7366104125977, "epoch": 0.6053333333333333, "grad_norm": 6.479448318481445, "kl": 0.625, "learning_rate": 8.100553355827897e-06, "loss": 0.019, "reward": 1.7622768431901932, "reward_std": 0.6781650111079216, "rewards/accuracy_reward": 0.2187500074505806, "rewards/format_reward": 0.6651786044239998, "rewards/tag_count_reward": 0.878348246216774, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.608, "grad_norm": 3.77752685546875, "kl": 0.58154296875, "learning_rate": 8.009112013194707e-06, "loss": 0.0233, "reward": 1.8616072088479996, "reward_std": 0.7435284927487373, "rewards/accuracy_reward": 0.30133930407464504, "rewards/format_reward": 0.6741071715950966, "rewards/tag_count_reward": 0.8861607536673546, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.6106666666666667, "grad_norm": 0.44458553194999695, "kl": 0.579833984375, "learning_rate": 7.917843685508702e-06, "loss": 0.0232, "reward": 1.856584906578064, "reward_std": 0.7319114580750465, "rewards/accuracy_reward": 0.33482143841683865, "rewards/format_reward": 0.651785746216774, "rewards/tag_count_reward": 0.8699777126312256, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.6133333333333333, "grad_norm": 0.944040060043335, "kl": 0.654296875, "learning_rate": 7.826756304298428e-06, "loss": 0.0262, "reward": 1.7912947088479996, "reward_std": 0.7824961915612221, "rewards/accuracy_reward": 0.25223215483129025, "rewards/format_reward": 0.683035746216774, "rewards/tag_count_reward": 0.856026828289032, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.616, "grad_norm": 0.4724942743778229, "kl": 0.47802734375, "learning_rate": 7.73585778536756e-06, "loss": 0.0191, "reward": 1.8510045558214188, "reward_std": 0.6905161440372467, "rewards/accuracy_reward": 0.21651786752045155, "rewards/format_reward": 0.7633928880095482, "rewards/tag_count_reward": 0.8710937947034836, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.6186666666666667, "grad_norm": 0.44556644558906555, "kl": 0.3642578125, "learning_rate": 7.645156028107005e-06, "loss": 0.0146, "reward": 2.014508992433548, "reward_std": 0.6820996776223183, "rewards/accuracy_reward": 0.305803582072258, "rewards/format_reward": 0.816964328289032, "rewards/tag_count_reward": 0.8917411044239998, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 1021.8459854125977, "epoch": 0.6213333333333333, "grad_norm": 0.5146077275276184, "kl": 0.254638671875, "learning_rate": 7.554658914808404e-06, "loss": 0.0068, "reward": 2.0652903020381927, "reward_std": 0.613891314715147, "rewards/accuracy_reward": 0.28571429941803217, "rewards/format_reward": 0.883928619325161, "rewards/tag_count_reward": 0.895647369325161, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.624, "grad_norm": 0.5768357515335083, "kl": 0.338623046875, "learning_rate": 7.464374309979143e-06, "loss": 0.0135, "reward": 1.9782366752624512, "reward_std": 0.6160991229116917, "rewards/accuracy_reward": 0.24553572665899992, "rewards/format_reward": 0.8816964700818062, "rewards/tag_count_reward": 0.8510045111179352, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.6266666666666667, "grad_norm": 2.275315284729004, "kl": 0.61083984375, "learning_rate": 7.3743100596589e-06, "loss": 0.0244, "reward": 1.903459906578064, "reward_std": 0.6063342429697514, "rewards/accuracy_reward": 0.22544643888249993, "rewards/format_reward": 0.883928619325161, "rewards/tag_count_reward": 0.7940848544239998, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.6293333333333333, "grad_norm": 1.4082753658294678, "kl": 0.58154296875, "learning_rate": 7.284473990737795e-06, "loss": 0.0233, "reward": 1.9514510035514832, "reward_std": 0.5934183858335018, "rewards/accuracy_reward": 0.28348215972073376, "rewards/format_reward": 0.8906250298023224, "rewards/tag_count_reward": 0.777343787252903, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 1021.7500076293945, "epoch": 0.632, "grad_norm": 0.34938082098960876, "kl": 1.23388671875, "learning_rate": 7.194873910276205e-06, "loss": 0.0051, "reward": 2.1512277871370316, "reward_std": 0.5031805112957954, "rewards/accuracy_reward": 0.38169644703157246, "rewards/format_reward": 0.9397321864962578, "rewards/tag_count_reward": 0.8297991529107094, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 1021.8906402587891, "epoch": 0.6346666666666667, "grad_norm": 0.22738490998744965, "kl": 0.2371826171875, "learning_rate": 7.1055176048263085e-06, "loss": 0.0062, "reward": 2.0831474363803864, "reward_std": 0.47590644657611847, "rewards/accuracy_reward": 0.28794643841683865, "rewards/format_reward": 0.9241071715950966, "rewards/tag_count_reward": 0.871093787252903, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 1021.8191986083984, "epoch": 0.6373333333333333, "grad_norm": 0.2740417718887329, "kl": 0.2435302734375, "learning_rate": 7.0164128397554e-06, "loss": 0.0016, "reward": 2.166294753551483, "reward_std": 0.5260685943067074, "rewards/accuracy_reward": 0.3303571566939354, "rewards/format_reward": 0.9419643208384514, "rewards/tag_count_reward": 0.8939732536673546, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 1023.0602722167969, "epoch": 0.64, "grad_norm": 0.8618957996368408, "kl": 0.253662109375, "learning_rate": 6.92756735857107e-06, "loss": 0.0078, "reward": 2.203125089406967, "reward_std": 0.4360897056758404, "rewards/accuracy_reward": 0.325892873108387, "rewards/format_reward": 0.9486607611179352, "rewards/tag_count_reward": 0.9285714700818062, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.6426666666666667, "grad_norm": 0.2517678141593933, "kl": 0.248779296875, "learning_rate": 6.838988882248243e-06, "loss": 0.01, "reward": 2.1992188692092896, "reward_std": 0.5083508864045143, "rewards/accuracy_reward": 0.34821430454030633, "rewards/format_reward": 0.9352678954601288, "rewards/tag_count_reward": 0.9157366380095482, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.6453333333333333, "grad_norm": 0.24498316645622253, "kl": 0.2371826171875, "learning_rate": 6.750685108558221e-06, "loss": 0.0095, "reward": 2.289062589406967, "reward_std": 0.5808268934488297, "rewards/accuracy_reward": 0.408482164144516, "rewards/format_reward": 0.9397321790456772, "rewards/tag_count_reward": 0.9408482685685158, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.648, "grad_norm": 0.20510031282901764, "kl": 0.261962890625, "learning_rate": 6.662663711399705e-06, "loss": 0.0105, "reward": 2.2154018729925156, "reward_std": 0.5156445652246475, "rewards/accuracy_reward": 0.34375001955777407, "rewards/format_reward": 0.9263393208384514, "rewards/tag_count_reward": 0.945312537252903, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.6506666666666666, "grad_norm": 0.18001040816307068, "kl": 0.2032470703125, "learning_rate": 6.574932340131917e-06, "loss": 0.0081, "reward": 2.252232253551483, "reward_std": 0.5333615131676197, "rewards/accuracy_reward": 0.39062501676380634, "rewards/format_reward": 0.933035746216774, "rewards/tag_count_reward": 0.9285714775323868, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.6533333333333333, "grad_norm": 0.19406573474407196, "kl": 0.225341796875, "learning_rate": 6.487498618909845e-06, "loss": 0.009, "reward": 2.214843824505806, "reward_std": 0.4596148282289505, "rewards/accuracy_reward": 0.34821430314332247, "rewards/format_reward": 0.9308036044239998, "rewards/tag_count_reward": 0.9358259290456772, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.656, "grad_norm": 0.35295507311820984, "kl": 0.2305908203125, "learning_rate": 6.400370146021662e-06, "loss": 0.0092, "reward": 2.094866171479225, "reward_std": 0.5512706525623798, "rewards/accuracy_reward": 0.2790178656578064, "rewards/format_reward": 0.9017857536673546, "rewards/tag_count_reward": 0.9140625447034836, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.6586666666666666, "grad_norm": 0.32935085892677307, "kl": 0.250732421875, "learning_rate": 6.313554493228431e-06, "loss": 0.01, "reward": 2.193638488650322, "reward_std": 0.5914541855454445, "rewards/accuracy_reward": 0.41071429569274187, "rewards/format_reward": 0.8928571790456772, "rewards/tag_count_reward": 0.8900670111179352, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 1021.8125076293945, "epoch": 0.6613333333333333, "grad_norm": 0.3850191533565521, "kl": 0.268310546875, "learning_rate": 6.227059205106085e-06, "loss": 0.0048, "reward": 2.1065849363803864, "reward_std": 0.5955651290714741, "rewards/accuracy_reward": 0.32142858393490314, "rewards/format_reward": 0.8973214775323868, "rewards/tag_count_reward": 0.8878348618745804, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 1023.8950958251953, "epoch": 0.664, "grad_norm": 1.332704782485962, "kl": 0.3875732421875, "learning_rate": 6.14089179838977e-06, "loss": 0.015, "reward": 2.0273438841104507, "reward_std": 0.6821491718292236, "rewards/accuracy_reward": 0.28571429778821766, "rewards/format_reward": 0.8549107685685158, "rewards/tag_count_reward": 0.8867188021540642, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.6666666666666666, "grad_norm": 0.27303817868232727, "kl": 0.241943359375, "learning_rate": 6.0550597613206205e-06, "loss": 0.0097, "reward": 2.0262277722358704, "reward_std": 0.5583557672798634, "rewards/accuracy_reward": 0.2321428656578064, "rewards/format_reward": 0.881696455180645, "rewards/tag_count_reward": 0.912388451397419, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.6693333333333333, "grad_norm": 0.24417054653167725, "kl": 0.2457275390625, "learning_rate": 5.969570552995014e-06, "loss": 0.0098, "reward": 2.052455425262451, "reward_std": 0.6332450993359089, "rewards/accuracy_reward": 0.29241072619333863, "rewards/format_reward": 0.8482143357396126, "rewards/tag_count_reward": 0.91183041036129, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 1021.8504486083984, "epoch": 0.672, "grad_norm": 0.33021074533462524, "kl": 0.2567138671875, "learning_rate": 5.8844316027163315e-06, "loss": 0.0026, "reward": 2.1372769325971603, "reward_std": 0.6047781556844711, "rewards/accuracy_reward": 0.3236607313156128, "rewards/format_reward": 0.8772321864962578, "rewards/tag_count_reward": 0.9363839700818062, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.6746666666666666, "grad_norm": 0.26396986842155457, "kl": 0.1905517578125, "learning_rate": 5.799650309349348e-06, "loss": 0.0077, "reward": 2.209263503551483, "reward_std": 0.5806162096560001, "rewards/accuracy_reward": 0.3794643022119999, "rewards/format_reward": 0.883928619325161, "rewards/tag_count_reward": 0.9458705857396126, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.6773333333333333, "grad_norm": 0.20837701857089996, "kl": 0.2464599609375, "learning_rate": 5.715234040677229e-06, "loss": 0.0099, "reward": 2.099888503551483, "reward_std": 0.6910686045885086, "rewards/accuracy_reward": 0.34375001303851604, "rewards/format_reward": 0.8459821864962578, "rewards/tag_count_reward": 0.910156287252903, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.68, "grad_norm": 0.2536359429359436, "kl": 0.19482421875, "learning_rate": 5.631190132761247e-06, "loss": 0.0078, "reward": 2.228794753551483, "reward_std": 0.4858120158314705, "rewards/accuracy_reward": 0.3504464402794838, "rewards/format_reward": 0.9151786118745804, "rewards/tag_count_reward": 0.9631696864962578, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.6826666666666666, "grad_norm": 0.2243085354566574, "kl": 0.208740234375, "learning_rate": 5.547525889303265e-06, "loss": 0.0083, "reward": 2.0228795558214188, "reward_std": 0.5694718845188618, "rewards/accuracy_reward": 0.1919642947614193, "rewards/format_reward": 0.8839286118745804, "rewards/tag_count_reward": 0.94698666036129, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.6853333333333333, "grad_norm": 0.23991478979587555, "kl": 0.2109375, "learning_rate": 5.464248581011002e-06, "loss": 0.0084, "reward": 2.0742188543081284, "reward_std": 0.5562547482550144, "rewards/accuracy_reward": 0.2522321520373225, "rewards/format_reward": 0.8883928954601288, "rewards/tag_count_reward": 0.9335938021540642, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.688, "grad_norm": 0.24084815382957458, "kl": 0.1883544921875, "learning_rate": 5.381365444966205e-06, "loss": 0.0075, "reward": 2.194754585623741, "reward_std": 0.5408528298139572, "rewards/accuracy_reward": 0.32812501303851604, "rewards/format_reward": 0.9196428805589676, "rewards/tag_count_reward": 0.94698666036129, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.6906666666666667, "grad_norm": 0.2627911865711212, "kl": 0.2288818359375, "learning_rate": 5.298883683995697e-06, "loss": 0.0092, "reward": 2.180803656578064, "reward_std": 0.5501891225576401, "rewards/accuracy_reward": 0.32142858300358057, "rewards/format_reward": 0.9151786118745804, "rewards/tag_count_reward": 0.9441964700818062, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 1021.9285736083984, "epoch": 0.6933333333333334, "grad_norm": 0.29868605732917786, "kl": 0.204833984375, "learning_rate": 5.216810466045448e-06, "loss": 0.0051, "reward": 2.1222099363803864, "reward_std": 0.5263746418058872, "rewards/accuracy_reward": 0.2745535832364112, "rewards/format_reward": 0.9174107536673546, "rewards/tag_count_reward": 0.9302455708384514, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.696, "grad_norm": 0.2207689732313156, "kl": 0.232666015625, "learning_rate": 5.135152923557647e-06, "loss": 0.0093, "reward": 2.2232143878936768, "reward_std": 0.5706557966768742, "rewards/accuracy_reward": 0.35714287613518536, "rewards/format_reward": 0.9241071864962578, "rewards/tag_count_reward": 0.9419643208384514, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 1023.9397354125977, "epoch": 0.6986666666666667, "grad_norm": 0.923786997795105, "kl": 0.3634033203125, "learning_rate": 5.053918152850868e-06, "loss": 0.0145, "reward": 2.0675224363803864, "reward_std": 0.6228335537016392, "rewards/accuracy_reward": 0.27678573061712086, "rewards/format_reward": 0.8883928954601288, "rewards/tag_count_reward": 0.902343787252903, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.7013333333333334, "grad_norm": 0.22262142598628998, "kl": 0.2320556640625, "learning_rate": 4.973113213503379e-06, "loss": 0.0094, "reward": 2.079799249768257, "reward_std": 0.49952351674437523, "rewards/accuracy_reward": 0.2566964402794838, "rewards/format_reward": 0.8995536118745804, "rewards/tag_count_reward": 0.9235491529107094, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 1021.7723236083984, "epoch": 0.704, "grad_norm": 0.23496830463409424, "kl": 0.2869873046875, "learning_rate": 4.8927451277396365e-06, "loss": 0.0033, "reward": 2.0965402722358704, "reward_std": 0.5459967963397503, "rewards/accuracy_reward": 0.2834821529686451, "rewards/format_reward": 0.8950893208384514, "rewards/tag_count_reward": 0.917968787252903, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.7066666666666667, "grad_norm": 0.35722586512565613, "kl": 0.2728271484375, "learning_rate": 4.812820879820034e-06, "loss": 0.0109, "reward": 2.1143974512815475, "reward_std": 0.5960128493607044, "rewards/accuracy_reward": 0.3080357275903225, "rewards/format_reward": 0.8883929029107094, "rewards/tag_count_reward": 0.9179687947034836, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 1023.5870590209961, "epoch": 0.7093333333333334, "grad_norm": 0.27110370993614197, "kl": 0.2879638671875, "learning_rate": 4.733347415433946e-06, "loss": 0.0107, "reward": 2.1824778020381927, "reward_std": 0.5552078559994698, "rewards/accuracy_reward": 0.330357164144516, "rewards/format_reward": 0.910714328289032, "rewards/tag_count_reward": 0.9414063021540642, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 1017.9531478881836, "epoch": 0.712, "grad_norm": 0.2856296896934509, "kl": 0.3702392578125, "learning_rate": 4.654331641096118e-06, "loss": -0.0014, "reward": 2.164062574505806, "reward_std": 0.5980110131204128, "rewards/accuracy_reward": 0.35267858393490314, "rewards/format_reward": 0.8816964700818062, "rewards/tag_count_reward": 0.9296875447034836, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.7146666666666667, "grad_norm": 0.503265917301178, "kl": 0.4259033203125, "learning_rate": 4.575780423546476e-06, "loss": 0.017, "reward": 2.089843824505806, "reward_std": 0.6514175869524479, "rewards/accuracy_reward": 0.31473215227015316, "rewards/format_reward": 0.863839328289032, "rewards/tag_count_reward": 0.9112723618745804, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 1021.5401840209961, "epoch": 0.7173333333333334, "grad_norm": 0.5456968545913696, "kl": 0.456298828125, "learning_rate": 4.497700589153379e-06, "loss": 0.0121, "reward": 2.1322545558214188, "reward_std": 0.5633701980113983, "rewards/accuracy_reward": 0.31696429778821766, "rewards/format_reward": 0.8772321790456772, "rewards/tag_count_reward": 0.9380580857396126, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 1019.4308166503906, "epoch": 0.72, "grad_norm": 0.5453124642372131, "kl": 0.689453125, "learning_rate": 4.420098923320378e-06, "loss": 0.0179, "reward": 2.0892858058214188, "reward_std": 0.6481143087148666, "rewards/accuracy_reward": 0.29910715855658054, "rewards/format_reward": 0.8727678954601288, "rewards/tag_count_reward": 0.9174107536673546, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 1023.0803604125977, "epoch": 0.7226666666666667, "grad_norm": 1.6248281002044678, "kl": 0.951171875, "learning_rate": 4.342982169896555e-06, "loss": 0.0359, "reward": 2.1824777722358704, "reward_std": 0.6868480890989304, "rewards/accuracy_reward": 0.3906250186264515, "rewards/format_reward": 0.8794643208384514, "rewards/tag_count_reward": 0.9123884364962578, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 1017.8303909301758, "epoch": 0.7253333333333334, "grad_norm": 2.223525285720825, "kl": 0.97705078125, "learning_rate": 4.266357030590449e-06, "loss": 0.0254, "reward": 1.9916295409202576, "reward_std": 0.6709228046238422, "rewards/accuracy_reward": 0.23883929522708058, "rewards/format_reward": 0.8459821864962578, "rewards/tag_count_reward": 0.9068080857396126, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 1020.6607284545898, "epoch": 0.728, "grad_norm": 2.5242061614990234, "kl": 0.951171875, "learning_rate": 4.1902301643876555e-06, "loss": 0.027, "reward": 2.1835938841104507, "reward_std": 0.6639501675963402, "rewards/accuracy_reward": 0.3750000186264515, "rewards/format_reward": 0.8772321864962578, "rewards/tag_count_reward": 0.9313616454601288, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 1023.4754486083984, "epoch": 0.7306666666666667, "grad_norm": 0.40351805090904236, "kl": 0.8681640625, "learning_rate": 4.114608186972143e-06, "loss": 0.034, "reward": 2.135044753551483, "reward_std": 0.6348404288291931, "rewards/accuracy_reward": 0.33705358672887087, "rewards/format_reward": 0.8705357685685158, "rewards/tag_count_reward": 0.9274553954601288, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 1018.0268020629883, "epoch": 0.7333333333333333, "grad_norm": 2.133788585662842, "kl": 1.110595703125, "learning_rate": 4.0394976701513235e-06, "loss": 0.029, "reward": 2.1356027871370316, "reward_std": 0.7512710765004158, "rewards/accuracy_reward": 0.3995535857975483, "rewards/format_reward": 0.8303571939468384, "rewards/tag_count_reward": 0.9056920185685158, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 1023.6607208251953, "epoch": 0.736, "grad_norm": 0.8914547562599182, "kl": 0.768310546875, "learning_rate": 3.96490514128494e-06, "loss": 0.0301, "reward": 2.1155135184526443, "reward_std": 0.7052669823169708, "rewards/accuracy_reward": 0.34598215855658054, "rewards/format_reward": 0.848214328289032, "rewards/tag_count_reward": 0.9213170036673546, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 1023.9151840209961, "epoch": 0.7386666666666667, "grad_norm": 0.5397745966911316, "kl": 0.6962890625, "learning_rate": 3.890837082717822e-06, "loss": 0.0277, "reward": 2.074218839406967, "reward_std": 0.6800587102770805, "rewards/accuracy_reward": 0.3125000186264515, "rewards/format_reward": 0.8437500521540642, "rewards/tag_count_reward": 0.917968787252903, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 1018.1227951049805, "epoch": 0.7413333333333333, "grad_norm": 0.45244407653808594, "kl": 0.572509765625, "learning_rate": 3.817299931216537e-06, "loss": 0.0115, "reward": 2.083705484867096, "reward_std": 0.6785896308720112, "rewards/accuracy_reward": 0.29241072945296764, "rewards/format_reward": 0.8683036267757416, "rewards/tag_count_reward": 0.9229910969734192, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 1017.2433242797852, "epoch": 0.744, "grad_norm": 0.6339854598045349, "kl": 0.649658203125, "learning_rate": 3.74430007741003e-06, "loss": 0.0105, "reward": 2.0390626043081284, "reward_std": 0.6866142302751541, "rewards/accuracy_reward": 0.27455357764847577, "rewards/format_reward": 0.8504464700818062, "rewards/tag_count_reward": 0.9140625521540642, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.7466666666666667, "grad_norm": 0.6728559136390686, "kl": 0.6767578125, "learning_rate": 3.671843865234238e-06, "loss": 0.0271, "reward": 2.2684153020381927, "reward_std": 0.7245003581047058, "rewards/accuracy_reward": 0.4933035895228386, "rewards/format_reward": 0.859375037252903, "rewards/tag_count_reward": 0.9157366454601288, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 1023.7723236083984, "epoch": 0.7493333333333333, "grad_norm": 0.7666006088256836, "kl": 0.7099609375, "learning_rate": 3.599937591380791e-06, "loss": 0.0277, "reward": 2.04241082072258, "reward_std": 0.6483294367790222, "rewards/accuracy_reward": 0.27455358672887087, "rewards/format_reward": 0.8482143357396126, "rewards/tag_count_reward": 0.9196428880095482, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 1021.8035736083984, "epoch": 0.752, "grad_norm": 0.42735132575035095, "kl": 0.5888671875, "learning_rate": 3.5285875047498075e-06, "loss": 0.0193, "reward": 2.113839328289032, "reward_std": 0.6161059066653252, "rewards/accuracy_reward": 0.31696429708972573, "rewards/format_reward": 0.859375037252903, "rewards/tag_count_reward": 0.9375000521540642, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 1023.6383972167969, "epoch": 0.7546666666666667, "grad_norm": 1.4887264966964722, "kl": 0.679443359375, "learning_rate": 3.4577998059068354e-06, "loss": 0.0267, "reward": 1.9263393580913544, "reward_std": 0.7034792378544807, "rewards/accuracy_reward": 0.23883929778821766, "rewards/format_reward": 0.785714328289032, "rewards/tag_count_reward": 0.9017857536673546, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 1021.8303604125977, "epoch": 0.7573333333333333, "grad_norm": 0.34186023473739624, "kl": 0.44091796875, "learning_rate": 3.3875806465440152e-06, "loss": 0.0153, "reward": 2.2338171005249023, "reward_std": 0.6429506540298462, "rewards/accuracy_reward": 0.4151785857975483, "rewards/format_reward": 0.8772321939468384, "rewards/tag_count_reward": 0.9414062947034836, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 1021.6919708251953, "epoch": 0.76, "grad_norm": 0.318380743265152, "kl": 0.3946533203125, "learning_rate": 3.3179361289454694e-06, "loss": 0.0086, "reward": 2.2293528020381927, "reward_std": 0.6601455509662628, "rewards/accuracy_reward": 0.42410715483129025, "rewards/format_reward": 0.8593750521540642, "rewards/tag_count_reward": 0.9458705857396126, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.7626666666666667, "grad_norm": 0.3260420560836792, "kl": 0.3426513671875, "learning_rate": 3.2488723054569905e-06, "loss": 0.0138, "reward": 2.1450893729925156, "reward_std": 0.5205648727715015, "rewards/accuracy_reward": 0.2991071534343064, "rewards/format_reward": 0.8928571864962578, "rewards/tag_count_reward": 0.953125037252903, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.7653333333333333, "grad_norm": 0.2629320025444031, "kl": 0.2696533203125, "learning_rate": 3.1803951779600774e-06, "loss": 0.0109, "reward": 2.2338170260190964, "reward_std": 0.5047199167311192, "rewards/accuracy_reward": 0.3705357350409031, "rewards/format_reward": 0.9107143133878708, "rewards/tag_count_reward": 0.9525670111179352, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.768, "grad_norm": 0.3063962161540985, "kl": 0.22705078125, "learning_rate": 3.112510697350348e-06, "loss": 0.0091, "reward": 2.2137278020381927, "reward_std": 0.5222755149006844, "rewards/accuracy_reward": 0.3281250149011612, "rewards/format_reward": 0.9196428954601288, "rewards/tag_count_reward": 0.9659598618745804, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.7706666666666667, "grad_norm": 0.31240493059158325, "kl": 0.2208251953125, "learning_rate": 3.04522476302039e-06, "loss": 0.009, "reward": 2.4347099363803864, "reward_std": 0.49093519896268845, "rewards/accuracy_reward": 0.5223214514553547, "rewards/format_reward": 0.9352678954601288, "rewards/tag_count_reward": 0.9771205708384514, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.7733333333333333, "grad_norm": 0.2734697461128235, "kl": 0.21337890625, "learning_rate": 2.978543222347076e-06, "loss": 0.0088, "reward": 2.364955484867096, "reward_std": 0.38068827986717224, "rewards/accuracy_reward": 0.4241071604192257, "rewards/format_reward": 0.9575893208384514, "rewards/tag_count_reward": 0.9832589626312256, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 1019.6495666503906, "epoch": 0.776, "grad_norm": 0.2320028394460678, "kl": 0.2481689453125, "learning_rate": 2.912471870183411e-06, "loss": -0.0009, "reward": 2.199776917695999, "reward_std": 0.5590856000781059, "rewards/accuracy_reward": 0.34598216274753213, "rewards/format_reward": 0.8973214700818062, "rewards/tag_count_reward": 0.9564732611179352, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 1021.7767944335938, "epoch": 0.7786666666666666, "grad_norm": 0.29300227761268616, "kl": 0.1875, "learning_rate": 2.847016448354948e-06, "loss": -0.0004, "reward": 2.2890625298023224, "reward_std": 0.4556136131286621, "rewards/accuracy_reward": 0.3794643022119999, "rewards/format_reward": 0.9375000298023224, "rewards/tag_count_reward": 0.9720982536673546, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 1021.7745590209961, "epoch": 0.7813333333333333, "grad_norm": 0.29833829402923584, "kl": 0.212158203125, "learning_rate": 2.782182645160789e-06, "loss": 0.0019, "reward": 2.275669738650322, "reward_std": 0.42217013984918594, "rewards/accuracy_reward": 0.36160715855658054, "rewards/format_reward": 0.944196455180645, "rewards/tag_count_reward": 0.9698661267757416, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.784, "grad_norm": 0.35956284403800964, "kl": 0.2496337890625, "learning_rate": 2.71797609487926e-06, "loss": 0.0101, "reward": 2.2433036863803864, "reward_std": 0.5022850632667542, "rewards/accuracy_reward": 0.3504464514553547, "rewards/format_reward": 0.9263393208384514, "rewards/tag_count_reward": 0.9665179029107094, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 1021.7455444335938, "epoch": 0.7866666666666666, "grad_norm": 0.23198282718658447, "kl": 0.3045654296875, "learning_rate": 2.6544023772782736e-06, "loss": 0.0012, "reward": 2.301339417695999, "reward_std": 0.4813353540375829, "rewards/accuracy_reward": 0.4129464477300644, "rewards/format_reward": 0.926339328289032, "rewards/tag_count_reward": 0.9620536118745804, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 1019.7142944335938, "epoch": 0.7893333333333333, "grad_norm": 0.27904388308525085, "kl": 0.3428955078125, "learning_rate": 2.591467017130426e-06, "loss": 0.0038, "reward": 2.1456473916769028, "reward_std": 0.4807808957993984, "rewards/accuracy_reward": 0.27232144260779023, "rewards/format_reward": 0.9196428954601288, "rewards/tag_count_reward": 0.9536830857396126, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.792, "grad_norm": 0.272941917181015, "kl": 0.2530517578125, "learning_rate": 2.5291754837328787e-06, "loss": 0.0101, "reward": 2.2561384737491608, "reward_std": 0.5383404716849327, "rewards/accuracy_reward": 0.36383930407464504, "rewards/format_reward": 0.9308036044239998, "rewards/tag_count_reward": 0.9614955708384514, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 1020.1339416503906, "epoch": 0.7946666666666666, "grad_norm": 0.3166991174221039, "kl": 0.23974609375, "learning_rate": 2.4675331904320533e-06, "loss": 0.0086, "reward": 2.244977742433548, "reward_std": 0.5296464376151562, "rewards/accuracy_reward": 0.37276787869632244, "rewards/format_reward": 0.9196428954601288, "rewards/tag_count_reward": 0.9525670185685158, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 1021.9933090209961, "epoch": 0.7973333333333333, "grad_norm": 0.24607542157173157, "kl": 0.208984375, "learning_rate": 2.4065454941531963e-06, "loss": 0.0012, "reward": 2.2957590520381927, "reward_std": 0.4782888777554035, "rewards/accuracy_reward": 0.39732144959270954, "rewards/format_reward": 0.9352678880095482, "rewards/tag_count_reward": 0.9631696864962578, "step": 299 }, { "clip_ratio": 0.0, "completion_length": 1015.8437652587891, "epoch": 0.8, "grad_norm": 0.2551475167274475, "kl": 0.306396484375, "learning_rate": 2.346217694934847e-06, "loss": 0.0037, "reward": 2.2566965222358704, "reward_std": 0.5625656880438328, "rewards/accuracy_reward": 0.388392873108387, "rewards/format_reward": 0.9174107536673546, "rewards/tag_count_reward": 0.9508928880095482, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 1014.8750305175781, "epoch": 0.8026666666666666, "grad_norm": 0.21855834126472473, "kl": 0.2073974609375, "learning_rate": 2.286555035468233e-06, "loss": 0.0062, "reward": 2.209821566939354, "reward_std": 0.39225295558571815, "rewards/accuracy_reward": 0.2901785848662257, "rewards/format_reward": 0.9508928954601288, "rewards/tag_count_reward": 0.9687500447034836, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 1019.4687652587891, "epoch": 0.8053333333333333, "grad_norm": 0.2842949628829956, "kl": 0.219970703125, "learning_rate": 2.22756270064168e-06, "loss": 0.0085, "reward": 2.2840402722358704, "reward_std": 0.5333524160087109, "rewards/accuracy_reward": 0.404017873108387, "rewards/format_reward": 0.9241071939468384, "rewards/tag_count_reward": 0.9559152200818062, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 1017.2812652587891, "epoch": 0.808, "grad_norm": 1.0460567474365234, "kl": 0.333740234375, "learning_rate": 2.16924581709002e-06, "loss": 0.0141, "reward": 2.2137277722358704, "reward_std": 0.5989870205521584, "rewards/accuracy_reward": 0.36383930034935474, "rewards/format_reward": 0.9129464626312256, "rewards/tag_count_reward": 0.936941996216774, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 1014.8884201049805, "epoch": 0.8106666666666666, "grad_norm": 0.24618743360042572, "kl": 0.252197265625, "learning_rate": 2.1116094527490594e-06, "loss": -0.001, "reward": 2.2466518729925156, "reward_std": 0.5951163619756699, "rewards/accuracy_reward": 0.3995535857975483, "rewards/format_reward": 0.906250037252903, "rewards/tag_count_reward": 0.9408482611179352, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 1020.6183166503906, "epoch": 0.8133333333333334, "grad_norm": 0.24632352590560913, "kl": 0.28173828125, "learning_rate": 2.0546586164151827e-06, "loss": 0.0065, "reward": 2.1841518878936768, "reward_std": 0.5703849159181118, "rewards/accuracy_reward": 0.3482143050059676, "rewards/format_reward": 0.8995536044239998, "rewards/tag_count_reward": 0.9363839700818062, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 1019.8459930419922, "epoch": 0.816, "grad_norm": 0.23753482103347778, "kl": 0.2523193359375, "learning_rate": 1.9983982573100413e-06, "loss": 0.0035, "reward": 2.3169643580913544, "reward_std": 0.6238032579421997, "rewards/accuracy_reward": 0.4441964477300644, "rewards/format_reward": 0.9241071864962578, "rewards/tag_count_reward": 0.9486607536673546, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 1014.0536041259766, "epoch": 0.8186666666666667, "grad_norm": 0.20797064900398254, "kl": 0.2744140625, "learning_rate": 1.94283326465047e-06, "loss": -0.0052, "reward": 2.1305804401636124, "reward_std": 0.5734776593744755, "rewards/accuracy_reward": 0.2946428656578064, "rewards/format_reward": 0.8973214775323868, "rewards/tag_count_reward": 0.9386161118745804, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 1020.1183166503906, "epoch": 0.8213333333333334, "grad_norm": 0.30176863074302673, "kl": 0.24755859375, "learning_rate": 1.887968467223591e-06, "loss": 0.0048, "reward": 2.251674234867096, "reward_std": 0.6692539118230343, "rewards/accuracy_reward": 0.4174107350409031, "rewards/format_reward": 0.899553619325161, "rewards/tag_count_reward": 0.9347098618745804, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 1012.366096496582, "epoch": 0.824, "grad_norm": 0.324207603931427, "kl": 0.220703125, "learning_rate": 1.8338086329671734e-06, "loss": 0.0124, "reward": 2.1830358058214188, "reward_std": 0.5649261251091957, "rewards/accuracy_reward": 0.32812501350417733, "rewards/format_reward": 0.9107143208384514, "rewards/tag_count_reward": 0.9441964700818062, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 1018.5535888671875, "epoch": 0.8266666666666667, "grad_norm": 0.2686956524848938, "kl": 0.2506103515625, "learning_rate": 1.7803584685552877e-06, "loss": 0.001, "reward": 2.042968839406967, "reward_std": 0.604579221457243, "rewards/accuracy_reward": 0.22991072572767735, "rewards/format_reward": 0.8839286044239998, "rewards/tag_count_reward": 0.9291295185685158, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 1018.4419784545898, "epoch": 0.8293333333333334, "grad_norm": 0.3965815007686615, "kl": 0.19384765625, "learning_rate": 1.7276226189892763e-06, "loss": 0.0043, "reward": 2.279017925262451, "reward_std": 0.5148574188351631, "rewards/accuracy_reward": 0.361607164144516, "rewards/format_reward": 0.9464286118745804, "rewards/tag_count_reward": 0.9709821864962578, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 1017.2165451049805, "epoch": 0.832, "grad_norm": 0.3476318418979645, "kl": 0.244140625, "learning_rate": 1.6756056671940902e-06, "loss": 0.0017, "reward": 2.2522322684526443, "reward_std": 0.660854198038578, "rewards/accuracy_reward": 0.4084821669384837, "rewards/format_reward": 0.9017857536673546, "rewards/tag_count_reward": 0.9419643357396126, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 1020.2276916503906, "epoch": 0.8346666666666667, "grad_norm": 0.21767204999923706, "kl": 0.2122802734375, "learning_rate": 1.624312133620013e-06, "loss": 0.0087, "reward": 2.1450893729925156, "reward_std": 0.5749500542879105, "rewards/accuracy_reward": 0.29910715855658054, "rewards/format_reward": 0.9017857536673546, "rewards/tag_count_reward": 0.9441964700818062, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 1020.4263458251953, "epoch": 0.8373333333333334, "grad_norm": 0.5620092153549194, "kl": 0.3297119140625, "learning_rate": 1.5737464758498243e-06, "loss": 0.0137, "reward": 2.0965402722358704, "reward_std": 0.6482469737529755, "rewards/accuracy_reward": 0.29910715483129025, "rewards/format_reward": 0.8772321790456772, "rewards/tag_count_reward": 0.9202009439468384, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 1019.7901916503906, "epoch": 0.84, "grad_norm": 0.2823946475982666, "kl": 0.24169921875, "learning_rate": 1.523913088211415e-06, "loss": 0.0072, "reward": 2.210937589406967, "reward_std": 0.5960576869547367, "rewards/accuracy_reward": 0.3660714440047741, "rewards/format_reward": 0.899553619325161, "rewards/tag_count_reward": 0.9453125447034836, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.8426666666666667, "grad_norm": 0.30682700872421265, "kl": 0.20391845703125, "learning_rate": 1.474816301395906e-06, "loss": 0.0082, "reward": 2.2784599363803864, "reward_std": 0.5610373616218567, "rewards/accuracy_reward": 0.3906250260770321, "rewards/format_reward": 0.9263393208384514, "rewards/tag_count_reward": 0.9614955708384514, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 1006.584846496582, "epoch": 0.8453333333333334, "grad_norm": 0.23224209249019623, "kl": 0.2933349609375, "learning_rate": 1.4264603820813006e-06, "loss": 0.0081, "reward": 1.9715402871370316, "reward_std": 0.5434744767844677, "rewards/accuracy_reward": 0.1629464323632419, "rewards/format_reward": 0.8750000447034836, "rewards/tag_count_reward": 0.9335938021540642, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 1012.3281555175781, "epoch": 0.848, "grad_norm": 0.37959590554237366, "kl": 0.422119140625, "learning_rate": 1.3788495325616912e-06, "loss": -0.0014, "reward": 2.26506707072258, "reward_std": 0.6333228126168251, "rewards/accuracy_reward": 0.42633930779993534, "rewards/format_reward": 0.899553619325161, "rewards/tag_count_reward": 0.9391741529107094, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 1018.6584930419922, "epoch": 0.8506666666666667, "grad_norm": 0.2974534034729004, "kl": 0.27685546875, "learning_rate": 1.3319878903820682e-06, "loss": 0.0003, "reward": 2.3013393878936768, "reward_std": 0.5945294424891472, "rewards/accuracy_reward": 0.4308035895228386, "rewards/format_reward": 0.9107143133878708, "rewards/tag_count_reward": 0.9598214700818062, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 1020.2968902587891, "epoch": 0.8533333333333334, "grad_norm": 0.32377177476882935, "kl": 0.2545166015625, "learning_rate": 1.2858795279787517e-06, "loss": 0.0079, "reward": 2.1796876043081284, "reward_std": 0.5994452647864819, "rewards/accuracy_reward": 0.35044644214212894, "rewards/format_reward": 0.8906250447034836, "rewards/tag_count_reward": 0.938616119325161, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 1018.444206237793, "epoch": 0.856, "grad_norm": 0.2041098177433014, "kl": 0.2281494140625, "learning_rate": 1.2405284523254823e-06, "loss": 0.0085, "reward": 2.3108259737491608, "reward_std": 0.5786739625036716, "rewards/accuracy_reward": 0.4486607387661934, "rewards/format_reward": 0.9151786118745804, "rewards/tag_count_reward": 0.9469866529107094, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 1020.6071548461914, "epoch": 0.8586666666666667, "grad_norm": 0.31456390023231506, "kl": 0.22998046875, "learning_rate": 1.195938604585205e-06, "loss": 0.0069, "reward": 2.13225457072258, "reward_std": 0.562423225492239, "rewards/accuracy_reward": 0.2767857313156128, "rewards/format_reward": 0.901785746216774, "rewards/tag_count_reward": 0.9536830931901932, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 1021.0089416503906, "epoch": 0.8613333333333333, "grad_norm": 0.18485142290592194, "kl": 0.1756591796875, "learning_rate": 1.152113859767565e-06, "loss": 0.0085, "reward": 2.2059152722358704, "reward_std": 0.4480607798323035, "rewards/accuracy_reward": 0.305803582072258, "rewards/format_reward": 0.9375000298023224, "rewards/tag_count_reward": 0.9626116454601288, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 1021.8169708251953, "epoch": 0.864, "grad_norm": 0.23961246013641357, "kl": 0.21728515625, "learning_rate": 1.109058026392158e-06, "loss": 0.0044, "reward": 2.2148438543081284, "reward_std": 0.5881692916154861, "rewards/accuracy_reward": 0.3705357350409031, "rewards/format_reward": 0.9040179029107094, "rewards/tag_count_reward": 0.9402902349829674, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 1023.1741104125977, "epoch": 0.8666666666666667, "grad_norm": 0.2150668352842331, "kl": 0.197265625, "learning_rate": 1.0667748461575544e-06, "loss": 0.008, "reward": 2.2271206229925156, "reward_std": 0.5270771663635969, "rewards/accuracy_reward": 0.3660714481957257, "rewards/format_reward": 0.9129464700818062, "rewards/tag_count_reward": 0.9481027200818062, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 1020.9531326293945, "epoch": 0.8693333333333333, "grad_norm": 0.24379926919937134, "kl": 0.2073974609375, "learning_rate": 1.0252679936161392e-06, "loss": 0.0088, "reward": 2.2271206378936768, "reward_std": 0.5633045695722103, "rewards/accuracy_reward": 0.35937502048909664, "rewards/format_reward": 0.9174107387661934, "rewards/tag_count_reward": 0.9503348544239998, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.872, "grad_norm": 0.4589475691318512, "kl": 0.230224609375, "learning_rate": 9.845410758547724e-07, "loss": 0.0093, "reward": 2.1964286863803864, "reward_std": 0.5628439746797085, "rewards/accuracy_reward": 0.3370535795111209, "rewards/format_reward": 0.9129464700818062, "rewards/tag_count_reward": 0.946428619325161, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 1012.912971496582, "epoch": 0.8746666666666667, "grad_norm": 0.28130125999450684, "kl": 0.342041015625, "learning_rate": 9.445976321813277e-07, "loss": -0.009, "reward": 2.3710938543081284, "reward_std": 0.5514123477041721, "rewards/accuracy_reward": 0.47544645005837083, "rewards/format_reward": 0.933035746216774, "rewards/tag_count_reward": 0.9626116454601288, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 1017.9263534545898, "epoch": 0.8773333333333333, "grad_norm": 0.19118691980838776, "kl": 0.335693359375, "learning_rate": 9.054411338171099e-07, "loss": -0.0008, "reward": 2.2952009737491608, "reward_std": 0.46351186372339725, "rewards/accuracy_reward": 0.3883928668219596, "rewards/format_reward": 0.9419643208384514, "rewards/tag_count_reward": 0.964843787252903, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 1021.4732208251953, "epoch": 0.88, "grad_norm": 0.1962963342666626, "kl": 0.181884765625, "learning_rate": 8.670749835951964e-07, "loss": 0.0054, "reward": 2.2952009737491608, "reward_std": 0.5603885129094124, "rewards/accuracy_reward": 0.4196428805589676, "rewards/format_reward": 0.9218750298023224, "rewards/tag_count_reward": 0.953683078289032, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 1021.9799194335938, "epoch": 0.8826666666666667, "grad_norm": 0.6351999640464783, "kl": 0.2130126953125, "learning_rate": 8.29502515664723e-07, "loss": 0.0044, "reward": 2.152901843190193, "reward_std": 0.5181849822402, "rewards/accuracy_reward": 0.2879464402794838, "rewards/format_reward": 0.9174107536673546, "rewards/tag_count_reward": 0.9475446939468384, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.8853333333333333, "grad_norm": 0.28189706802368164, "kl": 0.2200927734375, "learning_rate": 7.927269952011285e-07, "loss": 0.0089, "reward": 2.146205425262451, "reward_std": 0.5443706884980202, "rewards/accuracy_reward": 0.31026787124574184, "rewards/format_reward": 0.8973214700818062, "rewards/tag_count_reward": 0.938616119325161, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 1017.8549270629883, "epoch": 0.888, "grad_norm": 0.39113467931747437, "kl": 0.2760009765625, "learning_rate": 7.567516181223966e-07, "loss": 0.0007, "reward": 2.158482238650322, "reward_std": 0.5754810310900211, "rewards/accuracy_reward": 0.2968750186264515, "rewards/format_reward": 0.9174107536673546, "rewards/tag_count_reward": 0.9441964700818062, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 1017.1585006713867, "epoch": 0.8906666666666667, "grad_norm": 0.26294657588005066, "kl": 0.2698974609375, "learning_rate": 7.215795108113343e-07, "loss": 0.0052, "reward": 2.2025670409202576, "reward_std": 0.6121005043387413, "rewards/accuracy_reward": 0.37053573317825794, "rewards/format_reward": 0.895089328289032, "rewards/tag_count_reward": 0.9369420185685158, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 1018.5334930419922, "epoch": 0.8933333333333333, "grad_norm": 0.28085288405418396, "kl": 0.2161865234375, "learning_rate": 6.872137298438653e-07, "loss": 0.0096, "reward": 2.2405134588479996, "reward_std": 0.5706223845481873, "rewards/accuracy_reward": 0.3616071653086692, "rewards/format_reward": 0.9263393208384514, "rewards/tag_count_reward": 0.9525670111179352, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 1020.0290298461914, "epoch": 0.896, "grad_norm": 0.35503754019737244, "kl": 0.277099609375, "learning_rate": 6.536572617234082e-07, "loss": 0.0092, "reward": 2.2360492199659348, "reward_std": 0.5588010214269161, "rewards/accuracy_reward": 0.3660714440047741, "rewards/format_reward": 0.9218750298023224, "rewards/tag_count_reward": 0.948102705180645, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 1016.4776916503906, "epoch": 0.8986666666666666, "grad_norm": 0.4076971113681793, "kl": 0.256103515625, "learning_rate": 6.209130226213378e-07, "loss": 0.0079, "reward": 2.22209832072258, "reward_std": 0.5934063121676445, "rewards/accuracy_reward": 0.3973214477300644, "rewards/format_reward": 0.8950893208384514, "rewards/tag_count_reward": 0.9296875447034836, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 1021.7678604125977, "epoch": 0.9013333333333333, "grad_norm": 0.18329821527004242, "kl": 0.22509765625, "learning_rate": 5.889838581235641e-07, "loss": 0.0102, "reward": 2.200334906578064, "reward_std": 0.5884822010993958, "rewards/accuracy_reward": 0.36383930081501603, "rewards/format_reward": 0.9017857536673546, "rewards/tag_count_reward": 0.9347098618745804, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 1021.4642944335938, "epoch": 0.904, "grad_norm": 0.4405348598957062, "kl": 0.29541015625, "learning_rate": 5.578725429832344e-07, "loss": 0.0112, "reward": 2.169084906578064, "reward_std": 0.6066409535706043, "rewards/accuracy_reward": 0.33258930034935474, "rewards/format_reward": 0.8973214775323868, "rewards/tag_count_reward": 0.9391741529107094, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 1009.591552734375, "epoch": 0.9066666666666666, "grad_norm": 0.3651023507118225, "kl": 0.3321533203125, "learning_rate": 5.275817808796013e-07, "loss": -0.0047, "reward": 2.168526902794838, "reward_std": 0.5713673643767834, "rewards/accuracy_reward": 0.330357164144516, "rewards/format_reward": 0.8995536118745804, "rewards/tag_count_reward": 0.9386161118745804, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 1015.3594131469727, "epoch": 0.9093333333333333, "grad_norm": 0.8885743021965027, "kl": 0.2982177734375, "learning_rate": 4.981142041830645e-07, "loss": 0.0078, "reward": 2.1093751192092896, "reward_std": 0.5496832653880119, "rewards/accuracy_reward": 0.29241072991862893, "rewards/format_reward": 0.8906250447034836, "rewards/tag_count_reward": 0.9263393208384514, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 1018.2276992797852, "epoch": 0.912, "grad_norm": 0.1936761438846588, "kl": 0.231689453125, "learning_rate": 4.6947237372640954e-07, "loss": 0.0124, "reward": 2.168526917695999, "reward_std": 0.5354543067514896, "rewards/accuracy_reward": 0.30133930081501603, "rewards/format_reward": 0.910714328289032, "rewards/tag_count_reward": 0.9564732536673546, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 1013.1629791259766, "epoch": 0.9146666666666666, "grad_norm": 0.5735378861427307, "kl": 0.255615234375, "learning_rate": 4.416587785822568e-07, "loss": 0.0028, "reward": 2.1160715222358704, "reward_std": 0.5677376128733158, "rewards/accuracy_reward": 0.27678572572767735, "rewards/format_reward": 0.8928571864962578, "rewards/tag_count_reward": 0.946428619325161, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 1011.4062652587891, "epoch": 0.9173333333333333, "grad_norm": 0.2437291294336319, "kl": 0.23291015625, "learning_rate": 4.1467583584676395e-07, "loss": 0.0097, "reward": 2.2477679550647736, "reward_std": 0.5661342553794384, "rewards/accuracy_reward": 0.37500001303851604, "rewards/format_reward": 0.9174107536673546, "rewards/tag_count_reward": 0.9553571790456772, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 1023.0401840209961, "epoch": 0.92, "grad_norm": 0.30266594886779785, "kl": 0.258056640625, "learning_rate": 3.885258904295575e-07, "loss": 0.0103, "reward": 2.1132813096046448, "reward_std": 0.5288172848522663, "rewards/accuracy_reward": 0.2723214365541935, "rewards/format_reward": 0.8995536118745804, "rewards/tag_count_reward": 0.9414062947034836, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 1018.3459930419922, "epoch": 0.9226666666666666, "grad_norm": 0.44057974219322205, "kl": 0.41259765625, "learning_rate": 3.6321121484996447e-07, "loss": 0.0007, "reward": 2.1322545260190964, "reward_std": 0.6624284163117409, "rewards/accuracy_reward": 0.3370535895228386, "rewards/format_reward": 0.8750000447034836, "rewards/tag_count_reward": 0.9202009290456772, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 1015.5625305175781, "epoch": 0.9253333333333333, "grad_norm": 0.5109536051750183, "kl": 0.3021240234375, "learning_rate": 3.3873400903951636e-07, "loss": 0.0071, "reward": 2.1975447237491608, "reward_std": 0.5756550095975399, "rewards/accuracy_reward": 0.36607144586741924, "rewards/format_reward": 0.8928571790456772, "rewards/tag_count_reward": 0.938616119325161, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 1015.7433242797852, "epoch": 0.928, "grad_norm": 0.2958410382270813, "kl": 0.290283203125, "learning_rate": 3.1509640015076946e-07, "loss": 0.0115, "reward": 2.1434152871370316, "reward_std": 0.680501900613308, "rewards/accuracy_reward": 0.3504464477300644, "rewards/format_reward": 0.8705357536673546, "rewards/tag_count_reward": 0.9224330857396126, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 1019.8058166503906, "epoch": 0.9306666666666666, "grad_norm": 0.4034452736377716, "kl": 0.27490234375, "learning_rate": 2.923004423724474e-07, "loss": 0.0029, "reward": 2.22600457072258, "reward_std": 0.5555046014487743, "rewards/accuracy_reward": 0.37946430407464504, "rewards/format_reward": 0.899553619325161, "rewards/tag_count_reward": 0.9469866529107094, "step": 349 }, { "clip_ratio": 0.0, "completion_length": 1019.709831237793, "epoch": 0.9333333333333333, "grad_norm": 0.22715678811073303, "kl": 0.259033203125, "learning_rate": 2.703481167509281e-07, "loss": 0.01, "reward": 2.2159599363803864, "reward_std": 0.6509725004434586, "rewards/accuracy_reward": 0.39732144959270954, "rewards/format_reward": 0.8816964700818062, "rewards/tag_count_reward": 0.9369420036673546, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 1014.2857284545898, "epoch": 0.936, "grad_norm": 0.28359052538871765, "kl": 0.2364501953125, "learning_rate": 2.4924133101807636e-07, "loss": 0.0082, "reward": 2.3164063692092896, "reward_std": 0.5717856138944626, "rewards/accuracy_reward": 0.4397321715950966, "rewards/format_reward": 0.921875037252903, "rewards/tag_count_reward": 0.9547991454601288, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 1010.491096496582, "epoch": 0.9386666666666666, "grad_norm": 0.2850205600261688, "kl": 0.431640625, "learning_rate": 2.289819194254661e-07, "loss": -0.0118, "reward": 2.1925224363803864, "reward_std": 0.6137520037591457, "rewards/accuracy_reward": 0.3459821604192257, "rewards/format_reward": 0.9017857611179352, "rewards/tag_count_reward": 0.9447545111179352, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 1018.2455520629883, "epoch": 0.9413333333333334, "grad_norm": 0.25859472155570984, "kl": 0.2259521484375, "learning_rate": 2.0957164258497031e-07, "loss": 0.0008, "reward": 2.210937574505806, "reward_std": 0.521145723760128, "rewards/accuracy_reward": 0.33705358766019344, "rewards/format_reward": 0.9196428880095482, "rewards/tag_count_reward": 0.954241119325161, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 1019.0000152587891, "epoch": 0.944, "grad_norm": 0.24050916731357574, "kl": 0.2647705078125, "learning_rate": 1.9101218731575777e-07, "loss": 0.0093, "reward": 2.039620652794838, "reward_std": 0.5573948994278908, "rewards/accuracy_reward": 0.2187500111758709, "rewards/format_reward": 0.8861607536673546, "rewards/tag_count_reward": 0.9347098767757416, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 1005.6272659301758, "epoch": 0.9466666666666667, "grad_norm": 29.152254104614258, "kl": 15.73291015625, "learning_rate": 1.73305166497707e-07, "loss": 0.0494, "reward": 2.1562501341104507, "reward_std": 0.6062168106436729, "rewards/accuracy_reward": 0.33705358766019344, "rewards/format_reward": 0.883928619325161, "rewards/tag_count_reward": 0.9352679029107094, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 1018.8727874755859, "epoch": 0.9493333333333334, "grad_norm": 0.22955262660980225, "kl": 0.20166015625, "learning_rate": 1.5645211893123846e-07, "loss": 0.0064, "reward": 2.2371652722358704, "reward_std": 0.6186549700796604, "rewards/accuracy_reward": 0.4062500186264515, "rewards/format_reward": 0.8928571790456772, "rewards/tag_count_reward": 0.938058078289032, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 1020.5625152587891, "epoch": 0.952, "grad_norm": 0.3843916952610016, "kl": 0.2706298828125, "learning_rate": 1.4045450920358917e-07, "loss": 0.0079, "reward": 2.1729911416769028, "reward_std": 0.5495161339640617, "rewards/accuracy_reward": 0.33035715855658054, "rewards/format_reward": 0.8995536044239998, "rewards/tag_count_reward": 0.9430804029107094, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 1016.8459930419922, "epoch": 0.9546666666666667, "grad_norm": 0.20025238394737244, "kl": 0.229248046875, "learning_rate": 1.2531372756153458e-07, "loss": 0.0013, "reward": 2.2098215222358704, "reward_std": 0.5276618581265211, "rewards/accuracy_reward": 0.3258928693830967, "rewards/format_reward": 0.9263393208384514, "rewards/tag_count_reward": 0.957589328289032, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 1009.1250305175781, "epoch": 0.9573333333333334, "grad_norm": 0.4796367585659027, "kl": 0.453369140625, "learning_rate": 1.1103108979056865e-07, "loss": 0.0055, "reward": 2.159040242433548, "reward_std": 0.6154494881629944, "rewards/accuracy_reward": 0.3526785857975483, "rewards/format_reward": 0.87276791036129, "rewards/tag_count_reward": 0.9335937947034836, "step": 359 }, { "clip_ratio": 0.0, "completion_length": 1017.2053833007812, "epoch": 0.96, "grad_norm": 0.24999898672103882, "kl": 0.257080078125, "learning_rate": 9.760783710056176e-08, "loss": 0.0102, "reward": 2.225446566939354, "reward_std": 0.49892666935920715, "rewards/accuracy_reward": 0.361607164144516, "rewards/format_reward": 0.9129464626312256, "rewards/tag_count_reward": 0.9508929029107094, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 1018.8705520629883, "epoch": 0.9626666666666667, "grad_norm": 0.24726729094982147, "kl": 0.26513671875, "learning_rate": 8.504513601789388e-08, "loss": 0.0106, "reward": 2.1194197237491608, "reward_std": 0.6447071582078934, "rewards/accuracy_reward": 0.33035716228187084, "rewards/format_reward": 0.868303619325161, "rewards/tag_count_reward": 0.9207589626312256, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 1015.9352798461914, "epoch": 0.9653333333333334, "grad_norm": 0.31679829955101013, "kl": 0.26416015625, "learning_rate": 7.334407828407885e-08, "loss": 0.009, "reward": 2.240513488650322, "reward_std": 0.5577768888324499, "rewards/accuracy_reward": 0.3816964514553547, "rewards/format_reward": 0.9062500447034836, "rewards/tag_count_reward": 0.952566996216774, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 1017.8460083007812, "epoch": 0.968, "grad_norm": 0.2769128978252411, "kl": 0.240234375, "learning_rate": 6.250568076088814e-08, "loss": 0.0072, "reward": 2.214843839406967, "reward_std": 0.4956537261605263, "rewards/accuracy_reward": 0.33258930314332247, "rewards/format_reward": 0.9241071939468384, "rewards/tag_count_reward": 0.9581473618745804, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 1014.8415374755859, "epoch": 0.9706666666666667, "grad_norm": 0.2232033759355545, "kl": 0.216552734375, "learning_rate": 5.2530885341982586e-08, "loss": 0.0007, "reward": 2.321986734867096, "reward_std": 0.5681647323071957, "rewards/accuracy_reward": 0.4352678768336773, "rewards/format_reward": 0.926339328289032, "rewards/tag_count_reward": 0.9603795111179352, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 1019.8482208251953, "epoch": 0.9733333333333334, "grad_norm": 0.2994762063026428, "kl": 0.2735595703125, "learning_rate": 4.3420558871060116e-08, "loss": 0.0128, "reward": 2.224888503551483, "reward_std": 0.5850169435143471, "rewards/accuracy_reward": 0.37723216228187084, "rewards/format_reward": 0.9017857611179352, "rewards/tag_count_reward": 0.9458705857396126, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 1021.2544784545898, "epoch": 0.976, "grad_norm": 0.21192388236522675, "kl": 0.2596435546875, "learning_rate": 3.517549306652157e-08, "loss": 0.0118, "reward": 2.1026787012815475, "reward_std": 0.5294410735368729, "rewards/accuracy_reward": 0.2544642947614193, "rewards/format_reward": 0.9040178880095482, "rewards/tag_count_reward": 0.9441964775323868, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 1018.3058242797852, "epoch": 0.9786666666666667, "grad_norm": 0.34231218695640564, "kl": 0.2655029296875, "learning_rate": 2.7796404452666847e-08, "loss": 0.0085, "reward": 2.29241082072258, "reward_std": 0.617542814463377, "rewards/accuracy_reward": 0.43526787497103214, "rewards/format_reward": 0.9017857536673546, "rewards/tag_count_reward": 0.9553571864962578, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 1011.8415374755859, "epoch": 0.9813333333333333, "grad_norm": 0.2850145101547241, "kl": 0.24853515625, "learning_rate": 2.1283934297432472e-08, "loss": 0.0066, "reward": 2.2695313692092896, "reward_std": 0.5283640064299107, "rewards/accuracy_reward": 0.38616072945296764, "rewards/format_reward": 0.9263393208384514, "rewards/tag_count_reward": 0.9570312947034836, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 1020.8393020629883, "epoch": 0.984, "grad_norm": 0.23045480251312256, "kl": 0.2613525390625, "learning_rate": 1.5638648556656198e-08, "loss": 0.0103, "reward": 2.2566965371370316, "reward_std": 0.6659301854670048, "rewards/accuracy_reward": 0.43973216880112886, "rewards/format_reward": 0.883928619325161, "rewards/tag_count_reward": 0.9330357611179352, "step": 369 }, { "clip_ratio": 0.0, "completion_length": 1018.4531326293945, "epoch": 0.9866666666666667, "grad_norm": 0.4619045853614807, "kl": 0.31494140625, "learning_rate": 1.0861037824896337e-08, "loss": 0.0051, "reward": 2.2109376192092896, "reward_std": 0.6436006389558315, "rewards/accuracy_reward": 0.3973214514553547, "rewards/format_reward": 0.8816964775323868, "rewards/tag_count_reward": 0.9319196939468384, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 1017.1629638671875, "epoch": 0.9893333333333333, "grad_norm": 0.4388638734817505, "kl": 0.2684326171875, "learning_rate": 6.951517292800303e-09, "loss": 0.0092, "reward": 2.20814748108387, "reward_std": 0.5858336836099625, "rewards/accuracy_reward": 0.3526785932481289, "rewards/format_reward": 0.910714328289032, "rewards/tag_count_reward": 0.9447545036673546, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 1013.9419860839844, "epoch": 0.992, "grad_norm": 0.23698575794696808, "kl": 0.301513671875, "learning_rate": 3.9104267110168235e-09, "loss": 0.0068, "reward": 2.0736608058214188, "reward_std": 0.6049975231289864, "rewards/accuracy_reward": 0.310267869848758, "rewards/format_reward": 0.8526786118745804, "rewards/tag_count_reward": 0.910714328289032, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 1015.5446701049805, "epoch": 0.9946666666666667, "grad_norm": 0.1941290646791458, "kl": 0.21368408203125, "learning_rate": 1.738030360677323e-09, "loss": 0.0099, "reward": 2.3370536863803864, "reward_std": 0.5505933277308941, "rewards/accuracy_reward": 0.4620535932481289, "rewards/format_reward": 0.9241071790456772, "rewards/tag_count_reward": 0.9508928954601288, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 1015.7143173217773, "epoch": 0.9973333333333333, "grad_norm": 0.2728719115257263, "kl": 0.245849609375, "learning_rate": 4.3451703042207694e-10, "loss": -0.0058, "reward": 2.1406251341104507, "reward_std": 0.5760147906839848, "rewards/accuracy_reward": 0.29464287124574184, "rewards/format_reward": 0.8973214626312256, "rewards/tag_count_reward": 0.9486607536673546, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 1.0, "grad_norm": 0.2957329750061035, "kl": 0.3221435546875, "learning_rate": 0.0, "loss": 0.0103, "reward": 2.0814733505249023, "reward_std": 0.5692420080304146, "rewards/accuracy_reward": 0.2700892947614193, "rewards/format_reward": 0.8816964700818062, "rewards/tag_count_reward": 0.9296875447034836, "step": 375 }, { "epoch": 1.0, "step": 375, "total_flos": 0.0, "train_loss": 17.059915766330747, "train_runtime": 37131.9859, "train_samples_per_second": 0.323, "train_steps_per_second": 0.01 } ], "logging_steps": 1, "max_steps": 375, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }