| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 375, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 458.96876525878906, | |
| "epoch": 0.0026666666666666666, | |
| "grad_norm": 0.46892526745796204, | |
| "kl": 0.0, | |
| "learning_rate": 5.263157894736843e-07, | |
| "loss": 0.0151, | |
| "reward": 0.27287947945296764, | |
| "reward_std": 0.4559166468679905, | |
| "rewards/accuracy_reward": 0.15178571781143546, | |
| "rewards/format_reward": 0.03571428800933063, | |
| "rewards/tag_count_reward": 0.08537946850992739, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 484.49109649658203, | |
| "epoch": 0.005333333333333333, | |
| "grad_norm": 0.3571339547634125, | |
| "kl": 0.0, | |
| "learning_rate": 1.0526315789473685e-06, | |
| "loss": 0.0171, | |
| "reward": 0.35435269586741924, | |
| "reward_std": 0.45219872146844864, | |
| "rewards/accuracy_reward": 0.23437501164153218, | |
| "rewards/format_reward": 0.03571428777649999, | |
| "rewards/tag_count_reward": 0.08426339691504836, | |
| "step": 2 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 485.7589530944824, | |
| "epoch": 0.008, | |
| "grad_norm": 0.40610507130622864, | |
| "kl": 0.00011050701141357422, | |
| "learning_rate": 1.5789473684210526e-06, | |
| "loss": 0.0203, | |
| "reward": 0.28962054662406445, | |
| "reward_std": 0.3996347077190876, | |
| "rewards/accuracy_reward": 0.18303572479635477, | |
| "rewards/format_reward": 0.02232142980210483, | |
| "rewards/tag_count_reward": 0.084263397147879, | |
| "step": 3 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 465.26341247558594, | |
| "epoch": 0.010666666666666666, | |
| "grad_norm": 0.40742260217666626, | |
| "kl": 0.00010585784912109375, | |
| "learning_rate": 2.105263157894737e-06, | |
| "loss": 0.0039, | |
| "reward": 0.2533482275903225, | |
| "reward_std": 0.40483200177550316, | |
| "rewards/accuracy_reward": 0.16517857555299997, | |
| "rewards/format_reward": 0.02232142980210483, | |
| "rewards/tag_count_reward": 0.06584821734577417, | |
| "step": 4 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 509.9531440734863, | |
| "epoch": 0.013333333333333334, | |
| "grad_norm": 0.39532214403152466, | |
| "kl": 0.0002484321594238281, | |
| "learning_rate": 2.631578947368421e-06, | |
| "loss": 0.022, | |
| "reward": 0.3203125111758709, | |
| "reward_std": 0.46360545977950096, | |
| "rewards/accuracy_reward": 0.16741072293370962, | |
| "rewards/format_reward": 0.046875001629814506, | |
| "rewards/tag_count_reward": 0.10602679150179029, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 471.4799270629883, | |
| "epoch": 0.016, | |
| "grad_norm": 0.4526801109313965, | |
| "kl": 0.002498626708984375, | |
| "learning_rate": 3.157894736842105e-06, | |
| "loss": 0.0443, | |
| "reward": 0.3766741268336773, | |
| "reward_std": 0.5414610058069229, | |
| "rewards/accuracy_reward": 0.12500000651925802, | |
| "rewards/format_reward": 0.07812500395812094, | |
| "rewards/tag_count_reward": 0.17354911426082253, | |
| "step": 6 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 464.9776954650879, | |
| "epoch": 0.018666666666666668, | |
| "grad_norm": 0.6029561758041382, | |
| "kl": 0.0282135009765625, | |
| "learning_rate": 3.6842105263157896e-06, | |
| "loss": 0.0547, | |
| "reward": 0.4609375260770321, | |
| "reward_std": 0.5835300870239735, | |
| "rewards/accuracy_reward": 0.15848214970901608, | |
| "rewards/format_reward": 0.10044643096625805, | |
| "rewards/tag_count_reward": 0.2020089365541935, | |
| "step": 7 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 486.8638610839844, | |
| "epoch": 0.021333333333333333, | |
| "grad_norm": 12.084760665893555, | |
| "kl": 0.3585205078125, | |
| "learning_rate": 4.210526315789474e-06, | |
| "loss": 0.0975, | |
| "reward": 0.6953125298023224, | |
| "reward_std": 0.6895988658070564, | |
| "rewards/accuracy_reward": 0.1941964365541935, | |
| "rewards/format_reward": 0.16964286379516125, | |
| "rewards/tag_count_reward": 0.3314732275903225, | |
| "step": 8 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 505.02457427978516, | |
| "epoch": 0.024, | |
| "grad_norm": 2.077465772628784, | |
| "kl": 0.099151611328125, | |
| "learning_rate": 4.736842105263158e-06, | |
| "loss": 0.0874, | |
| "reward": 0.6886161044239998, | |
| "reward_std": 0.7351350113749504, | |
| "rewards/accuracy_reward": 0.22991072619333863, | |
| "rewards/format_reward": 0.1830357238650322, | |
| "rewards/tag_count_reward": 0.27566965483129025, | |
| "step": 9 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 424.9910888671875, | |
| "epoch": 0.02666666666666667, | |
| "grad_norm": 0.5744335651397705, | |
| "kl": 0.0402679443359375, | |
| "learning_rate": 5.263157894736842e-06, | |
| "loss": 0.0758, | |
| "reward": 0.8264509364962578, | |
| "reward_std": 0.7150396555662155, | |
| "rewards/accuracy_reward": 0.341517873108387, | |
| "rewards/format_reward": 0.17187500931322575, | |
| "rewards/tag_count_reward": 0.3130580522119999, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 500.03350830078125, | |
| "epoch": 0.029333333333333333, | |
| "grad_norm": 0.587772786617279, | |
| "kl": 0.0297393798828125, | |
| "learning_rate": 5.789473684210527e-06, | |
| "loss": 0.0688, | |
| "reward": 0.651785746216774, | |
| "reward_std": 0.6237742006778717, | |
| "rewards/accuracy_reward": 0.2366071566939354, | |
| "rewards/format_reward": 0.16964286658912897, | |
| "rewards/tag_count_reward": 0.2455357275903225, | |
| "step": 11 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 492.5357360839844, | |
| "epoch": 0.032, | |
| "grad_norm": 0.4513722360134125, | |
| "kl": 0.020191192626953125, | |
| "learning_rate": 6.31578947368421e-06, | |
| "loss": 0.0719, | |
| "reward": 0.7382812947034836, | |
| "reward_std": 0.6591350436210632, | |
| "rewards/accuracy_reward": 0.2500000111758709, | |
| "rewards/format_reward": 0.20535715576261282, | |
| "rewards/tag_count_reward": 0.28292411752045155, | |
| "step": 12 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 421.35493087768555, | |
| "epoch": 0.034666666666666665, | |
| "grad_norm": 0.4251323342323303, | |
| "kl": 0.0242767333984375, | |
| "learning_rate": 6.842105263157896e-06, | |
| "loss": 0.0909, | |
| "reward": 0.8208705708384514, | |
| "reward_std": 0.655682947486639, | |
| "rewards/accuracy_reward": 0.37053573969751596, | |
| "rewards/format_reward": 0.18080357927829027, | |
| "rewards/tag_count_reward": 0.26953126303851604, | |
| "step": 13 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 452.12055587768555, | |
| "epoch": 0.037333333333333336, | |
| "grad_norm": 0.46205392479896545, | |
| "kl": 0.01677703857421875, | |
| "learning_rate": 7.368421052631579e-06, | |
| "loss": 0.1193, | |
| "reward": 1.0200893357396126, | |
| "reward_std": 0.7653229907155037, | |
| "rewards/accuracy_reward": 0.3325893059372902, | |
| "rewards/format_reward": 0.3102678768336773, | |
| "rewards/tag_count_reward": 0.37723215483129025, | |
| "step": 14 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 448.9710006713867, | |
| "epoch": 0.04, | |
| "grad_norm": 0.4762975573539734, | |
| "kl": 0.0328216552734375, | |
| "learning_rate": 7.894736842105265e-06, | |
| "loss": 0.1402, | |
| "reward": 1.0072545260190964, | |
| "reward_std": 0.7546271607279778, | |
| "rewards/accuracy_reward": 0.3125000186264515, | |
| "rewards/format_reward": 0.29464287124574184, | |
| "rewards/tag_count_reward": 0.400111623108387, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 385.267879486084, | |
| "epoch": 0.042666666666666665, | |
| "grad_norm": 0.7390420436859131, | |
| "kl": 0.057586669921875, | |
| "learning_rate": 8.421052631578948e-06, | |
| "loss": 0.1962, | |
| "reward": 1.1467634439468384, | |
| "reward_std": 0.855265200138092, | |
| "rewards/accuracy_reward": 0.1584821492433548, | |
| "rewards/format_reward": 0.4508928768336773, | |
| "rewards/tag_count_reward": 0.5373884215950966, | |
| "step": 16 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 313.7522487640381, | |
| "epoch": 0.04533333333333334, | |
| "grad_norm": 5.1166672706604, | |
| "kl": 0.2215576171875, | |
| "learning_rate": 8.947368421052632e-06, | |
| "loss": 0.2362, | |
| "reward": 1.5094866752624512, | |
| "reward_std": 0.7659965306520462, | |
| "rewards/accuracy_reward": 0.20758929196745157, | |
| "rewards/format_reward": 0.5825893096625805, | |
| "rewards/tag_count_reward": 0.7193080633878708, | |
| "step": 17 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 257.66518783569336, | |
| "epoch": 0.048, | |
| "grad_norm": 227.28211975097656, | |
| "kl": 4.34381103515625, | |
| "learning_rate": 9.473684210526315e-06, | |
| "loss": 0.1844, | |
| "reward": 1.6919643580913544, | |
| "reward_std": 0.5936451926827431, | |
| "rewards/accuracy_reward": 0.10937500605359674, | |
| "rewards/format_reward": 0.7366071790456772, | |
| "rewards/tag_count_reward": 0.8459821790456772, | |
| "step": 18 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 265.9732208251953, | |
| "epoch": 0.050666666666666665, | |
| "grad_norm": 1.1222331523895264, | |
| "kl": 0.14581298828125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1309, | |
| "reward": 1.865513488650322, | |
| "reward_std": 0.505423042923212, | |
| "rewards/accuracy_reward": 0.08705357438884676, | |
| "rewards/format_reward": 0.8459821864962578, | |
| "rewards/tag_count_reward": 0.9324777275323868, | |
| "step": 19 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 284.7544746398926, | |
| "epoch": 0.05333333333333334, | |
| "grad_norm": 1.1658185720443726, | |
| "kl": 0.17352294921875, | |
| "learning_rate": 1.0526315789473684e-05, | |
| "loss": 0.0248, | |
| "reward": 1.9095983058214188, | |
| "reward_std": 0.4629954472184181, | |
| "rewards/accuracy_reward": 0.07812500209547579, | |
| "rewards/format_reward": 0.8772321939468384, | |
| "rewards/tag_count_reward": 0.954241119325161, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 362.5602798461914, | |
| "epoch": 0.056, | |
| "grad_norm": 2.576747179031372, | |
| "kl": 0.215087890625, | |
| "learning_rate": 1.105263157894737e-05, | |
| "loss": -0.0262, | |
| "reward": 1.9508929550647736, | |
| "reward_std": 0.45306090638041496, | |
| "rewards/accuracy_reward": 0.08928571990691125, | |
| "rewards/format_reward": 0.8928571939468384, | |
| "rewards/tag_count_reward": 0.9687500447034836, | |
| "step": 21 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 402.1875190734863, | |
| "epoch": 0.058666666666666666, | |
| "grad_norm": 0.5915634036064148, | |
| "kl": 0.10125732421875, | |
| "learning_rate": 1.1578947368421053e-05, | |
| "loss": -0.024, | |
| "reward": 1.9921875894069672, | |
| "reward_std": 0.47527335956692696, | |
| "rewards/accuracy_reward": 0.13392857555299997, | |
| "rewards/format_reward": 0.8973214775323868, | |
| "rewards/tag_count_reward": 0.9609375521540642, | |
| "step": 22 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 407.0379638671875, | |
| "epoch": 0.06133333333333333, | |
| "grad_norm": 1.3497282266616821, | |
| "kl": 0.19049072265625, | |
| "learning_rate": 1.2105263157894737e-05, | |
| "loss": -0.0088, | |
| "reward": 1.5842634737491608, | |
| "reward_std": 0.6860349476337433, | |
| "rewards/accuracy_reward": 0.3080357313156128, | |
| "rewards/format_reward": 0.4776786006987095, | |
| "rewards/tag_count_reward": 0.7985491454601288, | |
| "step": 23 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 391.67635345458984, | |
| "epoch": 0.064, | |
| "grad_norm": 0.8537726402282715, | |
| "kl": 0.16400146484375, | |
| "learning_rate": 1.263157894736842e-05, | |
| "loss": 0.0033, | |
| "reward": 1.8833706229925156, | |
| "reward_std": 0.6259407699108124, | |
| "rewards/accuracy_reward": 0.22098215599544346, | |
| "rewards/format_reward": 0.7433036118745804, | |
| "rewards/tag_count_reward": 0.9190848618745804, | |
| "step": 24 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 454.6205635070801, | |
| "epoch": 0.06666666666666667, | |
| "grad_norm": 1.942647099494934, | |
| "kl": 0.2576904296875, | |
| "learning_rate": 1.3157894736842108e-05, | |
| "loss": 0.0536, | |
| "reward": 1.9882813543081284, | |
| "reward_std": 0.48262083530426025, | |
| "rewards/accuracy_reward": 0.17187500931322575, | |
| "rewards/format_reward": 0.863839328289032, | |
| "rewards/tag_count_reward": 0.9525670036673546, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 542.8727912902832, | |
| "epoch": 0.06933333333333333, | |
| "grad_norm": 2.0896551609039307, | |
| "kl": 0.12127685546875, | |
| "learning_rate": 1.3684210526315791e-05, | |
| "loss": 0.0504, | |
| "reward": 1.8939732909202576, | |
| "reward_std": 0.5775122344493866, | |
| "rewards/accuracy_reward": 0.16294643306173384, | |
| "rewards/format_reward": 0.8125000298023224, | |
| "rewards/tag_count_reward": 0.918526828289032, | |
| "step": 26 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 586.6004753112793, | |
| "epoch": 0.072, | |
| "grad_norm": 1.5025179386138916, | |
| "kl": 0.171630859375, | |
| "learning_rate": 1.4210526315789475e-05, | |
| "loss": 0.0595, | |
| "reward": 1.727678656578064, | |
| "reward_std": 0.6498573049902916, | |
| "rewards/accuracy_reward": 0.1250000053551048, | |
| "rewards/format_reward": 0.714285746216774, | |
| "rewards/tag_count_reward": 0.8883929029107094, | |
| "step": 27 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 566.8102912902832, | |
| "epoch": 0.07466666666666667, | |
| "grad_norm": 32.01606369018555, | |
| "kl": 3.818359375, | |
| "learning_rate": 1.4736842105263159e-05, | |
| "loss": 0.2776, | |
| "reward": 1.5814732760190964, | |
| "reward_std": 0.7524110227823257, | |
| "rewards/accuracy_reward": 0.19419643934816122, | |
| "rewards/format_reward": 0.589285746216774, | |
| "rewards/tag_count_reward": 0.7979911044239998, | |
| "step": 28 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 560.9710121154785, | |
| "epoch": 0.07733333333333334, | |
| "grad_norm": 27.419784545898438, | |
| "kl": 2.98828125, | |
| "learning_rate": 1.5263157894736846e-05, | |
| "loss": 0.3264, | |
| "reward": 1.4570313096046448, | |
| "reward_std": 0.8036252707242966, | |
| "rewards/accuracy_reward": 0.18526786705479026, | |
| "rewards/format_reward": 0.5111607350409031, | |
| "rewards/tag_count_reward": 0.7606027126312256, | |
| "step": 29 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 678.9933395385742, | |
| "epoch": 0.08, | |
| "grad_norm": 5.0629682540893555, | |
| "kl": 0.550537109375, | |
| "learning_rate": 1.578947368421053e-05, | |
| "loss": 0.2401, | |
| "reward": 1.18526791036129, | |
| "reward_std": 0.7161316871643066, | |
| "rewards/accuracy_reward": 0.14062500419095159, | |
| "rewards/format_reward": 0.3348214440047741, | |
| "rewards/tag_count_reward": 0.7098214626312256, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 660.4018173217773, | |
| "epoch": 0.08266666666666667, | |
| "grad_norm": 37.217041015625, | |
| "kl": 0.595703125, | |
| "learning_rate": 1.6315789473684213e-05, | |
| "loss": 0.142, | |
| "reward": 1.1584821939468384, | |
| "reward_std": 0.6607379615306854, | |
| "rewards/accuracy_reward": 0.15625000931322575, | |
| "rewards/format_reward": 0.2745535895228386, | |
| "rewards/tag_count_reward": 0.7276786118745804, | |
| "step": 31 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 658.4107513427734, | |
| "epoch": 0.08533333333333333, | |
| "grad_norm": 6019.00439453125, | |
| "kl": 20.4326171875, | |
| "learning_rate": 1.6842105263157896e-05, | |
| "loss": 1.3873, | |
| "reward": 0.9341518431901932, | |
| "reward_std": 0.5787044316530228, | |
| "rewards/accuracy_reward": 0.058035716181620955, | |
| "rewards/format_reward": 0.14508929196745157, | |
| "rewards/tag_count_reward": 0.7310268208384514, | |
| "step": 32 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 580.5580596923828, | |
| "epoch": 0.088, | |
| "grad_norm": 290.9708557128906, | |
| "kl": 8.6171875, | |
| "learning_rate": 1.736842105263158e-05, | |
| "loss": 0.4883, | |
| "reward": 0.7516741380095482, | |
| "reward_std": 0.5007706061005592, | |
| "rewards/accuracy_reward": 0.029017858440056443, | |
| "rewards/format_reward": 0.07812500349245965, | |
| "rewards/tag_count_reward": 0.644531287252903, | |
| "step": 33 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 583.0335083007812, | |
| "epoch": 0.09066666666666667, | |
| "grad_norm": 16.34986114501953, | |
| "kl": 1.13134765625, | |
| "learning_rate": 1.7894736842105264e-05, | |
| "loss": 0.0601, | |
| "reward": 0.7003348544239998, | |
| "reward_std": 0.5002335086464882, | |
| "rewards/accuracy_reward": 0.046875002793967724, | |
| "rewards/format_reward": 0.07589286053553224, | |
| "rewards/tag_count_reward": 0.577566996216774, | |
| "step": 34 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 507.6719093322754, | |
| "epoch": 0.09333333333333334, | |
| "grad_norm": 7.823866367340088, | |
| "kl": 1.912109375, | |
| "learning_rate": 1.8421052631578947e-05, | |
| "loss": -0.0674, | |
| "reward": 0.6406250298023224, | |
| "reward_std": 0.5031706914305687, | |
| "rewards/accuracy_reward": 0.02232142980210483, | |
| "rewards/format_reward": 0.08928571757860482, | |
| "rewards/tag_count_reward": 0.5290178842842579, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 355.6808223724365, | |
| "epoch": 0.096, | |
| "grad_norm": 4.270777225494385, | |
| "kl": 1.470703125, | |
| "learning_rate": 1.894736842105263e-05, | |
| "loss": -0.2041, | |
| "reward": 0.6618303954601288, | |
| "reward_std": 0.5147153101861477, | |
| "rewards/accuracy_reward": 0.026785714784637094, | |
| "rewards/format_reward": 0.10044643189758062, | |
| "rewards/tag_count_reward": 0.5345982424914837, | |
| "step": 36 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 317.5245666503906, | |
| "epoch": 0.09866666666666667, | |
| "grad_norm": 4.440794467926025, | |
| "kl": 1.41943359375, | |
| "learning_rate": 1.9473684210526318e-05, | |
| "loss": -0.1957, | |
| "reward": 0.758370578289032, | |
| "reward_std": 0.5318198576569557, | |
| "rewards/accuracy_reward": 0.01562500069849193, | |
| "rewards/format_reward": 0.1361607201397419, | |
| "rewards/tag_count_reward": 0.6065848544239998, | |
| "step": 37 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 293.32813835144043, | |
| "epoch": 0.10133333333333333, | |
| "grad_norm": 1485.8994140625, | |
| "kl": 12.12353515625, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1922, | |
| "reward": 1.1914063096046448, | |
| "reward_std": 0.6419440135359764, | |
| "rewards/accuracy_reward": 0.015625000931322575, | |
| "rewards/format_reward": 0.5312500298023224, | |
| "rewards/tag_count_reward": 0.6445312723517418, | |
| "step": 38 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 209.48884773254395, | |
| "epoch": 0.104, | |
| "grad_norm": 990314.5, | |
| "kl": 6982.73046875, | |
| "learning_rate": 1.999956548296958e-05, | |
| "loss": 215.6623, | |
| "reward": 0.6473214477300644, | |
| "reward_std": 0.5485238395631313, | |
| "rewards/accuracy_reward": 0.017857143888249993, | |
| "rewards/format_reward": 0.13169643469154835, | |
| "rewards/tag_count_reward": 0.4977678842842579, | |
| "step": 39 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 172.20759868621826, | |
| "epoch": 0.10666666666666667, | |
| "grad_norm": 9381907.0, | |
| "kl": 130954.7265625, | |
| "learning_rate": 1.9998261969639324e-05, | |
| "loss": 5173.1377, | |
| "reward": 0.4804687649011612, | |
| "reward_std": 0.39905556850135326, | |
| "rewards/accuracy_reward": 0.04464286030270159, | |
| "rewards/format_reward": 0.049107144586741924, | |
| "rewards/tag_count_reward": 0.3867187649011612, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 208.4575958251953, | |
| "epoch": 0.10933333333333334, | |
| "grad_norm": 174.25518798828125, | |
| "kl": 8.283203125, | |
| "learning_rate": 1.9996089573288985e-05, | |
| "loss": -0.2933, | |
| "reward": 0.5055803842842579, | |
| "reward_std": 0.4746779501438141, | |
| "rewards/accuracy_reward": 0.0669642889406532, | |
| "rewards/format_reward": 0.04687500232830644, | |
| "rewards/tag_count_reward": 0.3917410895228386, | |
| "step": 41 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 123.5022382736206, | |
| "epoch": 0.112, | |
| "grad_norm": 135551.15625, | |
| "kl": 210.92578125, | |
| "learning_rate": 1.99930484827072e-05, | |
| "loss": 11.072, | |
| "reward": 0.3387276977300644, | |
| "reward_std": 0.37313414365053177, | |
| "rewards/accuracy_reward": 0.02901785890571773, | |
| "rewards/format_reward": 0.026785715715959668, | |
| "rewards/tag_count_reward": 0.282924123108387, | |
| "step": 42 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 228.55134963989258, | |
| "epoch": 0.11466666666666667, | |
| "grad_norm": 366.1275939941406, | |
| "kl": 15.40625, | |
| "learning_rate": 1.9989138962175105e-05, | |
| "loss": -0.214, | |
| "reward": 0.4938616268336773, | |
| "reward_std": 0.5711234211921692, | |
| "rewards/accuracy_reward": 0.04464286006987095, | |
| "rewards/format_reward": 0.13392857741564512, | |
| "rewards/tag_count_reward": 0.3152901902794838, | |
| "step": 43 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 247.72992134094238, | |
| "epoch": 0.11733333333333333, | |
| "grad_norm": 53.45908737182617, | |
| "kl": 10.09765625, | |
| "learning_rate": 1.9984361351443343e-05, | |
| "loss": -0.4247, | |
| "reward": 0.5357142984867096, | |
| "reward_std": 0.6442215740680695, | |
| "rewards/accuracy_reward": 0.07142857532016933, | |
| "rewards/format_reward": 0.15625000558793545, | |
| "rewards/tag_count_reward": 0.3080357313156128, | |
| "step": 44 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 267.9419746398926, | |
| "epoch": 0.12, | |
| "grad_norm": 8.117420196533203, | |
| "kl": 4.306640625, | |
| "learning_rate": 1.9978716065702566e-05, | |
| "loss": -0.3767, | |
| "reward": 0.5859375260770321, | |
| "reward_std": 0.5742851197719574, | |
| "rewards/accuracy_reward": 0.04017857322469354, | |
| "rewards/format_reward": 0.1718750074505806, | |
| "rewards/tag_count_reward": 0.3738839440047741, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 327.2009162902832, | |
| "epoch": 0.12266666666666666, | |
| "grad_norm": 390668.9375, | |
| "kl": 652.412109375, | |
| "learning_rate": 1.9972203595547334e-05, | |
| "loss": 27.4051, | |
| "reward": 0.820870578289032, | |
| "reward_std": 0.7370434999465942, | |
| "rewards/accuracy_reward": 0.05357143119908869, | |
| "rewards/format_reward": 0.3504464440047741, | |
| "rewards/tag_count_reward": 0.4168526977300644, | |
| "step": 46 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 308.95314025878906, | |
| "epoch": 0.12533333333333332, | |
| "grad_norm": 3.3901305198669434, | |
| "kl": 1.37255859375, | |
| "learning_rate": 1.996482450693348e-05, | |
| "loss": -0.235, | |
| "reward": 0.9084821864962578, | |
| "reward_std": 0.7239489033818245, | |
| "rewards/accuracy_reward": 0.04241071594879031, | |
| "rewards/format_reward": 0.3526785932481289, | |
| "rewards/tag_count_reward": 0.5133928842842579, | |
| "step": 47 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 269.2343864440918, | |
| "epoch": 0.128, | |
| "grad_norm": 13.403223037719727, | |
| "kl": 0.79248046875, | |
| "learning_rate": 1.9956579441128942e-05, | |
| "loss": -0.1234, | |
| "reward": 0.627790205180645, | |
| "reward_std": 0.3243283350020647, | |
| "rewards/accuracy_reward": 0.040178572526201606, | |
| "rewards/format_reward": 0.017857144121080637, | |
| "rewards/tag_count_reward": 0.5697544887661934, | |
| "step": 48 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 244.00894165039062, | |
| "epoch": 0.13066666666666665, | |
| "grad_norm": 1.7707425355911255, | |
| "kl": 0.734130859375, | |
| "learning_rate": 1.994746911465802e-05, | |
| "loss": -0.0886, | |
| "reward": 0.530133955180645, | |
| "reward_std": 0.24073401279747486, | |
| "rewards/accuracy_reward": 0.01562500069849193, | |
| "rewards/format_reward": 0.0022321429569274187, | |
| "rewards/tag_count_reward": 0.5122768133878708, | |
| "step": 49 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 277.7120609283447, | |
| "epoch": 0.13333333333333333, | |
| "grad_norm": 1.7800672054290771, | |
| "kl": 1.8212890625, | |
| "learning_rate": 1.9937494319239112e-05, | |
| "loss": -0.1572, | |
| "reward": 0.5022321678698063, | |
| "reward_std": 0.2955008540302515, | |
| "rewards/accuracy_reward": 0.011160715017467737, | |
| "rewards/format_reward": 0.0022321429569274187, | |
| "rewards/tag_count_reward": 0.4888393059372902, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 295.51340675354004, | |
| "epoch": 0.136, | |
| "grad_norm": 10.020119667053223, | |
| "kl": 5.904296875, | |
| "learning_rate": 1.9926655921715924e-05, | |
| "loss": -0.3163, | |
| "reward": 0.3733259104192257, | |
| "reward_std": 0.2695033699274063, | |
| "rewards/accuracy_reward": 0.026785715948790312, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3465401940047741, | |
| "step": 51 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 441.823673248291, | |
| "epoch": 0.13866666666666666, | |
| "grad_norm": 2.06915283203125, | |
| "kl": 2.1552734375, | |
| "learning_rate": 1.9914954863982106e-05, | |
| "loss": -0.3148, | |
| "reward": 0.2756696492433548, | |
| "reward_std": 0.17585041373968124, | |
| "rewards/accuracy_reward": 0.008928571827709675, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.266741082072258, | |
| "step": 52 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 938.0960235595703, | |
| "epoch": 0.14133333333333334, | |
| "grad_norm": 0.31667816638946533, | |
| "kl": 0.596923828125, | |
| "learning_rate": 1.990239216289944e-05, | |
| "loss": -0.1824, | |
| "reward": 0.23493304289877415, | |
| "reward_std": 0.07539755944162607, | |
| "rewards/accuracy_reward": 0.004464285913854837, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.23046875931322575, | |
| "step": 53 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 930.7031631469727, | |
| "epoch": 0.144, | |
| "grad_norm": 0.257773756980896, | |
| "kl": 0.6866455078125, | |
| "learning_rate": 1.9888968910209433e-05, | |
| "loss": -0.1836, | |
| "reward": 0.22935268841683865, | |
| "reward_std": 0.062135092448443174, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.22935268841683865, | |
| "step": 54 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 959.5089569091797, | |
| "epoch": 0.14666666666666667, | |
| "grad_norm": 0.38278838992118835, | |
| "kl": 0.61932373046875, | |
| "learning_rate": 1.9874686272438467e-05, | |
| "loss": -0.1345, | |
| "reward": 0.23437500931322575, | |
| "reward_std": 0.04414202296175063, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.23437500931322575, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 989.2812881469727, | |
| "epoch": 0.14933333333333335, | |
| "grad_norm": 5.235660552978516, | |
| "kl": 0.56573486328125, | |
| "learning_rate": 1.9859545490796414e-05, | |
| "loss": -0.0736, | |
| "reward": 0.23772322572767735, | |
| "reward_std": 0.04692125436849892, | |
| "rewards/accuracy_reward": 0.0022321429569274187, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.235491082072258, | |
| "step": 56 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 984.7567443847656, | |
| "epoch": 0.152, | |
| "grad_norm": 0.7513731718063354, | |
| "kl": 1.2568359375, | |
| "learning_rate": 1.9843547881068763e-05, | |
| "loss": -0.0667, | |
| "reward": 0.23493304289877415, | |
| "reward_std": 0.0440019175875932, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.23493304289877415, | |
| "step": 57 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1004.8973541259766, | |
| "epoch": 0.15466666666666667, | |
| "grad_norm": 2176257.75, | |
| "kl": 24204.224578857422, | |
| "learning_rate": 1.9826694833502295e-05, | |
| "loss": 964.5377, | |
| "reward": 0.23995536752045155, | |
| "reward_std": 0.0476194906514138, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.23995536752045155, | |
| "step": 58 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1013.3593978881836, | |
| "epoch": 0.15733333333333333, | |
| "grad_norm": 3.9350333213806152, | |
| "kl": 6.3824462890625, | |
| "learning_rate": 1.9808987812684247e-05, | |
| "loss": -0.0163, | |
| "reward": 0.239397332072258, | |
| "reward_std": 0.04491220973432064, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.239397332072258, | |
| "step": 59 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1002.2120819091797, | |
| "epoch": 0.16, | |
| "grad_norm": 10.338529586791992, | |
| "kl": 2.3958740234375, | |
| "learning_rate": 1.979042835741503e-05, | |
| "loss": -0.0206, | |
| "reward": 0.2265625111758709, | |
| "reward_std": 0.07814983604475856, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2265625111758709, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1009.8348388671875, | |
| "epoch": 0.16266666666666665, | |
| "grad_norm": 0.21856586635112762, | |
| "kl": 0.22381591796875, | |
| "learning_rate": 1.9771018080574534e-05, | |
| "loss": -0.0197, | |
| "reward": 0.22767858020961285, | |
| "reward_std": 0.058380599366500974, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.22767858020961285, | |
| "step": 61 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1009.4732360839844, | |
| "epoch": 0.16533333333333333, | |
| "grad_norm": 0.38580521941185, | |
| "kl": 0.169189453125, | |
| "learning_rate": 1.9750758668981925e-05, | |
| "loss": -0.0383, | |
| "reward": 0.24051340483129025, | |
| "reward_std": 0.03663408872671425, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.24051340483129025, | |
| "step": 62 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 979.4420013427734, | |
| "epoch": 0.168, | |
| "grad_norm": 0.6577736735343933, | |
| "kl": 0.9158935546875, | |
| "learning_rate": 1.9729651883249075e-05, | |
| "loss": -0.0914, | |
| "reward": 0.2399553656578064, | |
| "reward_std": 0.03222780209034681, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2399553656578064, | |
| "step": 63 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1006.5223541259766, | |
| "epoch": 0.17066666666666666, | |
| "grad_norm": 10.438541412353516, | |
| "kl": 1.66058349609375, | |
| "learning_rate": 1.9707699557627554e-05, | |
| "loss": -0.005, | |
| "reward": 0.2460937574505806, | |
| "reward_std": 0.014615848893299699, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2460937574505806, | |
| "step": 64 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 996.0647659301758, | |
| "epoch": 0.17333333333333334, | |
| "grad_norm": 0.5891804099082947, | |
| "kl": 0.95574951171875, | |
| "learning_rate": 1.968490359984923e-05, | |
| "loss": -0.0622, | |
| "reward": 0.2421875074505806, | |
| "reward_std": 0.024542340775951743, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2421875074505806, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 996.9174499511719, | |
| "epoch": 0.176, | |
| "grad_norm": 0.47327500581741333, | |
| "kl": 0.97607421875, | |
| "learning_rate": 1.9661265990960486e-05, | |
| "loss": -0.0374, | |
| "reward": 0.24386161379516125, | |
| "reward_std": 0.021628810092806816, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.24386161379516125, | |
| "step": 66 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1009.5178833007812, | |
| "epoch": 0.17866666666666667, | |
| "grad_norm": 6.1433186531066895, | |
| "kl": 15.404144287109375, | |
| "learning_rate": 1.9636788785150037e-05, | |
| "loss": -0.0088, | |
| "reward": 0.24609375558793545, | |
| "reward_std": 0.013276896439492702, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.24609375558793545, | |
| "step": 67 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1018.4620666503906, | |
| "epoch": 0.18133333333333335, | |
| "grad_norm": 1.1267215013504028, | |
| "kl": 0.652923583984375, | |
| "learning_rate": 1.9611474109570446e-05, | |
| "loss": -0.007, | |
| "reward": 0.2483258955180645, | |
| "reward_std": 0.006263935239985585, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2483258955180645, | |
| "step": 68 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1019.7433166503906, | |
| "epoch": 0.184, | |
| "grad_norm": 0.2426103949546814, | |
| "kl": 0.08197021484375, | |
| "learning_rate": 1.9585324164153236e-05, | |
| "loss": -0.0126, | |
| "reward": 0.2477678582072258, | |
| "reward_std": 0.008351913653314114, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2477678582072258, | |
| "step": 69 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1015.9888534545898, | |
| "epoch": 0.18666666666666668, | |
| "grad_norm": 0.12586605548858643, | |
| "kl": 0.143280029296875, | |
| "learning_rate": 1.9558341221417744e-05, | |
| "loss": -0.0217, | |
| "reward": 0.2472098283469677, | |
| "reward_std": 0.009100939147174358, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2472098283469677, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1018.0803756713867, | |
| "epoch": 0.18933333333333333, | |
| "grad_norm": 0.38826289772987366, | |
| "kl": 0.079498291015625, | |
| "learning_rate": 1.9530527626273592e-05, | |
| "loss": -0.0119, | |
| "reward": 0.246651791036129, | |
| "reward_std": 0.01252787047997117, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.246651791036129, | |
| "step": 71 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1015.8348388671875, | |
| "epoch": 0.192, | |
| "grad_norm": 615.1087036132812, | |
| "kl": 32.953125, | |
| "learning_rate": 1.9501885795816937e-05, | |
| "loss": 1.3001, | |
| "reward": 0.246651791036129, | |
| "reward_std": 0.01252787047997117, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.246651791036129, | |
| "step": 72 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1017.616081237793, | |
| "epoch": 0.19466666666666665, | |
| "grad_norm": 0.04497808218002319, | |
| "kl": 0.08001708984375, | |
| "learning_rate": 1.9472418219120403e-05, | |
| "loss": -0.0054, | |
| "reward": 0.2494419664144516, | |
| "reward_std": 0.0020879784133285284, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2494419664144516, | |
| "step": 73 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1018.8951110839844, | |
| "epoch": 0.19733333333333333, | |
| "grad_norm": 2.6043214797973633, | |
| "kl": 0.47564697265625, | |
| "learning_rate": 1.9442127457016768e-05, | |
| "loss": -0.0041, | |
| "reward": 0.24720982648432255, | |
| "reward_std": 0.010439892299473286, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.24720982648432255, | |
| "step": 74 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1001.9643249511719, | |
| "epoch": 0.2, | |
| "grad_norm": 0.7235760688781738, | |
| "kl": 0.35595703125, | |
| "learning_rate": 1.9411016141876438e-05, | |
| "loss": -0.0072, | |
| "reward": 0.2460937574505806, | |
| "reward_std": 0.01745285326614976, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2460937574505806, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1001.7567443847656, | |
| "epoch": 0.20266666666666666, | |
| "grad_norm": 11.015027046203613, | |
| "kl": 0.2857666015625, | |
| "learning_rate": 1.9379086977378664e-05, | |
| "loss": -0.0117, | |
| "reward": 0.2466517947614193, | |
| "reward_std": 0.015364875085651875, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2466517947614193, | |
| "step": 76 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 980.8705825805664, | |
| "epoch": 0.20533333333333334, | |
| "grad_norm": 0.5128747224807739, | |
| "kl": 0.2799072265625, | |
| "learning_rate": 1.9346342738276593e-05, | |
| "loss": -0.0305, | |
| "reward": 0.24665179289877415, | |
| "reward_std": 0.032296012388542295, | |
| "rewards/accuracy_reward": 0.0022321429569274187, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.24441965110599995, | |
| "step": 77 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 942.6451416015625, | |
| "epoch": 0.208, | |
| "grad_norm": 8.115665435791016, | |
| "kl": 1.64599609375, | |
| "learning_rate": 1.9312786270156135e-05, | |
| "loss": 0.0306, | |
| "reward": 0.24497768841683865, | |
| "reward_std": 0.03677184786647558, | |
| "rewards/accuracy_reward": 0.0022321429569274187, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.24274554289877415, | |
| "step": 78 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 947.0089797973633, | |
| "epoch": 0.21066666666666667, | |
| "grad_norm": 86.71783447265625, | |
| "kl": 10.751220703125, | |
| "learning_rate": 1.927842048918867e-05, | |
| "loss": 0.3956, | |
| "reward": 0.2527901902794838, | |
| "reward_std": 0.041396952932700515, | |
| "rewards/accuracy_reward": 0.004464285913854837, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2483259029686451, | |
| "step": 79 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 976.4687881469727, | |
| "epoch": 0.21333333333333335, | |
| "grad_norm": 0.8117868304252625, | |
| "kl": 1.150146484375, | |
| "learning_rate": 1.9243248381877605e-05, | |
| "loss": 0.0132, | |
| "reward": 0.2594866156578064, | |
| "reward_std": 0.061963471584022045, | |
| "rewards/accuracy_reward": 0.011160714784637094, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2483258992433548, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 999.3281707763672, | |
| "epoch": 0.216, | |
| "grad_norm": 0.5710753798484802, | |
| "kl": 0.2978515625, | |
| "learning_rate": 1.9207273004798873e-05, | |
| "loss": -0.0116, | |
| "reward": 0.25279018841683865, | |
| "reward_std": 0.04546203720383346, | |
| "rewards/accuracy_reward": 0.004464285913854837, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.24832589738070965, | |
| "step": 81 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1006.6428985595703, | |
| "epoch": 0.21866666666666668, | |
| "grad_norm": 0.4451937973499298, | |
| "kl": 0.3206787109375, | |
| "learning_rate": 1.9170497484335276e-05, | |
| "loss": -0.0188, | |
| "reward": 0.2695312611758709, | |
| "reward_std": 0.09501239191740751, | |
| "rewards/accuracy_reward": 0.01785714365541935, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2516741156578064, | |
| "step": 82 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1012.850471496582, | |
| "epoch": 0.22133333333333333, | |
| "grad_norm": 0.6105583310127258, | |
| "kl": 0.556640625, | |
| "learning_rate": 1.9132925016404805e-05, | |
| "loss": 0.0129, | |
| "reward": 0.3136160857975483, | |
| "reward_std": 0.1892830766737461, | |
| "rewards/accuracy_reward": 0.0491071455180645, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2645089440047741, | |
| "step": 83 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1011.8192291259766, | |
| "epoch": 0.224, | |
| "grad_norm": 0.4263147711753845, | |
| "kl": 1.17919921875, | |
| "learning_rate": 1.9094558866182892e-05, | |
| "loss": 0.0313, | |
| "reward": 0.4051339440047741, | |
| "reward_std": 0.2580750435590744, | |
| "rewards/accuracy_reward": 0.11830357857979834, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2868303693830967, | |
| "step": 84 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 994.6518173217773, | |
| "epoch": 0.22666666666666666, | |
| "grad_norm": 4.379977226257324, | |
| "kl": 2.775390625, | |
| "learning_rate": 1.9055402367818673e-05, | |
| "loss": 0.0242, | |
| "reward": 0.4547991268336773, | |
| "reward_std": 0.2754308069124818, | |
| "rewards/accuracy_reward": 0.17633929336443543, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2784598357975483, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 981.888427734375, | |
| "epoch": 0.22933333333333333, | |
| "grad_norm": 0.5411613583564758, | |
| "kl": 1.14501953125, | |
| "learning_rate": 1.901545892414523e-05, | |
| "loss": 0.0135, | |
| "reward": 0.474888414144516, | |
| "reward_std": 0.298506336286664, | |
| "rewards/accuracy_reward": 0.18973215203732252, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2851562649011612, | |
| "step": 86 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 979.1473693847656, | |
| "epoch": 0.232, | |
| "grad_norm": 0.31463608145713806, | |
| "kl": 1.203125, | |
| "learning_rate": 1.897473200638386e-05, | |
| "loss": -0.0059, | |
| "reward": 0.4665178768336773, | |
| "reward_std": 0.2942663496360183, | |
| "rewards/accuracy_reward": 0.19866072945296764, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2678571566939354, | |
| "step": 87 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1000.0179061889648, | |
| "epoch": 0.23466666666666666, | |
| "grad_norm": 0.2776876986026764, | |
| "kl": 0.96484375, | |
| "learning_rate": 1.8933225153842446e-05, | |
| "loss": 0.0193, | |
| "reward": 0.4268973432481289, | |
| "reward_std": 0.25522872246801853, | |
| "rewards/accuracy_reward": 0.16517858393490314, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2617187574505806, | |
| "step": 88 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 996.5000305175781, | |
| "epoch": 0.23733333333333334, | |
| "grad_norm": 0.5454741716384888, | |
| "kl": 1.11767578125, | |
| "learning_rate": 1.8890941973607843e-05, | |
| "loss": 0.0167, | |
| "reward": 0.4676339514553547, | |
| "reward_std": 0.2707557659596205, | |
| "rewards/accuracy_reward": 0.22544644214212894, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.2421875074505806, | |
| "step": 89 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 996.5893325805664, | |
| "epoch": 0.24, | |
| "grad_norm": 0.45834293961524963, | |
| "kl": 1.054443359375, | |
| "learning_rate": 1.8847886140232438e-05, | |
| "loss": 0.0215, | |
| "reward": 0.3158482275903225, | |
| "reward_std": 0.2969865184277296, | |
| "rewards/accuracy_reward": 0.14732143469154835, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.16852679289877415, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 927.3036041259766, | |
| "epoch": 0.24266666666666667, | |
| "grad_norm": 2.235508918762207, | |
| "kl": 2.0927734375, | |
| "learning_rate": 1.8804061395414795e-05, | |
| "loss": 0.0831, | |
| "reward": 0.2265625149011612, | |
| "reward_std": 0.2394270822405815, | |
| "rewards/accuracy_reward": 0.07812500232830644, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.1484375074505806, | |
| "step": 91 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 954.2366485595703, | |
| "epoch": 0.24533333333333332, | |
| "grad_norm": 0.3937546908855438, | |
| "kl": 1.47265625, | |
| "learning_rate": 1.875947154767452e-05, | |
| "loss": 0.0661, | |
| "reward": 0.19140626024454832, | |
| "reward_std": 0.23799890838563442, | |
| "rewards/accuracy_reward": 0.06250000419095159, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.12890625651925802, | |
| "step": 92 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1003.2478103637695, | |
| "epoch": 0.248, | |
| "grad_norm": 0.41460925340652466, | |
| "kl": 1.29296875, | |
| "learning_rate": 1.8714120472021252e-05, | |
| "loss": 0.0501, | |
| "reward": 0.23828126303851604, | |
| "reward_std": 0.28448878042399883, | |
| "rewards/accuracy_reward": 0.09821428917348385, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.1400669701397419, | |
| "step": 93 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 995.8393096923828, | |
| "epoch": 0.25066666666666665, | |
| "grad_norm": 0.31394830346107483, | |
| "kl": 1.2431640625, | |
| "learning_rate": 1.8668012109617933e-05, | |
| "loss": 0.0441, | |
| "reward": 0.25558036752045155, | |
| "reward_std": 0.28702029772102833, | |
| "rewards/accuracy_reward": 0.10491071850992739, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.1506696492433548, | |
| "step": 94 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1013.6808471679688, | |
| "epoch": 0.25333333333333335, | |
| "grad_norm": 0.26716697216033936, | |
| "kl": 1.11474609375, | |
| "learning_rate": 1.862115046743831e-05, | |
| "loss": 0.039, | |
| "reward": 0.30747769959270954, | |
| "reward_std": 0.3026493303477764, | |
| "rewards/accuracy_reward": 0.12053572107106447, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.1869419738650322, | |
| "step": 95 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1017.1518173217773, | |
| "epoch": 0.256, | |
| "grad_norm": 0.28958389163017273, | |
| "kl": 1.10400390625, | |
| "learning_rate": 1.85735396179187e-05, | |
| "loss": 0.0416, | |
| "reward": 0.4603794813156128, | |
| "reward_std": 0.3755842447280884, | |
| "rewards/accuracy_reward": 0.20312500931322575, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.25725447572767735, | |
| "step": 96 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1013.7857437133789, | |
| "epoch": 0.25866666666666666, | |
| "grad_norm": 0.2677743434906006, | |
| "kl": 1.009765625, | |
| "learning_rate": 1.8525183698604098e-05, | |
| "loss": 0.0371, | |
| "reward": 0.4949777014553547, | |
| "reward_std": 0.3382803946733475, | |
| "rewards/accuracy_reward": 0.18303572293370962, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3119419813156128, | |
| "step": 97 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1012.8259201049805, | |
| "epoch": 0.2613333333333333, | |
| "grad_norm": 0.2341116964817047, | |
| "kl": 0.853759765625, | |
| "learning_rate": 1.8476086911788588e-05, | |
| "loss": 0.0321, | |
| "reward": 0.510044664144516, | |
| "reward_std": 0.2946113534271717, | |
| "rewards/accuracy_reward": 0.1651785783469677, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3448660857975483, | |
| "step": 98 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1005.520133972168, | |
| "epoch": 0.264, | |
| "grad_norm": 0.4626868665218353, | |
| "kl": 1.1552734375, | |
| "learning_rate": 1.8426253524150176e-05, | |
| "loss": 0.0344, | |
| "reward": 0.5691964514553547, | |
| "reward_std": 0.34905775636434555, | |
| "rewards/accuracy_reward": 0.21205358253791928, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.357142873108387, | |
| "step": 99 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1007.6183471679688, | |
| "epoch": 0.26666666666666666, | |
| "grad_norm": 0.4349505603313446, | |
| "kl": 0.7802734375, | |
| "learning_rate": 1.8375687866379988e-05, | |
| "loss": 0.0271, | |
| "reward": 0.5998884215950966, | |
| "reward_std": 0.33171170204877853, | |
| "rewards/accuracy_reward": 0.22544644167646766, | |
| "rewards/format_reward": 0.0022321429569274187, | |
| "rewards/tag_count_reward": 0.3722098395228386, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1005.4777221679688, | |
| "epoch": 0.2693333333333333, | |
| "grad_norm": 0.4926176071166992, | |
| "kl": 0.667236328125, | |
| "learning_rate": 1.8324394332805913e-05, | |
| "loss": 0.0161, | |
| "reward": 0.6283482350409031, | |
| "reward_std": 0.330145962536335, | |
| "rewards/accuracy_reward": 0.24776786658912897, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3805803768336773, | |
| "step": 101 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1007.0223617553711, | |
| "epoch": 0.272, | |
| "grad_norm": 0.5684264898300171, | |
| "kl": 0.4881591796875, | |
| "learning_rate": 1.8272377381010726e-05, | |
| "loss": 0.0033, | |
| "reward": 0.6183036044239998, | |
| "reward_std": 0.32388063333928585, | |
| "rewards/accuracy_reward": 0.2500000111758709, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3683035932481289, | |
| "step": 102 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1007.1875457763672, | |
| "epoch": 0.27466666666666667, | |
| "grad_norm": 4.337119102478027, | |
| "kl": 0.94384765625, | |
| "learning_rate": 1.8219641531444713e-05, | |
| "loss": 0.031, | |
| "reward": 0.7594866454601288, | |
| "reward_std": 0.3420899845659733, | |
| "rewards/accuracy_reward": 0.3883928768336773, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3710937723517418, | |
| "step": 103 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1001.1563034057617, | |
| "epoch": 0.2773333333333333, | |
| "grad_norm": 0.5998754501342773, | |
| "kl": 0.375, | |
| "learning_rate": 1.8166191367032828e-05, | |
| "loss": 0.0018, | |
| "reward": 0.6891741305589676, | |
| "reward_std": 0.38426094129681587, | |
| "rewards/accuracy_reward": 0.3147321566939354, | |
| "rewards/format_reward": 0.0022321429569274187, | |
| "rewards/tag_count_reward": 0.3722098357975483, | |
| "step": 104 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1015.3370895385742, | |
| "epoch": 0.28, | |
| "grad_norm": 0.2921030521392822, | |
| "kl": 0.419189453125, | |
| "learning_rate": 1.811203153277641e-05, | |
| "loss": 0.0097, | |
| "reward": 0.8236607536673546, | |
| "reward_std": 0.32553235441446304, | |
| "rewards/accuracy_reward": 0.4330357424914837, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3906250149011612, | |
| "step": 105 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 994.6964645385742, | |
| "epoch": 0.2826666666666667, | |
| "grad_norm": 3.2729032039642334, | |
| "kl": 0.99658203125, | |
| "learning_rate": 1.8057166735349533e-05, | |
| "loss": 0.0083, | |
| "reward": 0.7879464700818062, | |
| "reward_std": 0.33463743701577187, | |
| "rewards/accuracy_reward": 0.3906250149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3973214440047741, | |
| "step": 106 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 983.4241485595703, | |
| "epoch": 0.2853333333333333, | |
| "grad_norm": 1.0045377016067505, | |
| "kl": 0.7274169921875, | |
| "learning_rate": 1.800160174268996e-05, | |
| "loss": -0.0159, | |
| "reward": 0.6489955708384514, | |
| "reward_std": 0.3681374154984951, | |
| "rewards/accuracy_reward": 0.25446429289877415, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3945312686264515, | |
| "step": 107 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 986.0915603637695, | |
| "epoch": 0.288, | |
| "grad_norm": 0.5037855505943298, | |
| "kl": 0.580078125, | |
| "learning_rate": 1.7945341383584818e-05, | |
| "loss": -0.0129, | |
| "reward": 0.7650670036673546, | |
| "reward_std": 0.39130162820219994, | |
| "rewards/accuracy_reward": 0.35044644586741924, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4146205559372902, | |
| "step": 108 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 965.9866561889648, | |
| "epoch": 0.2906666666666667, | |
| "grad_norm": 1.3588380813598633, | |
| "kl": 1.8997802734375, | |
| "learning_rate": 1.7888390547250944e-05, | |
| "loss": 0.032, | |
| "reward": 0.7505580708384514, | |
| "reward_std": 0.3513164669275284, | |
| "rewards/accuracy_reward": 0.3325893059372902, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4179687649011612, | |
| "step": 109 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 946.988883972168, | |
| "epoch": 0.29333333333333333, | |
| "grad_norm": 0.42907217144966125, | |
| "kl": 1.02099609375, | |
| "learning_rate": 1.7830754182909985e-05, | |
| "loss": -0.002, | |
| "reward": 0.8191964626312256, | |
| "reward_std": 0.3892271090298891, | |
| "rewards/accuracy_reward": 0.39732144633308053, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4218750186264515, | |
| "step": 110 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 908.0603103637695, | |
| "epoch": 0.296, | |
| "grad_norm": 0.554644763469696, | |
| "kl": 1.03662109375, | |
| "learning_rate": 1.7772437299358324e-05, | |
| "loss": -0.0007, | |
| "reward": 0.7488839663565159, | |
| "reward_std": 0.34201290644705296, | |
| "rewards/accuracy_reward": 0.3102678705472499, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4386160932481289, | |
| "step": 111 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 912.4442443847656, | |
| "epoch": 0.2986666666666667, | |
| "grad_norm": 0.5132027268409729, | |
| "kl": 1.64404296875, | |
| "learning_rate": 1.771344496453177e-05, | |
| "loss": 0.0156, | |
| "reward": 0.7243304029107094, | |
| "reward_std": 0.36312241293489933, | |
| "rewards/accuracy_reward": 0.28348215762525797, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4408482313156128, | |
| "step": 112 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 946.7210235595703, | |
| "epoch": 0.30133333333333334, | |
| "grad_norm": 2.337268114089966, | |
| "kl": 4.0595703125, | |
| "learning_rate": 1.7653782305065158e-05, | |
| "loss": 0.0725, | |
| "reward": 0.7773437798023224, | |
| "reward_std": 0.35461460426449776, | |
| "rewards/accuracy_reward": 0.2991071557626128, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4782366268336773, | |
| "step": 113 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 971.0312957763672, | |
| "epoch": 0.304, | |
| "grad_norm": 0.5845390558242798, | |
| "kl": 2.72265625, | |
| "learning_rate": 1.7593454505846807e-05, | |
| "loss": 0.0799, | |
| "reward": 0.781808078289032, | |
| "reward_std": 0.3098542857915163, | |
| "rewards/accuracy_reward": 0.3125000111758709, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4693080484867096, | |
| "step": 114 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 999.4821701049805, | |
| "epoch": 0.30666666666666664, | |
| "grad_norm": 0.36925044655799866, | |
| "kl": 1.072998046875, | |
| "learning_rate": 1.753246680956795e-05, | |
| "loss": 0.0012, | |
| "reward": 0.6941964626312256, | |
| "reward_std": 0.3417879194021225, | |
| "rewards/accuracy_reward": 0.27455358393490314, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4196428768336773, | |
| "step": 115 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 997.1049423217773, | |
| "epoch": 0.30933333333333335, | |
| "grad_norm": 0.7683995366096497, | |
| "kl": 0.99810791015625, | |
| "learning_rate": 1.7470824516267125e-05, | |
| "loss": -0.0258, | |
| "reward": 0.5870536006987095, | |
| "reward_std": 0.3048571478575468, | |
| "rewards/accuracy_reward": 0.16741072060540318, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4196428805589676, | |
| "step": 116 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1004.1183319091797, | |
| "epoch": 0.312, | |
| "grad_norm": 0.6834697127342224, | |
| "kl": 0.769775390625, | |
| "learning_rate": 1.7408532982869573e-05, | |
| "loss": -0.0129, | |
| "reward": 0.6238839514553547, | |
| "reward_std": 0.32993016950786114, | |
| "rewards/accuracy_reward": 0.18973215110599995, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4341518059372902, | |
| "step": 117 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 856.0893325805664, | |
| "epoch": 0.31466666666666665, | |
| "grad_norm": 18.79123306274414, | |
| "kl": 7.33984375, | |
| "learning_rate": 1.7345597622721727e-05, | |
| "loss": 0.1389, | |
| "reward": 0.5736607387661934, | |
| "reward_std": 0.3069217577576637, | |
| "rewards/accuracy_reward": 0.12723214901052415, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4464285895228386, | |
| "step": 118 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 857.9777145385742, | |
| "epoch": 0.31733333333333336, | |
| "grad_norm": 6.959547996520996, | |
| "kl": 5.00390625, | |
| "learning_rate": 1.7282023905120743e-05, | |
| "loss": 0.0227, | |
| "reward": 0.4944196678698063, | |
| "reward_std": 0.23757354356348515, | |
| "rewards/accuracy_reward": 0.05133928917348385, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4430803768336773, | |
| "step": 119 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 914.3504867553711, | |
| "epoch": 0.32, | |
| "grad_norm": 2.665106773376465, | |
| "kl": 0.91259765625, | |
| "learning_rate": 1.721781735483921e-05, | |
| "loss": -0.0911, | |
| "reward": 0.5279018022119999, | |
| "reward_std": 0.2325394507497549, | |
| "rewards/accuracy_reward": 0.06026785937137902, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4676339514553547, | |
| "step": 120 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 966.8370971679688, | |
| "epoch": 0.32266666666666666, | |
| "grad_norm": 1.2509655952453613, | |
| "kl": 0.35968017578125, | |
| "learning_rate": 1.7152983551645054e-05, | |
| "loss": -0.06, | |
| "reward": 0.5563616342842579, | |
| "reward_std": 0.25420672446489334, | |
| "rewards/accuracy_reward": 0.08705357438884676, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4693080596625805, | |
| "step": 121 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 981.8326263427734, | |
| "epoch": 0.3253333333333333, | |
| "grad_norm": 0.5648319125175476, | |
| "kl": 0.5328369140625, | |
| "learning_rate": 1.708752812981659e-05, | |
| "loss": -0.0362, | |
| "reward": 0.5742187760770321, | |
| "reward_std": 0.26508820056915283, | |
| "rewards/accuracy_reward": 0.10491071850992739, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4693080522119999, | |
| "step": 122 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 959.6272735595703, | |
| "epoch": 0.328, | |
| "grad_norm": 10.74463939666748, | |
| "kl": 1.951416015625, | |
| "learning_rate": 1.702145677765293e-05, | |
| "loss": -0.0225, | |
| "reward": 0.6300223544239998, | |
| "reward_std": 0.28988964296877384, | |
| "rewards/accuracy_reward": 0.13616071711294353, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4938616305589676, | |
| "step": 123 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 980.8192443847656, | |
| "epoch": 0.33066666666666666, | |
| "grad_norm": 4.069360256195068, | |
| "kl": 1.291015625, | |
| "learning_rate": 1.6954775236979616e-05, | |
| "loss": -0.0099, | |
| "reward": 0.5474330633878708, | |
| "reward_std": 0.2495187409222126, | |
| "rewards/accuracy_reward": 0.06473214668221772, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.482700914144516, | |
| "step": 124 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 950.1986999511719, | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 8.897069931030273, | |
| "kl": 2.5947265625, | |
| "learning_rate": 1.6887489302649657e-05, | |
| "loss": 0.0202, | |
| "reward": 0.6049107387661934, | |
| "reward_std": 0.31144498474895954, | |
| "rewards/accuracy_reward": 0.12500000605359674, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4799107350409031, | |
| "step": 125 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 987.8058395385742, | |
| "epoch": 0.336, | |
| "grad_norm": 2.9771108627319336, | |
| "kl": 1.3916015625, | |
| "learning_rate": 1.6819604822039924e-05, | |
| "loss": 0.0017, | |
| "reward": 0.5251116268336773, | |
| "reward_std": 0.21684774663299322, | |
| "rewards/accuracy_reward": 0.04464285937137902, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4804687723517418, | |
| "step": 126 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 982.4085311889648, | |
| "epoch": 0.33866666666666667, | |
| "grad_norm": 1.1636186838150024, | |
| "kl": 1.030029296875, | |
| "learning_rate": 1.6751127694543012e-05, | |
| "loss": -0.02, | |
| "reward": 0.5340402089059353, | |
| "reward_std": 0.21295001544058323, | |
| "rewards/accuracy_reward": 0.04687500186264515, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.487165205180645, | |
| "step": 127 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 980.8795166015625, | |
| "epoch": 0.3413333333333333, | |
| "grad_norm": 1.9692587852478027, | |
| "kl": 9.68798828125, | |
| "learning_rate": 1.6682063871054534e-05, | |
| "loss": -0.0215, | |
| "reward": 0.5200893133878708, | |
| "reward_std": 0.2094257827848196, | |
| "rewards/accuracy_reward": 0.037946430733427405, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4821428768336773, | |
| "step": 128 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1005.0111923217773, | |
| "epoch": 0.344, | |
| "grad_norm": 1.1740566492080688, | |
| "kl": 0.3739013671875, | |
| "learning_rate": 1.661241935345599e-05, | |
| "loss": -0.0071, | |
| "reward": 0.6735491380095482, | |
| "reward_std": 0.2998756691813469, | |
| "rewards/accuracy_reward": 0.16517857555299997, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5083705559372902, | |
| "step": 129 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 991.6607360839844, | |
| "epoch": 0.3466666666666667, | |
| "grad_norm": 0.4999859929084778, | |
| "kl": 0.33416748046875, | |
| "learning_rate": 1.654220019409317e-05, | |
| "loss": -0.0413, | |
| "reward": 0.6049107499420643, | |
| "reward_std": 0.21885671466588974, | |
| "rewards/accuracy_reward": 0.10044643492437899, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5044643096625805, | |
| "step": 130 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1006.4777069091797, | |
| "epoch": 0.34933333333333333, | |
| "grad_norm": 0.31324487924575806, | |
| "kl": 0.33984375, | |
| "learning_rate": 1.6471412495250195e-05, | |
| "loss": -0.0189, | |
| "reward": 0.580915205180645, | |
| "reward_std": 0.22965830191969872, | |
| "rewards/accuracy_reward": 0.07589285913854837, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5050223432481289, | |
| "step": 131 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1021.9040222167969, | |
| "epoch": 0.352, | |
| "grad_norm": 0.3059490919113159, | |
| "kl": 0.1947021484375, | |
| "learning_rate": 1.640006240861921e-05, | |
| "loss": 0.0013, | |
| "reward": 0.6244419887661934, | |
| "reward_std": 0.23875875398516655, | |
| "rewards/accuracy_reward": 0.10937500582076609, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5150669887661934, | |
| "step": 132 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1007.4710083007812, | |
| "epoch": 0.3546666666666667, | |
| "grad_norm": 0.9267669916152954, | |
| "kl": 0.306396484375, | |
| "learning_rate": 1.632815613476576e-05, | |
| "loss": -0.0092, | |
| "reward": 0.6741071715950966, | |
| "reward_std": 0.3328205347061157, | |
| "rewards/accuracy_reward": 0.19419643934816122, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4799107313156128, | |
| "step": 133 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 746.9375305175781, | |
| "epoch": 0.35733333333333334, | |
| "grad_norm": 6.944976329803467, | |
| "kl": 1.9111328125, | |
| "learning_rate": 1.6255699922589968e-05, | |
| "loss": -0.0158, | |
| "reward": 0.5117187686264515, | |
| "reward_std": 0.33955446630716324, | |
| "rewards/accuracy_reward": 0.14732143585570157, | |
| "rewards/format_reward": 0.004464285913854837, | |
| "rewards/tag_count_reward": 0.3599330522119999, | |
| "step": 134 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 688.2656555175781, | |
| "epoch": 0.36, | |
| "grad_norm": 4.551051139831543, | |
| "kl": 1.47119140625, | |
| "learning_rate": 1.6182700068783463e-05, | |
| "loss": 0.074, | |
| "reward": 0.4464285969734192, | |
| "reward_std": 0.32823895290493965, | |
| "rewards/accuracy_reward": 0.10714286146685481, | |
| "rewards/format_reward": 0.0022321429569274187, | |
| "rewards/tag_count_reward": 0.3370535857975483, | |
| "step": 135 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 932.7768249511719, | |
| "epoch": 0.3626666666666667, | |
| "grad_norm": 0.7402526140213013, | |
| "kl": 0.3092041015625, | |
| "learning_rate": 1.610916291728218e-05, | |
| "loss": 0.015, | |
| "reward": 0.6032366417348385, | |
| "reward_std": 0.4040101356804371, | |
| "rewards/accuracy_reward": 0.19866072572767735, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4045759104192257, | |
| "step": 136 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 916.5134353637695, | |
| "epoch": 0.36533333333333334, | |
| "grad_norm": 0.6370754837989807, | |
| "kl": 0.3028564453125, | |
| "learning_rate": 1.6035094858715065e-05, | |
| "loss": -0.0375, | |
| "reward": 0.7421875298023224, | |
| "reward_std": 0.3760041519999504, | |
| "rewards/accuracy_reward": 0.2968750149011612, | |
| "rewards/format_reward": 0.006696428870782256, | |
| "rewards/tag_count_reward": 0.4386160969734192, | |
| "step": 137 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 890.5982513427734, | |
| "epoch": 0.368, | |
| "grad_norm": 0.6831709742546082, | |
| "kl": 0.2908935546875, | |
| "learning_rate": 1.5960502329848683e-05, | |
| "loss": -0.0319, | |
| "reward": 0.8917411044239998, | |
| "reward_std": 0.4364234544336796, | |
| "rewards/accuracy_reward": 0.39508930407464504, | |
| "rewards/format_reward": 0.024553572293370962, | |
| "rewards/tag_count_reward": 0.4720982387661934, | |
| "step": 138 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 804.1205749511719, | |
| "epoch": 0.37066666666666664, | |
| "grad_norm": 2.1319992542266846, | |
| "kl": 0.320068359375, | |
| "learning_rate": 1.588539181302786e-05, | |
| "loss": 0.0253, | |
| "reward": 0.7901785969734192, | |
| "reward_std": 0.4901719093322754, | |
| "rewards/accuracy_reward": 0.24776787287555635, | |
| "rewards/format_reward": 0.04017857415601611, | |
| "rewards/tag_count_reward": 0.5022321678698063, | |
| "step": 139 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 786.3839645385742, | |
| "epoch": 0.37333333333333335, | |
| "grad_norm": 2.234715700149536, | |
| "kl": 0.26708984375, | |
| "learning_rate": 1.580976983561235e-05, | |
| "loss": -0.0489, | |
| "reward": 0.8493303880095482, | |
| "reward_std": 0.43167896941304207, | |
| "rewards/accuracy_reward": 0.3013392947614193, | |
| "rewards/format_reward": 0.008928571827709675, | |
| "rewards/tag_count_reward": 0.5390625074505806, | |
| "step": 140 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 816.1518325805664, | |
| "epoch": 0.376, | |
| "grad_norm": 2.293884038925171, | |
| "kl": 0.3765869140625, | |
| "learning_rate": 1.5733642969409553e-05, | |
| "loss": -0.0492, | |
| "reward": 0.7857143208384514, | |
| "reward_std": 0.39178847149014473, | |
| "rewards/accuracy_reward": 0.26116072526201606, | |
| "rewards/format_reward": 0.0022321429569274187, | |
| "rewards/tag_count_reward": 0.522321455180645, | |
| "step": 141 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 867.8973693847656, | |
| "epoch": 0.37866666666666665, | |
| "grad_norm": 2.1033823490142822, | |
| "kl": 0.519775390625, | |
| "learning_rate": 1.5657017830103448e-05, | |
| "loss": -0.0506, | |
| "reward": 0.7667411044239998, | |
| "reward_std": 0.34294718876481056, | |
| "rewards/accuracy_reward": 0.227678582072258, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5390625223517418, | |
| "step": 142 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 819.8661117553711, | |
| "epoch": 0.38133333333333336, | |
| "grad_norm": 5.280317306518555, | |
| "kl": 0.734130859375, | |
| "learning_rate": 1.5579901076679625e-05, | |
| "loss": -0.0823, | |
| "reward": 0.89620541036129, | |
| "reward_std": 0.4438219405710697, | |
| "rewards/accuracy_reward": 0.39955358766019344, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4966518133878708, | |
| "step": 143 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 853.4085235595703, | |
| "epoch": 0.384, | |
| "grad_norm": 6.651998519897461, | |
| "kl": 1.65087890625, | |
| "learning_rate": 1.5502299410846626e-05, | |
| "loss": -0.071, | |
| "reward": 0.7762277126312256, | |
| "reward_std": 0.3538948893547058, | |
| "rewards/accuracy_reward": 0.22098215576261282, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5552455745637417, | |
| "step": 144 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 825.1585235595703, | |
| "epoch": 0.38666666666666666, | |
| "grad_norm": 45.774906158447266, | |
| "kl": 7.41796875, | |
| "learning_rate": 1.5424219576453526e-05, | |
| "loss": 0.0825, | |
| "reward": 0.7812500298023224, | |
| "reward_std": 0.35702329128980637, | |
| "rewards/accuracy_reward": 0.2410714402794838, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5401785969734192, | |
| "step": 145 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 843.4263687133789, | |
| "epoch": 0.3893333333333333, | |
| "grad_norm": 10.040907859802246, | |
| "kl": 3.3037109375, | |
| "learning_rate": 1.5345668358903886e-05, | |
| "loss": -0.0851, | |
| "reward": 0.786272369325161, | |
| "reward_std": 0.3718103840947151, | |
| "rewards/accuracy_reward": 0.2120535783469677, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5742187723517418, | |
| "step": 146 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 788.7656555175781, | |
| "epoch": 0.392, | |
| "grad_norm": 3.9412412643432617, | |
| "kl": 1.4208984375, | |
| "learning_rate": 1.5266652584566056e-05, | |
| "loss": -0.1977, | |
| "reward": 0.7784598544239998, | |
| "reward_std": 0.39768509939312935, | |
| "rewards/accuracy_reward": 0.23883929662406445, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5396205633878708, | |
| "step": 147 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 778.2053909301758, | |
| "epoch": 0.39466666666666667, | |
| "grad_norm": 2.3464877605438232, | |
| "kl": 1.646484375, | |
| "learning_rate": 1.5187179120179969e-05, | |
| "loss": -0.2273, | |
| "reward": 0.7354910895228386, | |
| "reward_std": 0.42870184034109116, | |
| "rewards/accuracy_reward": 0.18973215110599995, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.545758955180645, | |
| "step": 148 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 763.8794937133789, | |
| "epoch": 0.3973333333333333, | |
| "grad_norm": 1.7182663679122925, | |
| "kl": 1.0068359375, | |
| "learning_rate": 1.5107254872260366e-05, | |
| "loss": -0.2177, | |
| "reward": 0.8147321864962578, | |
| "reward_std": 0.4319792427122593, | |
| "rewards/accuracy_reward": 0.28571429662406445, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5290178842842579, | |
| "step": 149 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 848.1295013427734, | |
| "epoch": 0.4, | |
| "grad_norm": 1.2562867403030396, | |
| "kl": 0.520263671875, | |
| "learning_rate": 1.5026886786496624e-05, | |
| "loss": -0.1592, | |
| "reward": 0.9051339626312256, | |
| "reward_std": 0.38791827112436295, | |
| "rewards/accuracy_reward": 0.3348214477300644, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5703125223517418, | |
| "step": 150 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 835.6518173217773, | |
| "epoch": 0.4026666666666667, | |
| "grad_norm": 0.8011857867240906, | |
| "kl": 0.93359375, | |
| "learning_rate": 1.4946081847149134e-05, | |
| "loss": -0.171, | |
| "reward": 0.926339328289032, | |
| "reward_std": 0.43096619471907616, | |
| "rewards/accuracy_reward": 0.37276787124574184, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5535714626312256, | |
| "step": 151 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 866.0201263427734, | |
| "epoch": 0.4053333333333333, | |
| "grad_norm": 2.4044995307922363, | |
| "kl": 2.04150390625, | |
| "learning_rate": 1.4864847076442358e-05, | |
| "loss": -0.1295, | |
| "reward": 0.9380580633878708, | |
| "reward_std": 0.3936535269021988, | |
| "rewards/accuracy_reward": 0.35714287497103214, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.580915205180645, | |
| "step": 152 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 850.8460159301758, | |
| "epoch": 0.408, | |
| "grad_norm": 3.49764084815979, | |
| "kl": 2.35498046875, | |
| "learning_rate": 1.4783189533954555e-05, | |
| "loss": -0.1393, | |
| "reward": 0.8465402200818062, | |
| "reward_std": 0.36915882118046284, | |
| "rewards/accuracy_reward": 0.2656250116415322, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.580915205180645, | |
| "step": 153 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 844.9888763427734, | |
| "epoch": 0.4106666666666667, | |
| "grad_norm": 0.9199227690696716, | |
| "kl": 0.605712890625, | |
| "learning_rate": 1.4701116316004307e-05, | |
| "loss": -0.1487, | |
| "reward": 0.847656287252903, | |
| "reward_std": 0.41640398278832436, | |
| "rewards/accuracy_reward": 0.25446430034935474, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5931919738650322, | |
| "step": 154 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 894.4620895385742, | |
| "epoch": 0.41333333333333333, | |
| "grad_norm": 0.9275841116905212, | |
| "kl": 0.5400390625, | |
| "learning_rate": 1.46186345550338e-05, | |
| "loss": -0.1346, | |
| "reward": 0.851562537252903, | |
| "reward_std": 0.40566281601786613, | |
| "rewards/accuracy_reward": 0.24330358393490314, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.608258955180645, | |
| "step": 155 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 778.1741333007812, | |
| "epoch": 0.416, | |
| "grad_norm": 1.4222042560577393, | |
| "kl": 1.9130859375, | |
| "learning_rate": 1.4535751418989e-05, | |
| "loss": -0.1921, | |
| "reward": 0.694196455180645, | |
| "reward_std": 0.39460423216223717, | |
| "rewards/accuracy_reward": 0.15401786495931447, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5401785969734192, | |
| "step": 156 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 691.5089569091797, | |
| "epoch": 0.4186666666666667, | |
| "grad_norm": 2.3009650707244873, | |
| "kl": 2.8173828125, | |
| "learning_rate": 1.4452474110696738e-05, | |
| "loss": -0.1628, | |
| "reward": 0.5496652014553547, | |
| "reward_std": 0.3982691466808319, | |
| "rewards/accuracy_reward": 0.10491072060540318, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4447544813156128, | |
| "step": 157 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 749.8437805175781, | |
| "epoch": 0.42133333333333334, | |
| "grad_norm": 1.4146252870559692, | |
| "kl": 2.4423828125, | |
| "learning_rate": 1.4368809867238754e-05, | |
| "loss": -0.139, | |
| "reward": 0.4933035932481289, | |
| "reward_std": 0.3439667113125324, | |
| "rewards/accuracy_reward": 0.07366071688011289, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.419642873108387, | |
| "step": 158 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 777.3147735595703, | |
| "epoch": 0.424, | |
| "grad_norm": 1.5294604301452637, | |
| "kl": 2.662109375, | |
| "learning_rate": 1.4284765959322772e-05, | |
| "loss": -0.0536, | |
| "reward": 0.4246651977300644, | |
| "reward_std": 0.28294636122882366, | |
| "rewards/accuracy_reward": 0.037946430733427405, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3867187649011612, | |
| "step": 159 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 840.5290603637695, | |
| "epoch": 0.4266666666666667, | |
| "grad_norm": 1.1588491201400757, | |
| "kl": 2.80078125, | |
| "learning_rate": 1.4200349690650654e-05, | |
| "loss": -0.0394, | |
| "reward": 0.4810268096625805, | |
| "reward_std": 0.33573491498827934, | |
| "rewards/accuracy_reward": 0.06919643329456449, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4118303768336773, | |
| "step": 160 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 892.8437881469727, | |
| "epoch": 0.42933333333333334, | |
| "grad_norm": 1.1430574655532837, | |
| "kl": 2.9453125, | |
| "learning_rate": 1.411556839728367e-05, | |
| "loss": 0.0025, | |
| "reward": 0.5619419924914837, | |
| "reward_std": 0.360417190939188, | |
| "rewards/accuracy_reward": 0.0915178598370403, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4704241268336773, | |
| "step": 161 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 945.9308395385742, | |
| "epoch": 0.432, | |
| "grad_norm": 2.161261796951294, | |
| "kl": 3.875, | |
| "learning_rate": 1.4030429447004992e-05, | |
| "loss": 0.0688, | |
| "reward": 0.6428571678698063, | |
| "reward_std": 0.3830233383923769, | |
| "rewards/accuracy_reward": 0.12053572200238705, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5223214514553547, | |
| "step": 162 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 992.6518249511719, | |
| "epoch": 0.43466666666666665, | |
| "grad_norm": 1.7473344802856445, | |
| "kl": 3.423828125, | |
| "learning_rate": 1.3944940238679384e-05, | |
| "loss": 0.0891, | |
| "reward": 0.6618303805589676, | |
| "reward_std": 0.2981398981064558, | |
| "rewards/accuracy_reward": 0.06250000395812094, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5993303805589676, | |
| "step": 163 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1013.8393096923828, | |
| "epoch": 0.43733333333333335, | |
| "grad_norm": 7.500110149383545, | |
| "kl": 6.62255859375, | |
| "learning_rate": 1.3859108201610236e-05, | |
| "loss": 0.254, | |
| "reward": 0.729352705180645, | |
| "reward_std": 0.35707739368081093, | |
| "rewards/accuracy_reward": 0.1294642894063145, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5998884215950966, | |
| "step": 164 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1014.9397506713867, | |
| "epoch": 0.44, | |
| "grad_norm": 5.974610805511475, | |
| "kl": 4.771484375, | |
| "learning_rate": 1.3772940794893916e-05, | |
| "loss": 0.1753, | |
| "reward": 0.7025669887661934, | |
| "reward_std": 0.322284035384655, | |
| "rewards/accuracy_reward": 0.09821429080329835, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6043526902794838, | |
| "step": 165 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1019.9397506713867, | |
| "epoch": 0.44266666666666665, | |
| "grad_norm": 7.7188496589660645, | |
| "kl": 3.6796875, | |
| "learning_rate": 1.368644550677157e-05, | |
| "loss": 0.1398, | |
| "reward": 0.7522321790456772, | |
| "reward_std": 0.3287883847951889, | |
| "rewards/accuracy_reward": 0.1183035746216774, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6339285969734192, | |
| "step": 166 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1020.819206237793, | |
| "epoch": 0.44533333333333336, | |
| "grad_norm": 15.867236137390137, | |
| "kl": 8.28125, | |
| "learning_rate": 1.3599629853978342e-05, | |
| "loss": 0.3249, | |
| "reward": 0.7204241380095482, | |
| "reward_std": 0.32472028210759163, | |
| "rewards/accuracy_reward": 0.10491072037257254, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6155134215950966, | |
| "step": 167 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1020.0424194335938, | |
| "epoch": 0.448, | |
| "grad_norm": 6.8363213539123535, | |
| "kl": 4.9951171875, | |
| "learning_rate": 1.3512501381090158e-05, | |
| "loss": 0.1957, | |
| "reward": 0.6914062798023224, | |
| "reward_std": 0.3072166871279478, | |
| "rewards/accuracy_reward": 0.0848214307334274, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6065848544239998, | |
| "step": 168 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1016.553596496582, | |
| "epoch": 0.45066666666666666, | |
| "grad_norm": 3.5362603664398193, | |
| "kl": 6.8984375, | |
| "learning_rate": 1.3425067659868084e-05, | |
| "loss": 0.2479, | |
| "reward": 0.6785714477300644, | |
| "reward_std": 0.3066155780106783, | |
| "rewards/accuracy_reward": 0.07142857532016933, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6071428805589676, | |
| "step": 169 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1011.7210006713867, | |
| "epoch": 0.4533333333333333, | |
| "grad_norm": 10.573477745056152, | |
| "kl": 3.390625, | |
| "learning_rate": 1.3337336288600297e-05, | |
| "loss": 0.1204, | |
| "reward": 0.7215402126312256, | |
| "reward_std": 0.33272249065339565, | |
| "rewards/accuracy_reward": 0.11160714761354029, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6099330708384514, | |
| "step": 170 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1019.7209930419922, | |
| "epoch": 0.456, | |
| "grad_norm": 8.912219047546387, | |
| "kl": 6.275390625, | |
| "learning_rate": 1.324931489144178e-05, | |
| "loss": 0.248, | |
| "reward": 0.816406287252903, | |
| "reward_std": 0.3113563619554043, | |
| "rewards/accuracy_reward": 0.16294643562287092, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6534598544239998, | |
| "step": 171 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1019.9620819091797, | |
| "epoch": 0.45866666666666667, | |
| "grad_norm": 5.703995704650879, | |
| "kl": 3.966796875, | |
| "learning_rate": 1.3161011117751756e-05, | |
| "loss": 0.1519, | |
| "reward": 0.7667411044239998, | |
| "reward_std": 0.31835605204105377, | |
| "rewards/accuracy_reward": 0.12723214970901608, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6395089477300644, | |
| "step": 172 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1018.3928680419922, | |
| "epoch": 0.4613333333333333, | |
| "grad_norm": 8.292807579040527, | |
| "kl": 8.216796875, | |
| "learning_rate": 1.3072432641428931e-05, | |
| "loss": 0.1965, | |
| "reward": 0.7946428954601288, | |
| "reward_std": 0.3237072564661503, | |
| "rewards/accuracy_reward": 0.12946429196745157, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6651785969734192, | |
| "step": 173 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1022.4241104125977, | |
| "epoch": 0.464, | |
| "grad_norm": 5.680503845214844, | |
| "kl": 2.45263671875, | |
| "learning_rate": 1.2983587160244602e-05, | |
| "loss": 0.0982, | |
| "reward": 0.7505580708384514, | |
| "reward_std": 0.24723401945084333, | |
| "rewards/accuracy_reward": 0.08035714528523386, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6702009215950966, | |
| "step": 174 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1023.0937576293945, | |
| "epoch": 0.4666666666666667, | |
| "grad_norm": 2.7311763763427734, | |
| "kl": 2.314453125, | |
| "learning_rate": 1.2894482395173695e-05, | |
| "loss": 0.0906, | |
| "reward": 0.8197545111179352, | |
| "reward_std": 0.3316057715564966, | |
| "rewards/accuracy_reward": 0.1450892926659435, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.674665205180645, | |
| "step": 175 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1020.8437652587891, | |
| "epoch": 0.4693333333333333, | |
| "grad_norm": 7.489380836486816, | |
| "kl": 3.44140625, | |
| "learning_rate": 1.2805126089723798e-05, | |
| "loss": 0.1294, | |
| "reward": 0.8493303954601288, | |
| "reward_std": 0.35581264086067677, | |
| "rewards/accuracy_reward": 0.1741071492433548, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6752232536673546, | |
| "step": 176 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.472, | |
| "grad_norm": 2.636575937271118, | |
| "kl": 2.30322265625, | |
| "learning_rate": 1.2715526009262209e-05, | |
| "loss": 0.0922, | |
| "reward": 0.7790178805589676, | |
| "reward_std": 0.2882107999175787, | |
| "rewards/accuracy_reward": 0.10714286286383867, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6718750298023224, | |
| "step": 177 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1020.0401840209961, | |
| "epoch": 0.4746666666666667, | |
| "grad_norm": 4.1205291748046875, | |
| "kl": 0.9132080078125, | |
| "learning_rate": 1.2625689940341102e-05, | |
| "loss": 0.0322, | |
| "reward": 0.8643973618745804, | |
| "reward_std": 0.3784267157316208, | |
| "rewards/accuracy_reward": 0.20535715203732252, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.659040205180645, | |
| "step": 178 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1022.0803756713867, | |
| "epoch": 0.47733333333333333, | |
| "grad_norm": 4.4023823738098145, | |
| "kl": 0.935791015625, | |
| "learning_rate": 1.2535625690020861e-05, | |
| "loss": 0.0334, | |
| "reward": 0.8593750521540642, | |
| "reward_std": 0.3589657451957464, | |
| "rewards/accuracy_reward": 0.20982143888249993, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6495535969734192, | |
| "step": 179 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.48, | |
| "grad_norm": 2.203847885131836, | |
| "kl": 1.26123046875, | |
| "learning_rate": 1.24453410851916e-05, | |
| "loss": 0.0504, | |
| "reward": 0.825334869325161, | |
| "reward_std": 0.3454053979367018, | |
| "rewards/accuracy_reward": 0.17857143841683865, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6467634066939354, | |
| "step": 180 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.4826666666666667, | |
| "grad_norm": 0.9205332398414612, | |
| "kl": 0.9808349609375, | |
| "learning_rate": 1.2354843971892998e-05, | |
| "loss": 0.0393, | |
| "reward": 0.8794643357396126, | |
| "reward_std": 0.3468211852014065, | |
| "rewards/accuracy_reward": 0.2142857238650322, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6651786118745804, | |
| "step": 181 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.48533333333333334, | |
| "grad_norm": 1.2223032712936401, | |
| "kl": 0.931640625, | |
| "learning_rate": 1.2264142214632441e-05, | |
| "loss": 0.0373, | |
| "reward": 0.8850446715950966, | |
| "reward_std": 0.36373014003038406, | |
| "rewards/accuracy_reward": 0.2142857275903225, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6707589626312256, | |
| "step": 182 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.488, | |
| "grad_norm": 0.24835845828056335, | |
| "kl": 0.541015625, | |
| "learning_rate": 1.2173243695701575e-05, | |
| "loss": 0.0217, | |
| "reward": 0.981026828289032, | |
| "reward_std": 0.37699259258806705, | |
| "rewards/accuracy_reward": 0.2924107275903225, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6886161044239998, | |
| "step": 183 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1021.8236694335938, | |
| "epoch": 0.49066666666666664, | |
| "grad_norm": 7.40866756439209, | |
| "kl": 0.44091796875, | |
| "learning_rate": 1.2082156314491298e-05, | |
| "loss": 0.0111, | |
| "reward": 0.8722098544239998, | |
| "reward_std": 0.2936495263129473, | |
| "rewards/accuracy_reward": 0.17857143515720963, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6936384290456772, | |
| "step": 184 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1022.1406326293945, | |
| "epoch": 0.49333333333333335, | |
| "grad_norm": 0.5567572712898254, | |
| "kl": 0.3126220703125, | |
| "learning_rate": 1.1990887986805295e-05, | |
| "loss": 0.0092, | |
| "reward": 0.9202009439468384, | |
| "reward_std": 0.38105889968574047, | |
| "rewards/accuracy_reward": 0.22991072572767735, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6902902126312256, | |
| "step": 185 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1021.8236694335938, | |
| "epoch": 0.496, | |
| "grad_norm": 0.5544495582580566, | |
| "kl": 0.29736328125, | |
| "learning_rate": 1.1899446644172106e-05, | |
| "loss": 0.0076, | |
| "reward": 0.9854911267757416, | |
| "reward_std": 0.37811920419335365, | |
| "rewards/accuracy_reward": 0.290178582072258, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6953125298023224, | |
| "step": 186 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.49866666666666665, | |
| "grad_norm": 0.2723735570907593, | |
| "kl": 0.351318359375, | |
| "learning_rate": 1.1807840233155863e-05, | |
| "loss": 0.0141, | |
| "reward": 0.9441964626312256, | |
| "reward_std": 0.40811292454600334, | |
| "rewards/accuracy_reward": 0.29017858766019344, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6540178880095482, | |
| "step": 187 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1021.8482208251953, | |
| "epoch": 0.5013333333333333, | |
| "grad_norm": 0.5237597823143005, | |
| "kl": 0.347900390625, | |
| "learning_rate": 1.1716076714665701e-05, | |
| "loss": 0.0104, | |
| "reward": 0.832031287252903, | |
| "reward_std": 0.3854901008307934, | |
| "rewards/accuracy_reward": 0.20312500838190317, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6289062723517418, | |
| "step": 188 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.504, | |
| "grad_norm": 0.7326854467391968, | |
| "kl": 0.361572265625, | |
| "learning_rate": 1.1624164063263931e-05, | |
| "loss": 0.0145, | |
| "reward": 0.8487723544239998, | |
| "reward_std": 0.40539926290512085, | |
| "rewards/accuracy_reward": 0.23437501792795956, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6143973395228386, | |
| "step": 189 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1021.8415222167969, | |
| "epoch": 0.5066666666666667, | |
| "grad_norm": 0.1683388352394104, | |
| "kl": 0.231689453125, | |
| "learning_rate": 1.1532110266473026e-05, | |
| "loss": 0.0062, | |
| "reward": 0.9581473767757416, | |
| "reward_std": 0.41959446854889393, | |
| "rewards/accuracy_reward": 0.28571429708972573, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6724330633878708, | |
| "step": 190 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.5093333333333333, | |
| "grad_norm": 0.4431297481060028, | |
| "kl": 0.2105712890625, | |
| "learning_rate": 1.1439923324081465e-05, | |
| "loss": 0.0084, | |
| "reward": 0.9871652275323868, | |
| "reward_std": 0.39653555303812027, | |
| "rewards/accuracy_reward": 0.3080357275903225, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.679129496216774, | |
| "step": 191 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.512, | |
| "grad_norm": 0.293804407119751, | |
| "kl": 0.230224609375, | |
| "learning_rate": 1.1347611247448544e-05, | |
| "loss": 0.0092, | |
| "reward": 0.9916295036673546, | |
| "reward_std": 0.41540490463376045, | |
| "rewards/accuracy_reward": 0.34821430034935474, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.643415205180645, | |
| "step": 192 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.5146666666666667, | |
| "grad_norm": 0.5157522559165955, | |
| "kl": 0.388671875, | |
| "learning_rate": 1.1255182058808143e-05, | |
| "loss": 0.0155, | |
| "reward": 0.8850446864962578, | |
| "reward_std": 0.3409024402499199, | |
| "rewards/accuracy_reward": 0.28125001676380634, | |
| "rewards/format_reward": 0.011160714784637094, | |
| "rewards/tag_count_reward": 0.592633955180645, | |
| "step": 193 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.5173333333333333, | |
| "grad_norm": 2.269129514694214, | |
| "kl": 0.740234375, | |
| "learning_rate": 1.1162643790571574e-05, | |
| "loss": 0.0296, | |
| "reward": 0.934709869325161, | |
| "reward_std": 0.5415398068726063, | |
| "rewards/accuracy_reward": 0.22321429615840316, | |
| "rewards/format_reward": 0.15178571827709675, | |
| "rewards/tag_count_reward": 0.5597098469734192, | |
| "step": 194 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1022.5959854125977, | |
| "epoch": 0.52, | |
| "grad_norm": 2.012619972229004, | |
| "kl": 0.505859375, | |
| "learning_rate": 1.1070004484629543e-05, | |
| "loss": 0.0196, | |
| "reward": 0.9972098618745804, | |
| "reward_std": 0.5713630616664886, | |
| "rewards/accuracy_reward": 0.19196429569274187, | |
| "rewards/format_reward": 0.18080358020961285, | |
| "rewards/tag_count_reward": 0.624441996216774, | |
| "step": 195 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1022.0870590209961, | |
| "epoch": 0.5226666666666666, | |
| "grad_norm": 2.0236079692840576, | |
| "kl": 0.2779541015625, | |
| "learning_rate": 1.0977272191653272e-05, | |
| "loss": 0.0071, | |
| "reward": 1.0167411044239998, | |
| "reward_std": 0.4556749537587166, | |
| "rewards/accuracy_reward": 0.26339287124574184, | |
| "rewards/format_reward": 0.07366071781143546, | |
| "rewards/tag_count_reward": 0.6796875298023224, | |
| "step": 196 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.5253333333333333, | |
| "grad_norm": 2.6833431720733643, | |
| "kl": 0.3123779296875, | |
| "learning_rate": 1.0884454970394871e-05, | |
| "loss": 0.0125, | |
| "reward": 1.047433078289032, | |
| "reward_std": 0.45435722172260284, | |
| "rewards/accuracy_reward": 0.2901785857975483, | |
| "rewards/format_reward": 0.06026785960420966, | |
| "rewards/tag_count_reward": 0.6969866380095482, | |
| "step": 197 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1015.2656478881836, | |
| "epoch": 0.528, | |
| "grad_norm": 1.6515158414840698, | |
| "kl": 0.37060546875, | |
| "learning_rate": 1.0791560886987016e-05, | |
| "loss": -0.0019, | |
| "reward": 1.0569196939468384, | |
| "reward_std": 0.4782850816845894, | |
| "rewards/accuracy_reward": 0.26116072689183056, | |
| "rewards/format_reward": 0.06919643143191934, | |
| "rewards/tag_count_reward": 0.7265625223517418, | |
| "step": 198 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.5306666666666666, | |
| "grad_norm": 0.6246099472045898, | |
| "kl": 0.23095703125, | |
| "learning_rate": 1.069859801424196e-05, | |
| "loss": 0.0092, | |
| "reward": 1.0781250596046448, | |
| "reward_std": 0.46683337539434433, | |
| "rewards/accuracy_reward": 0.24553573061712086, | |
| "rewards/format_reward": 0.07812500419095159, | |
| "rewards/tag_count_reward": 0.7544643133878708, | |
| "step": 199 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.5333333333333333, | |
| "grad_norm": 0.20519134402275085, | |
| "kl": 0.2835693359375, | |
| "learning_rate": 1.0605574430949983e-05, | |
| "loss": 0.0113, | |
| "reward": 1.1729911118745804, | |
| "reward_std": 0.5779759176075459, | |
| "rewards/accuracy_reward": 0.27678572945296764, | |
| "rewards/format_reward": 0.13392857555299997, | |
| "rewards/tag_count_reward": 0.7622768208384514, | |
| "step": 200 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.536, | |
| "grad_norm": 0.5384864807128906, | |
| "kl": 0.3441162109375, | |
| "learning_rate": 1.0512498221177319e-05, | |
| "loss": 0.0138, | |
| "reward": 1.323102742433548, | |
| "reward_std": 0.7179402336478233, | |
| "rewards/accuracy_reward": 0.2812500149011612, | |
| "rewards/format_reward": 0.2656250111758709, | |
| "rewards/tag_count_reward": 0.776227705180645, | |
| "step": 201 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1021.7790222167969, | |
| "epoch": 0.5386666666666666, | |
| "grad_norm": 0.9296804070472717, | |
| "kl": 0.4293212890625, | |
| "learning_rate": 1.0419377473563621e-05, | |
| "loss": 0.0124, | |
| "reward": 1.3621652275323868, | |
| "reward_std": 0.7298394441604614, | |
| "rewards/accuracy_reward": 0.20982143934816122, | |
| "rewards/format_reward": 0.3727678768336773, | |
| "rewards/tag_count_reward": 0.7795759290456772, | |
| "step": 202 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.5413333333333333, | |
| "grad_norm": 1.218420147895813, | |
| "kl": 0.484619140625, | |
| "learning_rate": 1.0326220280619036e-05, | |
| "loss": 0.0194, | |
| "reward": 1.6858259737491608, | |
| "reward_std": 0.8232333958148956, | |
| "rewards/accuracy_reward": 0.38392858393490314, | |
| "rewards/format_reward": 0.5379464626312256, | |
| "rewards/tag_count_reward": 0.7639509364962578, | |
| "step": 203 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.544, | |
| "grad_norm": 0.30742985010147095, | |
| "kl": 0.3109130859375, | |
| "learning_rate": 1.0233034738020933e-05, | |
| "loss": 0.0124, | |
| "reward": 1.7661831378936768, | |
| "reward_std": 0.720308743417263, | |
| "rewards/accuracy_reward": 0.2700892984867096, | |
| "rewards/format_reward": 0.714285746216774, | |
| "rewards/tag_count_reward": 0.7818080708384514, | |
| "step": 204 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.5466666666666666, | |
| "grad_norm": 0.2768275737762451, | |
| "kl": 0.3094482421875, | |
| "learning_rate": 1.0139828943910358e-05, | |
| "loss": 0.0124, | |
| "reward": 1.6763393431901932, | |
| "reward_std": 0.7501099109649658, | |
| "rewards/accuracy_reward": 0.26116073061712086, | |
| "rewards/format_reward": 0.698660746216774, | |
| "rewards/tag_count_reward": 0.7165178880095482, | |
| "step": 205 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.5493333333333333, | |
| "grad_norm": 0.7091500759124756, | |
| "kl": 0.293701171875, | |
| "learning_rate": 1.004661099818829e-05, | |
| "loss": 0.0118, | |
| "reward": 1.9626117050647736, | |
| "reward_std": 0.7170767486095428, | |
| "rewards/accuracy_reward": 0.4062500111758709, | |
| "rewards/format_reward": 0.8303571864962578, | |
| "rewards/tag_count_reward": 0.7260045036673546, | |
| "step": 206 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.552, | |
| "grad_norm": 1.4856609106063843, | |
| "kl": 0.3824462890625, | |
| "learning_rate": 9.953389001811716e-06, | |
| "loss": 0.0153, | |
| "reward": 1.7338170409202576, | |
| "reward_std": 0.6667964346706867, | |
| "rewards/accuracy_reward": 0.2924107313156128, | |
| "rewards/format_reward": 0.785714328289032, | |
| "rewards/tag_count_reward": 0.655691996216774, | |
| "step": 207 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.5546666666666666, | |
| "grad_norm": 0.4151708483695984, | |
| "kl": 0.361083984375, | |
| "learning_rate": 9.860171056089646e-06, | |
| "loss": 0.0145, | |
| "reward": 1.8677456378936768, | |
| "reward_std": 0.6513328105211258, | |
| "rewards/accuracy_reward": 0.3906250223517418, | |
| "rewards/format_reward": 0.828125037252903, | |
| "rewards/tag_count_reward": 0.6489955559372902, | |
| "step": 208 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1022.0312576293945, | |
| "epoch": 0.5573333333333333, | |
| "grad_norm": 0.8962245583534241, | |
| "kl": 0.468994140625, | |
| "learning_rate": 9.766965261979072e-06, | |
| "loss": 0.0162, | |
| "reward": 1.7784598916769028, | |
| "reward_std": 0.6881442964076996, | |
| "rewards/accuracy_reward": 0.28125000838190317, | |
| "rewards/format_reward": 0.7946429029107094, | |
| "rewards/tag_count_reward": 0.702566996216774, | |
| "step": 209 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1020.3080520629883, | |
| "epoch": 0.56, | |
| "grad_norm": 0.398674875497818, | |
| "kl": 0.547607421875, | |
| "learning_rate": 9.673779719380967e-06, | |
| "loss": 0.0146, | |
| "reward": 1.7405134588479996, | |
| "reward_std": 0.7453675791621208, | |
| "rewards/accuracy_reward": 0.23660715529695153, | |
| "rewards/format_reward": 0.7566964626312256, | |
| "rewards/tag_count_reward": 0.7472098544239998, | |
| "step": 210 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1019.8259048461914, | |
| "epoch": 0.5626666666666666, | |
| "grad_norm": 0.8340033292770386, | |
| "kl": 0.69189453125, | |
| "learning_rate": 9.580622526436382e-06, | |
| "loss": 0.0191, | |
| "reward": 1.5396205931901932, | |
| "reward_std": 0.7650129646062851, | |
| "rewards/accuracy_reward": 0.18973215157166123, | |
| "rewards/format_reward": 0.5781250223517418, | |
| "rewards/tag_count_reward": 0.7717634290456772, | |
| "step": 211 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.5653333333333334, | |
| "grad_norm": 1.366788387298584, | |
| "kl": 0.626708984375, | |
| "learning_rate": 9.487501778822685e-06, | |
| "loss": 0.0251, | |
| "reward": 1.494977742433548, | |
| "reward_std": 0.7519652545452118, | |
| "rewards/accuracy_reward": 0.13169643515720963, | |
| "rewards/format_reward": 0.5736607387661934, | |
| "rewards/tag_count_reward": 0.7896205708384514, | |
| "step": 212 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.568, | |
| "grad_norm": 0.7474933862686157, | |
| "kl": 0.3994140625, | |
| "learning_rate": 9.394425569050018e-06, | |
| "loss": 0.016, | |
| "reward": 1.5239956080913544, | |
| "reward_std": 0.747960276901722, | |
| "rewards/accuracy_reward": 0.11830357694998384, | |
| "rewards/format_reward": 0.589285746216774, | |
| "rewards/tag_count_reward": 0.816406287252903, | |
| "step": 213 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.5706666666666667, | |
| "grad_norm": 0.7150735259056091, | |
| "kl": 0.3843994140625, | |
| "learning_rate": 9.30140198575804e-06, | |
| "loss": 0.0154, | |
| "reward": 1.7003348916769028, | |
| "reward_std": 0.7763254791498184, | |
| "rewards/accuracy_reward": 0.2366071545984596, | |
| "rewards/format_reward": 0.6651786118745804, | |
| "rewards/tag_count_reward": 0.7985491454601288, | |
| "step": 214 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.5733333333333334, | |
| "grad_norm": 0.3341580629348755, | |
| "kl": 0.252685546875, | |
| "learning_rate": 9.208439113012984e-06, | |
| "loss": 0.0101, | |
| "reward": 1.6869420260190964, | |
| "reward_std": 0.6856246665120125, | |
| "rewards/accuracy_reward": 0.15401786495931447, | |
| "rewards/format_reward": 0.7209821790456772, | |
| "rewards/tag_count_reward": 0.8119420036673546, | |
| "step": 215 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1021.8214340209961, | |
| "epoch": 0.576, | |
| "grad_norm": 0.24437068402767181, | |
| "kl": 0.2630615234375, | |
| "learning_rate": 9.115545029605129e-06, | |
| "loss": 0.0061, | |
| "reward": 1.7594867050647736, | |
| "reward_std": 0.6430581621825695, | |
| "rewards/accuracy_reward": 0.16294643678702414, | |
| "rewards/format_reward": 0.781250037252903, | |
| "rewards/tag_count_reward": 0.8152902200818062, | |
| "step": 216 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.5786666666666667, | |
| "grad_norm": 0.2341134250164032, | |
| "kl": 0.2742919921875, | |
| "learning_rate": 9.022727808346731e-06, | |
| "loss": 0.011, | |
| "reward": 1.7907367050647736, | |
| "reward_std": 0.6435003951191902, | |
| "rewards/accuracy_reward": 0.212053582072258, | |
| "rewards/format_reward": 0.812500037252903, | |
| "rewards/tag_count_reward": 0.7661830708384514, | |
| "step": 217 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1023.8683090209961, | |
| "epoch": 0.5813333333333334, | |
| "grad_norm": 0.4210372269153595, | |
| "kl": 0.2279052734375, | |
| "learning_rate": 8.92999551537046e-06, | |
| "loss": 0.0089, | |
| "reward": 1.7952009737491608, | |
| "reward_std": 0.6400342211127281, | |
| "rewards/accuracy_reward": 0.20982144260779023, | |
| "rewards/format_reward": 0.837053619325161, | |
| "rewards/tag_count_reward": 0.7483259364962578, | |
| "step": 218 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.584, | |
| "grad_norm": 0.24627907574176788, | |
| "kl": 0.1993408203125, | |
| "learning_rate": 8.837356209428428e-06, | |
| "loss": 0.008, | |
| "reward": 1.8950893729925156, | |
| "reward_std": 0.5851474218070507, | |
| "rewards/accuracy_reward": 0.23214286752045155, | |
| "rewards/format_reward": 0.8928571864962578, | |
| "rewards/tag_count_reward": 0.770089328289032, | |
| "step": 219 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.5866666666666667, | |
| "grad_norm": 0.20180067420005798, | |
| "kl": 0.2174072265625, | |
| "learning_rate": 8.744817941191862e-06, | |
| "loss": 0.0087, | |
| "reward": 2.039620652794838, | |
| "reward_std": 0.630747739225626, | |
| "rewards/accuracy_reward": 0.3214285895228386, | |
| "rewards/format_reward": 0.9062500447034836, | |
| "rewards/tag_count_reward": 0.811941996216774, | |
| "step": 220 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.5893333333333334, | |
| "grad_norm": 0.5031952261924744, | |
| "kl": 0.2564697265625, | |
| "learning_rate": 8.652388752551458e-06, | |
| "loss": 0.0103, | |
| "reward": 1.864397406578064, | |
| "reward_std": 0.6196173951029778, | |
| "rewards/accuracy_reward": 0.25223215483129025, | |
| "rewards/format_reward": 0.8482143208384514, | |
| "rewards/tag_count_reward": 0.7639509364962578, | |
| "step": 221 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.592, | |
| "grad_norm": 0.3535725176334381, | |
| "kl": 0.2275390625, | |
| "learning_rate": 8.560076675918537e-06, | |
| "loss": 0.0091, | |
| "reward": 2.0172992199659348, | |
| "reward_std": 0.5813482627272606, | |
| "rewards/accuracy_reward": 0.27232144260779023, | |
| "rewards/format_reward": 0.899553619325161, | |
| "rewards/tag_count_reward": 0.8454241454601288, | |
| "step": 222 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.5946666666666667, | |
| "grad_norm": 0.3289671838283539, | |
| "kl": 0.258056640625, | |
| "learning_rate": 8.467889733526977e-06, | |
| "loss": 0.0103, | |
| "reward": 1.983258992433548, | |
| "reward_std": 0.6793729364871979, | |
| "rewards/accuracy_reward": 0.29241072945296764, | |
| "rewards/format_reward": 0.8459821864962578, | |
| "rewards/tag_count_reward": 0.844866119325161, | |
| "step": 223 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.5973333333333334, | |
| "grad_norm": 1.2749642133712769, | |
| "kl": 0.409912109375, | |
| "learning_rate": 8.375835936736072e-06, | |
| "loss": 0.0164, | |
| "reward": 1.8945313394069672, | |
| "reward_std": 0.7175267487764359, | |
| "rewards/accuracy_reward": 0.29017858393490314, | |
| "rewards/format_reward": 0.7678571715950966, | |
| "rewards/tag_count_reward": 0.8364955708384514, | |
| "step": 224 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1021.7678604125977, | |
| "epoch": 0.6, | |
| "grad_norm": 3.180739641189575, | |
| "kl": 0.831787109375, | |
| "learning_rate": 8.283923285334304e-06, | |
| "loss": 0.0279, | |
| "reward": 1.8989956080913544, | |
| "reward_std": 0.679141990840435, | |
| "rewards/accuracy_reward": 0.2834821492433548, | |
| "rewards/format_reward": 0.7433036044239998, | |
| "rewards/tag_count_reward": 0.8722098618745804, | |
| "step": 225 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.6026666666666667, | |
| "grad_norm": 5.543436527252197, | |
| "kl": 0.572998046875, | |
| "learning_rate": 8.19215976684414e-06, | |
| "loss": 0.0229, | |
| "reward": 1.9581474363803864, | |
| "reward_std": 0.7154616340994835, | |
| "rewards/accuracy_reward": 0.32142858812585473, | |
| "rewards/format_reward": 0.750000037252903, | |
| "rewards/tag_count_reward": 0.8867187798023224, | |
| "step": 226 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1021.7366104125977, | |
| "epoch": 0.6053333333333333, | |
| "grad_norm": 6.479448318481445, | |
| "kl": 0.625, | |
| "learning_rate": 8.100553355827897e-06, | |
| "loss": 0.019, | |
| "reward": 1.7622768431901932, | |
| "reward_std": 0.6781650111079216, | |
| "rewards/accuracy_reward": 0.2187500074505806, | |
| "rewards/format_reward": 0.6651786044239998, | |
| "rewards/tag_count_reward": 0.878348246216774, | |
| "step": 227 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.608, | |
| "grad_norm": 3.77752685546875, | |
| "kl": 0.58154296875, | |
| "learning_rate": 8.009112013194707e-06, | |
| "loss": 0.0233, | |
| "reward": 1.8616072088479996, | |
| "reward_std": 0.7435284927487373, | |
| "rewards/accuracy_reward": 0.30133930407464504, | |
| "rewards/format_reward": 0.6741071715950966, | |
| "rewards/tag_count_reward": 0.8861607536673546, | |
| "step": 228 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.6106666666666667, | |
| "grad_norm": 0.44458553194999695, | |
| "kl": 0.579833984375, | |
| "learning_rate": 7.917843685508702e-06, | |
| "loss": 0.0232, | |
| "reward": 1.856584906578064, | |
| "reward_std": 0.7319114580750465, | |
| "rewards/accuracy_reward": 0.33482143841683865, | |
| "rewards/format_reward": 0.651785746216774, | |
| "rewards/tag_count_reward": 0.8699777126312256, | |
| "step": 229 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.6133333333333333, | |
| "grad_norm": 0.944040060043335, | |
| "kl": 0.654296875, | |
| "learning_rate": 7.826756304298428e-06, | |
| "loss": 0.0262, | |
| "reward": 1.7912947088479996, | |
| "reward_std": 0.7824961915612221, | |
| "rewards/accuracy_reward": 0.25223215483129025, | |
| "rewards/format_reward": 0.683035746216774, | |
| "rewards/tag_count_reward": 0.856026828289032, | |
| "step": 230 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.616, | |
| "grad_norm": 0.4724942743778229, | |
| "kl": 0.47802734375, | |
| "learning_rate": 7.73585778536756e-06, | |
| "loss": 0.0191, | |
| "reward": 1.8510045558214188, | |
| "reward_std": 0.6905161440372467, | |
| "rewards/accuracy_reward": 0.21651786752045155, | |
| "rewards/format_reward": 0.7633928880095482, | |
| "rewards/tag_count_reward": 0.8710937947034836, | |
| "step": 231 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.6186666666666667, | |
| "grad_norm": 0.44556644558906555, | |
| "kl": 0.3642578125, | |
| "learning_rate": 7.645156028107005e-06, | |
| "loss": 0.0146, | |
| "reward": 2.014508992433548, | |
| "reward_std": 0.6820996776223183, | |
| "rewards/accuracy_reward": 0.305803582072258, | |
| "rewards/format_reward": 0.816964328289032, | |
| "rewards/tag_count_reward": 0.8917411044239998, | |
| "step": 232 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1021.8459854125977, | |
| "epoch": 0.6213333333333333, | |
| "grad_norm": 0.5146077275276184, | |
| "kl": 0.254638671875, | |
| "learning_rate": 7.554658914808404e-06, | |
| "loss": 0.0068, | |
| "reward": 2.0652903020381927, | |
| "reward_std": 0.613891314715147, | |
| "rewards/accuracy_reward": 0.28571429941803217, | |
| "rewards/format_reward": 0.883928619325161, | |
| "rewards/tag_count_reward": 0.895647369325161, | |
| "step": 233 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.624, | |
| "grad_norm": 0.5768357515335083, | |
| "kl": 0.338623046875, | |
| "learning_rate": 7.464374309979143e-06, | |
| "loss": 0.0135, | |
| "reward": 1.9782366752624512, | |
| "reward_std": 0.6160991229116917, | |
| "rewards/accuracy_reward": 0.24553572665899992, | |
| "rewards/format_reward": 0.8816964700818062, | |
| "rewards/tag_count_reward": 0.8510045111179352, | |
| "step": 234 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.6266666666666667, | |
| "grad_norm": 2.275315284729004, | |
| "kl": 0.61083984375, | |
| "learning_rate": 7.3743100596589e-06, | |
| "loss": 0.0244, | |
| "reward": 1.903459906578064, | |
| "reward_std": 0.6063342429697514, | |
| "rewards/accuracy_reward": 0.22544643888249993, | |
| "rewards/format_reward": 0.883928619325161, | |
| "rewards/tag_count_reward": 0.7940848544239998, | |
| "step": 235 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.6293333333333333, | |
| "grad_norm": 1.4082753658294678, | |
| "kl": 0.58154296875, | |
| "learning_rate": 7.284473990737795e-06, | |
| "loss": 0.0233, | |
| "reward": 1.9514510035514832, | |
| "reward_std": 0.5934183858335018, | |
| "rewards/accuracy_reward": 0.28348215972073376, | |
| "rewards/format_reward": 0.8906250298023224, | |
| "rewards/tag_count_reward": 0.777343787252903, | |
| "step": 236 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1021.7500076293945, | |
| "epoch": 0.632, | |
| "grad_norm": 0.34938082098960876, | |
| "kl": 1.23388671875, | |
| "learning_rate": 7.194873910276205e-06, | |
| "loss": 0.0051, | |
| "reward": 2.1512277871370316, | |
| "reward_std": 0.5031805112957954, | |
| "rewards/accuracy_reward": 0.38169644703157246, | |
| "rewards/format_reward": 0.9397321864962578, | |
| "rewards/tag_count_reward": 0.8297991529107094, | |
| "step": 237 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1021.8906402587891, | |
| "epoch": 0.6346666666666667, | |
| "grad_norm": 0.22738490998744965, | |
| "kl": 0.2371826171875, | |
| "learning_rate": 7.1055176048263085e-06, | |
| "loss": 0.0062, | |
| "reward": 2.0831474363803864, | |
| "reward_std": 0.47590644657611847, | |
| "rewards/accuracy_reward": 0.28794643841683865, | |
| "rewards/format_reward": 0.9241071715950966, | |
| "rewards/tag_count_reward": 0.871093787252903, | |
| "step": 238 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1021.8191986083984, | |
| "epoch": 0.6373333333333333, | |
| "grad_norm": 0.2740417718887329, | |
| "kl": 0.2435302734375, | |
| "learning_rate": 7.0164128397554e-06, | |
| "loss": 0.0016, | |
| "reward": 2.166294753551483, | |
| "reward_std": 0.5260685943067074, | |
| "rewards/accuracy_reward": 0.3303571566939354, | |
| "rewards/format_reward": 0.9419643208384514, | |
| "rewards/tag_count_reward": 0.8939732536673546, | |
| "step": 239 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1023.0602722167969, | |
| "epoch": 0.64, | |
| "grad_norm": 0.8618957996368408, | |
| "kl": 0.253662109375, | |
| "learning_rate": 6.92756735857107e-06, | |
| "loss": 0.0078, | |
| "reward": 2.203125089406967, | |
| "reward_std": 0.4360897056758404, | |
| "rewards/accuracy_reward": 0.325892873108387, | |
| "rewards/format_reward": 0.9486607611179352, | |
| "rewards/tag_count_reward": 0.9285714700818062, | |
| "step": 240 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.6426666666666667, | |
| "grad_norm": 0.2517678141593933, | |
| "kl": 0.248779296875, | |
| "learning_rate": 6.838988882248243e-06, | |
| "loss": 0.01, | |
| "reward": 2.1992188692092896, | |
| "reward_std": 0.5083508864045143, | |
| "rewards/accuracy_reward": 0.34821430454030633, | |
| "rewards/format_reward": 0.9352678954601288, | |
| "rewards/tag_count_reward": 0.9157366380095482, | |
| "step": 241 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.6453333333333333, | |
| "grad_norm": 0.24498316645622253, | |
| "kl": 0.2371826171875, | |
| "learning_rate": 6.750685108558221e-06, | |
| "loss": 0.0095, | |
| "reward": 2.289062589406967, | |
| "reward_std": 0.5808268934488297, | |
| "rewards/accuracy_reward": 0.408482164144516, | |
| "rewards/format_reward": 0.9397321790456772, | |
| "rewards/tag_count_reward": 0.9408482685685158, | |
| "step": 242 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.648, | |
| "grad_norm": 0.20510031282901764, | |
| "kl": 0.261962890625, | |
| "learning_rate": 6.662663711399705e-06, | |
| "loss": 0.0105, | |
| "reward": 2.2154018729925156, | |
| "reward_std": 0.5156445652246475, | |
| "rewards/accuracy_reward": 0.34375001955777407, | |
| "rewards/format_reward": 0.9263393208384514, | |
| "rewards/tag_count_reward": 0.945312537252903, | |
| "step": 243 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.6506666666666666, | |
| "grad_norm": 0.18001040816307068, | |
| "kl": 0.2032470703125, | |
| "learning_rate": 6.574932340131917e-06, | |
| "loss": 0.0081, | |
| "reward": 2.252232253551483, | |
| "reward_std": 0.5333615131676197, | |
| "rewards/accuracy_reward": 0.39062501676380634, | |
| "rewards/format_reward": 0.933035746216774, | |
| "rewards/tag_count_reward": 0.9285714775323868, | |
| "step": 244 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.6533333333333333, | |
| "grad_norm": 0.19406573474407196, | |
| "kl": 0.225341796875, | |
| "learning_rate": 6.487498618909845e-06, | |
| "loss": 0.009, | |
| "reward": 2.214843824505806, | |
| "reward_std": 0.4596148282289505, | |
| "rewards/accuracy_reward": 0.34821430314332247, | |
| "rewards/format_reward": 0.9308036044239998, | |
| "rewards/tag_count_reward": 0.9358259290456772, | |
| "step": 245 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.656, | |
| "grad_norm": 0.35295507311820984, | |
| "kl": 0.2305908203125, | |
| "learning_rate": 6.400370146021662e-06, | |
| "loss": 0.0092, | |
| "reward": 2.094866171479225, | |
| "reward_std": 0.5512706525623798, | |
| "rewards/accuracy_reward": 0.2790178656578064, | |
| "rewards/format_reward": 0.9017857536673546, | |
| "rewards/tag_count_reward": 0.9140625447034836, | |
| "step": 246 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.6586666666666666, | |
| "grad_norm": 0.32935085892677307, | |
| "kl": 0.250732421875, | |
| "learning_rate": 6.313554493228431e-06, | |
| "loss": 0.01, | |
| "reward": 2.193638488650322, | |
| "reward_std": 0.5914541855454445, | |
| "rewards/accuracy_reward": 0.41071429569274187, | |
| "rewards/format_reward": 0.8928571790456772, | |
| "rewards/tag_count_reward": 0.8900670111179352, | |
| "step": 247 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1021.8125076293945, | |
| "epoch": 0.6613333333333333, | |
| "grad_norm": 0.3850191533565521, | |
| "kl": 0.268310546875, | |
| "learning_rate": 6.227059205106085e-06, | |
| "loss": 0.0048, | |
| "reward": 2.1065849363803864, | |
| "reward_std": 0.5955651290714741, | |
| "rewards/accuracy_reward": 0.32142858393490314, | |
| "rewards/format_reward": 0.8973214775323868, | |
| "rewards/tag_count_reward": 0.8878348618745804, | |
| "step": 248 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1023.8950958251953, | |
| "epoch": 0.664, | |
| "grad_norm": 1.332704782485962, | |
| "kl": 0.3875732421875, | |
| "learning_rate": 6.14089179838977e-06, | |
| "loss": 0.015, | |
| "reward": 2.0273438841104507, | |
| "reward_std": 0.6821491718292236, | |
| "rewards/accuracy_reward": 0.28571429778821766, | |
| "rewards/format_reward": 0.8549107685685158, | |
| "rewards/tag_count_reward": 0.8867188021540642, | |
| "step": 249 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 0.27303817868232727, | |
| "kl": 0.241943359375, | |
| "learning_rate": 6.0550597613206205e-06, | |
| "loss": 0.0097, | |
| "reward": 2.0262277722358704, | |
| "reward_std": 0.5583557672798634, | |
| "rewards/accuracy_reward": 0.2321428656578064, | |
| "rewards/format_reward": 0.881696455180645, | |
| "rewards/tag_count_reward": 0.912388451397419, | |
| "step": 250 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.6693333333333333, | |
| "grad_norm": 0.24417054653167725, | |
| "kl": 0.2457275390625, | |
| "learning_rate": 5.969570552995014e-06, | |
| "loss": 0.0098, | |
| "reward": 2.052455425262451, | |
| "reward_std": 0.6332450993359089, | |
| "rewards/accuracy_reward": 0.29241072619333863, | |
| "rewards/format_reward": 0.8482143357396126, | |
| "rewards/tag_count_reward": 0.91183041036129, | |
| "step": 251 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1021.8504486083984, | |
| "epoch": 0.672, | |
| "grad_norm": 0.33021074533462524, | |
| "kl": 0.2567138671875, | |
| "learning_rate": 5.8844316027163315e-06, | |
| "loss": 0.0026, | |
| "reward": 2.1372769325971603, | |
| "reward_std": 0.6047781556844711, | |
| "rewards/accuracy_reward": 0.3236607313156128, | |
| "rewards/format_reward": 0.8772321864962578, | |
| "rewards/tag_count_reward": 0.9363839700818062, | |
| "step": 252 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.6746666666666666, | |
| "grad_norm": 0.26396986842155457, | |
| "kl": 0.1905517578125, | |
| "learning_rate": 5.799650309349348e-06, | |
| "loss": 0.0077, | |
| "reward": 2.209263503551483, | |
| "reward_std": 0.5806162096560001, | |
| "rewards/accuracy_reward": 0.3794643022119999, | |
| "rewards/format_reward": 0.883928619325161, | |
| "rewards/tag_count_reward": 0.9458705857396126, | |
| "step": 253 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.6773333333333333, | |
| "grad_norm": 0.20837701857089996, | |
| "kl": 0.2464599609375, | |
| "learning_rate": 5.715234040677229e-06, | |
| "loss": 0.0099, | |
| "reward": 2.099888503551483, | |
| "reward_std": 0.6910686045885086, | |
| "rewards/accuracy_reward": 0.34375001303851604, | |
| "rewards/format_reward": 0.8459821864962578, | |
| "rewards/tag_count_reward": 0.910156287252903, | |
| "step": 254 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.68, | |
| "grad_norm": 0.2536359429359436, | |
| "kl": 0.19482421875, | |
| "learning_rate": 5.631190132761247e-06, | |
| "loss": 0.0078, | |
| "reward": 2.228794753551483, | |
| "reward_std": 0.4858120158314705, | |
| "rewards/accuracy_reward": 0.3504464402794838, | |
| "rewards/format_reward": 0.9151786118745804, | |
| "rewards/tag_count_reward": 0.9631696864962578, | |
| "step": 255 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.6826666666666666, | |
| "grad_norm": 0.2243085354566574, | |
| "kl": 0.208740234375, | |
| "learning_rate": 5.547525889303265e-06, | |
| "loss": 0.0083, | |
| "reward": 2.0228795558214188, | |
| "reward_std": 0.5694718845188618, | |
| "rewards/accuracy_reward": 0.1919642947614193, | |
| "rewards/format_reward": 0.8839286118745804, | |
| "rewards/tag_count_reward": 0.94698666036129, | |
| "step": 256 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.6853333333333333, | |
| "grad_norm": 0.23991478979587555, | |
| "kl": 0.2109375, | |
| "learning_rate": 5.464248581011002e-06, | |
| "loss": 0.0084, | |
| "reward": 2.0742188543081284, | |
| "reward_std": 0.5562547482550144, | |
| "rewards/accuracy_reward": 0.2522321520373225, | |
| "rewards/format_reward": 0.8883928954601288, | |
| "rewards/tag_count_reward": 0.9335938021540642, | |
| "step": 257 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.688, | |
| "grad_norm": 0.24084815382957458, | |
| "kl": 0.1883544921875, | |
| "learning_rate": 5.381365444966205e-06, | |
| "loss": 0.0075, | |
| "reward": 2.194754585623741, | |
| "reward_std": 0.5408528298139572, | |
| "rewards/accuracy_reward": 0.32812501303851604, | |
| "rewards/format_reward": 0.9196428805589676, | |
| "rewards/tag_count_reward": 0.94698666036129, | |
| "step": 258 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.6906666666666667, | |
| "grad_norm": 0.2627911865711212, | |
| "kl": 0.2288818359375, | |
| "learning_rate": 5.298883683995697e-06, | |
| "loss": 0.0092, | |
| "reward": 2.180803656578064, | |
| "reward_std": 0.5501891225576401, | |
| "rewards/accuracy_reward": 0.32142858300358057, | |
| "rewards/format_reward": 0.9151786118745804, | |
| "rewards/tag_count_reward": 0.9441964700818062, | |
| "step": 259 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1021.9285736083984, | |
| "epoch": 0.6933333333333334, | |
| "grad_norm": 0.29868605732917786, | |
| "kl": 0.204833984375, | |
| "learning_rate": 5.216810466045448e-06, | |
| "loss": 0.0051, | |
| "reward": 2.1222099363803864, | |
| "reward_std": 0.5263746418058872, | |
| "rewards/accuracy_reward": 0.2745535832364112, | |
| "rewards/format_reward": 0.9174107536673546, | |
| "rewards/tag_count_reward": 0.9302455708384514, | |
| "step": 260 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.696, | |
| "grad_norm": 0.2207689732313156, | |
| "kl": 0.232666015625, | |
| "learning_rate": 5.135152923557647e-06, | |
| "loss": 0.0093, | |
| "reward": 2.2232143878936768, | |
| "reward_std": 0.5706557966768742, | |
| "rewards/accuracy_reward": 0.35714287613518536, | |
| "rewards/format_reward": 0.9241071864962578, | |
| "rewards/tag_count_reward": 0.9419643208384514, | |
| "step": 261 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1023.9397354125977, | |
| "epoch": 0.6986666666666667, | |
| "grad_norm": 0.923786997795105, | |
| "kl": 0.3634033203125, | |
| "learning_rate": 5.053918152850868e-06, | |
| "loss": 0.0145, | |
| "reward": 2.0675224363803864, | |
| "reward_std": 0.6228335537016392, | |
| "rewards/accuracy_reward": 0.27678573061712086, | |
| "rewards/format_reward": 0.8883928954601288, | |
| "rewards/tag_count_reward": 0.902343787252903, | |
| "step": 262 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.7013333333333334, | |
| "grad_norm": 0.22262142598628998, | |
| "kl": 0.2320556640625, | |
| "learning_rate": 4.973113213503379e-06, | |
| "loss": 0.0094, | |
| "reward": 2.079799249768257, | |
| "reward_std": 0.49952351674437523, | |
| "rewards/accuracy_reward": 0.2566964402794838, | |
| "rewards/format_reward": 0.8995536118745804, | |
| "rewards/tag_count_reward": 0.9235491529107094, | |
| "step": 263 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1021.7723236083984, | |
| "epoch": 0.704, | |
| "grad_norm": 0.23496830463409424, | |
| "kl": 0.2869873046875, | |
| "learning_rate": 4.8927451277396365e-06, | |
| "loss": 0.0033, | |
| "reward": 2.0965402722358704, | |
| "reward_std": 0.5459967963397503, | |
| "rewards/accuracy_reward": 0.2834821529686451, | |
| "rewards/format_reward": 0.8950893208384514, | |
| "rewards/tag_count_reward": 0.917968787252903, | |
| "step": 264 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.7066666666666667, | |
| "grad_norm": 0.35722586512565613, | |
| "kl": 0.2728271484375, | |
| "learning_rate": 4.812820879820034e-06, | |
| "loss": 0.0109, | |
| "reward": 2.1143974512815475, | |
| "reward_std": 0.5960128493607044, | |
| "rewards/accuracy_reward": 0.3080357275903225, | |
| "rewards/format_reward": 0.8883929029107094, | |
| "rewards/tag_count_reward": 0.9179687947034836, | |
| "step": 265 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1023.5870590209961, | |
| "epoch": 0.7093333333333334, | |
| "grad_norm": 0.27110370993614197, | |
| "kl": 0.2879638671875, | |
| "learning_rate": 4.733347415433946e-06, | |
| "loss": 0.0107, | |
| "reward": 2.1824778020381927, | |
| "reward_std": 0.5552078559994698, | |
| "rewards/accuracy_reward": 0.330357164144516, | |
| "rewards/format_reward": 0.910714328289032, | |
| "rewards/tag_count_reward": 0.9414063021540642, | |
| "step": 266 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1017.9531478881836, | |
| "epoch": 0.712, | |
| "grad_norm": 0.2856296896934509, | |
| "kl": 0.3702392578125, | |
| "learning_rate": 4.654331641096118e-06, | |
| "loss": -0.0014, | |
| "reward": 2.164062574505806, | |
| "reward_std": 0.5980110131204128, | |
| "rewards/accuracy_reward": 0.35267858393490314, | |
| "rewards/format_reward": 0.8816964700818062, | |
| "rewards/tag_count_reward": 0.9296875447034836, | |
| "step": 267 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.7146666666666667, | |
| "grad_norm": 0.503265917301178, | |
| "kl": 0.4259033203125, | |
| "learning_rate": 4.575780423546476e-06, | |
| "loss": 0.017, | |
| "reward": 2.089843824505806, | |
| "reward_std": 0.6514175869524479, | |
| "rewards/accuracy_reward": 0.31473215227015316, | |
| "rewards/format_reward": 0.863839328289032, | |
| "rewards/tag_count_reward": 0.9112723618745804, | |
| "step": 268 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1021.5401840209961, | |
| "epoch": 0.7173333333333334, | |
| "grad_norm": 0.5456968545913696, | |
| "kl": 0.456298828125, | |
| "learning_rate": 4.497700589153379e-06, | |
| "loss": 0.0121, | |
| "reward": 2.1322545558214188, | |
| "reward_std": 0.5633701980113983, | |
| "rewards/accuracy_reward": 0.31696429778821766, | |
| "rewards/format_reward": 0.8772321790456772, | |
| "rewards/tag_count_reward": 0.9380580857396126, | |
| "step": 269 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1019.4308166503906, | |
| "epoch": 0.72, | |
| "grad_norm": 0.5453124642372131, | |
| "kl": 0.689453125, | |
| "learning_rate": 4.420098923320378e-06, | |
| "loss": 0.0179, | |
| "reward": 2.0892858058214188, | |
| "reward_std": 0.6481143087148666, | |
| "rewards/accuracy_reward": 0.29910715855658054, | |
| "rewards/format_reward": 0.8727678954601288, | |
| "rewards/tag_count_reward": 0.9174107536673546, | |
| "step": 270 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1023.0803604125977, | |
| "epoch": 0.7226666666666667, | |
| "grad_norm": 1.6248281002044678, | |
| "kl": 0.951171875, | |
| "learning_rate": 4.342982169896555e-06, | |
| "loss": 0.0359, | |
| "reward": 2.1824777722358704, | |
| "reward_std": 0.6868480890989304, | |
| "rewards/accuracy_reward": 0.3906250186264515, | |
| "rewards/format_reward": 0.8794643208384514, | |
| "rewards/tag_count_reward": 0.9123884364962578, | |
| "step": 271 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1017.8303909301758, | |
| "epoch": 0.7253333333333334, | |
| "grad_norm": 2.223525285720825, | |
| "kl": 0.97705078125, | |
| "learning_rate": 4.266357030590449e-06, | |
| "loss": 0.0254, | |
| "reward": 1.9916295409202576, | |
| "reward_std": 0.6709228046238422, | |
| "rewards/accuracy_reward": 0.23883929522708058, | |
| "rewards/format_reward": 0.8459821864962578, | |
| "rewards/tag_count_reward": 0.9068080857396126, | |
| "step": 272 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1020.6607284545898, | |
| "epoch": 0.728, | |
| "grad_norm": 2.5242061614990234, | |
| "kl": 0.951171875, | |
| "learning_rate": 4.1902301643876555e-06, | |
| "loss": 0.027, | |
| "reward": 2.1835938841104507, | |
| "reward_std": 0.6639501675963402, | |
| "rewards/accuracy_reward": 0.3750000186264515, | |
| "rewards/format_reward": 0.8772321864962578, | |
| "rewards/tag_count_reward": 0.9313616454601288, | |
| "step": 273 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1023.4754486083984, | |
| "epoch": 0.7306666666666667, | |
| "grad_norm": 0.40351805090904236, | |
| "kl": 0.8681640625, | |
| "learning_rate": 4.114608186972143e-06, | |
| "loss": 0.034, | |
| "reward": 2.135044753551483, | |
| "reward_std": 0.6348404288291931, | |
| "rewards/accuracy_reward": 0.33705358672887087, | |
| "rewards/format_reward": 0.8705357685685158, | |
| "rewards/tag_count_reward": 0.9274553954601288, | |
| "step": 274 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1018.0268020629883, | |
| "epoch": 0.7333333333333333, | |
| "grad_norm": 2.133788585662842, | |
| "kl": 1.110595703125, | |
| "learning_rate": 4.0394976701513235e-06, | |
| "loss": 0.029, | |
| "reward": 2.1356027871370316, | |
| "reward_std": 0.7512710765004158, | |
| "rewards/accuracy_reward": 0.3995535857975483, | |
| "rewards/format_reward": 0.8303571939468384, | |
| "rewards/tag_count_reward": 0.9056920185685158, | |
| "step": 275 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1023.6607208251953, | |
| "epoch": 0.736, | |
| "grad_norm": 0.8914547562599182, | |
| "kl": 0.768310546875, | |
| "learning_rate": 3.96490514128494e-06, | |
| "loss": 0.0301, | |
| "reward": 2.1155135184526443, | |
| "reward_std": 0.7052669823169708, | |
| "rewards/accuracy_reward": 0.34598215855658054, | |
| "rewards/format_reward": 0.848214328289032, | |
| "rewards/tag_count_reward": 0.9213170036673546, | |
| "step": 276 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1023.9151840209961, | |
| "epoch": 0.7386666666666667, | |
| "grad_norm": 0.5397745966911316, | |
| "kl": 0.6962890625, | |
| "learning_rate": 3.890837082717822e-06, | |
| "loss": 0.0277, | |
| "reward": 2.074218839406967, | |
| "reward_std": 0.6800587102770805, | |
| "rewards/accuracy_reward": 0.3125000186264515, | |
| "rewards/format_reward": 0.8437500521540642, | |
| "rewards/tag_count_reward": 0.917968787252903, | |
| "step": 277 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1018.1227951049805, | |
| "epoch": 0.7413333333333333, | |
| "grad_norm": 0.45244407653808594, | |
| "kl": 0.572509765625, | |
| "learning_rate": 3.817299931216537e-06, | |
| "loss": 0.0115, | |
| "reward": 2.083705484867096, | |
| "reward_std": 0.6785896308720112, | |
| "rewards/accuracy_reward": 0.29241072945296764, | |
| "rewards/format_reward": 0.8683036267757416, | |
| "rewards/tag_count_reward": 0.9229910969734192, | |
| "step": 278 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1017.2433242797852, | |
| "epoch": 0.744, | |
| "grad_norm": 0.6339854598045349, | |
| "kl": 0.649658203125, | |
| "learning_rate": 3.74430007741003e-06, | |
| "loss": 0.0105, | |
| "reward": 2.0390626043081284, | |
| "reward_std": 0.6866142302751541, | |
| "rewards/accuracy_reward": 0.27455357764847577, | |
| "rewards/format_reward": 0.8504464700818062, | |
| "rewards/tag_count_reward": 0.9140625521540642, | |
| "step": 279 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.7466666666666667, | |
| "grad_norm": 0.6728559136390686, | |
| "kl": 0.6767578125, | |
| "learning_rate": 3.671843865234238e-06, | |
| "loss": 0.0271, | |
| "reward": 2.2684153020381927, | |
| "reward_std": 0.7245003581047058, | |
| "rewards/accuracy_reward": 0.4933035895228386, | |
| "rewards/format_reward": 0.859375037252903, | |
| "rewards/tag_count_reward": 0.9157366454601288, | |
| "step": 280 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1023.7723236083984, | |
| "epoch": 0.7493333333333333, | |
| "grad_norm": 0.7666006088256836, | |
| "kl": 0.7099609375, | |
| "learning_rate": 3.599937591380791e-06, | |
| "loss": 0.0277, | |
| "reward": 2.04241082072258, | |
| "reward_std": 0.6483294367790222, | |
| "rewards/accuracy_reward": 0.27455358672887087, | |
| "rewards/format_reward": 0.8482143357396126, | |
| "rewards/tag_count_reward": 0.9196428880095482, | |
| "step": 281 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1021.8035736083984, | |
| "epoch": 0.752, | |
| "grad_norm": 0.42735132575035095, | |
| "kl": 0.5888671875, | |
| "learning_rate": 3.5285875047498075e-06, | |
| "loss": 0.0193, | |
| "reward": 2.113839328289032, | |
| "reward_std": 0.6161059066653252, | |
| "rewards/accuracy_reward": 0.31696429708972573, | |
| "rewards/format_reward": 0.859375037252903, | |
| "rewards/tag_count_reward": 0.9375000521540642, | |
| "step": 282 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1023.6383972167969, | |
| "epoch": 0.7546666666666667, | |
| "grad_norm": 1.4887264966964722, | |
| "kl": 0.679443359375, | |
| "learning_rate": 3.4577998059068354e-06, | |
| "loss": 0.0267, | |
| "reward": 1.9263393580913544, | |
| "reward_std": 0.7034792378544807, | |
| "rewards/accuracy_reward": 0.23883929778821766, | |
| "rewards/format_reward": 0.785714328289032, | |
| "rewards/tag_count_reward": 0.9017857536673546, | |
| "step": 283 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1021.8303604125977, | |
| "epoch": 0.7573333333333333, | |
| "grad_norm": 0.34186023473739624, | |
| "kl": 0.44091796875, | |
| "learning_rate": 3.3875806465440152e-06, | |
| "loss": 0.0153, | |
| "reward": 2.2338171005249023, | |
| "reward_std": 0.6429506540298462, | |
| "rewards/accuracy_reward": 0.4151785857975483, | |
| "rewards/format_reward": 0.8772321939468384, | |
| "rewards/tag_count_reward": 0.9414062947034836, | |
| "step": 284 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1021.6919708251953, | |
| "epoch": 0.76, | |
| "grad_norm": 0.318380743265152, | |
| "kl": 0.3946533203125, | |
| "learning_rate": 3.3179361289454694e-06, | |
| "loss": 0.0086, | |
| "reward": 2.2293528020381927, | |
| "reward_std": 0.6601455509662628, | |
| "rewards/accuracy_reward": 0.42410715483129025, | |
| "rewards/format_reward": 0.8593750521540642, | |
| "rewards/tag_count_reward": 0.9458705857396126, | |
| "step": 285 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.7626666666666667, | |
| "grad_norm": 0.3260420560836792, | |
| "kl": 0.3426513671875, | |
| "learning_rate": 3.2488723054569905e-06, | |
| "loss": 0.0138, | |
| "reward": 2.1450893729925156, | |
| "reward_std": 0.5205648727715015, | |
| "rewards/accuracy_reward": 0.2991071534343064, | |
| "rewards/format_reward": 0.8928571864962578, | |
| "rewards/tag_count_reward": 0.953125037252903, | |
| "step": 286 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.7653333333333333, | |
| "grad_norm": 0.2629320025444031, | |
| "kl": 0.2696533203125, | |
| "learning_rate": 3.1803951779600774e-06, | |
| "loss": 0.0109, | |
| "reward": 2.2338170260190964, | |
| "reward_std": 0.5047199167311192, | |
| "rewards/accuracy_reward": 0.3705357350409031, | |
| "rewards/format_reward": 0.9107143133878708, | |
| "rewards/tag_count_reward": 0.9525670111179352, | |
| "step": 287 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.768, | |
| "grad_norm": 0.3063962161540985, | |
| "kl": 0.22705078125, | |
| "learning_rate": 3.112510697350348e-06, | |
| "loss": 0.0091, | |
| "reward": 2.2137278020381927, | |
| "reward_std": 0.5222755149006844, | |
| "rewards/accuracy_reward": 0.3281250149011612, | |
| "rewards/format_reward": 0.9196428954601288, | |
| "rewards/tag_count_reward": 0.9659598618745804, | |
| "step": 288 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.7706666666666667, | |
| "grad_norm": 0.31240493059158325, | |
| "kl": 0.2208251953125, | |
| "learning_rate": 3.04522476302039e-06, | |
| "loss": 0.009, | |
| "reward": 2.4347099363803864, | |
| "reward_std": 0.49093519896268845, | |
| "rewards/accuracy_reward": 0.5223214514553547, | |
| "rewards/format_reward": 0.9352678954601288, | |
| "rewards/tag_count_reward": 0.9771205708384514, | |
| "step": 289 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.7733333333333333, | |
| "grad_norm": 0.2734697461128235, | |
| "kl": 0.21337890625, | |
| "learning_rate": 2.978543222347076e-06, | |
| "loss": 0.0088, | |
| "reward": 2.364955484867096, | |
| "reward_std": 0.38068827986717224, | |
| "rewards/accuracy_reward": 0.4241071604192257, | |
| "rewards/format_reward": 0.9575893208384514, | |
| "rewards/tag_count_reward": 0.9832589626312256, | |
| "step": 290 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1019.6495666503906, | |
| "epoch": 0.776, | |
| "grad_norm": 0.2320028394460678, | |
| "kl": 0.2481689453125, | |
| "learning_rate": 2.912471870183411e-06, | |
| "loss": -0.0009, | |
| "reward": 2.199776917695999, | |
| "reward_std": 0.5590856000781059, | |
| "rewards/accuracy_reward": 0.34598216274753213, | |
| "rewards/format_reward": 0.8973214700818062, | |
| "rewards/tag_count_reward": 0.9564732611179352, | |
| "step": 291 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1021.7767944335938, | |
| "epoch": 0.7786666666666666, | |
| "grad_norm": 0.29300227761268616, | |
| "kl": 0.1875, | |
| "learning_rate": 2.847016448354948e-06, | |
| "loss": -0.0004, | |
| "reward": 2.2890625298023224, | |
| "reward_std": 0.4556136131286621, | |
| "rewards/accuracy_reward": 0.3794643022119999, | |
| "rewards/format_reward": 0.9375000298023224, | |
| "rewards/tag_count_reward": 0.9720982536673546, | |
| "step": 292 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1021.7745590209961, | |
| "epoch": 0.7813333333333333, | |
| "grad_norm": 0.29833829402923584, | |
| "kl": 0.212158203125, | |
| "learning_rate": 2.782182645160789e-06, | |
| "loss": 0.0019, | |
| "reward": 2.275669738650322, | |
| "reward_std": 0.42217013984918594, | |
| "rewards/accuracy_reward": 0.36160715855658054, | |
| "rewards/format_reward": 0.944196455180645, | |
| "rewards/tag_count_reward": 0.9698661267757416, | |
| "step": 293 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.784, | |
| "grad_norm": 0.35956284403800964, | |
| "kl": 0.2496337890625, | |
| "learning_rate": 2.71797609487926e-06, | |
| "loss": 0.0101, | |
| "reward": 2.2433036863803864, | |
| "reward_std": 0.5022850632667542, | |
| "rewards/accuracy_reward": 0.3504464514553547, | |
| "rewards/format_reward": 0.9263393208384514, | |
| "rewards/tag_count_reward": 0.9665179029107094, | |
| "step": 294 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1021.7455444335938, | |
| "epoch": 0.7866666666666666, | |
| "grad_norm": 0.23198282718658447, | |
| "kl": 0.3045654296875, | |
| "learning_rate": 2.6544023772782736e-06, | |
| "loss": 0.0012, | |
| "reward": 2.301339417695999, | |
| "reward_std": 0.4813353540375829, | |
| "rewards/accuracy_reward": 0.4129464477300644, | |
| "rewards/format_reward": 0.926339328289032, | |
| "rewards/tag_count_reward": 0.9620536118745804, | |
| "step": 295 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1019.7142944335938, | |
| "epoch": 0.7893333333333333, | |
| "grad_norm": 0.27904388308525085, | |
| "kl": 0.3428955078125, | |
| "learning_rate": 2.591467017130426e-06, | |
| "loss": 0.0038, | |
| "reward": 2.1456473916769028, | |
| "reward_std": 0.4807808957993984, | |
| "rewards/accuracy_reward": 0.27232144260779023, | |
| "rewards/format_reward": 0.9196428954601288, | |
| "rewards/tag_count_reward": 0.9536830857396126, | |
| "step": 296 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.792, | |
| "grad_norm": 0.272941917181015, | |
| "kl": 0.2530517578125, | |
| "learning_rate": 2.5291754837328787e-06, | |
| "loss": 0.0101, | |
| "reward": 2.2561384737491608, | |
| "reward_std": 0.5383404716849327, | |
| "rewards/accuracy_reward": 0.36383930407464504, | |
| "rewards/format_reward": 0.9308036044239998, | |
| "rewards/tag_count_reward": 0.9614955708384514, | |
| "step": 297 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1020.1339416503906, | |
| "epoch": 0.7946666666666666, | |
| "grad_norm": 0.3166991174221039, | |
| "kl": 0.23974609375, | |
| "learning_rate": 2.4675331904320533e-06, | |
| "loss": 0.0086, | |
| "reward": 2.244977742433548, | |
| "reward_std": 0.5296464376151562, | |
| "rewards/accuracy_reward": 0.37276787869632244, | |
| "rewards/format_reward": 0.9196428954601288, | |
| "rewards/tag_count_reward": 0.9525670185685158, | |
| "step": 298 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1021.9933090209961, | |
| "epoch": 0.7973333333333333, | |
| "grad_norm": 0.24607542157173157, | |
| "kl": 0.208984375, | |
| "learning_rate": 2.4065454941531963e-06, | |
| "loss": 0.0012, | |
| "reward": 2.2957590520381927, | |
| "reward_std": 0.4782888777554035, | |
| "rewards/accuracy_reward": 0.39732144959270954, | |
| "rewards/format_reward": 0.9352678880095482, | |
| "rewards/tag_count_reward": 0.9631696864962578, | |
| "step": 299 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1015.8437652587891, | |
| "epoch": 0.8, | |
| "grad_norm": 0.2551475167274475, | |
| "kl": 0.306396484375, | |
| "learning_rate": 2.346217694934847e-06, | |
| "loss": 0.0037, | |
| "reward": 2.2566965222358704, | |
| "reward_std": 0.5625656880438328, | |
| "rewards/accuracy_reward": 0.388392873108387, | |
| "rewards/format_reward": 0.9174107536673546, | |
| "rewards/tag_count_reward": 0.9508928880095482, | |
| "step": 300 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1014.8750305175781, | |
| "epoch": 0.8026666666666666, | |
| "grad_norm": 0.21855834126472473, | |
| "kl": 0.2073974609375, | |
| "learning_rate": 2.286555035468233e-06, | |
| "loss": 0.0062, | |
| "reward": 2.209821566939354, | |
| "reward_std": 0.39225295558571815, | |
| "rewards/accuracy_reward": 0.2901785848662257, | |
| "rewards/format_reward": 0.9508928954601288, | |
| "rewards/tag_count_reward": 0.9687500447034836, | |
| "step": 301 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1019.4687652587891, | |
| "epoch": 0.8053333333333333, | |
| "grad_norm": 0.2842949628829956, | |
| "kl": 0.219970703125, | |
| "learning_rate": 2.22756270064168e-06, | |
| "loss": 0.0085, | |
| "reward": 2.2840402722358704, | |
| "reward_std": 0.5333524160087109, | |
| "rewards/accuracy_reward": 0.404017873108387, | |
| "rewards/format_reward": 0.9241071939468384, | |
| "rewards/tag_count_reward": 0.9559152200818062, | |
| "step": 302 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1017.2812652587891, | |
| "epoch": 0.808, | |
| "grad_norm": 1.0460567474365234, | |
| "kl": 0.333740234375, | |
| "learning_rate": 2.16924581709002e-06, | |
| "loss": 0.0141, | |
| "reward": 2.2137277722358704, | |
| "reward_std": 0.5989870205521584, | |
| "rewards/accuracy_reward": 0.36383930034935474, | |
| "rewards/format_reward": 0.9129464626312256, | |
| "rewards/tag_count_reward": 0.936941996216774, | |
| "step": 303 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1014.8884201049805, | |
| "epoch": 0.8106666666666666, | |
| "grad_norm": 0.24618743360042572, | |
| "kl": 0.252197265625, | |
| "learning_rate": 2.1116094527490594e-06, | |
| "loss": -0.001, | |
| "reward": 2.2466518729925156, | |
| "reward_std": 0.5951163619756699, | |
| "rewards/accuracy_reward": 0.3995535857975483, | |
| "rewards/format_reward": 0.906250037252903, | |
| "rewards/tag_count_reward": 0.9408482611179352, | |
| "step": 304 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1020.6183166503906, | |
| "epoch": 0.8133333333333334, | |
| "grad_norm": 0.24632352590560913, | |
| "kl": 0.28173828125, | |
| "learning_rate": 2.0546586164151827e-06, | |
| "loss": 0.0065, | |
| "reward": 2.1841518878936768, | |
| "reward_std": 0.5703849159181118, | |
| "rewards/accuracy_reward": 0.3482143050059676, | |
| "rewards/format_reward": 0.8995536044239998, | |
| "rewards/tag_count_reward": 0.9363839700818062, | |
| "step": 305 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1019.8459930419922, | |
| "epoch": 0.816, | |
| "grad_norm": 0.23753482103347778, | |
| "kl": 0.2523193359375, | |
| "learning_rate": 1.9983982573100413e-06, | |
| "loss": 0.0035, | |
| "reward": 2.3169643580913544, | |
| "reward_std": 0.6238032579421997, | |
| "rewards/accuracy_reward": 0.4441964477300644, | |
| "rewards/format_reward": 0.9241071864962578, | |
| "rewards/tag_count_reward": 0.9486607536673546, | |
| "step": 306 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1014.0536041259766, | |
| "epoch": 0.8186666666666667, | |
| "grad_norm": 0.20797064900398254, | |
| "kl": 0.2744140625, | |
| "learning_rate": 1.94283326465047e-06, | |
| "loss": -0.0052, | |
| "reward": 2.1305804401636124, | |
| "reward_std": 0.5734776593744755, | |
| "rewards/accuracy_reward": 0.2946428656578064, | |
| "rewards/format_reward": 0.8973214775323868, | |
| "rewards/tag_count_reward": 0.9386161118745804, | |
| "step": 307 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1020.1183166503906, | |
| "epoch": 0.8213333333333334, | |
| "grad_norm": 0.30176863074302673, | |
| "kl": 0.24755859375, | |
| "learning_rate": 1.887968467223591e-06, | |
| "loss": 0.0048, | |
| "reward": 2.251674234867096, | |
| "reward_std": 0.6692539118230343, | |
| "rewards/accuracy_reward": 0.4174107350409031, | |
| "rewards/format_reward": 0.899553619325161, | |
| "rewards/tag_count_reward": 0.9347098618745804, | |
| "step": 308 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1012.366096496582, | |
| "epoch": 0.824, | |
| "grad_norm": 0.324207603931427, | |
| "kl": 0.220703125, | |
| "learning_rate": 1.8338086329671734e-06, | |
| "loss": 0.0124, | |
| "reward": 2.1830358058214188, | |
| "reward_std": 0.5649261251091957, | |
| "rewards/accuracy_reward": 0.32812501350417733, | |
| "rewards/format_reward": 0.9107143208384514, | |
| "rewards/tag_count_reward": 0.9441964700818062, | |
| "step": 309 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1018.5535888671875, | |
| "epoch": 0.8266666666666667, | |
| "grad_norm": 0.2686956524848938, | |
| "kl": 0.2506103515625, | |
| "learning_rate": 1.7803584685552877e-06, | |
| "loss": 0.001, | |
| "reward": 2.042968839406967, | |
| "reward_std": 0.604579221457243, | |
| "rewards/accuracy_reward": 0.22991072572767735, | |
| "rewards/format_reward": 0.8839286044239998, | |
| "rewards/tag_count_reward": 0.9291295185685158, | |
| "step": 310 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1018.4419784545898, | |
| "epoch": 0.8293333333333334, | |
| "grad_norm": 0.3965815007686615, | |
| "kl": 0.19384765625, | |
| "learning_rate": 1.7276226189892763e-06, | |
| "loss": 0.0043, | |
| "reward": 2.279017925262451, | |
| "reward_std": 0.5148574188351631, | |
| "rewards/accuracy_reward": 0.361607164144516, | |
| "rewards/format_reward": 0.9464286118745804, | |
| "rewards/tag_count_reward": 0.9709821864962578, | |
| "step": 311 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1017.2165451049805, | |
| "epoch": 0.832, | |
| "grad_norm": 0.3476318418979645, | |
| "kl": 0.244140625, | |
| "learning_rate": 1.6756056671940902e-06, | |
| "loss": 0.0017, | |
| "reward": 2.2522322684526443, | |
| "reward_std": 0.660854198038578, | |
| "rewards/accuracy_reward": 0.4084821669384837, | |
| "rewards/format_reward": 0.9017857536673546, | |
| "rewards/tag_count_reward": 0.9419643357396126, | |
| "step": 312 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1020.2276916503906, | |
| "epoch": 0.8346666666666667, | |
| "grad_norm": 0.21767204999923706, | |
| "kl": 0.2122802734375, | |
| "learning_rate": 1.624312133620013e-06, | |
| "loss": 0.0087, | |
| "reward": 2.1450893729925156, | |
| "reward_std": 0.5749500542879105, | |
| "rewards/accuracy_reward": 0.29910715855658054, | |
| "rewards/format_reward": 0.9017857536673546, | |
| "rewards/tag_count_reward": 0.9441964700818062, | |
| "step": 313 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1020.4263458251953, | |
| "epoch": 0.8373333333333334, | |
| "grad_norm": 0.5620092153549194, | |
| "kl": 0.3297119140625, | |
| "learning_rate": 1.5737464758498243e-06, | |
| "loss": 0.0137, | |
| "reward": 2.0965402722358704, | |
| "reward_std": 0.6482469737529755, | |
| "rewards/accuracy_reward": 0.29910715483129025, | |
| "rewards/format_reward": 0.8772321790456772, | |
| "rewards/tag_count_reward": 0.9202009439468384, | |
| "step": 314 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1019.7901916503906, | |
| "epoch": 0.84, | |
| "grad_norm": 0.2823946475982666, | |
| "kl": 0.24169921875, | |
| "learning_rate": 1.523913088211415e-06, | |
| "loss": 0.0072, | |
| "reward": 2.210937589406967, | |
| "reward_std": 0.5960576869547367, | |
| "rewards/accuracy_reward": 0.3660714440047741, | |
| "rewards/format_reward": 0.899553619325161, | |
| "rewards/tag_count_reward": 0.9453125447034836, | |
| "step": 315 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.8426666666666667, | |
| "grad_norm": 0.30682700872421265, | |
| "kl": 0.20391845703125, | |
| "learning_rate": 1.474816301395906e-06, | |
| "loss": 0.0082, | |
| "reward": 2.2784599363803864, | |
| "reward_std": 0.5610373616218567, | |
| "rewards/accuracy_reward": 0.3906250260770321, | |
| "rewards/format_reward": 0.9263393208384514, | |
| "rewards/tag_count_reward": 0.9614955708384514, | |
| "step": 316 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1006.584846496582, | |
| "epoch": 0.8453333333333334, | |
| "grad_norm": 0.23224209249019623, | |
| "kl": 0.2933349609375, | |
| "learning_rate": 1.4264603820813006e-06, | |
| "loss": 0.0081, | |
| "reward": 1.9715402871370316, | |
| "reward_std": 0.5434744767844677, | |
| "rewards/accuracy_reward": 0.1629464323632419, | |
| "rewards/format_reward": 0.8750000447034836, | |
| "rewards/tag_count_reward": 0.9335938021540642, | |
| "step": 317 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1012.3281555175781, | |
| "epoch": 0.848, | |
| "grad_norm": 0.37959590554237366, | |
| "kl": 0.422119140625, | |
| "learning_rate": 1.3788495325616912e-06, | |
| "loss": -0.0014, | |
| "reward": 2.26506707072258, | |
| "reward_std": 0.6333228126168251, | |
| "rewards/accuracy_reward": 0.42633930779993534, | |
| "rewards/format_reward": 0.899553619325161, | |
| "rewards/tag_count_reward": 0.9391741529107094, | |
| "step": 318 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1018.6584930419922, | |
| "epoch": 0.8506666666666667, | |
| "grad_norm": 0.2974534034729004, | |
| "kl": 0.27685546875, | |
| "learning_rate": 1.3319878903820682e-06, | |
| "loss": 0.0003, | |
| "reward": 2.3013393878936768, | |
| "reward_std": 0.5945294424891472, | |
| "rewards/accuracy_reward": 0.4308035895228386, | |
| "rewards/format_reward": 0.9107143133878708, | |
| "rewards/tag_count_reward": 0.9598214700818062, | |
| "step": 319 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1020.2968902587891, | |
| "epoch": 0.8533333333333334, | |
| "grad_norm": 0.32377177476882935, | |
| "kl": 0.2545166015625, | |
| "learning_rate": 1.2858795279787517e-06, | |
| "loss": 0.0079, | |
| "reward": 2.1796876043081284, | |
| "reward_std": 0.5994452647864819, | |
| "rewards/accuracy_reward": 0.35044644214212894, | |
| "rewards/format_reward": 0.8906250447034836, | |
| "rewards/tag_count_reward": 0.938616119325161, | |
| "step": 320 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1018.444206237793, | |
| "epoch": 0.856, | |
| "grad_norm": 0.2041098177433014, | |
| "kl": 0.2281494140625, | |
| "learning_rate": 1.2405284523254823e-06, | |
| "loss": 0.0085, | |
| "reward": 2.3108259737491608, | |
| "reward_std": 0.5786739625036716, | |
| "rewards/accuracy_reward": 0.4486607387661934, | |
| "rewards/format_reward": 0.9151786118745804, | |
| "rewards/tag_count_reward": 0.9469866529107094, | |
| "step": 321 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1020.6071548461914, | |
| "epoch": 0.8586666666666667, | |
| "grad_norm": 0.31456390023231506, | |
| "kl": 0.22998046875, | |
| "learning_rate": 1.195938604585205e-06, | |
| "loss": 0.0069, | |
| "reward": 2.13225457072258, | |
| "reward_std": 0.562423225492239, | |
| "rewards/accuracy_reward": 0.2767857313156128, | |
| "rewards/format_reward": 0.901785746216774, | |
| "rewards/tag_count_reward": 0.9536830931901932, | |
| "step": 322 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1021.0089416503906, | |
| "epoch": 0.8613333333333333, | |
| "grad_norm": 0.18485142290592194, | |
| "kl": 0.1756591796875, | |
| "learning_rate": 1.152113859767565e-06, | |
| "loss": 0.0085, | |
| "reward": 2.2059152722358704, | |
| "reward_std": 0.4480607798323035, | |
| "rewards/accuracy_reward": 0.305803582072258, | |
| "rewards/format_reward": 0.9375000298023224, | |
| "rewards/tag_count_reward": 0.9626116454601288, | |
| "step": 323 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1021.8169708251953, | |
| "epoch": 0.864, | |
| "grad_norm": 0.23961246013641357, | |
| "kl": 0.21728515625, | |
| "learning_rate": 1.109058026392158e-06, | |
| "loss": 0.0044, | |
| "reward": 2.2148438543081284, | |
| "reward_std": 0.5881692916154861, | |
| "rewards/accuracy_reward": 0.3705357350409031, | |
| "rewards/format_reward": 0.9040179029107094, | |
| "rewards/tag_count_reward": 0.9402902349829674, | |
| "step": 324 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1023.1741104125977, | |
| "epoch": 0.8666666666666667, | |
| "grad_norm": 0.2150668352842331, | |
| "kl": 0.197265625, | |
| "learning_rate": 1.0667748461575544e-06, | |
| "loss": 0.008, | |
| "reward": 2.2271206229925156, | |
| "reward_std": 0.5270771663635969, | |
| "rewards/accuracy_reward": 0.3660714481957257, | |
| "rewards/format_reward": 0.9129464700818062, | |
| "rewards/tag_count_reward": 0.9481027200818062, | |
| "step": 325 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1020.9531326293945, | |
| "epoch": 0.8693333333333333, | |
| "grad_norm": 0.24379926919937134, | |
| "kl": 0.2073974609375, | |
| "learning_rate": 1.0252679936161392e-06, | |
| "loss": 0.0088, | |
| "reward": 2.2271206378936768, | |
| "reward_std": 0.5633045695722103, | |
| "rewards/accuracy_reward": 0.35937502048909664, | |
| "rewards/format_reward": 0.9174107387661934, | |
| "rewards/tag_count_reward": 0.9503348544239998, | |
| "step": 326 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.872, | |
| "grad_norm": 0.4589475691318512, | |
| "kl": 0.230224609375, | |
| "learning_rate": 9.845410758547724e-07, | |
| "loss": 0.0093, | |
| "reward": 2.1964286863803864, | |
| "reward_std": 0.5628439746797085, | |
| "rewards/accuracy_reward": 0.3370535795111209, | |
| "rewards/format_reward": 0.9129464700818062, | |
| "rewards/tag_count_reward": 0.946428619325161, | |
| "step": 327 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1012.912971496582, | |
| "epoch": 0.8746666666666667, | |
| "grad_norm": 0.28130125999450684, | |
| "kl": 0.342041015625, | |
| "learning_rate": 9.445976321813277e-07, | |
| "loss": -0.009, | |
| "reward": 2.3710938543081284, | |
| "reward_std": 0.5514123477041721, | |
| "rewards/accuracy_reward": 0.47544645005837083, | |
| "rewards/format_reward": 0.933035746216774, | |
| "rewards/tag_count_reward": 0.9626116454601288, | |
| "step": 328 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1017.9263534545898, | |
| "epoch": 0.8773333333333333, | |
| "grad_norm": 0.19118691980838776, | |
| "kl": 0.335693359375, | |
| "learning_rate": 9.054411338171099e-07, | |
| "loss": -0.0008, | |
| "reward": 2.2952009737491608, | |
| "reward_std": 0.46351186372339725, | |
| "rewards/accuracy_reward": 0.3883928668219596, | |
| "rewards/format_reward": 0.9419643208384514, | |
| "rewards/tag_count_reward": 0.964843787252903, | |
| "step": 329 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1021.4732208251953, | |
| "epoch": 0.88, | |
| "grad_norm": 0.1962963342666626, | |
| "kl": 0.181884765625, | |
| "learning_rate": 8.670749835951964e-07, | |
| "loss": 0.0054, | |
| "reward": 2.2952009737491608, | |
| "reward_std": 0.5603885129094124, | |
| "rewards/accuracy_reward": 0.4196428805589676, | |
| "rewards/format_reward": 0.9218750298023224, | |
| "rewards/tag_count_reward": 0.953683078289032, | |
| "step": 330 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1021.9799194335938, | |
| "epoch": 0.8826666666666667, | |
| "grad_norm": 0.6351999640464783, | |
| "kl": 0.2130126953125, | |
| "learning_rate": 8.29502515664723e-07, | |
| "loss": 0.0044, | |
| "reward": 2.152901843190193, | |
| "reward_std": 0.5181849822402, | |
| "rewards/accuracy_reward": 0.2879464402794838, | |
| "rewards/format_reward": 0.9174107536673546, | |
| "rewards/tag_count_reward": 0.9475446939468384, | |
| "step": 331 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 0.8853333333333333, | |
| "grad_norm": 0.28189706802368164, | |
| "kl": 0.2200927734375, | |
| "learning_rate": 7.927269952011285e-07, | |
| "loss": 0.0089, | |
| "reward": 2.146205425262451, | |
| "reward_std": 0.5443706884980202, | |
| "rewards/accuracy_reward": 0.31026787124574184, | |
| "rewards/format_reward": 0.8973214700818062, | |
| "rewards/tag_count_reward": 0.938616119325161, | |
| "step": 332 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1017.8549270629883, | |
| "epoch": 0.888, | |
| "grad_norm": 0.39113467931747437, | |
| "kl": 0.2760009765625, | |
| "learning_rate": 7.567516181223966e-07, | |
| "loss": 0.0007, | |
| "reward": 2.158482238650322, | |
| "reward_std": 0.5754810310900211, | |
| "rewards/accuracy_reward": 0.2968750186264515, | |
| "rewards/format_reward": 0.9174107536673546, | |
| "rewards/tag_count_reward": 0.9441964700818062, | |
| "step": 333 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1017.1585006713867, | |
| "epoch": 0.8906666666666667, | |
| "grad_norm": 0.26294657588005066, | |
| "kl": 0.2698974609375, | |
| "learning_rate": 7.215795108113343e-07, | |
| "loss": 0.0052, | |
| "reward": 2.2025670409202576, | |
| "reward_std": 0.6121005043387413, | |
| "rewards/accuracy_reward": 0.37053573317825794, | |
| "rewards/format_reward": 0.895089328289032, | |
| "rewards/tag_count_reward": 0.9369420185685158, | |
| "step": 334 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1018.5334930419922, | |
| "epoch": 0.8933333333333333, | |
| "grad_norm": 0.28085288405418396, | |
| "kl": 0.2161865234375, | |
| "learning_rate": 6.872137298438653e-07, | |
| "loss": 0.0096, | |
| "reward": 2.2405134588479996, | |
| "reward_std": 0.5706223845481873, | |
| "rewards/accuracy_reward": 0.3616071653086692, | |
| "rewards/format_reward": 0.9263393208384514, | |
| "rewards/tag_count_reward": 0.9525670111179352, | |
| "step": 335 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1020.0290298461914, | |
| "epoch": 0.896, | |
| "grad_norm": 0.35503754019737244, | |
| "kl": 0.277099609375, | |
| "learning_rate": 6.536572617234082e-07, | |
| "loss": 0.0092, | |
| "reward": 2.2360492199659348, | |
| "reward_std": 0.5588010214269161, | |
| "rewards/accuracy_reward": 0.3660714440047741, | |
| "rewards/format_reward": 0.9218750298023224, | |
| "rewards/tag_count_reward": 0.948102705180645, | |
| "step": 336 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1016.4776916503906, | |
| "epoch": 0.8986666666666666, | |
| "grad_norm": 0.4076971113681793, | |
| "kl": 0.256103515625, | |
| "learning_rate": 6.209130226213378e-07, | |
| "loss": 0.0079, | |
| "reward": 2.22209832072258, | |
| "reward_std": 0.5934063121676445, | |
| "rewards/accuracy_reward": 0.3973214477300644, | |
| "rewards/format_reward": 0.8950893208384514, | |
| "rewards/tag_count_reward": 0.9296875447034836, | |
| "step": 337 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1021.7678604125977, | |
| "epoch": 0.9013333333333333, | |
| "grad_norm": 0.18329821527004242, | |
| "kl": 0.22509765625, | |
| "learning_rate": 5.889838581235641e-07, | |
| "loss": 0.0102, | |
| "reward": 2.200334906578064, | |
| "reward_std": 0.5884822010993958, | |
| "rewards/accuracy_reward": 0.36383930081501603, | |
| "rewards/format_reward": 0.9017857536673546, | |
| "rewards/tag_count_reward": 0.9347098618745804, | |
| "step": 338 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1021.4642944335938, | |
| "epoch": 0.904, | |
| "grad_norm": 0.4405348598957062, | |
| "kl": 0.29541015625, | |
| "learning_rate": 5.578725429832344e-07, | |
| "loss": 0.0112, | |
| "reward": 2.169084906578064, | |
| "reward_std": 0.6066409535706043, | |
| "rewards/accuracy_reward": 0.33258930034935474, | |
| "rewards/format_reward": 0.8973214775323868, | |
| "rewards/tag_count_reward": 0.9391741529107094, | |
| "step": 339 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1009.591552734375, | |
| "epoch": 0.9066666666666666, | |
| "grad_norm": 0.3651023507118225, | |
| "kl": 0.3321533203125, | |
| "learning_rate": 5.275817808796013e-07, | |
| "loss": -0.0047, | |
| "reward": 2.168526902794838, | |
| "reward_std": 0.5713673643767834, | |
| "rewards/accuracy_reward": 0.330357164144516, | |
| "rewards/format_reward": 0.8995536118745804, | |
| "rewards/tag_count_reward": 0.9386161118745804, | |
| "step": 340 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1015.3594131469727, | |
| "epoch": 0.9093333333333333, | |
| "grad_norm": 0.8885743021965027, | |
| "kl": 0.2982177734375, | |
| "learning_rate": 4.981142041830645e-07, | |
| "loss": 0.0078, | |
| "reward": 2.1093751192092896, | |
| "reward_std": 0.5496832653880119, | |
| "rewards/accuracy_reward": 0.29241072991862893, | |
| "rewards/format_reward": 0.8906250447034836, | |
| "rewards/tag_count_reward": 0.9263393208384514, | |
| "step": 341 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1018.2276992797852, | |
| "epoch": 0.912, | |
| "grad_norm": 0.1936761438846588, | |
| "kl": 0.231689453125, | |
| "learning_rate": 4.6947237372640954e-07, | |
| "loss": 0.0124, | |
| "reward": 2.168526917695999, | |
| "reward_std": 0.5354543067514896, | |
| "rewards/accuracy_reward": 0.30133930081501603, | |
| "rewards/format_reward": 0.910714328289032, | |
| "rewards/tag_count_reward": 0.9564732536673546, | |
| "step": 342 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1013.1629791259766, | |
| "epoch": 0.9146666666666666, | |
| "grad_norm": 0.5735378861427307, | |
| "kl": 0.255615234375, | |
| "learning_rate": 4.416587785822568e-07, | |
| "loss": 0.0028, | |
| "reward": 2.1160715222358704, | |
| "reward_std": 0.5677376128733158, | |
| "rewards/accuracy_reward": 0.27678572572767735, | |
| "rewards/format_reward": 0.8928571864962578, | |
| "rewards/tag_count_reward": 0.946428619325161, | |
| "step": 343 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1011.4062652587891, | |
| "epoch": 0.9173333333333333, | |
| "grad_norm": 0.2437291294336319, | |
| "kl": 0.23291015625, | |
| "learning_rate": 4.1467583584676395e-07, | |
| "loss": 0.0097, | |
| "reward": 2.2477679550647736, | |
| "reward_std": 0.5661342553794384, | |
| "rewards/accuracy_reward": 0.37500001303851604, | |
| "rewards/format_reward": 0.9174107536673546, | |
| "rewards/tag_count_reward": 0.9553571790456772, | |
| "step": 344 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1023.0401840209961, | |
| "epoch": 0.92, | |
| "grad_norm": 0.30266594886779785, | |
| "kl": 0.258056640625, | |
| "learning_rate": 3.885258904295575e-07, | |
| "loss": 0.0103, | |
| "reward": 2.1132813096046448, | |
| "reward_std": 0.5288172848522663, | |
| "rewards/accuracy_reward": 0.2723214365541935, | |
| "rewards/format_reward": 0.8995536118745804, | |
| "rewards/tag_count_reward": 0.9414062947034836, | |
| "step": 345 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1018.3459930419922, | |
| "epoch": 0.9226666666666666, | |
| "grad_norm": 0.44057974219322205, | |
| "kl": 0.41259765625, | |
| "learning_rate": 3.6321121484996447e-07, | |
| "loss": 0.0007, | |
| "reward": 2.1322545260190964, | |
| "reward_std": 0.6624284163117409, | |
| "rewards/accuracy_reward": 0.3370535895228386, | |
| "rewards/format_reward": 0.8750000447034836, | |
| "rewards/tag_count_reward": 0.9202009290456772, | |
| "step": 346 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1015.5625305175781, | |
| "epoch": 0.9253333333333333, | |
| "grad_norm": 0.5109536051750183, | |
| "kl": 0.3021240234375, | |
| "learning_rate": 3.3873400903951636e-07, | |
| "loss": 0.0071, | |
| "reward": 2.1975447237491608, | |
| "reward_std": 0.5756550095975399, | |
| "rewards/accuracy_reward": 0.36607144586741924, | |
| "rewards/format_reward": 0.8928571790456772, | |
| "rewards/tag_count_reward": 0.938616119325161, | |
| "step": 347 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1015.7433242797852, | |
| "epoch": 0.928, | |
| "grad_norm": 0.2958410382270813, | |
| "kl": 0.290283203125, | |
| "learning_rate": 3.1509640015076946e-07, | |
| "loss": 0.0115, | |
| "reward": 2.1434152871370316, | |
| "reward_std": 0.680501900613308, | |
| "rewards/accuracy_reward": 0.3504464477300644, | |
| "rewards/format_reward": 0.8705357536673546, | |
| "rewards/tag_count_reward": 0.9224330857396126, | |
| "step": 348 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1019.8058166503906, | |
| "epoch": 0.9306666666666666, | |
| "grad_norm": 0.4034452736377716, | |
| "kl": 0.27490234375, | |
| "learning_rate": 2.923004423724474e-07, | |
| "loss": 0.0029, | |
| "reward": 2.22600457072258, | |
| "reward_std": 0.5555046014487743, | |
| "rewards/accuracy_reward": 0.37946430407464504, | |
| "rewards/format_reward": 0.899553619325161, | |
| "rewards/tag_count_reward": 0.9469866529107094, | |
| "step": 349 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1019.709831237793, | |
| "epoch": 0.9333333333333333, | |
| "grad_norm": 0.22715678811073303, | |
| "kl": 0.259033203125, | |
| "learning_rate": 2.703481167509281e-07, | |
| "loss": 0.01, | |
| "reward": 2.2159599363803864, | |
| "reward_std": 0.6509725004434586, | |
| "rewards/accuracy_reward": 0.39732144959270954, | |
| "rewards/format_reward": 0.8816964700818062, | |
| "rewards/tag_count_reward": 0.9369420036673546, | |
| "step": 350 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1014.2857284545898, | |
| "epoch": 0.936, | |
| "grad_norm": 0.28359052538871765, | |
| "kl": 0.2364501953125, | |
| "learning_rate": 2.4924133101807636e-07, | |
| "loss": 0.0082, | |
| "reward": 2.3164063692092896, | |
| "reward_std": 0.5717856138944626, | |
| "rewards/accuracy_reward": 0.4397321715950966, | |
| "rewards/format_reward": 0.921875037252903, | |
| "rewards/tag_count_reward": 0.9547991454601288, | |
| "step": 351 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1010.491096496582, | |
| "epoch": 0.9386666666666666, | |
| "grad_norm": 0.2850205600261688, | |
| "kl": 0.431640625, | |
| "learning_rate": 2.289819194254661e-07, | |
| "loss": -0.0118, | |
| "reward": 2.1925224363803864, | |
| "reward_std": 0.6137520037591457, | |
| "rewards/accuracy_reward": 0.3459821604192257, | |
| "rewards/format_reward": 0.9017857611179352, | |
| "rewards/tag_count_reward": 0.9447545111179352, | |
| "step": 352 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1018.2455520629883, | |
| "epoch": 0.9413333333333334, | |
| "grad_norm": 0.25859472155570984, | |
| "kl": 0.2259521484375, | |
| "learning_rate": 2.0957164258497031e-07, | |
| "loss": 0.0008, | |
| "reward": 2.210937574505806, | |
| "reward_std": 0.521145723760128, | |
| "rewards/accuracy_reward": 0.33705358766019344, | |
| "rewards/format_reward": 0.9196428880095482, | |
| "rewards/tag_count_reward": 0.954241119325161, | |
| "step": 353 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1019.0000152587891, | |
| "epoch": 0.944, | |
| "grad_norm": 0.24050916731357574, | |
| "kl": 0.2647705078125, | |
| "learning_rate": 1.9101218731575777e-07, | |
| "loss": 0.0093, | |
| "reward": 2.039620652794838, | |
| "reward_std": 0.5573948994278908, | |
| "rewards/accuracy_reward": 0.2187500111758709, | |
| "rewards/format_reward": 0.8861607536673546, | |
| "rewards/tag_count_reward": 0.9347098767757416, | |
| "step": 354 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1005.6272659301758, | |
| "epoch": 0.9466666666666667, | |
| "grad_norm": 29.152254104614258, | |
| "kl": 15.73291015625, | |
| "learning_rate": 1.73305166497707e-07, | |
| "loss": 0.0494, | |
| "reward": 2.1562501341104507, | |
| "reward_std": 0.6062168106436729, | |
| "rewards/accuracy_reward": 0.33705358766019344, | |
| "rewards/format_reward": 0.883928619325161, | |
| "rewards/tag_count_reward": 0.9352679029107094, | |
| "step": 355 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1018.8727874755859, | |
| "epoch": 0.9493333333333334, | |
| "grad_norm": 0.22955262660980225, | |
| "kl": 0.20166015625, | |
| "learning_rate": 1.5645211893123846e-07, | |
| "loss": 0.0064, | |
| "reward": 2.2371652722358704, | |
| "reward_std": 0.6186549700796604, | |
| "rewards/accuracy_reward": 0.4062500186264515, | |
| "rewards/format_reward": 0.8928571790456772, | |
| "rewards/tag_count_reward": 0.938058078289032, | |
| "step": 356 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1020.5625152587891, | |
| "epoch": 0.952, | |
| "grad_norm": 0.3843916952610016, | |
| "kl": 0.2706298828125, | |
| "learning_rate": 1.4045450920358917e-07, | |
| "loss": 0.0079, | |
| "reward": 2.1729911416769028, | |
| "reward_std": 0.5495161339640617, | |
| "rewards/accuracy_reward": 0.33035715855658054, | |
| "rewards/format_reward": 0.8995536044239998, | |
| "rewards/tag_count_reward": 0.9430804029107094, | |
| "step": 357 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1016.8459930419922, | |
| "epoch": 0.9546666666666667, | |
| "grad_norm": 0.20025238394737244, | |
| "kl": 0.229248046875, | |
| "learning_rate": 1.2531372756153458e-07, | |
| "loss": 0.0013, | |
| "reward": 2.2098215222358704, | |
| "reward_std": 0.5276618581265211, | |
| "rewards/accuracy_reward": 0.3258928693830967, | |
| "rewards/format_reward": 0.9263393208384514, | |
| "rewards/tag_count_reward": 0.957589328289032, | |
| "step": 358 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1009.1250305175781, | |
| "epoch": 0.9573333333333334, | |
| "grad_norm": 0.4796367585659027, | |
| "kl": 0.453369140625, | |
| "learning_rate": 1.1103108979056865e-07, | |
| "loss": 0.0055, | |
| "reward": 2.159040242433548, | |
| "reward_std": 0.6154494881629944, | |
| "rewards/accuracy_reward": 0.3526785857975483, | |
| "rewards/format_reward": 0.87276791036129, | |
| "rewards/tag_count_reward": 0.9335937947034836, | |
| "step": 359 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1017.2053833007812, | |
| "epoch": 0.96, | |
| "grad_norm": 0.24999898672103882, | |
| "kl": 0.257080078125, | |
| "learning_rate": 9.760783710056176e-08, | |
| "loss": 0.0102, | |
| "reward": 2.225446566939354, | |
| "reward_std": 0.49892666935920715, | |
| "rewards/accuracy_reward": 0.361607164144516, | |
| "rewards/format_reward": 0.9129464626312256, | |
| "rewards/tag_count_reward": 0.9508929029107094, | |
| "step": 360 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1018.8705520629883, | |
| "epoch": 0.9626666666666667, | |
| "grad_norm": 0.24726729094982147, | |
| "kl": 0.26513671875, | |
| "learning_rate": 8.504513601789388e-08, | |
| "loss": 0.0106, | |
| "reward": 2.1194197237491608, | |
| "reward_std": 0.6447071582078934, | |
| "rewards/accuracy_reward": 0.33035716228187084, | |
| "rewards/format_reward": 0.868303619325161, | |
| "rewards/tag_count_reward": 0.9207589626312256, | |
| "step": 361 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1015.9352798461914, | |
| "epoch": 0.9653333333333334, | |
| "grad_norm": 0.31679829955101013, | |
| "kl": 0.26416015625, | |
| "learning_rate": 7.334407828407885e-08, | |
| "loss": 0.009, | |
| "reward": 2.240513488650322, | |
| "reward_std": 0.5577768888324499, | |
| "rewards/accuracy_reward": 0.3816964514553547, | |
| "rewards/format_reward": 0.9062500447034836, | |
| "rewards/tag_count_reward": 0.952566996216774, | |
| "step": 362 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1017.8460083007812, | |
| "epoch": 0.968, | |
| "grad_norm": 0.2769128978252411, | |
| "kl": 0.240234375, | |
| "learning_rate": 6.250568076088814e-08, | |
| "loss": 0.0072, | |
| "reward": 2.214843839406967, | |
| "reward_std": 0.4956537261605263, | |
| "rewards/accuracy_reward": 0.33258930314332247, | |
| "rewards/format_reward": 0.9241071939468384, | |
| "rewards/tag_count_reward": 0.9581473618745804, | |
| "step": 363 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1014.8415374755859, | |
| "epoch": 0.9706666666666667, | |
| "grad_norm": 0.2232033759355545, | |
| "kl": 0.216552734375, | |
| "learning_rate": 5.2530885341982586e-08, | |
| "loss": 0.0007, | |
| "reward": 2.321986734867096, | |
| "reward_std": 0.5681647323071957, | |
| "rewards/accuracy_reward": 0.4352678768336773, | |
| "rewards/format_reward": 0.926339328289032, | |
| "rewards/tag_count_reward": 0.9603795111179352, | |
| "step": 364 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1019.8482208251953, | |
| "epoch": 0.9733333333333334, | |
| "grad_norm": 0.2994762063026428, | |
| "kl": 0.2735595703125, | |
| "learning_rate": 4.3420558871060116e-08, | |
| "loss": 0.0128, | |
| "reward": 2.224888503551483, | |
| "reward_std": 0.5850169435143471, | |
| "rewards/accuracy_reward": 0.37723216228187084, | |
| "rewards/format_reward": 0.9017857611179352, | |
| "rewards/tag_count_reward": 0.9458705857396126, | |
| "step": 365 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1021.2544784545898, | |
| "epoch": 0.976, | |
| "grad_norm": 0.21192388236522675, | |
| "kl": 0.2596435546875, | |
| "learning_rate": 3.517549306652157e-08, | |
| "loss": 0.0118, | |
| "reward": 2.1026787012815475, | |
| "reward_std": 0.5294410735368729, | |
| "rewards/accuracy_reward": 0.2544642947614193, | |
| "rewards/format_reward": 0.9040178880095482, | |
| "rewards/tag_count_reward": 0.9441964775323868, | |
| "step": 366 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1018.3058242797852, | |
| "epoch": 0.9786666666666667, | |
| "grad_norm": 0.34231218695640564, | |
| "kl": 0.2655029296875, | |
| "learning_rate": 2.7796404452666847e-08, | |
| "loss": 0.0085, | |
| "reward": 2.29241082072258, | |
| "reward_std": 0.617542814463377, | |
| "rewards/accuracy_reward": 0.43526787497103214, | |
| "rewards/format_reward": 0.9017857536673546, | |
| "rewards/tag_count_reward": 0.9553571864962578, | |
| "step": 367 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1011.8415374755859, | |
| "epoch": 0.9813333333333333, | |
| "grad_norm": 0.2850145101547241, | |
| "kl": 0.24853515625, | |
| "learning_rate": 2.1283934297432472e-08, | |
| "loss": 0.0066, | |
| "reward": 2.2695313692092896, | |
| "reward_std": 0.5283640064299107, | |
| "rewards/accuracy_reward": 0.38616072945296764, | |
| "rewards/format_reward": 0.9263393208384514, | |
| "rewards/tag_count_reward": 0.9570312947034836, | |
| "step": 368 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1020.8393020629883, | |
| "epoch": 0.984, | |
| "grad_norm": 0.23045480251312256, | |
| "kl": 0.2613525390625, | |
| "learning_rate": 1.5638648556656198e-08, | |
| "loss": 0.0103, | |
| "reward": 2.2566965371370316, | |
| "reward_std": 0.6659301854670048, | |
| "rewards/accuracy_reward": 0.43973216880112886, | |
| "rewards/format_reward": 0.883928619325161, | |
| "rewards/tag_count_reward": 0.9330357611179352, | |
| "step": 369 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1018.4531326293945, | |
| "epoch": 0.9866666666666667, | |
| "grad_norm": 0.4619045853614807, | |
| "kl": 0.31494140625, | |
| "learning_rate": 1.0861037824896337e-08, | |
| "loss": 0.0051, | |
| "reward": 2.2109376192092896, | |
| "reward_std": 0.6436006389558315, | |
| "rewards/accuracy_reward": 0.3973214514553547, | |
| "rewards/format_reward": 0.8816964775323868, | |
| "rewards/tag_count_reward": 0.9319196939468384, | |
| "step": 370 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1017.1629638671875, | |
| "epoch": 0.9893333333333333, | |
| "grad_norm": 0.4388638734817505, | |
| "kl": 0.2684326171875, | |
| "learning_rate": 6.951517292800303e-09, | |
| "loss": 0.0092, | |
| "reward": 2.20814748108387, | |
| "reward_std": 0.5858336836099625, | |
| "rewards/accuracy_reward": 0.3526785932481289, | |
| "rewards/format_reward": 0.910714328289032, | |
| "rewards/tag_count_reward": 0.9447545036673546, | |
| "step": 371 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1013.9419860839844, | |
| "epoch": 0.992, | |
| "grad_norm": 0.23698575794696808, | |
| "kl": 0.301513671875, | |
| "learning_rate": 3.9104267110168235e-09, | |
| "loss": 0.0068, | |
| "reward": 2.0736608058214188, | |
| "reward_std": 0.6049975231289864, | |
| "rewards/accuracy_reward": 0.310267869848758, | |
| "rewards/format_reward": 0.8526786118745804, | |
| "rewards/tag_count_reward": 0.910714328289032, | |
| "step": 372 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1015.5446701049805, | |
| "epoch": 0.9946666666666667, | |
| "grad_norm": 0.1941290646791458, | |
| "kl": 0.21368408203125, | |
| "learning_rate": 1.738030360677323e-09, | |
| "loss": 0.0099, | |
| "reward": 2.3370536863803864, | |
| "reward_std": 0.5505933277308941, | |
| "rewards/accuracy_reward": 0.4620535932481289, | |
| "rewards/format_reward": 0.9241071790456772, | |
| "rewards/tag_count_reward": 0.9508928954601288, | |
| "step": 373 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1015.7143173217773, | |
| "epoch": 0.9973333333333333, | |
| "grad_norm": 0.2728719115257263, | |
| "kl": 0.245849609375, | |
| "learning_rate": 4.3451703042207694e-10, | |
| "loss": -0.0058, | |
| "reward": 2.1406251341104507, | |
| "reward_std": 0.5760147906839848, | |
| "rewards/accuracy_reward": 0.29464287124574184, | |
| "rewards/format_reward": 0.8973214626312256, | |
| "rewards/tag_count_reward": 0.9486607536673546, | |
| "step": 374 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1024.0, | |
| "epoch": 1.0, | |
| "grad_norm": 0.2957329750061035, | |
| "kl": 0.3221435546875, | |
| "learning_rate": 0.0, | |
| "loss": 0.0103, | |
| "reward": 2.0814733505249023, | |
| "reward_std": 0.5692420080304146, | |
| "rewards/accuracy_reward": 0.2700892947614193, | |
| "rewards/format_reward": 0.8816964700818062, | |
| "rewards/tag_count_reward": 0.9296875447034836, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 375, | |
| "total_flos": 0.0, | |
| "train_loss": 17.059915766330747, | |
| "train_runtime": 37131.9859, | |
| "train_samples_per_second": 0.323, | |
| "train_steps_per_second": 0.01 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 375, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |