{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 674.7265625, "epoch": 0.0004, "grad_norm": 0.5231496039061674, "kl": 0.0, "learning_rate": 8e-08, "loss": -0.0103, "reward": 0.19921875, "reward_std": 0.3890402242541313, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.0234375, "rewards/tag_count_reward": 0.14453125, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 731.328125, "epoch": 0.0008, "grad_norm": 0.47906263076570954, "kl": 0.0, "learning_rate": 1.6e-07, "loss": 0.0066, "reward": 0.21484375, "reward_std": 0.38167842477560043, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.046875, "rewards/tag_count_reward": 0.13671875, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 699.1484375, "epoch": 0.0012, "grad_norm": 0.4036705129381692, "kl": 0.00012040138244628906, "learning_rate": 2.4000000000000003e-07, "loss": 0.0211, "reward": 0.130859375, "reward_std": 0.2753865271806717, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0078125, "rewards/tag_count_reward": 0.060546875, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 657.953125, "epoch": 0.0016, "grad_norm": 0.5919995398375534, "kl": 0.00014638900756835938, "learning_rate": 3.2e-07, "loss": 0.0331, "reward": 0.20703125, "reward_std": 0.43431489169597626, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0546875, "rewards/tag_count_reward": 0.15234375, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 669.609375, "epoch": 0.002, "grad_norm": 0.47977995251181765, "kl": 0.00011086463928222656, "learning_rate": 4.0000000000000003e-07, "loss": 0.0471, "reward": 0.216796875, "reward_std": 0.39057739078998566, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.015625, "rewards/tag_count_reward": 0.107421875, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 659.75, "epoch": 0.0024, "grad_norm": 0.49905663124721933, "kl": 0.00013566017150878906, "learning_rate": 4.800000000000001e-07, "loss": 0.0004, "reward": 0.125, "reward_std": 0.2786017134785652, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.0078125, "rewards/tag_count_reward": 0.0859375, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 727.859375, "epoch": 0.0028, "grad_norm": 0.3691749077600384, "kl": 0.00014638900756835938, "learning_rate": 5.6e-07, "loss": 0.0421, "reward": 0.1015625, "reward_std": 0.21188171207904816, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.015625, "rewards/tag_count_reward": 0.0625, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 627.6796875, "epoch": 0.0032, "grad_norm": 0.36625551665454514, "kl": 0.0001442432403564453, "learning_rate": 6.4e-07, "loss": 0.0515, "reward": 0.140625, "reward_std": 0.19210299476981163, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0859375, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 661.9921875, "epoch": 0.0036, "grad_norm": 0.5776602130299453, "kl": 0.00013053417205810547, "learning_rate": 7.2e-07, "loss": -0.0351, "reward": 0.08984375, "reward_std": 0.21971597895026207, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.05859375, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 677.1328125, "epoch": 0.004, "grad_norm": 0.4610714150762134, "kl": 0.00013625621795654297, "learning_rate": 8.000000000000001e-07, "loss": 0.0553, "reward": 0.21875, "reward_std": 0.388346541672945, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.0234375, "rewards/tag_count_reward": 0.125, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 557.0625, "epoch": 0.0044, "grad_norm": 0.7776853945889578, "kl": 0.00025153160095214844, "learning_rate": 8.8e-07, "loss": 0.0177, "reward": 0.466796875, "reward_std": 0.3718581274151802, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0390625, "rewards/tag_count_reward": 0.146484375, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 759.7109375, "epoch": 0.0048, "grad_norm": 0.43697818030344526, "kl": 0.00023674964904785156, "learning_rate": 9.600000000000001e-07, "loss": 0.061, "reward": 0.220703125, "reward_std": 0.4034613408148289, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.0546875, "rewards/tag_count_reward": 0.126953125, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 715.8515625, "epoch": 0.0052, "grad_norm": 0.5331627296122858, "kl": 0.0006008148193359375, "learning_rate": 1.04e-06, "loss": 0.0808, "reward": 0.36328125, "reward_std": 0.43628619983792305, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.046875, "rewards/tag_count_reward": 0.19140625, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 679.1953125, "epoch": 0.0056, "grad_norm": 0.6115763138688094, "kl": 0.0012006759643554688, "learning_rate": 1.12e-06, "loss": 0.0404, "reward": 0.25390625, "reward_std": 0.470514141023159, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.046875, "rewards/tag_count_reward": 0.18359375, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 779.2578125, "epoch": 0.006, "grad_norm": 0.4156476334378649, "kl": 0.0013103485107421875, "learning_rate": 1.2000000000000002e-06, "loss": 0.0306, "reward": 0.21484375, "reward_std": 0.42150191962718964, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.0625, "rewards/tag_count_reward": 0.11328125, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 687.3515625, "epoch": 0.0064, "grad_norm": 0.48086229296394656, "kl": 0.0073223114013671875, "learning_rate": 1.28e-06, "loss": -0.001, "reward": 0.337890625, "reward_std": 0.5084059983491898, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.078125, "rewards/tag_count_reward": 0.212890625, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 629.234375, "epoch": 0.0068, "grad_norm": 1.725193892660429, "kl": 0.04974365234375, "learning_rate": 1.3600000000000001e-06, "loss": 0.0256, "reward": 0.236328125, "reward_std": 0.41634494811296463, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0390625, "rewards/tag_count_reward": 0.197265625, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 612.75, "epoch": 0.0072, "grad_norm": 2.5262200137825257, "kl": 0.0819091796875, "learning_rate": 1.44e-06, "loss": 0.0381, "reward": 0.453125, "reward_std": 0.585490383207798, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.109375, "rewards/tag_count_reward": 0.3203125, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 652.59375, "epoch": 0.0076, "grad_norm": 1.7982864058848516, "kl": 0.05999755859375, "learning_rate": 1.52e-06, "loss": 0.0804, "reward": 0.376953125, "reward_std": 0.5275212079286575, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.0703125, "rewards/tag_count_reward": 0.212890625, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 681.2890625, "epoch": 0.008, "grad_norm": 1.0504975931803886, "kl": 0.03924560546875, "learning_rate": 1.6000000000000001e-06, "loss": 0.0271, "reward": 0.330078125, "reward_std": 0.4561749994754791, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.078125, "rewards/tag_count_reward": 0.212890625, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 645.828125, "epoch": 0.0084, "grad_norm": 0.4071174231272071, "kl": 0.014404296875, "learning_rate": 1.6800000000000002e-06, "loss": 0.049, "reward": 0.294921875, "reward_std": 0.5098106935620308, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.0625, "rewards/tag_count_reward": 0.185546875, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 697.3828125, "epoch": 0.0088, "grad_norm": 0.46486820588366157, "kl": 0.00738525390625, "learning_rate": 1.76e-06, "loss": 0.0404, "reward": 0.55078125, "reward_std": 0.5924507901072502, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.109375, "rewards/tag_count_reward": 0.26953125, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 556.1015625, "epoch": 0.0092, "grad_norm": 0.5897787589491361, "kl": 0.0091705322265625, "learning_rate": 1.8400000000000002e-06, "loss": 0.0653, "reward": 0.478515625, "reward_std": 0.5884353518486023, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.109375, "rewards/tag_count_reward": 0.244140625, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 585.4921875, "epoch": 0.0096, "grad_norm": 0.5128198378235748, "kl": 0.005218505859375, "learning_rate": 1.9200000000000003e-06, "loss": 0.048, "reward": 0.15625, "reward_std": 0.3811819367110729, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.0234375, "rewards/tag_count_reward": 0.109375, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 721.6875, "epoch": 0.01, "grad_norm": 0.42876292794791904, "kl": 0.00359344482421875, "learning_rate": 2.0000000000000003e-06, "loss": 0.0509, "reward": 0.208984375, "reward_std": 0.4613487794995308, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0546875, "rewards/tag_count_reward": 0.146484375, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 717.171875, "epoch": 0.0104, "grad_norm": 0.3283343589118685, "kl": 0.0043659210205078125, "learning_rate": 2.08e-06, "loss": 0.0207, "reward": 0.166015625, "reward_std": 0.2856166884303093, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.03125, "rewards/tag_count_reward": 0.087890625, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 708.9453125, "epoch": 0.0108, "grad_norm": 0.4833999718623918, "kl": 0.005542755126953125, "learning_rate": 2.16e-06, "loss": 0.027, "reward": 0.232421875, "reward_std": 0.42304350435733795, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.0234375, "rewards/tag_count_reward": 0.099609375, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 598.8671875, "epoch": 0.0112, "grad_norm": 0.7078264401774912, "kl": 0.01612091064453125, "learning_rate": 2.24e-06, "loss": 0.1233, "reward": 0.4453125, "reward_std": 0.6048080921173096, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.125, "rewards/tag_count_reward": 0.3046875, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 634.828125, "epoch": 0.0116, "grad_norm": 0.5295919981233603, "kl": 0.012447357177734375, "learning_rate": 2.3200000000000002e-06, "loss": 0.0789, "reward": 0.375, "reward_std": 0.5588233880698681, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.109375, "rewards/tag_count_reward": 0.2578125, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 584.8046875, "epoch": 0.012, "grad_norm": 0.4860986788708241, "kl": 0.01303863525390625, "learning_rate": 2.4000000000000003e-06, "loss": 0.083, "reward": 0.4375, "reward_std": 0.6996572017669678, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.140625, "rewards/tag_count_reward": 0.2734375, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 647.2890625, "epoch": 0.0124, "grad_norm": 0.5404325110996678, "kl": 0.01079559326171875, "learning_rate": 2.4800000000000004e-06, "loss": 0.0377, "reward": 0.447265625, "reward_std": 0.5448299273848534, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.1015625, "rewards/tag_count_reward": 0.330078125, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 744.015625, "epoch": 0.0128, "grad_norm": 0.44086188077663874, "kl": 0.0094146728515625, "learning_rate": 2.56e-06, "loss": 0.0297, "reward": 0.447265625, "reward_std": 0.5865104496479034, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.1015625, "rewards/tag_count_reward": 0.322265625, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 605.3125, "epoch": 0.0132, "grad_norm": 0.5486261292142703, "kl": 0.0171966552734375, "learning_rate": 2.64e-06, "loss": 0.0301, "reward": 0.37109375, "reward_std": 0.5996888875961304, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.1015625, "rewards/tag_count_reward": 0.26171875, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 738.484375, "epoch": 0.0136, "grad_norm": 0.45004216170061107, "kl": 0.0157623291015625, "learning_rate": 2.7200000000000002e-06, "loss": 0.0429, "reward": 0.509765625, "reward_std": 0.5436728000640869, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.0625, "rewards/tag_count_reward": 0.306640625, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 535.46875, "epoch": 0.014, "grad_norm": 0.6948198455919858, "kl": 0.0369873046875, "learning_rate": 2.8000000000000003e-06, "loss": 0.1135, "reward": 0.53125, "reward_std": 0.6856220811605453, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.15625, "rewards/tag_count_reward": 0.3515625, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 501.53125, "epoch": 0.0144, "grad_norm": 0.7454874855133018, "kl": 0.036407470703125, "learning_rate": 2.88e-06, "loss": 0.088, "reward": 0.876953125, "reward_std": 0.7454017400741577, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.21875, "rewards/tag_count_reward": 0.509765625, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 596.03125, "epoch": 0.0148, "grad_norm": 8.79111969894152, "kl": 0.05908203125, "learning_rate": 2.96e-06, "loss": 0.1055, "reward": 0.6640625, "reward_std": 0.7139723747968674, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.1484375, "rewards/tag_count_reward": 0.359375, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 636.0390625, "epoch": 0.0152, "grad_norm": 1.077209933568181, "kl": 0.07049560546875, "learning_rate": 3.04e-06, "loss": 0.0195, "reward": 0.658203125, "reward_std": 0.7542373687028885, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.1953125, "rewards/tag_count_reward": 0.455078125, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 630.828125, "epoch": 0.0156, "grad_norm": 0.7639825445580255, "kl": 0.0924072265625, "learning_rate": 3.12e-06, "loss": 0.0945, "reward": 0.828125, "reward_std": 0.7309327721595764, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.234375, "rewards/tag_count_reward": 0.578125, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 492.90625, "epoch": 0.016, "grad_norm": 4.53418059950176, "kl": 0.251220703125, "learning_rate": 3.2000000000000003e-06, "loss": 0.0725, "reward": 0.962890625, "reward_std": 0.6842206120491028, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.3046875, "rewards/tag_count_reward": 0.634765625, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 582.3203125, "epoch": 0.0164, "grad_norm": 1.882647820271138, "kl": 0.218505859375, "learning_rate": 3.2800000000000004e-06, "loss": 0.0067, "reward": 0.7421875, "reward_std": 0.6279640719294548, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.1796875, "rewards/tag_count_reward": 0.4453125, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 557.8359375, "epoch": 0.0168, "grad_norm": 2.90345024527637, "kl": 0.247802734375, "learning_rate": 3.3600000000000004e-06, "loss": -0.0605, "reward": 0.728515625, "reward_std": 0.6447467356920242, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.1484375, "rewards/tag_count_reward": 0.556640625, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 476.765625, "epoch": 0.0172, "grad_norm": 0.7415325716664228, "kl": 0.2052001953125, "learning_rate": 3.44e-06, "loss": 0.0097, "reward": 0.85546875, "reward_std": 0.6563659012317657, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.203125, "rewards/tag_count_reward": 0.55859375, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 452.015625, "epoch": 0.0176, "grad_norm": 0.8874639422368854, "kl": 0.09130859375, "learning_rate": 3.52e-06, "loss": 0.0101, "reward": 0.72265625, "reward_std": 0.5616491511464119, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.09375, "rewards/tag_count_reward": 0.47265625, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 520.1875, "epoch": 0.018, "grad_norm": 1.4130621790937623, "kl": 0.1541748046875, "learning_rate": 3.6000000000000003e-06, "loss": -0.0693, "reward": 0.59375, "reward_std": 0.5132487565279007, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.0703125, "rewards/tag_count_reward": 0.4921875, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 556.9140625, "epoch": 0.0184, "grad_norm": 5.707138120266389, "kl": 0.9417724609375, "learning_rate": 3.6800000000000003e-06, "loss": -0.0618, "reward": 0.98046875, "reward_std": 0.4248930290341377, "rewards/accuracy_reward": 0.390625, "rewards/format_reward": 0.03125, "rewards/tag_count_reward": 0.55859375, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 504.984375, "epoch": 0.0188, "grad_norm": 0.9299697400953494, "kl": 0.08148193359375, "learning_rate": 3.7600000000000004e-06, "loss": -0.1385, "reward": 0.619140625, "reward_std": 0.5043524503707886, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.03125, "rewards/tag_count_reward": 0.431640625, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 511.0625, "epoch": 0.0192, "grad_norm": 1.1383820596308956, "kl": 0.0758056640625, "learning_rate": 3.8400000000000005e-06, "loss": -0.0465, "reward": 0.716796875, "reward_std": 0.5158518701791763, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.0390625, "rewards/tag_count_reward": 0.607421875, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 656.9375, "epoch": 0.0196, "grad_norm": 0.7278406356658751, "kl": 0.0694580078125, "learning_rate": 3.920000000000001e-06, "loss": -0.0674, "reward": 0.626953125, "reward_std": 0.40301191806793213, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0234375, "rewards/tag_count_reward": 0.478515625, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 720.265625, "epoch": 0.02, "grad_norm": 0.5394220095136213, "kl": 0.0675048828125, "learning_rate": 4.000000000000001e-06, "loss": -0.0436, "reward": 0.55078125, "reward_std": 0.3636101186275482, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.015625, "rewards/tag_count_reward": 0.50390625, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 566.5, "epoch": 0.0204, "grad_norm": 1.0368598928530224, "kl": 0.1026611328125, "learning_rate": 4.08e-06, "loss": -0.0425, "reward": 0.619140625, "reward_std": 0.32474544644355774, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.619140625, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 512.9296875, "epoch": 0.0208, "grad_norm": 1.621990828606246, "kl": 0.2642822265625, "learning_rate": 4.16e-06, "loss": -0.0281, "reward": 0.822265625, "reward_std": 0.41170164197683334, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.0703125, "rewards/tag_count_reward": 0.650390625, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 607.7734375, "epoch": 0.0212, "grad_norm": 0.9677869214480879, "kl": 0.1837158203125, "learning_rate": 4.24e-06, "loss": -0.0288, "reward": 0.697265625, "reward_std": 0.38212594389915466, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.0078125, "rewards/tag_count_reward": 0.611328125, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 622.6796875, "epoch": 0.0216, "grad_norm": 0.8567034214344293, "kl": 0.1361083984375, "learning_rate": 4.32e-06, "loss": -0.0124, "reward": 0.708984375, "reward_std": 0.35128459334373474, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.0078125, "rewards/tag_count_reward": 0.646484375, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 630.484375, "epoch": 0.022, "grad_norm": 0.902463690508282, "kl": 0.208984375, "learning_rate": 4.4e-06, "loss": -0.0002, "reward": 0.642578125, "reward_std": 0.320267628878355, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.015625, "rewards/tag_count_reward": 0.580078125, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 624.03125, "epoch": 0.0224, "grad_norm": 1.2060958484282007, "kl": 0.1492919921875, "learning_rate": 4.48e-06, "loss": -0.0135, "reward": 0.646484375, "reward_std": 0.27161574363708496, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.607421875, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 648.484375, "epoch": 0.0228, "grad_norm": 14.339719819265241, "kl": 0.4093017578125, "learning_rate": 4.56e-06, "loss": -0.0282, "reward": 0.607421875, "reward_std": 0.22253268957138062, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.599609375, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 773.1796875, "epoch": 0.0232, "grad_norm": 9.722171745482036, "kl": 0.2969970703125, "learning_rate": 4.6400000000000005e-06, "loss": 0.0583, "reward": 0.544921875, "reward_std": 0.21164903789758682, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.544921875, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 692.9609375, "epoch": 0.0236, "grad_norm": 0.8465129582084994, "kl": 0.139404296875, "learning_rate": 4.7200000000000005e-06, "loss": 0.122, "reward": 0.568359375, "reward_std": 0.24419524148106575, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.552734375, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 692.390625, "epoch": 0.024, "grad_norm": 1.091533552080865, "kl": 0.150146484375, "learning_rate": 4.800000000000001e-06, "loss": 0.0804, "reward": 0.705078125, "reward_std": 0.19725558161735535, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.580078125, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 674.546875, "epoch": 0.0244, "grad_norm": 4.607125590341557, "kl": 0.208984375, "learning_rate": 4.880000000000001e-06, "loss": -0.0119, "reward": 0.732421875, "reward_std": 0.3030276969075203, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.544921875, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 729.0, "epoch": 0.0248, "grad_norm": 0.5854045749309208, "kl": 0.07073974609375, "learning_rate": 4.960000000000001e-06, "loss": 0.0854, "reward": 0.59765625, "reward_std": 0.3086724951863289, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.0078125, "rewards/tag_count_reward": 0.55078125, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 675.5703125, "epoch": 0.0252, "grad_norm": 1.6019565166054728, "kl": 0.08111572265625, "learning_rate": 5.04e-06, "loss": 0.0963, "reward": 0.671875, "reward_std": 0.24871882051229477, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.0078125, "rewards/tag_count_reward": 0.640625, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 620.2578125, "epoch": 0.0256, "grad_norm": 0.7871298530729296, "kl": 0.115234375, "learning_rate": 5.12e-06, "loss": 0.0678, "reward": 0.6953125, "reward_std": 0.2564793489873409, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6484375, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 558.078125, "epoch": 0.026, "grad_norm": 1.0642460864334267, "kl": 0.1134033203125, "learning_rate": 5.2e-06, "loss": -0.004, "reward": 0.7109375, "reward_std": 0.229571133852005, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.015625, "rewards/tag_count_reward": 0.6796875, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 579.46875, "epoch": 0.0264, "grad_norm": 1.5731682269371667, "kl": 0.17962646484375, "learning_rate": 5.28e-06, "loss": -0.0119, "reward": 0.736328125, "reward_std": 0.10824790596961975, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.728515625, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 576.90625, "epoch": 0.0268, "grad_norm": 1.9265429020830558, "kl": 0.20849609375, "learning_rate": 5.36e-06, "loss": 0.0152, "reward": 0.697265625, "reward_std": 0.11995540745556355, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.697265625, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 559.4375, "epoch": 0.0272, "grad_norm": 1.7430510832596575, "kl": 0.6669921875, "learning_rate": 5.4400000000000004e-06, "loss": -0.0015, "reward": 0.744140625, "reward_std": 0.15409080311655998, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.712890625, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 569.7421875, "epoch": 0.0276, "grad_norm": 4.036067956128506, "kl": 3.29833984375, "learning_rate": 5.5200000000000005e-06, "loss": 0.0239, "reward": 0.84765625, "reward_std": 0.10187487304210663, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.72265625, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 530.7578125, "epoch": 0.028, "grad_norm": 1.5239655689552454, "kl": 0.9453125, "learning_rate": 5.600000000000001e-06, "loss": -0.0323, "reward": 0.853515625, "reward_std": 0.06243937276303768, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.728515625, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 591.0390625, "epoch": 0.0284, "grad_norm": 1.042277261186756, "kl": 0.45703125, "learning_rate": 5.68e-06, "loss": -0.0161, "reward": 0.708984375, "reward_std": 0.1284320019185543, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.701171875, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 654.875, "epoch": 0.0288, "grad_norm": 4.371618986497174, "kl": 1.953125, "learning_rate": 5.76e-06, "loss": 0.009, "reward": 0.75, "reward_std": 0.23136527091264725, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6953125, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 605.359375, "epoch": 0.0292, "grad_norm": 5.846524794382842, "kl": 0.532958984375, "learning_rate": 5.84e-06, "loss": 0.0763, "reward": 0.72265625, "reward_std": 0.13826189562678337, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.70703125, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 545.84375, "epoch": 0.0296, "grad_norm": 5.681614664513492, "kl": 0.477783203125, "learning_rate": 5.92e-06, "loss": 0.1122, "reward": 0.763671875, "reward_std": 0.20316895470023155, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.0078125, "rewards/tag_count_reward": 0.708984375, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 610.09375, "epoch": 0.03, "grad_norm": 5.199397222965969, "kl": 0.749267578125, "learning_rate": 6e-06, "loss": 0.0859, "reward": 0.720703125, "reward_std": 0.08599158376455307, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.720703125, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 692.578125, "epoch": 0.0304, "grad_norm": 54.19900241201727, "kl": 2.27685546875, "learning_rate": 6.08e-06, "loss": 0.1615, "reward": 0.751953125, "reward_std": 0.15809813141822815, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.720703125, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 565.71875, "epoch": 0.0308, "grad_norm": 34.55958166207911, "kl": 1.308349609375, "learning_rate": 6.16e-06, "loss": 0.0907, "reward": 0.89453125, "reward_std": 0.1889151707291603, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.69921875, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 566.90625, "epoch": 0.0312, "grad_norm": 6.356299443532911, "kl": 0.722412109375, "learning_rate": 6.24e-06, "loss": 0.0615, "reward": 0.865234375, "reward_std": 0.14606688544154167, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.716796875, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 699.46875, "epoch": 0.0316, "grad_norm": 432.0873870772095, "kl": 27.5625, "learning_rate": 6.3200000000000005e-06, "loss": 1.1713, "reward": 0.6484375, "reward_std": 0.19376936182379723, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0078125, "rewards/tag_count_reward": 0.6328125, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 634.3828125, "epoch": 0.032, "grad_norm": 111237.71349737271, "kl": 531.625, "learning_rate": 6.4000000000000006e-06, "loss": 31.526, "reward": 0.814453125, "reward_std": 0.21638617292046547, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.0078125, "rewards/tag_count_reward": 0.650390625, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 731.3671875, "epoch": 0.0324, "grad_norm": 20633.34824799034, "kl": 159.703125, "learning_rate": 6.480000000000001e-06, "loss": 7.8791, "reward": 0.6875, "reward_std": 0.3233315870165825, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.0234375, "rewards/tag_count_reward": 0.5, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 770.546875, "epoch": 0.0328, "grad_norm": 276.4773090350322, "kl": 9.0546875, "learning_rate": 6.560000000000001e-06, "loss": 0.4148, "reward": 0.37109375, "reward_std": 0.269182775169611, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.0078125, "rewards/tag_count_reward": 0.34765625, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 772.5703125, "epoch": 0.0332, "grad_norm": 10.204809969901708, "kl": 3.73046875, "learning_rate": 6.640000000000001e-06, "loss": 0.1547, "reward": 0.33203125, "reward_std": 0.2076178900897503, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.33203125, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 726.328125, "epoch": 0.0336, "grad_norm": 16.846294810753147, "kl": 2.58984375, "learning_rate": 6.720000000000001e-06, "loss": -0.0053, "reward": 0.3359375, "reward_std": 0.2300342656672001, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3359375, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 841.3671875, "epoch": 0.034, "grad_norm": 267.7937440896597, "kl": 1.931640625, "learning_rate": 6.800000000000001e-06, "loss": 0.1208, "reward": 0.33203125, "reward_std": 0.22728992998600006, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.33203125, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 662.03125, "epoch": 0.0344, "grad_norm": 35.07525657534382, "kl": 1.376953125, "learning_rate": 6.88e-06, "loss": 0.1865, "reward": 0.49609375, "reward_std": 0.2581078112125397, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0078125, "rewards/tag_count_reward": 0.36328125, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 707.4609375, "epoch": 0.0348, "grad_norm": 11.957668775015751, "kl": 0.8671875, "learning_rate": 6.96e-06, "loss": 0.114, "reward": 0.41015625, "reward_std": 0.25333084911108017, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0078125, "rewards/tag_count_reward": 0.40234375, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 650.5390625, "epoch": 0.0352, "grad_norm": 24.357901321036028, "kl": 1.357421875, "learning_rate": 7.04e-06, "loss": 0.0698, "reward": 0.388671875, "reward_std": 0.2365647740662098, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0078125, "rewards/tag_count_reward": 0.380859375, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 700.3671875, "epoch": 0.0356, "grad_norm": 167.9944214400145, "kl": 2.36328125, "learning_rate": 7.1200000000000004e-06, "loss": 0.1576, "reward": 0.65234375, "reward_std": 0.2375967726111412, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.38671875, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 696.7578125, "epoch": 0.036, "grad_norm": 13.399532704809337, "kl": 0.7607421875, "learning_rate": 7.2000000000000005e-06, "loss": 0.0573, "reward": 0.357421875, "reward_std": 0.2505216412246227, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0078125, "rewards/tag_count_reward": 0.349609375, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 702.7578125, "epoch": 0.0364, "grad_norm": 14.62273036879086, "kl": 1.0380859375, "learning_rate": 7.280000000000001e-06, "loss": 0.1265, "reward": 0.490234375, "reward_std": 0.20841382443904877, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.365234375, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 612.515625, "epoch": 0.0368, "grad_norm": 13.460945902001148, "kl": 1.1142578125, "learning_rate": 7.360000000000001e-06, "loss": 0.115, "reward": 0.392578125, "reward_std": 0.22453027218580246, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.392578125, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 612.6875, "epoch": 0.0372, "grad_norm": 1450979.8942930019, "kl": 2005.9765625, "learning_rate": 7.440000000000001e-06, "loss": 113.9845, "reward": 0.400390625, "reward_std": 0.2322065234184265, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.400390625, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 490.53125, "epoch": 0.0376, "grad_norm": 11.501905488854812, "kl": 1.064453125, "learning_rate": 7.520000000000001e-06, "loss": 0.2342, "reward": 0.416015625, "reward_std": 0.25477826967835426, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0078125, "rewards/tag_count_reward": 0.400390625, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 626.6015625, "epoch": 0.038, "grad_norm": 12.972322471778707, "kl": 0.8359375, "learning_rate": 7.600000000000001e-06, "loss": 0.1, "reward": 0.525390625, "reward_std": 0.21956948935985565, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.400390625, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 628.3359375, "epoch": 0.0384, "grad_norm": 13.94164550562132, "kl": 0.9951171875, "learning_rate": 7.680000000000001e-06, "loss": 0.1009, "reward": 0.365234375, "reward_std": 0.22382405772805214, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.365234375, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 691.1796875, "epoch": 0.0388, "grad_norm": 7.119539565510979, "kl": 1.33984375, "learning_rate": 7.76e-06, "loss": 0.0489, "reward": 0.517578125, "reward_std": 0.24044308811426163, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.392578125, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 672.5703125, "epoch": 0.0392, "grad_norm": 9.980227865913736, "kl": 0.7109375, "learning_rate": 7.840000000000001e-06, "loss": 0.0278, "reward": 0.40234375, "reward_std": 0.2733629271388054, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.38671875, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 658.421875, "epoch": 0.0396, "grad_norm": 6.077335568501444, "kl": 0.9228515625, "learning_rate": 7.92e-06, "loss": 0.1578, "reward": 0.560546875, "reward_std": 0.23913879320025444, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.435546875, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 583.2109375, "epoch": 0.04, "grad_norm": 4.271039327343523, "kl": 1.0380859375, "learning_rate": 8.000000000000001e-06, "loss": 0.1161, "reward": 0.400390625, "reward_std": 0.25792963430285454, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.400390625, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 558.671875, "epoch": 0.0404, "grad_norm": 7.194062498146512, "kl": 0.751953125, "learning_rate": 8.08e-06, "loss": 0.1133, "reward": 0.453125, "reward_std": 0.23212339356541634, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4453125, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 409.0078125, "epoch": 0.0408, "grad_norm": 19.09437568733509, "kl": 0.73828125, "learning_rate": 8.16e-06, "loss": 0.1586, "reward": 0.46484375, "reward_std": 0.2899658977985382, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.0078125, "rewards/tag_count_reward": 0.44140625, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 492.7421875, "epoch": 0.0412, "grad_norm": 33.26401969903997, "kl": 2.40478515625, "learning_rate": 8.24e-06, "loss": 0.2538, "reward": 0.529296875, "reward_std": 0.3060803487896919, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.498046875, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 412.3828125, "epoch": 0.0416, "grad_norm": 20.502492749196513, "kl": 1.435546875, "learning_rate": 8.32e-06, "loss": 0.0996, "reward": 0.513671875, "reward_std": 0.2512445002794266, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.505859375, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 448.6796875, "epoch": 0.042, "grad_norm": 6.817441870395461, "kl": 0.7880859375, "learning_rate": 8.400000000000001e-06, "loss": 0.0544, "reward": 0.615234375, "reward_std": 0.25487102195620537, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0078125, "rewards/tag_count_reward": 0.482421875, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 487.390625, "epoch": 0.0424, "grad_norm": 16.37715191577169, "kl": 0.5458984375, "learning_rate": 8.48e-06, "loss": 0.0964, "reward": 0.541015625, "reward_std": 0.2986927516758442, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.494140625, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 402.2578125, "epoch": 0.0428, "grad_norm": 3.5872707107264907, "kl": 0.67529296875, "learning_rate": 8.560000000000001e-06, "loss": -0.1415, "reward": 0.6953125, "reward_std": 0.2266143225133419, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5625, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 490.1328125, "epoch": 0.0432, "grad_norm": 5.866068219297919, "kl": 0.3447265625, "learning_rate": 8.64e-06, "loss": -0.0684, "reward": 0.599609375, "reward_std": 0.35185717046260834, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.544921875, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 527.3984375, "epoch": 0.0436, "grad_norm": 7.392038528558454, "kl": 0.50439453125, "learning_rate": 8.720000000000001e-06, "loss": -0.0606, "reward": 0.587890625, "reward_std": 0.26846133172512054, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.580078125, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 481.421875, "epoch": 0.044, "grad_norm": 32.90303213285144, "kl": 1.395263671875, "learning_rate": 8.8e-06, "loss": -0.0671, "reward": 0.58984375, "reward_std": 0.3163483962416649, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.54296875, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 443.1796875, "epoch": 0.0444, "grad_norm": 1.7383638781816932, "kl": 0.38671875, "learning_rate": 8.880000000000001e-06, "loss": -0.2494, "reward": 0.591796875, "reward_std": 0.3358397036790848, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.544921875, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 547.046875, "epoch": 0.0448, "grad_norm": 1.082139590288404, "kl": 0.3203125, "learning_rate": 8.96e-06, "loss": -0.2546, "reward": 0.5546875, "reward_std": 0.29279758036136627, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.546875, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 480.9921875, "epoch": 0.0452, "grad_norm": 1.009747039279418, "kl": 0.4599609375, "learning_rate": 9.040000000000002e-06, "loss": -0.2913, "reward": 0.53515625, "reward_std": 0.27896567434072495, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.52734375, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 401.4140625, "epoch": 0.0456, "grad_norm": 1.3549411137064022, "kl": 1.0673828125, "learning_rate": 9.12e-06, "loss": -0.4163, "reward": 0.6484375, "reward_std": 0.28427136689424515, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5234375, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 395.6171875, "epoch": 0.046, "grad_norm": 2.6995566050398407, "kl": 2.716796875, "learning_rate": 9.200000000000002e-06, "loss": -0.3906, "reward": 0.705078125, "reward_std": 0.3330542892217636, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.548828125, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 294.765625, "epoch": 0.0464, "grad_norm": 11.192302189977733, "kl": 12.46875, "learning_rate": 9.280000000000001e-06, "loss": -0.6292, "reward": 0.541015625, "reward_std": 0.35923682153224945, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.416015625, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 360.0625, "epoch": 0.0468, "grad_norm": 16.205538882228158, "kl": 23.4140625, "learning_rate": 9.360000000000002e-06, "loss": -0.5913, "reward": 0.451171875, "reward_std": 0.34164977073669434, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.451171875, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 406.8515625, "epoch": 0.0472, "grad_norm": 2.352652561071061, "kl": 5.5546875, "learning_rate": 9.440000000000001e-06, "loss": -0.7252, "reward": 0.443359375, "reward_std": 0.36970148235559464, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0078125, "rewards/tag_count_reward": 0.435546875, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 364.7421875, "epoch": 0.0476, "grad_norm": 6.178975142117124, "kl": 2.578125, "learning_rate": 9.52e-06, "loss": -0.5354, "reward": 0.5390625, "reward_std": 0.33910442888736725, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.53125, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 418.7421875, "epoch": 0.048, "grad_norm": 2.0133985202026117, "kl": 1.7216796875, "learning_rate": 9.600000000000001e-06, "loss": -0.3808, "reward": 0.60546875, "reward_std": 0.30514464527368546, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.59765625, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 303.1484375, "epoch": 0.0484, "grad_norm": 2.292335360066931, "kl": 2.9404296875, "learning_rate": 9.68e-06, "loss": -0.3916, "reward": 0.662109375, "reward_std": 0.4050801545381546, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0625, "rewards/tag_count_reward": 0.591796875, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 549.421875, "epoch": 0.0488, "grad_norm": 0.6646235208884022, "kl": 0.99560546875, "learning_rate": 9.760000000000001e-06, "loss": -0.2138, "reward": 0.791015625, "reward_std": 0.3274690583348274, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.595703125, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 590.390625, "epoch": 0.0492, "grad_norm": 1.5256230556580617, "kl": 1.2626953125, "learning_rate": 9.84e-06, "loss": -0.0894, "reward": 0.771484375, "reward_std": 0.3768974803388119, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.0390625, "rewards/tag_count_reward": 0.662109375, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 428.3046875, "epoch": 0.0496, "grad_norm": 1023.9195110872043, "kl": 53.7861328125, "learning_rate": 9.920000000000002e-06, "loss": 3.4749, "reward": 0.654296875, "reward_std": 0.20912879705429077, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0078125, "rewards/tag_count_reward": 0.646484375, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 355.6015625, "epoch": 0.05, "grad_norm": 18.93388412708911, "kl": 1.54296875, "learning_rate": 1e-05, "loss": 0.0348, "reward": 0.798828125, "reward_std": 0.329216904938221, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.046875, "rewards/tag_count_reward": 0.658203125, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 425.9921875, "epoch": 0.0504, "grad_norm": 15.738904969269706, "kl": 1.5625, "learning_rate": 1.008e-05, "loss": -0.0753, "reward": 0.7421875, "reward_std": 0.35137344896793365, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.078125, "rewards/tag_count_reward": 0.6640625, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 421.6171875, "epoch": 0.0508, "grad_norm": 1.675192719552423, "kl": 3.31591796875, "learning_rate": 1.0160000000000001e-05, "loss": -0.0767, "reward": 0.716796875, "reward_std": 0.23538538813591003, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.03125, "rewards/tag_count_reward": 0.685546875, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 497.9140625, "epoch": 0.0512, "grad_norm": 0.5644682692919389, "kl": 0.4398193359375, "learning_rate": 1.024e-05, "loss": -0.0597, "reward": 0.783203125, "reward_std": 0.34025509282946587, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.078125, "rewards/tag_count_reward": 0.705078125, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 515.09375, "epoch": 0.0516, "grad_norm": 1.601842283595738, "kl": 3.77734375, "learning_rate": 1.0320000000000001e-05, "loss": -0.0467, "reward": 1.216796875, "reward_std": 0.6119532287120819, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.34375, "rewards/tag_count_reward": 0.740234375, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 488.5546875, "epoch": 0.052, "grad_norm": 0.6799926754077729, "kl": 0.22119140625, "learning_rate": 1.04e-05, "loss": -0.0531, "reward": 1.80078125, "reward_std": 0.669259250164032, "rewards/accuracy_reward": 0.2265625, "rewards/format_reward": 0.71875, "rewards/tag_count_reward": 0.85546875, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 385.2734375, "epoch": 0.0524, "grad_norm": 0.6576096269912888, "kl": 0.3134765625, "learning_rate": 1.0480000000000001e-05, "loss": -0.1155, "reward": 1.896484375, "reward_std": 0.4434613138437271, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.826171875, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 327.5390625, "epoch": 0.0528, "grad_norm": 0.8657910361938828, "kl": 0.319091796875, "learning_rate": 1.056e-05, "loss": 0.0016, "reward": 1.880859375, "reward_std": 0.35563142225146294, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.802734375, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 341.625, "epoch": 0.0532, "grad_norm": 1.9638323762149943, "kl": 0.718017578125, "learning_rate": 1.0640000000000001e-05, "loss": 0.0508, "reward": 1.75, "reward_std": 0.3982636295258999, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.8125, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 277.15625, "epoch": 0.0536, "grad_norm": 2.143952983691835, "kl": 0.58056640625, "learning_rate": 1.072e-05, "loss": 0.0062, "reward": 1.93359375, "reward_std": 0.3538047317415476, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.93359375, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 209.9375, "epoch": 0.054, "grad_norm": 1.553557789546441, "kl": 1.4365234375, "learning_rate": 1.0800000000000002e-05, "loss": -0.0763, "reward": 1.640625, "reward_std": 0.6348458081483841, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.734375, "rewards/tag_count_reward": 0.8828125, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 226.21875, "epoch": 0.0544, "grad_norm": 1.1566011138342784, "kl": 2.33203125, "learning_rate": 1.0880000000000001e-05, "loss": -0.109, "reward": 1.560546875, "reward_std": 0.6160432696342468, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.6796875, "rewards/tag_count_reward": 0.880859375, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 208.9453125, "epoch": 0.0548, "grad_norm": 2.148692426287572, "kl": 1.115234375, "learning_rate": 1.0960000000000002e-05, "loss": -0.0973, "reward": 1.447265625, "reward_std": 0.6912604719400406, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.578125, "rewards/tag_count_reward": 0.845703125, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 245.28125, "epoch": 0.0552, "grad_norm": 3.287049487617892, "kl": 1.674072265625, "learning_rate": 1.1040000000000001e-05, "loss": -0.201, "reward": 1.55078125, "reward_std": 0.6164726689457893, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.6796875, "rewards/tag_count_reward": 0.87109375, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 176.765625, "epoch": 0.0556, "grad_norm": 1.7763853749024336, "kl": 0.78662109375, "learning_rate": 1.1120000000000002e-05, "loss": -0.0876, "reward": 1.5546875, "reward_std": 0.64179827272892, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.6796875, "rewards/tag_count_reward": 0.875, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 211.1640625, "epoch": 0.056, "grad_norm": 1.0314169933510107, "kl": 0.435546875, "learning_rate": 1.1200000000000001e-05, "loss": -0.0878, "reward": 1.736328125, "reward_std": 0.5674032047390938, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.8125, "rewards/tag_count_reward": 0.916015625, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 181.0, "epoch": 0.0564, "grad_norm": 1.9006624026869217, "kl": 1.18603515625, "learning_rate": 1.128e-05, "loss": -0.2571, "reward": 1.69921875, "reward_std": 0.5478394776582718, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8125, "rewards/tag_count_reward": 0.88671875, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 219.1640625, "epoch": 0.0568, "grad_norm": 0.9137212985131372, "kl": 0.349609375, "learning_rate": 1.136e-05, "loss": -0.0949, "reward": 2.09375, "reward_std": 0.31974536553025246, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.953125, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 204.15625, "epoch": 0.0572, "grad_norm": 1.643124977516558, "kl": 1.44091796875, "learning_rate": 1.144e-05, "loss": -0.0946, "reward": 2.087890625, "reward_std": 0.343445360660553, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.955078125, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 242.6484375, "epoch": 0.0576, "grad_norm": 2.715242824287215, "kl": 1.08984375, "learning_rate": 1.152e-05, "loss": -0.028, "reward": 2.06640625, "reward_std": 0.234375, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.98046875, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 228.8671875, "epoch": 0.058, "grad_norm": 0.9621091075065581, "kl": 0.377197265625, "learning_rate": 1.16e-05, "loss": 0.0059, "reward": 1.966796875, "reward_std": 0.1666145622730255, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 243.78125, "epoch": 0.0584, "grad_norm": 72.52018796477122, "kl": 9.13427734375, "learning_rate": 1.168e-05, "loss": 0.2789, "reward": 2.033203125, "reward_std": 0.2817709818482399, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.955078125, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 288.734375, "epoch": 0.0588, "grad_norm": 92.99629907555808, "kl": 2.0634765625, "learning_rate": 1.1760000000000001e-05, "loss": 0.0384, "reward": 1.99609375, "reward_std": 0.2356489971280098, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.86328125, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 239.1171875, "epoch": 0.0592, "grad_norm": 5.615276794505801, "kl": 0.61865234375, "learning_rate": 1.184e-05, "loss": 0.0605, "reward": 2.0546875, "reward_std": 0.23874729126691818, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.96875, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 214.8984375, "epoch": 0.0596, "grad_norm": 1.9396251006392768, "kl": 0.3193359375, "learning_rate": 1.1920000000000001e-05, "loss": -0.0011, "reward": 2.017578125, "reward_std": 0.30113694071769714, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.962890625, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 252.78125, "epoch": 0.06, "grad_norm": 1.4091115469179178, "kl": 0.576171875, "learning_rate": 1.2e-05, "loss": -0.0636, "reward": 2.044921875, "reward_std": 0.25612766295671463, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.974609375, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 242.0390625, "epoch": 0.0604, "grad_norm": 6.0193484921822265, "kl": 1.23583984375, "learning_rate": 1.2080000000000001e-05, "loss": 0.0731, "reward": 2.009765625, "reward_std": 0.29157526046037674, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.970703125, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 234.3671875, "epoch": 0.0608, "grad_norm": 675.5557738898508, "kl": 16.875, "learning_rate": 1.216e-05, "loss": 0.9658, "reward": 1.4296875, "reward_std": 0.7547721266746521, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.640625, "rewards/tag_count_reward": 0.7734375, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 152.8203125, "epoch": 0.0612, "grad_norm": 65.24410950408148, "kl": 10.828125, "learning_rate": 1.2240000000000001e-05, "loss": 0.5494, "reward": 1.517578125, "reward_std": 0.7220761626958847, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.3984375, "rewards/tag_count_reward": 0.744140625, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 142.453125, "epoch": 0.0616, "grad_norm": 19.10636798591105, "kl": 5.609375, "learning_rate": 1.232e-05, "loss": 0.5172, "reward": 0.86328125, "reward_std": 0.7119172066450119, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.2578125, "rewards/tag_count_reward": 0.60546875, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 98.671875, "epoch": 0.062, "grad_norm": 19.071041134679355, "kl": 2.7890625, "learning_rate": 1.2400000000000002e-05, "loss": 0.3933, "reward": 1.244140625, "reward_std": 0.8108646273612976, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.4296875, "rewards/tag_count_reward": 0.736328125, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 109.296875, "epoch": 0.0624, "grad_norm": 770104358243.8707, "kl": 2470834176.609375, "learning_rate": 1.248e-05, "loss": 43537836.0, "reward": 1.28515625, "reward_std": 0.7307612746953964, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.4140625, "rewards/tag_count_reward": 0.74609375, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 111.09375, "epoch": 0.0628, "grad_norm": 32779661034124.848, "kl": 163208757249.77344, "learning_rate": 1.2560000000000002e-05, "loss": 6424815104.0, "reward": 1.224609375, "reward_std": 0.7272335737943649, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.4453125, "rewards/tag_count_reward": 0.771484375, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 99.9296875, "epoch": 0.0632, "grad_norm": 342637338.4598142, "kl": 395473.171875, "learning_rate": 1.2640000000000001e-05, "loss": 12539.1377, "reward": 0.978515625, "reward_std": 0.667610839009285, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.28125, "rewards/tag_count_reward": 0.697265625, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 86.7578125, "epoch": 0.0636, "grad_norm": 635.8985981676134, "kl": 4.619140625, "learning_rate": 1.2720000000000002e-05, "loss": 0.2975, "reward": 0.650390625, "reward_std": 0.2548310048878193, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0078125, "rewards/tag_count_reward": 0.517578125, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 94.34375, "epoch": 0.064, "grad_norm": 459835606.0337409, "kl": 4489219.3828125, "learning_rate": 1.2800000000000001e-05, "loss": 186994.8125, "reward": 0.60546875, "reward_std": 0.3384179472923279, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.0234375, "rewards/tag_count_reward": 0.53515625, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 131.3125, "epoch": 0.0644, "grad_norm": 4292432039.183377, "kl": 6094858.5859375, "learning_rate": 1.2880000000000002e-05, "loss": 223627.3594, "reward": 0.708984375, "reward_std": 0.43561235070228577, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0859375, "rewards/tag_count_reward": 0.623046875, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 130.84375, "epoch": 0.0648, "grad_norm": 116.64433566484445, "kl": 4.833984375, "learning_rate": 1.2960000000000001e-05, "loss": 0.8697, "reward": 0.75390625, "reward_std": 0.2579278349876404, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.0078125, "rewards/tag_count_reward": 0.60546875, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 115.6171875, "epoch": 0.0652, "grad_norm": 2002369.530229485, "kl": 2993.439453125, "learning_rate": 1.3040000000000002e-05, "loss": 215.9718, "reward": 0.595703125, "reward_std": 0.2516294829547405, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0234375, "rewards/tag_count_reward": 0.572265625, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 146.4453125, "epoch": 0.0656, "grad_norm": 5.181887489359951, "kl": 1.98828125, "learning_rate": 1.3120000000000001e-05, "loss": 0.3657, "reward": 0.669921875, "reward_std": 0.1762286238372326, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.544921875, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 115.625, "epoch": 0.066, "grad_norm": 44.73300845053845, "kl": 3.3046875, "learning_rate": 1.3200000000000002e-05, "loss": 0.4451, "reward": 0.603515625, "reward_std": 0.22323543205857277, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.572265625, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 148.1640625, "epoch": 0.0664, "grad_norm": 114791858922.35358, "kl": 352456705.13671875, "learning_rate": 1.3280000000000002e-05, "loss": 10851641.0, "reward": 0.591796875, "reward_std": 0.20064478367567062, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.583984375, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 217.8515625, "epoch": 0.0668, "grad_norm": 6.920283163966772, "kl": 2.09375, "learning_rate": 1.3360000000000003e-05, "loss": 0.5148, "reward": 0.603515625, "reward_std": 0.1674797534942627, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.455078125, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 132.484375, "epoch": 0.0672, "grad_norm": 5.98223111695251, "kl": 1.779296875, "learning_rate": 1.3440000000000002e-05, "loss": 0.2318, "reward": 0.322265625, "reward_std": 0.1508117001503706, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.314453125, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 149.4375, "epoch": 0.0676, "grad_norm": 35.57096784981761, "kl": 3.33984375, "learning_rate": 1.3520000000000003e-05, "loss": 0.3447, "reward": 0.291015625, "reward_std": 0.12642239034175873, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.291015625, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 77.625, "epoch": 0.068, "grad_norm": 9.163955645084414, "kl": 1.91796875, "learning_rate": 1.3600000000000002e-05, "loss": 0.229, "reward": 0.3125, "reward_std": 0.1296292506158352, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3125, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 64.25, "epoch": 0.0684, "grad_norm": 7.257207216238163, "kl": 2.064453125, "learning_rate": 1.3680000000000003e-05, "loss": 0.2183, "reward": 0.4453125, "reward_std": 0.13211318291723728, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3203125, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 58.4140625, "epoch": 0.0688, "grad_norm": 23.073901699112152, "kl": 1.970703125, "learning_rate": 1.376e-05, "loss": 0.1304, "reward": 0.650390625, "reward_std": 0.1298168394714594, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.400390625, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 61.4296875, "epoch": 0.0692, "grad_norm": 26.989762540994025, "kl": 7.552734375, "learning_rate": 1.384e-05, "loss": 0.1026, "reward": 0.462890625, "reward_std": 0.09830783493816853, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.462890625, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 65.3203125, "epoch": 0.0696, "grad_norm": 24.244484041432866, "kl": 6.3671875, "learning_rate": 1.392e-05, "loss": -0.0283, "reward": 0.583984375, "reward_std": 0.09549626894295216, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.458984375, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 66.578125, "epoch": 0.07, "grad_norm": 214.6579220791913, "kl": 14.625, "learning_rate": 1.4e-05, "loss": 0.1173, "reward": 0.552734375, "reward_std": 0.11566018685698509, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.427734375, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 66.1640625, "epoch": 0.0704, "grad_norm": 5.87025949650913, "kl": 2.037109375, "learning_rate": 1.408e-05, "loss": -0.0871, "reward": 0.44140625, "reward_std": 0.10923500545322895, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.44140625, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 68.125, "epoch": 0.0708, "grad_norm": 2.861016596599971, "kl": 2.060546875, "learning_rate": 1.416e-05, "loss": -0.0953, "reward": 0.447265625, "reward_std": 0.12198552303016186, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.447265625, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 61.765625, "epoch": 0.0712, "grad_norm": 8.153490058143856, "kl": 2.30859375, "learning_rate": 1.4240000000000001e-05, "loss": 0.0007, "reward": 0.474609375, "reward_std": 0.07219875976443291, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.474609375, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 61.1015625, "epoch": 0.0716, "grad_norm": 11.117810961867841, "kl": 5.53125, "learning_rate": 1.432e-05, "loss": 0.1083, "reward": 0.6171875, "reward_std": 0.03726658131927252, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4921875, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 58.53125, "epoch": 0.072, "grad_norm": 9.7841712106801, "kl": 2.0859375, "learning_rate": 1.4400000000000001e-05, "loss": 0.0646, "reward": 0.603515625, "reward_std": 0.06024399772286415, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478515625, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 60.53125, "epoch": 0.0724, "grad_norm": 51.36410988004689, "kl": 5.177734375, "learning_rate": 1.448e-05, "loss": 0.1236, "reward": 0.61328125, "reward_std": 0.046875, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48828125, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 96.46875, "epoch": 0.0728, "grad_norm": 12.92268729152049, "kl": 7.25, "learning_rate": 1.4560000000000001e-05, "loss": 0.2427, "reward": 0.5, "reward_std": 0.015625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 167.7265625, "epoch": 0.0732, "grad_norm": 15.929746224728156, "kl": 6.4140625, "learning_rate": 1.464e-05, "loss": 0.0607, "reward": 0.48828125, "reward_std": 0.04192390665411949, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48828125, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 163.1015625, "epoch": 0.0736, "grad_norm": 13.85497904296788, "kl": 4.6953125, "learning_rate": 1.4720000000000001e-05, "loss": 0.0725, "reward": 0.46875, "reward_std": 0.07663769461214542, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.46875, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 178.4921875, "epoch": 0.074, "grad_norm": 2.1343696962785166, "kl": 5.5, "learning_rate": 1.48e-05, "loss": 0.2009, "reward": 0.486328125, "reward_std": 0.0695006437599659, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.486328125, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 187.421875, "epoch": 0.0744, "grad_norm": 2.743703912634402, "kl": 6.1796875, "learning_rate": 1.4880000000000002e-05, "loss": 0.1832, "reward": 0.486328125, "reward_std": 0.049755752086639404, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.486328125, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 222.3984375, "epoch": 0.0748, "grad_norm": 1.8698356937700655, "kl": 5.609375, "learning_rate": 1.496e-05, "loss": 0.2069, "reward": 0.6171875, "reward_std": 0.024809550493955612, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4921875, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 191.328125, "epoch": 0.0752, "grad_norm": 87.98445370860689, "kl": 253.7109375, "learning_rate": 1.5040000000000002e-05, "loss": 0.5762, "reward": 0.49609375, "reward_std": 0.051483154296875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.49609375, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 171.1640625, "epoch": 0.0756, "grad_norm": 6.8469258760385205, "kl": 5.78125, "learning_rate": 1.5120000000000001e-05, "loss": 0.1648, "reward": 0.619140625, "reward_std": 0.0390625, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.494140625, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 204.0, "epoch": 0.076, "grad_norm": 2.719000124002643, "kl": 4.8359375, "learning_rate": 1.5200000000000002e-05, "loss": 0.1934, "reward": 0.873046875, "reward_std": 0.0078125, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.498046875, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 241.625, "epoch": 0.0764, "grad_norm": 3.1974351470144438, "kl": 5.6015625, "learning_rate": 1.5280000000000003e-05, "loss": 0.2245, "reward": 0.501953125, "reward_std": 0.0078125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.501953125, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 233.0625, "epoch": 0.0768, "grad_norm": 3.6809385474645278, "kl": 5.03125, "learning_rate": 1.5360000000000002e-05, "loss": 0.1705, "reward": 0.49609375, "reward_std": 0.015625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.49609375, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 256.9765625, "epoch": 0.0772, "grad_norm": 1.7714768163088241, "kl": 5.078125, "learning_rate": 1.544e-05, "loss": 0.1805, "reward": 0.49609375, "reward_std": 0.015625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.49609375, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 227.578125, "epoch": 0.0776, "grad_norm": 0.3225522722550592, "kl": 5.28125, "learning_rate": 1.552e-05, "loss": 0.2001, "reward": 0.498046875, "reward_std": 0.0078125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.498046875, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 298.8125, "epoch": 0.078, "grad_norm": 0.31840309740608264, "kl": 4.9609375, "learning_rate": 1.5600000000000003e-05, "loss": 0.1993, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 257.9375, "epoch": 0.0784, "grad_norm": 0.5635544040028513, "kl": 4.9765625, "learning_rate": 1.5680000000000002e-05, "loss": 0.2011, "reward": 0.498046875, "reward_std": 0.0078125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.498046875, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 316.0234375, "epoch": 0.0788, "grad_norm": 0.9596133616741815, "kl": 5.015625, "learning_rate": 1.576e-05, "loss": 0.1606, "reward": 0.619140625, "reward_std": 0.0234375, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.494140625, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 360.25, "epoch": 0.0792, "grad_norm": 1.4056633295065286, "kl": 4.1484375, "learning_rate": 1.584e-05, "loss": 0.1679, "reward": 0.494140625, "reward_std": 0.0234375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.494140625, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 449.1171875, "epoch": 0.0796, "grad_norm": 1.7873854525977844, "kl": 4.0390625, "learning_rate": 1.5920000000000003e-05, "loss": 0.1364, "reward": 0.49609375, "reward_std": 0.03125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.49609375, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 427.8125, "epoch": 0.08, "grad_norm": 5.27172835527012, "kl": 3.890625, "learning_rate": 1.6000000000000003e-05, "loss": 0.0611, "reward": 0.615234375, "reward_std": 0.08018150180578232, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.490234375, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 477.9296875, "epoch": 0.0804, "grad_norm": 0.6581602794874679, "kl": 4.53515625, "learning_rate": 1.6080000000000002e-05, "loss": 0.1266, "reward": 0.623046875, "reward_std": 0.03411140665411949, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.498046875, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 411.3125, "epoch": 0.0808, "grad_norm": 2.0207990576365256, "kl": 4.71875, "learning_rate": 1.616e-05, "loss": 0.1942, "reward": 0.52734375, "reward_std": 0.11519249156117439, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.50390625, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 616.90625, "epoch": 0.0812, "grad_norm": 15.950317252836898, "kl": 4.2265625, "learning_rate": 1.6240000000000004e-05, "loss": 0.1636, "reward": 0.494140625, "reward_std": 0.08064346760511398, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.494140625, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 543.578125, "epoch": 0.0816, "grad_norm": 48.95503899227148, "kl": 2.97265625, "learning_rate": 1.632e-05, "loss": -0.0569, "reward": 0.392578125, "reward_std": 0.20807704329490662, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.392578125, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 603.546875, "epoch": 0.082, "grad_norm": 9.281803944319174, "kl": 3.3515625, "learning_rate": 1.64e-05, "loss": 0.0482, "reward": 0.51171875, "reward_std": 0.20600676536560059, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.37890625, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 616.359375, "epoch": 0.0824, "grad_norm": 1.3014405563086613, "kl": 3.45703125, "learning_rate": 1.648e-05, "loss": 0.0615, "reward": 0.345703125, "reward_std": 0.19121000543236732, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.345703125, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 535.4140625, "epoch": 0.0828, "grad_norm": 5.35552538139384, "kl": 3.8125, "learning_rate": 1.656e-05, "loss": 0.1144, "reward": 0.375, "reward_std": 0.19517892599105835, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.375, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 517.3203125, "epoch": 0.0832, "grad_norm": 8117.990870394696, "kl": 777.28515625, "learning_rate": 1.664e-05, "loss": 39.9554, "reward": 0.55078125, "reward_std": 0.20804854854941368, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.42578125, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 447.09375, "epoch": 0.0836, "grad_norm": 60.16205911307453, "kl": 8.234375, "learning_rate": 1.672e-05, "loss": 0.1895, "reward": 0.4765625, "reward_std": 0.2599855735898018, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.46875, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 346.109375, "epoch": 0.084, "grad_norm": 23.210504235850316, "kl": 4.8671875, "learning_rate": 1.6800000000000002e-05, "loss": 0.0918, "reward": 0.525390625, "reward_std": 0.23194240033626556, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.517578125, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 312.7109375, "epoch": 0.0844, "grad_norm": 7.887455841225596, "kl": 5.375, "learning_rate": 1.688e-05, "loss": 0.0766, "reward": 0.55859375, "reward_std": 0.24759945273399353, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.54296875, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 265.3828125, "epoch": 0.0848, "grad_norm": 95.20625027298713, "kl": 8.53125, "learning_rate": 1.696e-05, "loss": 0.2066, "reward": 0.46875, "reward_std": 0.25187384337186813, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.46875, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 286.9296875, "epoch": 0.0852, "grad_norm": 31.143820705388375, "kl": 3.6484375, "learning_rate": 1.704e-05, "loss": -0.0417, "reward": 0.5234375, "reward_std": 0.27524447441101074, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5078125, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 331.1484375, "epoch": 0.0856, "grad_norm": 10.899199742429852, "kl": 5.265625, "learning_rate": 1.7120000000000002e-05, "loss": 0.0797, "reward": 0.71484375, "reward_std": 0.2692321576178074, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.57421875, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 361.0625, "epoch": 0.086, "grad_norm": 8.94736722712284, "kl": 4.234375, "learning_rate": 1.72e-05, "loss": -0.0808, "reward": 0.50390625, "reward_std": 0.30029940605163574, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48828125, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 260.390625, "epoch": 0.0864, "grad_norm": 2.3106708957328497, "kl": 4.34375, "learning_rate": 1.728e-05, "loss": -0.1535, "reward": 0.5, "reward_std": 0.2853132337331772, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 367.75, "epoch": 0.0868, "grad_norm": 2.0705378355035933, "kl": 4.3046875, "learning_rate": 1.736e-05, "loss": -0.1025, "reward": 0.52734375, "reward_std": 0.2740003392100334, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.51953125, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 398.1328125, "epoch": 0.0872, "grad_norm": 0.8680709017614424, "kl": 4.859375, "learning_rate": 1.7440000000000002e-05, "loss": -0.1251, "reward": 0.505859375, "reward_std": 0.3358744829893112, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.482421875, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 560.7109375, "epoch": 0.0876, "grad_norm": 21.658487060702488, "kl": 8.609375, "learning_rate": 1.752e-05, "loss": 0.1172, "reward": 0.5703125, "reward_std": 0.2614353522658348, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5625, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 667.4296875, "epoch": 0.088, "grad_norm": 1.675688940447388, "kl": 11.953125, "learning_rate": 1.76e-05, "loss": -0.0009, "reward": 0.564453125, "reward_std": 0.22647975385189056, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.564453125, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 738.296875, "epoch": 0.0884, "grad_norm": 3821.428332230881, "kl": 320.4921875, "learning_rate": 1.768e-05, "loss": 5.8642, "reward": 0.54296875, "reward_std": 0.22393786162137985, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.53515625, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 552.1796875, "epoch": 0.0888, "grad_norm": 1.0090012697013004, "kl": 5.03125, "learning_rate": 1.7760000000000003e-05, "loss": 0.1788, "reward": 0.69921875, "reward_std": 0.16108210757374763, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.69140625, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 493.6796875, "epoch": 0.0892, "grad_norm": 4.738605346179903, "kl": 5.4609375, "learning_rate": 1.7840000000000002e-05, "loss": 0.179, "reward": 0.728515625, "reward_std": 0.10984568670392036, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.720703125, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 351.7265625, "epoch": 0.0896, "grad_norm": 56248.32681560889, "kl": 1740.15625, "learning_rate": 1.792e-05, "loss": 69.3533, "reward": 0.71875, "reward_std": 0.09435540437698364, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.71875, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 288.75, "epoch": 0.09, "grad_norm": 36.94891053284688, "kl": 16.953125, "learning_rate": 1.8e-05, "loss": 0.4005, "reward": 0.794921875, "reward_std": 0.2139635644853115, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.708984375, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 226.0625, "epoch": 0.0904, "grad_norm": 2.450465507474508, "kl": 5.953125, "learning_rate": 1.8080000000000003e-05, "loss": 0.2092, "reward": 0.720703125, "reward_std": 0.07212666235864162, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.720703125, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 184.0, "epoch": 0.0908, "grad_norm": 0.651458602304512, "kl": 5.28125, "learning_rate": 1.8160000000000002e-05, "loss": 0.2117, "reward": 0.873046875, "reward_std": 0.059012047946453094, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.740234375, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 186.859375, "epoch": 0.0912, "grad_norm": 0.9028381937769181, "kl": 4.8125, "learning_rate": 1.824e-05, "loss": 0.1834, "reward": 0.744140625, "reward_std": 0.13853386230766773, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.712890625, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 229.1796875, "epoch": 0.0916, "grad_norm": 0.2904641474114234, "kl": 5.71875, "learning_rate": 1.832e-05, "loss": 0.2023, "reward": 0.7421875, "reward_std": 0.03125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7421875, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 198.25, "epoch": 0.092, "grad_norm": 0.37359024123889234, "kl": 5.3828125, "learning_rate": 1.8400000000000003e-05, "loss": 0.2154, "reward": 0.787109375, "reward_std": 0.06765169650316238, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.748046875, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 209.5, "epoch": 0.0924, "grad_norm": 0.28682701918403203, "kl": 5.28125, "learning_rate": 1.8480000000000003e-05, "loss": 0.2106, "reward": 0.748046875, "reward_std": 0.0078125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.748046875, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 235.625, "epoch": 0.0928, "grad_norm": 0.3938566705043815, "kl": 5.0546875, "learning_rate": 1.8560000000000002e-05, "loss": 0.2026, "reward": 0.748046875, "reward_std": 0.0078125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.748046875, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 227.125, "epoch": 0.0932, "grad_norm": 0.11723173345942463, "kl": 4.859375, "learning_rate": 1.864e-05, "loss": 0.1955, "reward": 0.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.75, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 342.75, "epoch": 0.0936, "grad_norm": 0.4189433060856508, "kl": 3.9453125, "learning_rate": 1.8720000000000004e-05, "loss": 0.1642, "reward": 0.763671875, "reward_std": 0.050508126616477966, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.748046875, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 427.8359375, "epoch": 0.094, "grad_norm": 75.12663264646102, "kl": 1.9921875, "learning_rate": 1.88e-05, "loss": 0.1863, "reward": 0.65625, "reward_std": 0.274354737251997, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6015625, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 331.6171875, "epoch": 0.0944, "grad_norm": 0.21750796384932247, "kl": 4.390625, "learning_rate": 1.8880000000000002e-05, "loss": 0.1539, "reward": 0.744140625, "reward_std": 0.0234375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.744140625, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 334.625, "epoch": 0.0948, "grad_norm": 0.14308904480341308, "kl": 4.7265625, "learning_rate": 1.896e-05, "loss": 0.1885, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.75, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 353.0, "epoch": 0.0952, "grad_norm": 0.43183631359688424, "kl": 4.21875, "learning_rate": 1.904e-05, "loss": 0.1688, "reward": 0.75390625, "reward_std": 0.08321061357855797, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.73828125, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 360.0625, "epoch": 0.0956, "grad_norm": 1.993691841416631, "kl": 4.2734375, "learning_rate": 1.912e-05, "loss": 0.1716, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.75, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 435.5, "epoch": 0.096, "grad_norm": 15.616852329210676, "kl": 4.984375, "learning_rate": 1.9200000000000003e-05, "loss": 0.1991, "reward": 0.908203125, "reward_std": 0.1041487492620945, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.744140625, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 451.0, "epoch": 0.0964, "grad_norm": 0.37907381832067183, "kl": 4.40625, "learning_rate": 1.9280000000000002e-05, "loss": 0.1761, "reward": 0.744140625, "reward_std": 0.0234375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.744140625, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 366.75, "epoch": 0.0968, "grad_norm": 0.21774445193496844, "kl": 4.765625, "learning_rate": 1.936e-05, "loss": 0.1911, "reward": 0.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.75, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 296.5546875, "epoch": 0.0972, "grad_norm": 0.6718595961391379, "kl": 4.6015625, "learning_rate": 1.944e-05, "loss": 0.1795, "reward": 0.73828125, "reward_std": 0.04192390665411949, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.73828125, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 343.0, "epoch": 0.0976, "grad_norm": 0.6847851645164061, "kl": 4.90625, "learning_rate": 1.9520000000000003e-05, "loss": 0.1965, "reward": 0.75, "reward_std": 0.04473293572664261, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7421875, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 356.1015625, "epoch": 0.098, "grad_norm": 0.6747673128947689, "kl": 4.1171875, "learning_rate": 1.9600000000000002e-05, "loss": 0.149, "reward": 0.732421875, "reward_std": 0.04957009106874466, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.732421875, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 551.875, "epoch": 0.0984, "grad_norm": 0.6307027778082693, "kl": 4.75, "learning_rate": 1.968e-05, "loss": 0.1903, "reward": 0.734375, "reward_std": 0.05605955049395561, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.734375, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 590.078125, "epoch": 0.0988, "grad_norm": 0.3961676170815672, "kl": 3.8046875, "learning_rate": 1.976e-05, "loss": 0.1432, "reward": 0.74609375, "reward_std": 0.015625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.74609375, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 655.125, "epoch": 0.0992, "grad_norm": 0.5981740174143622, "kl": 3.5390625, "learning_rate": 1.9840000000000003e-05, "loss": 0.1412, "reward": 0.740234375, "reward_std": 0.0390625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.740234375, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 992.90625, "epoch": 0.0996, "grad_norm": 5.3310210149535635, "kl": 2.8828125, "learning_rate": 1.9920000000000002e-05, "loss": 0.1109, "reward": 0.671875, "reward_std": 0.15900594741106033, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.671875, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 992.0859375, "epoch": 0.1, "grad_norm": 4.3003455511611195, "kl": 2.89453125, "learning_rate": 2e-05, "loss": 0.1215, "reward": 0.75, "reward_std": 0.18868374079465866, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.625, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 877.8203125, "epoch": 0.1004, "grad_norm": 3.0452322162444467, "kl": 4.7734375, "learning_rate": 1.9999990252244153e-05, "loss": 0.1794, "reward": 0.681640625, "reward_std": 0.15274582616984844, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.681640625, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 798.625, "epoch": 0.1008, "grad_norm": 3512.2000664247007, "kl": 118.125, "learning_rate": 1.9999961008995607e-05, "loss": 6.0783, "reward": 0.685546875, "reward_std": 0.46106529980897903, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.1640625, "rewards/tag_count_reward": 0.521484375, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 938.5703125, "epoch": 0.1012, "grad_norm": 4.985251905932465, "kl": 4.8359375, "learning_rate": 1.9999912270311376e-05, "loss": 0.1947, "reward": 0.783203125, "reward_std": 0.2389550618827343, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.03125, "rewards/tag_count_reward": 0.626953125, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 565.5234375, "epoch": 0.1016, "grad_norm": 1.0623398377902065, "kl": 5.1328125, "learning_rate": 1.9999844036286483e-05, "loss": 0.192, "reward": 0.71484375, "reward_std": 0.09671888872981071, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.71484375, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 448.140625, "epoch": 0.102, "grad_norm": 0.7880113486480822, "kl": 4.7265625, "learning_rate": 1.9999756307053947e-05, "loss": 0.1748, "reward": 0.7265625, "reward_std": 0.09154300764203072, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7265625, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 753.0, "epoch": 0.1024, "grad_norm": 0.6357600598551962, "kl": 6.2421875, "learning_rate": 1.9999649082784807e-05, "loss": 0.2495, "reward": 0.689453125, "reward_std": 0.1522677205502987, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.681640625, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 862.0, "epoch": 0.1028, "grad_norm": 0.5496425714119663, "kl": 6.3984375, "learning_rate": 1.99995223636881e-05, "loss": 0.2559, "reward": 0.82421875, "reward_std": 0.13478534296154976, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.69921875, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 925.25, "epoch": 0.1032, "grad_norm": 0.7783298939897098, "kl": 6.8125, "learning_rate": 1.9999376150010868e-05, "loss": 0.2729, "reward": 0.671875, "reward_std": 0.20774900540709496, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6640625, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 536.75, "epoch": 0.1036, "grad_norm": 0.745264846597934, "kl": 5.96875, "learning_rate": 1.9999210442038164e-05, "loss": 0.2387, "reward": 0.720703125, "reward_std": 0.11014364659786224, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.720703125, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 723.875, "epoch": 0.104, "grad_norm": 1.679758766735698, "kl": 6.109375, "learning_rate": 1.9999025240093045e-05, "loss": 0.2442, "reward": 0.70703125, "reward_std": 0.17010526731610298, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.69921875, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 938.0, "epoch": 0.1044, "grad_norm": 7.208792916880008, "kl": 5.765625, "learning_rate": 1.999882054453657e-05, "loss": 0.2305, "reward": 0.58203125, "reward_std": 0.2352989800274372, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.58203125, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1048, "grad_norm": 6.437244531467509, "kl": 3.25390625, "learning_rate": 1.9998596355767805e-05, "loss": 0.1299, "reward": 0.201171875, "reward_std": 0.1878456026315689, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.201171875, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1052, "grad_norm": 8.614940662162416, "kl": 1.693359375, "learning_rate": 1.9998352674223816e-05, "loss": 0.0676, "reward": 0.126953125, "reward_std": 0.152433879673481, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.126953125, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1056, "grad_norm": 1.3184626920574034, "kl": 0.892578125, "learning_rate": 1.999808950037968e-05, "loss": 0.0357, "reward": 0.080078125, "reward_std": 0.12743165343999863, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.080078125, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.106, "grad_norm": 182.97660173765507, "kl": 18.3330078125, "learning_rate": 1.9997806834748455e-05, "loss": 0.7306, "reward": 0.13671875, "reward_std": 0.17135479301214218, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.13671875, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1064, "grad_norm": 2.480859682328366, "kl": 2.453125, "learning_rate": 1.9997504677881224e-05, "loss": 0.0982, "reward": 0.181640625, "reward_std": 0.21267466992139816, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.181640625, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1068, "grad_norm": 16.287598048724302, "kl": 5.8828125, "learning_rate": 1.999718303036705e-05, "loss": 0.235, "reward": 0.267578125, "reward_std": 0.23735878989100456, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.267578125, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1072, "grad_norm": 2.3258021889486535, "kl": 4.9375, "learning_rate": 1.9996841892833e-05, "loss": 0.1974, "reward": 0.34765625, "reward_std": 0.2726123072206974, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.34765625, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1076, "grad_norm": 13.027876281778072, "kl": 5.984375, "learning_rate": 1.9996481265944146e-05, "loss": 0.2391, "reward": 0.619140625, "reward_std": 0.3107273578643799, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.494140625, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.108, "grad_norm": 12.638455685198364, "kl": 6.3203125, "learning_rate": 1.9996101150403543e-05, "loss": 0.2531, "reward": 0.64453125, "reward_std": 0.28342460840940475, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.51953125, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1084, "grad_norm": 6.251438271851435, "kl": 8.796875, "learning_rate": 1.9995701546952252e-05, "loss": 0.3512, "reward": 0.533203125, "reward_std": 0.3189627230167389, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.525390625, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1088, "grad_norm": 4.610708373848375, "kl": 7.4765625, "learning_rate": 1.9995282456369313e-05, "loss": 0.2988, "reward": 0.650390625, "reward_std": 0.2773875296115875, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.525390625, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1092, "grad_norm": 5.1837558910303905, "kl": 6.703125, "learning_rate": 1.999484387947177e-05, "loss": 0.2684, "reward": 0.6015625, "reward_std": 0.31162694096565247, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.46875, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1096, "grad_norm": 33.16834890262621, "kl": 10.5, "learning_rate": 1.9994385817114644e-05, "loss": 0.4193, "reward": 0.44140625, "reward_std": 0.3054959252476692, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.44140625, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.11, "grad_norm": 12.337259675042455, "kl": 5.671875, "learning_rate": 1.999390827019096e-05, "loss": 0.2272, "reward": 0.46875, "reward_std": 0.32186882197856903, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.34375, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 1017.8125, "epoch": 0.1104, "grad_norm": 6.01151341104219, "kl": 5.4453125, "learning_rate": 1.9993411239631713e-05, "loss": 0.2025, "reward": 0.384765625, "reward_std": 0.3162810280919075, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.376953125, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 1018.4453125, "epoch": 0.1108, "grad_norm": 10.8074151155841, "kl": 5.390625, "learning_rate": 1.9992894726405894e-05, "loss": 0.1775, "reward": 0.197265625, "reward_std": 0.3083246946334839, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.189453125, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1112, "grad_norm": 15.07642969910668, "kl": 3.6953125, "learning_rate": 1.999235873152047e-05, "loss": 0.1478, "reward": 0.1875, "reward_std": 0.26677989959716797, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1116, "grad_norm": 3.4129063811709894, "kl": 3.4296875, "learning_rate": 1.9991803256020393e-05, "loss": 0.1372, "reward": 0.263671875, "reward_std": 0.21447299420833588, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.138671875, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 1019.375, "epoch": 0.112, "grad_norm": 7.202946559968058, "kl": 3.51953125, "learning_rate": 1.9991228300988586e-05, "loss": 0.1503, "reward": 0.177734375, "reward_std": 0.2700237035751343, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.154296875, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 1008.6875, "epoch": 0.1124, "grad_norm": 5.139185973037531, "kl": 2.869140625, "learning_rate": 1.9990633867545956e-05, "loss": 0.1091, "reward": 0.26171875, "reward_std": 0.2248064372688532, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.12890625, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 991.7265625, "epoch": 0.1128, "grad_norm": 5.19065296656593, "kl": 1.47265625, "learning_rate": 1.9990019956851384e-05, "loss": 0.046, "reward": 0.0703125, "reward_std": 0.14940080046653748, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0703125, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 992.6796875, "epoch": 0.1132, "grad_norm": 4.5601182182476405, "kl": 1.701171875, "learning_rate": 1.9989386570101716e-05, "loss": 0.0629, "reward": 0.123046875, "reward_std": 0.17688597366213799, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.123046875, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 989.234375, "epoch": 0.1136, "grad_norm": 7.143813537074301, "kl": 9.39453125, "learning_rate": 1.9988733708531772e-05, "loss": 0.0478, "reward": 0.337890625, "reward_std": 0.19492597877979279, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.080078125, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 912.3828125, "epoch": 0.114, "grad_norm": 16.688138434002187, "kl": 1.2158203125, "learning_rate": 1.9988061373414342e-05, "loss": 0.0964, "reward": 0.23046875, "reward_std": 0.16247815266251564, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.10546875, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 940.2265625, "epoch": 0.1144, "grad_norm": 12.10333899642153, "kl": 1.59765625, "learning_rate": 1.998736956606018e-05, "loss": 0.0535, "reward": 0.216796875, "reward_std": 0.1904570460319519, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.083984375, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 880.8359375, "epoch": 0.1148, "grad_norm": 6.051319657791192, "kl": 1.2060546875, "learning_rate": 1.998665828781799e-05, "loss": 0.0708, "reward": 0.076171875, "reward_std": 0.12684792466461658, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.076171875, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 785.546875, "epoch": 0.1152, "grad_norm": 1.4044951096826195, "kl": 0.833984375, "learning_rate": 1.9985927540074453e-05, "loss": 0.028, "reward": 0.068359375, "reward_std": 0.11367788910865784, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.068359375, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 783.9375, "epoch": 0.1156, "grad_norm": 16.333802485546386, "kl": 1.65625, "learning_rate": 1.99851773242542e-05, "loss": 0.1188, "reward": 0.12890625, "reward_std": 0.20056581497192383, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.12109375, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 831.2890625, "epoch": 0.116, "grad_norm": 6.863112612841877, "kl": 1.958984375, "learning_rate": 1.9984407641819812e-05, "loss": 0.1464, "reward": 0.27734375, "reward_std": 0.20846552774310112, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.15234375, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 599.2265625, "epoch": 0.1164, "grad_norm": 27410.5182952558, "kl": 1538.015625, "learning_rate": 1.9983618494271825e-05, "loss": 34.707, "reward": 0.171875, "reward_std": 0.23788952827453613, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1640625, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 616.546875, "epoch": 0.1168, "grad_norm": 9.255718612091155, "kl": 2.490234375, "learning_rate": 1.998280988314872e-05, "loss": 0.0737, "reward": 0.1953125, "reward_std": 0.2147199586033821, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1953125, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 550.671875, "epoch": 0.1172, "grad_norm": 186791.50773460328, "kl": 6376.0, "learning_rate": 1.9981981810026932e-05, "loss": 453.9174, "reward": 0.1875, "reward_std": 0.2163861058652401, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 351.0234375, "epoch": 0.1176, "grad_norm": 34.033151934335834, "kl": 6.62109375, "learning_rate": 1.9981134276520828e-05, "loss": 0.2296, "reward": 0.169921875, "reward_std": 0.18820440769195557, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.169921875, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 266.2265625, "epoch": 0.118, "grad_norm": 8.068651429866122, "kl": 1.87109375, "learning_rate": 1.9980267284282718e-05, "loss": -0.0019, "reward": 0.197265625, "reward_std": 0.2202559858560562, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.197265625, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 319.9375, "epoch": 0.1184, "grad_norm": 47.95812729379957, "kl": 9.5546875, "learning_rate": 1.9979380835002846e-05, "loss": 0.4079, "reward": 0.228515625, "reward_std": 0.19845305010676384, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.228515625, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 188.3828125, "epoch": 0.1188, "grad_norm": 3.635772955018935, "kl": 2.240234375, "learning_rate": 1.9978474930409396e-05, "loss": 0.0548, "reward": 0.2578125, "reward_std": 0.21636439859867096, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 165.7265625, "epoch": 0.1192, "grad_norm": 1.8321975888448225, "kl": 2.259765625, "learning_rate": 1.997754957226847e-05, "loss": -0.0762, "reward": 0.2421875, "reward_std": 0.211391631513834, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2421875, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 281.1640625, "epoch": 0.1196, "grad_norm": 1.07253261763434, "kl": 1.875, "learning_rate": 1.99766047623841e-05, "loss": 0.0096, "reward": 0.341796875, "reward_std": 0.25995032116770744, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.341796875, "step": 299 }, { "clip_ratio": 0.0, "completion_length": 668.5, "epoch": 0.12, "grad_norm": 0.5129114045145476, "kl": 3.0390625, "learning_rate": 1.9975640502598243e-05, "loss": 0.1172, "reward": 0.775390625, "reward_std": 0.34360650926828384, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.0546875, "rewards/tag_count_reward": 0.587890625, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 899.1796875, "epoch": 0.1204, "grad_norm": 8.880492749985667, "kl": 3.30859375, "learning_rate": 1.9974656794790777e-05, "loss": 0.1306, "reward": 1.234375, "reward_std": 0.6402133703231812, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.5390625, "rewards/tag_count_reward": 0.5703125, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 909.8125, "epoch": 0.1208, "grad_norm": 113.1532080948638, "kl": 4.18359375, "learning_rate": 1.9973653640879486e-05, "loss": 0.1505, "reward": 1.220703125, "reward_std": 0.556606151163578, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.671875, "rewards/tag_count_reward": 0.541015625, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 768.875, "epoch": 0.1212, "grad_norm": 4087063.778665014, "kl": 60931.0078125, "learning_rate": 1.997263104282007e-05, "loss": 1633.0662, "reward": 1.345703125, "reward_std": 0.5281245857477188, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.78125, "rewards/tag_count_reward": 0.541015625, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 686.4296875, "epoch": 0.1216, "grad_norm": 23.74246952101163, "kl": 4.921875, "learning_rate": 1.997158900260614e-05, "loss": 0.1897, "reward": 1.298828125, "reward_std": 0.5681162551045418, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.75, "rewards/tag_count_reward": 0.541015625, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 593.875, "epoch": 0.122, "grad_norm": 1.8315440232199458, "kl": 4.1875, "learning_rate": 1.9970527522269204e-05, "loss": 0.1676, "reward": 1.048828125, "reward_std": 0.6301785111427307, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.484375, "rewards/tag_count_reward": 0.494140625, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 502.875, "epoch": 0.1224, "grad_norm": 151.7449818462519, "kl": 6.9140625, "learning_rate": 1.9969446603878673e-05, "loss": 0.2769, "reward": 0.98828125, "reward_std": 0.581148661673069, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.46875, "rewards/tag_count_reward": 0.45703125, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 635.5, "epoch": 0.1228, "grad_norm": 4556.840963832666, "kl": 226.5, "learning_rate": 1.9968346249541848e-05, "loss": 9.0717, "reward": 0.712890625, "reward_std": 0.4513862356543541, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.21875, "rewards/tag_count_reward": 0.369140625, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 572.0, "epoch": 0.1232, "grad_norm": 6.221919436499226, "kl": 4.6796875, "learning_rate": 1.9967226461403934e-05, "loss": 0.1872, "reward": 0.474609375, "reward_std": 0.41813959926366806, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.1171875, "rewards/tag_count_reward": 0.357421875, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 514.0, "epoch": 0.1236, "grad_norm": 7.8747821893280925, "kl": 3.828125, "learning_rate": 1.996608724164801e-05, "loss": 0.153, "reward": 0.34375, "reward_std": 0.2527535781264305, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0390625, "rewards/tag_count_reward": 0.3046875, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 497.125, "epoch": 0.124, "grad_norm": 3004.3433288819665, "kl": 265.375, "learning_rate": 1.9964928592495046e-05, "loss": 10.625, "reward": 0.361328125, "reward_std": 0.36744409799575806, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.09375, "rewards/tag_count_reward": 0.267578125, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 691.25, "epoch": 0.1244, "grad_norm": 56.06523597959889, "kl": 3.078125, "learning_rate": 1.9963750516203887e-05, "loss": 0.1233, "reward": 0.615234375, "reward_std": 0.43321915715932846, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.1484375, "rewards/tag_count_reward": 0.248046875, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 528.625, "epoch": 0.1248, "grad_norm": 84.6824018781464, "kl": 12.609375, "learning_rate": 1.996255301507125e-05, "loss": 0.5039, "reward": 0.4921875, "reward_std": 0.3327029347419739, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.09375, "rewards/tag_count_reward": 0.2734375, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 631.5, "epoch": 0.1252, "grad_norm": 8.967920238939566, "kl": 2.55859375, "learning_rate": 1.9961336091431728e-05, "loss": 0.1023, "reward": 0.400390625, "reward_std": 0.4765268787741661, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.1953125, "rewards/tag_count_reward": 0.205078125, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 603.625, "epoch": 0.1256, "grad_norm": 11.647460166242775, "kl": 2.48046875, "learning_rate": 1.9960099747657774e-05, "loss": 0.0996, "reward": 0.490234375, "reward_std": 0.5243637934327126, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.2734375, "rewards/tag_count_reward": 0.201171875, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 853.0, "epoch": 0.126, "grad_norm": 38.94912575724788, "kl": 9.08203125, "learning_rate": 1.9958843986159705e-05, "loss": 0.3634, "reward": 0.482421875, "reward_std": 0.48726659268140793, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.265625, "rewards/tag_count_reward": 0.208984375, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 896.125, "epoch": 0.1264, "grad_norm": 9.26420067211662, "kl": 3.9296875, "learning_rate": 1.9957568809385693e-05, "loss": 0.157, "reward": 0.638671875, "reward_std": 0.5206629559397697, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.4375, "rewards/tag_count_reward": 0.201171875, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 843.375, "epoch": 0.1268, "grad_norm": 7.12150419056921, "kl": 3.72265625, "learning_rate": 1.995627421982176e-05, "loss": 0.1492, "reward": 0.712890625, "reward_std": 0.5000608935952187, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.328125, "rewards/tag_count_reward": 0.259765625, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 441.0, "epoch": 0.1272, "grad_norm": 11.822470795745884, "kl": 3.0625, "learning_rate": 1.995496021999177e-05, "loss": 0.1225, "reward": 0.73046875, "reward_std": 0.48208289593458176, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.4453125, "rewards/tag_count_reward": 0.27734375, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 363.875, "epoch": 0.1276, "grad_norm": 14.248828570126147, "kl": 3.41015625, "learning_rate": 1.995362681245744e-05, "loss": 0.1363, "reward": 0.78125, "reward_std": 0.4907975420355797, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.484375, "rewards/tag_count_reward": 0.296875, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 941.0, "epoch": 0.128, "grad_norm": 1337.2090753120585, "kl": 152.6875, "learning_rate": 1.9952273999818312e-05, "loss": 6.0967, "reward": 0.8203125, "reward_std": 0.4705003798007965, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.421875, "rewards/tag_count_reward": 0.2734375, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 852.203125, "epoch": 0.1284, "grad_norm": 213.13253412165756, "kl": 27.125, "learning_rate": 1.9950901784711765e-05, "loss": 1.0862, "reward": 0.86328125, "reward_std": 0.47598550468683243, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.3359375, "rewards/tag_count_reward": 0.27734375, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 391.125, "epoch": 0.1288, "grad_norm": 5.450382922354749, "kl": 4.0703125, "learning_rate": 1.9949510169813006e-05, "loss": 0.1629, "reward": 0.58203125, "reward_std": 0.5177941173315048, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.3125, "rewards/tag_count_reward": 0.26171875, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 461.25, "epoch": 0.1292, "grad_norm": 2.1273559907233333, "kl": 3.6328125, "learning_rate": 1.994809915783505e-05, "loss": 0.1453, "reward": 0.642578125, "reward_std": 0.4714331030845642, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.25, "rewards/tag_count_reward": 0.267578125, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 277.375, "epoch": 0.1296, "grad_norm": 3.514816622579674, "kl": 4.1796875, "learning_rate": 1.9946668751528745e-05, "loss": 0.1671, "reward": 0.662109375, "reward_std": 0.4871250092983246, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.2578125, "rewards/tag_count_reward": 0.279296875, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 429.375, "epoch": 0.13, "grad_norm": 2.830531187332156, "kl": 3.2890625, "learning_rate": 1.9945218953682736e-05, "loss": 0.1314, "reward": 0.671875, "reward_std": 0.4188830330967903, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.328125, "rewards/tag_count_reward": 0.34375, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1304, "grad_norm": 47.55437057609593, "kl": 2.904296875, "learning_rate": 1.994374976712348e-05, "loss": 0.1163, "reward": 0.6171875, "reward_std": 0.44014404714107513, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.203125, "rewards/tag_count_reward": 0.4140625, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1308, "grad_norm": 15.264886651618037, "kl": 1.828125, "learning_rate": 1.9942261194715236e-05, "loss": 0.0731, "reward": 0.755859375, "reward_std": 0.49224841594696045, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.2421875, "rewards/tag_count_reward": 0.513671875, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1312, "grad_norm": 7.254529619659259, "kl": 1.318359375, "learning_rate": 1.9940753239360047e-05, "loss": 0.0526, "reward": 0.69140625, "reward_std": 0.41865529119968414, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.140625, "rewards/tag_count_reward": 0.55078125, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 463.375, "epoch": 0.1316, "grad_norm": 3.855661668479696, "kl": 2.8828125, "learning_rate": 1.9939225903997748e-05, "loss": 0.1155, "reward": 0.81640625, "reward_std": 0.3776960074901581, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.1015625, "rewards/tag_count_reward": 0.58984375, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 327.25, "epoch": 0.132, "grad_norm": 2.9491727040421174, "kl": 3.30078125, "learning_rate": 1.9937679191605964e-05, "loss": 0.1321, "reward": 0.8515625, "reward_std": 0.4139926955103874, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.1328125, "rewards/tag_count_reward": 0.59375, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 168.625, "epoch": 0.1324, "grad_norm": 2.7570256703068767, "kl": 4.3984375, "learning_rate": 1.9936113105200085e-05, "loss": 0.1761, "reward": 0.80078125, "reward_std": 0.5120957344770432, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.1796875, "rewards/tag_count_reward": 0.62109375, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 228.75, "epoch": 0.1328, "grad_norm": 2.858195480252147, "kl": 4.5390625, "learning_rate": 1.9934527647833276e-05, "loss": 0.1813, "reward": 0.810546875, "reward_std": 0.5075410157442093, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.1796875, "rewards/tag_count_reward": 0.630859375, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 334.25, "epoch": 0.1332, "grad_norm": 3.1750216687916275, "kl": 4.40625, "learning_rate": 1.993292282259647e-05, "loss": 0.1766, "reward": 1.03515625, "reward_std": 0.5409962236881256, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.4453125, "rewards/tag_count_reward": 0.58203125, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 501.875, "epoch": 0.1336, "grad_norm": 11.414820601057354, "kl": 5.0703125, "learning_rate": 1.9931298632618355e-05, "loss": 0.2025, "reward": 1.291015625, "reward_std": 0.5173156559467316, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.5859375, "rewards/tag_count_reward": 0.580078125, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 557.375, "epoch": 0.134, "grad_norm": 8.053734488415785, "kl": 5.5234375, "learning_rate": 1.992965508106537e-05, "loss": 0.221, "reward": 1.30859375, "reward_std": 0.4569675475358963, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.75, "rewards/tag_count_reward": 0.54296875, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 1013.625, "epoch": 0.1344, "grad_norm": 64.5290553574555, "kl": 9.609375, "learning_rate": 1.9927992171141707e-05, "loss": 0.3853, "reward": 1.263671875, "reward_std": 0.42807820439338684, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.8203125, "rewards/tag_count_reward": 0.435546875, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 1017.6484375, "epoch": 0.1348, "grad_norm": 292.6743281604208, "kl": 20.609375, "learning_rate": 1.992630990608929e-05, "loss": 0.8336, "reward": 1.419921875, "reward_std": 0.2860623076558113, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.341796875, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 988.15625, "epoch": 0.1352, "grad_norm": 61.17258954212131, "kl": 3.828125, "learning_rate": 1.9924608289187786e-05, "loss": 0.1232, "reward": 1.294921875, "reward_std": 0.20733904838562012, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.201171875, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 990.8671875, "epoch": 0.1356, "grad_norm": 2.7437220866917844, "kl": 1.072265625, "learning_rate": 1.992288732375458e-05, "loss": 0.0335, "reward": 1.14453125, "reward_std": 0.2509919963777065, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.18359375, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 1001.890625, "epoch": 0.136, "grad_norm": 2.2582775957863586, "kl": 1.6875, "learning_rate": 1.9921147013144782e-05, "loss": 0.0786, "reward": 1.314453125, "reward_std": 0.14481304213404655, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.212890625, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 1014.6953125, "epoch": 0.1364, "grad_norm": 0.9695229753732372, "kl": 1.00390625, "learning_rate": 1.9919387360751216e-05, "loss": 0.0404, "reward": 1.296875, "reward_std": 0.3165993243455887, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.2109375, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 1018.8828125, "epoch": 0.1368, "grad_norm": 0.44770933552453546, "kl": 0.262451171875, "learning_rate": 1.9917608370004417e-05, "loss": 0.0085, "reward": 1.15625, "reward_std": 0.3010886535048485, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.25, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1372, "grad_norm": 0.43722344592997886, "kl": 0.1414794921875, "learning_rate": 1.9915810044372618e-05, "loss": 0.0057, "reward": 1.21484375, "reward_std": 0.4502625837922096, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8125, "rewards/tag_count_reward": 0.40234375, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1376, "grad_norm": 7.934351350492605, "kl": 1.1416015625, "learning_rate": 1.9913992387361747e-05, "loss": 0.0457, "reward": 1.103515625, "reward_std": 0.541462779045105, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.6875, "rewards/tag_count_reward": 0.416015625, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 1020.46875, "epoch": 0.138, "grad_norm": 15.695194033625938, "kl": 1.0703125, "learning_rate": 1.991215540251542e-05, "loss": 0.0444, "reward": 1.154296875, "reward_std": 0.5437314510345459, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.7109375, "rewards/tag_count_reward": 0.443359375, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1384, "grad_norm": 1.6227486125942776, "kl": 0.219970703125, "learning_rate": 1.991029909341493e-05, "loss": 0.0088, "reward": 1.25, "reward_std": 0.50792346149683, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.796875, "rewards/tag_count_reward": 0.4453125, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 1013.390625, "epoch": 0.1388, "grad_norm": 2.20951942817893, "kl": 0.572265625, "learning_rate": 1.9908423463679246e-05, "loss": 0.0243, "reward": 1.11328125, "reward_std": 0.5634518414735794, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.6796875, "rewards/tag_count_reward": 0.43359375, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 1015.3984375, "epoch": 0.1392, "grad_norm": 7.7594042785015365, "kl": 2.041015625, "learning_rate": 1.990652851696501e-05, "loss": 0.0858, "reward": 1.20703125, "reward_std": 0.576018363237381, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.71875, "rewards/tag_count_reward": 0.48828125, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 1019.078125, "epoch": 0.1396, "grad_norm": 10.210100480088121, "kl": 4.671875, "learning_rate": 1.9904614256966514e-05, "loss": 0.1781, "reward": 1.302734375, "reward_std": 0.6121547967195511, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.6953125, "rewards/tag_count_reward": 0.607421875, "step": 349 }, { "clip_ratio": 0.0, "completion_length": 1018.0703125, "epoch": 0.14, "grad_norm": 100.73174986781578, "kl": 6.5859375, "learning_rate": 1.9902680687415704e-05, "loss": 0.2582, "reward": 1.3515625, "reward_std": 0.5881323218345642, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.6875, "rewards/tag_count_reward": 0.6640625, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 918.0, "epoch": 0.1404, "grad_norm": 4415.55478067813, "kl": 118.09375, "learning_rate": 1.9900727812082177e-05, "loss": 4.7144, "reward": 1.267578125, "reward_std": 0.5661409869790077, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.5703125, "rewards/tag_count_reward": 0.697265625, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 1016.65625, "epoch": 0.1408, "grad_norm": 11.810901167266763, "kl": 6.4609375, "learning_rate": 1.989875563477316e-05, "loss": 0.2462, "reward": 1.236328125, "reward_std": 0.6014630347490311, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.5546875, "rewards/tag_count_reward": 0.673828125, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 1022.8046875, "epoch": 0.1412, "grad_norm": 3.090390021376458, "kl": 5.140625, "learning_rate": 1.989676415933351e-05, "loss": 0.2048, "reward": 1.20703125, "reward_std": 0.6363562196493149, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.421875, "rewards/tag_count_reward": 0.66015625, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 1008.6171875, "epoch": 0.1416, "grad_norm": 3.018565423341549, "kl": 1.708984375, "learning_rate": 1.9894753389645723e-05, "loss": 0.0658, "reward": 1.064453125, "reward_std": 0.6712534129619598, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.40625, "rewards/tag_count_reward": 0.658203125, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 1008.7265625, "epoch": 0.142, "grad_norm": 1.3838282712869974, "kl": 0.9931640625, "learning_rate": 1.9892723329629885e-05, "loss": 0.0317, "reward": 0.791015625, "reward_std": 0.5027537494897842, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.1328125, "rewards/tag_count_reward": 0.533203125, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 1010.1953125, "epoch": 0.1424, "grad_norm": 7.5795530160885916, "kl": 1.37109375, "learning_rate": 1.9890673983243708e-05, "loss": 0.0562, "reward": 0.923828125, "reward_std": 0.6156398206949234, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.3671875, "rewards/tag_count_reward": 0.556640625, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1428, "grad_norm": 0.7817930272788044, "kl": 4.546875, "learning_rate": 1.9888605354482494e-05, "loss": 0.1817, "reward": 1.03125, "reward_std": 0.6676317155361176, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.46875, "rewards/tag_count_reward": 0.5625, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1432, "grad_norm": 1.2123918492024461, "kl": 5.2734375, "learning_rate": 1.988651744737914e-05, "loss": 0.211, "reward": 1.224609375, "reward_std": 0.7064291834831238, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.59375, "rewards/tag_count_reward": 0.630859375, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 1016.8828125, "epoch": 0.1436, "grad_norm": 56.24784713181669, "kl": 7.734375, "learning_rate": 1.9884410266004134e-05, "loss": 0.2997, "reward": 1.236328125, "reward_std": 0.8005036562681198, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.59375, "rewards/tag_count_reward": 0.642578125, "step": 359 }, { "clip_ratio": 0.0, "completion_length": 1021.1171875, "epoch": 0.144, "grad_norm": 205.5176982486473, "kl": 30.921875, "learning_rate": 1.988228381446553e-05, "loss": 1.2367, "reward": 0.98046875, "reward_std": 0.7695202082395554, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.4453125, "rewards/tag_count_reward": 0.53515625, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1444, "grad_norm": 9.991666827358896, "kl": 5.2265625, "learning_rate": 1.9880138096908955e-05, "loss": 0.2094, "reward": 1.197265625, "reward_std": 0.6708222329616547, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.5, "rewards/tag_count_reward": 0.564453125, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1448, "grad_norm": 88.6763046297086, "kl": 3.0859375, "learning_rate": 1.987797311751759e-05, "loss": 0.1234, "reward": 0.791015625, "reward_std": 0.6914880722761154, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.359375, "rewards/tag_count_reward": 0.431640625, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1452, "grad_norm": 4.9105954366216125, "kl": 5.6640625, "learning_rate": 1.9875788880512183e-05, "loss": 0.2272, "reward": 0.73828125, "reward_std": 0.6170024424791336, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.2265625, "rewards/tag_count_reward": 0.51171875, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1456, "grad_norm": 13.173739070223535, "kl": 8.6796875, "learning_rate": 1.9873585390151003e-05, "loss": 0.3474, "reward": 0.9609375, "reward_std": 0.8214973360300064, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.3828125, "rewards/tag_count_reward": 0.5625, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.146, "grad_norm": 8.408954618174247, "kl": 6.4609375, "learning_rate": 1.987136265072988e-05, "loss": 0.2587, "reward": 1.3359375, "reward_std": 0.8509128391742706, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.546875, "rewards/tag_count_reward": 0.65625, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1464, "grad_norm": 17.565352352375445, "kl": 8.609375, "learning_rate": 1.9869120666582153e-05, "loss": 0.3441, "reward": 1.12890625, "reward_std": 0.7928648889064789, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.390625, "rewards/tag_count_reward": 0.61328125, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1468, "grad_norm": 49.15567614561271, "kl": 3.39453125, "learning_rate": 1.986685944207868e-05, "loss": 0.1356, "reward": 0.587890625, "reward_std": 0.4094407558441162, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.046875, "rewards/tag_count_reward": 0.541015625, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1472, "grad_norm": 3.271708567123277, "kl": 5.875, "learning_rate": 1.9864578981627844e-05, "loss": 0.2352, "reward": 1.1015625, "reward_std": 0.7944374084472656, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.4296875, "rewards/tag_count_reward": 0.671875, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1476, "grad_norm": 51.604937290156656, "kl": 10.328125, "learning_rate": 1.986227928967551e-05, "loss": 0.4134, "reward": 1.26171875, "reward_std": 0.7793060690164566, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.5546875, "rewards/tag_count_reward": 0.70703125, "step": 369 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.148, "grad_norm": 19.63057405168318, "kl": 6.57421875, "learning_rate": 1.985996037070505e-05, "loss": 0.264, "reward": 1.318359375, "reward_std": 0.7837247997522354, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.3671875, "rewards/tag_count_reward": 0.576171875, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1484, "grad_norm": 5.219220554428282, "kl": 7.71875, "learning_rate": 1.9857622229237315e-05, "loss": 0.3089, "reward": 0.884765625, "reward_std": 0.6462861970067024, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.2265625, "rewards/tag_count_reward": 0.658203125, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 787.375, "epoch": 0.1488, "grad_norm": 15.168853767169974, "kl": 6.6171875, "learning_rate": 1.985526486983063e-05, "loss": 0.2647, "reward": 0.5546875, "reward_std": 0.3172723948955536, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0078125, "rewards/tag_count_reward": 0.546875, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 593.875, "epoch": 0.1492, "grad_norm": 12.439224225425056, "kl": 6.34375, "learning_rate": 1.985288829708079e-05, "loss": 0.2544, "reward": 0.62109375, "reward_std": 0.2205156274139881, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.62109375, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 734.0, "epoch": 0.1496, "grad_norm": 13.433136852516746, "kl": 7.046875, "learning_rate": 1.9850492515621038e-05, "loss": 0.2721, "reward": 0.55859375, "reward_std": 0.2932799607515335, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.55859375, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.15, "grad_norm": 50.373511053791624, "kl": 14.359375, "learning_rate": 1.9848077530122083e-05, "loss": 0.574, "reward": 0.533203125, "reward_std": 0.41490043699741364, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.03125, "rewards/tag_count_reward": 0.501953125, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 916.625, "epoch": 0.1504, "grad_norm": 6.8515074858305045, "kl": 8.3203125, "learning_rate": 1.9845643345292055e-05, "loss": 0.3325, "reward": 0.775390625, "reward_std": 0.28976939246058464, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.015625, "rewards/tag_count_reward": 0.634765625, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1508, "grad_norm": 3.33273979325767, "kl": 6.875, "learning_rate": 1.9843189965876525e-05, "loss": 0.2751, "reward": 0.625, "reward_std": 0.2423449456691742, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.625, "step": 377 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1512, "grad_norm": 7.885464752827306, "kl": 5.8125, "learning_rate": 1.9840717396658483e-05, "loss": 0.2324, "reward": 0.6171875, "reward_std": 0.29692019894719124, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4921875, "step": 378 }, { "clip_ratio": 0.0, "completion_length": 910.125, "epoch": 0.1516, "grad_norm": 6.717396839282083, "kl": 6.484375, "learning_rate": 1.983822564245833e-05, "loss": 0.2594, "reward": 0.74609375, "reward_std": 0.22580252215266228, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.62109375, "step": 379 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.152, "grad_norm": 6095.997750869237, "kl": 218.0625, "learning_rate": 1.983571470813386e-05, "loss": 8.6991, "reward": 0.5546875, "reward_std": 0.30207639932632446, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5546875, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1524, "grad_norm": 521.0301921108924, "kl": 59.625, "learning_rate": 1.983318459858028e-05, "loss": 2.3876, "reward": 0.53515625, "reward_std": 0.28383803367614746, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.53515625, "step": 381 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1528, "grad_norm": 146.32304306881667, "kl": 24.1875, "learning_rate": 1.9830635318730155e-05, "loss": 0.9694, "reward": 0.61328125, "reward_std": 0.23147617653012276, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.61328125, "step": 382 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1532, "grad_norm": 9.811194547895399, "kl": 6.8203125, "learning_rate": 1.982806687355345e-05, "loss": 0.2733, "reward": 0.859375, "reward_std": 0.27925052493810654, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.609375, "step": 383 }, { "clip_ratio": 0.0, "completion_length": 912.125, "epoch": 0.1536, "grad_norm": 2.091769312916941, "kl": 6.0546875, "learning_rate": 1.982547926805747e-05, "loss": 0.2419, "reward": 0.787109375, "reward_std": 0.2398892678320408, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0078125, "rewards/tag_count_reward": 0.654296875, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.154, "grad_norm": 2.7767859663021963, "kl": 6.578125, "learning_rate": 1.982287250728689e-05, "loss": 0.2629, "reward": 0.89453125, "reward_std": 0.4447258412837982, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.09375, "rewards/tag_count_reward": 0.67578125, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 914.25, "epoch": 0.1544, "grad_norm": 4.548478952915804, "kl": 7.40625, "learning_rate": 1.982024659632372e-05, "loss": 0.2969, "reward": 0.9375, "reward_std": 0.5716545730829239, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.21875, "rewards/tag_count_reward": 0.71875, "step": 386 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1548, "grad_norm": 7.646827645510463, "kl": 8.1171875, "learning_rate": 1.981760154028731e-05, "loss": 0.3251, "reward": 1.470703125, "reward_std": 0.7326623499393463, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.6484375, "rewards/tag_count_reward": 0.822265625, "step": 387 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1552, "grad_norm": 120.05574438679822, "kl": 27.78125, "learning_rate": 1.981493734433433e-05, "loss": 1.1133, "reward": 1.595703125, "reward_std": 0.7198008745908737, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.78125, "rewards/tag_count_reward": 0.814453125, "step": 388 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1556, "grad_norm": 86.70054297554563, "kl": 21.421875, "learning_rate": 1.981225401365877e-05, "loss": 0.8573, "reward": 1.77734375, "reward_std": 0.6329519897699356, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.828125, "rewards/tag_count_reward": 0.82421875, "step": 389 }, { "clip_ratio": 0.0, "completion_length": 741.625, "epoch": 0.156, "grad_norm": 6.841854971615355, "kl": 6.3984375, "learning_rate": 1.9809551553491918e-05, "loss": 0.2555, "reward": 1.642578125, "reward_std": 0.5430495068430901, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8359375, "rewards/tag_count_reward": 0.806640625, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1564, "grad_norm": 4.633675948529768, "kl": 1.419921875, "learning_rate": 1.9806829969102356e-05, "loss": 0.0568, "reward": 1.404296875, "reward_std": 0.6749884635210037, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.7421875, "rewards/tag_count_reward": 0.662109375, "step": 391 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1568, "grad_norm": 4.136214278716097, "kl": 1.9140625, "learning_rate": 1.980408926579596e-05, "loss": 0.0766, "reward": 1.130859375, "reward_std": 0.7460125386714935, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.484375, "rewards/tag_count_reward": 0.646484375, "step": 392 }, { "clip_ratio": 0.0, "completion_length": 1016.421875, "epoch": 0.1572, "grad_norm": 7.067692830765435, "kl": 3.359375, "learning_rate": 1.9801329448915863e-05, "loss": 0.1289, "reward": 1.126953125, "reward_std": 0.6665012985467911, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.3359375, "rewards/tag_count_reward": 0.666015625, "step": 393 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1576, "grad_norm": 76.8830155806116, "kl": 19.3359375, "learning_rate": 1.979855052384247e-05, "loss": 0.7726, "reward": 1.0078125, "reward_std": 0.5661618858575821, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.25, "rewards/tag_count_reward": 0.6328125, "step": 394 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.158, "grad_norm": 24.113913018843263, "kl": 7.671875, "learning_rate": 1.979575249599344e-05, "loss": 0.3073, "reward": 1.037109375, "reward_std": 0.6775131970643997, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.3125, "rewards/tag_count_reward": 0.599609375, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1584, "grad_norm": 8.128679293618658, "kl": 2.0625, "learning_rate": 1.9792935370823676e-05, "loss": 0.0825, "reward": 0.943359375, "reward_std": 0.6693326383829117, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.3125, "rewards/tag_count_reward": 0.630859375, "step": 396 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1588, "grad_norm": 8.411554471213725, "kl": 2.421875, "learning_rate": 1.97900991538253e-05, "loss": 0.0967, "reward": 1.08984375, "reward_std": 0.5417327135801315, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.1953125, "rewards/tag_count_reward": 0.64453125, "step": 397 }, { "clip_ratio": 0.0, "completion_length": 844.0, "epoch": 0.1592, "grad_norm": 8.04185457552214, "kl": 4.6875, "learning_rate": 1.9787243850527663e-05, "loss": 0.1874, "reward": 0.958984375, "reward_std": 0.46216458082199097, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.15625, "rewards/tag_count_reward": 0.677734375, "step": 398 }, { "clip_ratio": 0.0, "completion_length": 653.875, "epoch": 0.1596, "grad_norm": 2.2982748814699465, "kl": 7.5859375, "learning_rate": 1.9784369466497333e-05, "loss": 0.3035, "reward": 0.802734375, "reward_std": 0.4004361033439636, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.09375, "rewards/tag_count_reward": 0.708984375, "step": 399 }, { "clip_ratio": 0.0, "completion_length": 275.25, "epoch": 0.16, "grad_norm": 382.99747813304396, "kl": 52.765625, "learning_rate": 1.9781476007338058e-05, "loss": 2.1122, "reward": 0.78125, "reward_std": 0.3703605495393276, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.078125, "rewards/tag_count_reward": 0.703125, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 181.25, "epoch": 0.1604, "grad_norm": 6.116044036752886, "kl": 7.9765625, "learning_rate": 1.977856347869079e-05, "loss": 0.3197, "reward": 0.99609375, "reward_std": 0.6053012609481812, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.265625, "rewards/tag_count_reward": 0.73046875, "step": 401 }, { "clip_ratio": 0.0, "completion_length": 227.75, "epoch": 0.1608, "grad_norm": 1.7223280548274758, "kl": 6.578125, "learning_rate": 1.9775631886233655e-05, "loss": 0.2626, "reward": 1.26953125, "reward_std": 0.6843028217554092, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.484375, "rewards/tag_count_reward": 0.78515625, "step": 402 }, { "clip_ratio": 0.0, "completion_length": 162.375, "epoch": 0.1612, "grad_norm": 4.374493296917344, "kl": 6.1328125, "learning_rate": 1.9772681235681936e-05, "loss": 0.2455, "reward": 1.556640625, "reward_std": 0.7084085941314697, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.609375, "rewards/tag_count_reward": 0.822265625, "step": 403 }, { "clip_ratio": 0.0, "completion_length": 132.875, "epoch": 0.1616, "grad_norm": 6.7659518286389595, "kl": 6.2578125, "learning_rate": 1.9769711532788083e-05, "loss": 0.2504, "reward": 1.759765625, "reward_std": 0.5846036598086357, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8515625, "rewards/tag_count_reward": 0.908203125, "step": 404 }, { "clip_ratio": 0.0, "completion_length": 103.25, "epoch": 0.162, "grad_norm": 12.605174552665607, "kl": 6.5078125, "learning_rate": 1.9766722783341682e-05, "loss": 0.2604, "reward": 1.748046875, "reward_std": 0.46954021602869034, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.859375, "rewards/tag_count_reward": 0.888671875, "step": 405 }, { "clip_ratio": 0.0, "completion_length": 118.75, "epoch": 0.1624, "grad_norm": 3.2897317090296405, "kl": 6.4140625, "learning_rate": 1.976371499316945e-05, "loss": 0.2565, "reward": 1.80859375, "reward_std": 0.45997701585292816, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.875, "rewards/tag_count_reward": 0.93359375, "step": 406 }, { "clip_ratio": 0.0, "completion_length": 178.125, "epoch": 0.1628, "grad_norm": 38.15287836816992, "kl": 5.296875, "learning_rate": 1.9760688168135233e-05, "loss": 0.2114, "reward": 1.701171875, "reward_std": 0.5483102798461914, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8515625, "rewards/tag_count_reward": 0.849609375, "step": 407 }, { "clip_ratio": 0.0, "completion_length": 374.375, "epoch": 0.1632, "grad_norm": 37.431718855690654, "kl": 6.03125, "learning_rate": 1.9757642314139977e-05, "loss": 0.2417, "reward": 1.447265625, "reward_std": 0.30413495376706123, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.517578125, "step": 408 }, { "clip_ratio": 0.0, "completion_length": 362.25, "epoch": 0.1636, "grad_norm": 30.546484714819762, "kl": 5.390625, "learning_rate": 1.9754577437121733e-05, "loss": 0.2153, "reward": 1.494140625, "reward_std": 0.27597418427467346, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.541015625, "step": 409 }, { "clip_ratio": 0.0, "completion_length": 334.375, "epoch": 0.164, "grad_norm": 6.124961605761163, "kl": 3.41015625, "learning_rate": 1.9751493543055634e-05, "loss": 0.1361, "reward": 1.490234375, "reward_std": 0.1395602971315384, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.505859375, "step": 410 }, { "clip_ratio": 0.0, "completion_length": 317.125, "epoch": 0.1644, "grad_norm": 3.924248296304392, "kl": 3.44921875, "learning_rate": 1.974839063795389e-05, "loss": 0.138, "reward": 1.65234375, "reward_std": 0.24529700726270676, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.55859375, "step": 411 }, { "clip_ratio": 0.0, "completion_length": 794.75, "epoch": 0.1648, "grad_norm": 17.001661986945322, "kl": 4.5703125, "learning_rate": 1.9745268727865774e-05, "loss": 0.1829, "reward": 1.66015625, "reward_std": 0.2635114789009094, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.69921875, "step": 412 }, { "clip_ratio": 0.0, "completion_length": 947.875, "epoch": 0.1652, "grad_norm": 4.812167879996814, "kl": 5.796875, "learning_rate": 1.9742127818877605e-05, "loss": 0.2323, "reward": 1.642578125, "reward_std": 0.3253956511616707, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.712890625, "step": 413 }, { "clip_ratio": 0.0, "completion_length": 350.875, "epoch": 0.1656, "grad_norm": 3.8124854116714437, "kl": 3.93359375, "learning_rate": 1.9738967917112752e-05, "loss": 0.1573, "reward": 1.81640625, "reward_std": 0.20742058008909225, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.73046875, "step": 414 }, { "clip_ratio": 0.0, "completion_length": 551.25, "epoch": 0.166, "grad_norm": 13.012973539688197, "kl": 8.421875, "learning_rate": 1.9735789028731603e-05, "loss": 0.3369, "reward": 1.654296875, "reward_std": 0.3162069320678711, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.716796875, "step": 415 }, { "clip_ratio": 0.0, "completion_length": 436.0, "epoch": 0.1664, "grad_norm": 1.1559547037538493, "kl": 4.58203125, "learning_rate": 1.9732591159931564e-05, "loss": 0.1831, "reward": 1.771484375, "reward_std": 0.3146950453519821, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.716796875, "step": 416 }, { "clip_ratio": 0.0, "completion_length": 613.25, "epoch": 0.1668, "grad_norm": 2.595835207278674, "kl": 4.21875, "learning_rate": 1.972937431694704e-05, "loss": 0.1688, "reward": 1.93359375, "reward_std": 0.18974344432353973, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.72265625, "step": 417 }, { "clip_ratio": 0.0, "completion_length": 503.0, "epoch": 0.1672, "grad_norm": 2.542502092789849, "kl": 4.48828125, "learning_rate": 1.9726138506049438e-05, "loss": 0.1798, "reward": 1.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.7265625, "step": 418 }, { "clip_ratio": 0.0, "completion_length": 867.625, "epoch": 0.1676, "grad_norm": 116.458897797433, "kl": 23.0, "learning_rate": 1.9722883733547128e-05, "loss": 0.9203, "reward": 1.630859375, "reward_std": 0.36592715978622437, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.701171875, "step": 419 }, { "clip_ratio": 0.0, "completion_length": 376.125, "epoch": 0.168, "grad_norm": 2.0761532169637653, "kl": 4.296875, "learning_rate": 1.9719610005785466e-05, "loss": 0.172, "reward": 1.861328125, "reward_std": 0.0546875, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.744140625, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 778.25, "epoch": 0.1684, "grad_norm": 117.5733555861098, "kl": 3.5732421875, "learning_rate": 1.971631732914674e-05, "loss": 0.1431, "reward": 1.7265625, "reward_std": 0.09375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.7421875, "step": 421 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1688, "grad_norm": 0.9853245211123782, "kl": 0.1378173828125, "learning_rate": 1.9713005710050203e-05, "loss": 0.0055, "reward": 1.70703125, "reward_std": 0.14492058008909225, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.73828125, "step": 422 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1692, "grad_norm": 0.029427981731079113, "kl": 0.03741455078125, "learning_rate": 1.9709675154952017e-05, "loss": 0.0015, "reward": 1.873046875, "reward_std": 0.0078125, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 0.748046875, "step": 423 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1696, "grad_norm": 2.2201786901373897, "kl": 0.0462646484375, "learning_rate": 1.9706325670345276e-05, "loss": 0.0019, "reward": 1.830078125, "reward_std": 0.14773958921432495, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.736328125, "step": 424 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.17, "grad_norm": 0.251224030185249, "kl": 0.07177734375, "learning_rate": 1.9702957262759964e-05, "loss": 0.0029, "reward": 1.7265625, "reward_std": 0.09375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.7421875, "step": 425 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1704, "grad_norm": 163.4345062432784, "kl": 1.168701171875, "learning_rate": 1.9699569938762975e-05, "loss": 0.0468, "reward": 1.640625, "reward_std": 0.1493157334625721, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.6484375, "step": 426 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1708, "grad_norm": 3.9885577027486687, "kl": 0.091552734375, "learning_rate": 1.969616370495806e-05, "loss": 0.0037, "reward": 1.5390625, "reward_std": 0.277720432728529, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.59375, "step": 427 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1712, "grad_norm": 4.310601154649528, "kl": 0.06414794921875, "learning_rate": 1.9692738567985853e-05, "loss": 0.0026, "reward": 1.65234375, "reward_std": 0.2752937823534012, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.73046875, "step": 428 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1716, "grad_norm": 5.651964832489198, "kl": 0.1162109375, "learning_rate": 1.968929453452383e-05, "loss": 0.0047, "reward": 1.17578125, "reward_std": 0.5045628547668457, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.4609375, "rewards/tag_count_reward": 0.71484375, "step": 429 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.172, "grad_norm": 629.0231766196532, "kl": 6.978515625, "learning_rate": 1.9685831611286312e-05, "loss": 0.279, "reward": 0.728515625, "reward_std": 0.05429844930768013, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.728515625, "step": 430 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1724, "grad_norm": 18.57154156670685, "kl": 0.41162109375, "learning_rate": 1.9682349805024447e-05, "loss": 0.0165, "reward": 0.708984375, "reward_std": 0.10421638004481792, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.708984375, "step": 431 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1728, "grad_norm": 15.628645868018902, "kl": 0.199951171875, "learning_rate": 1.967884912252619e-05, "loss": 0.008, "reward": 0.833984375, "reward_std": 0.09499436244368553, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.708984375, "step": 432 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1732, "grad_norm": 2.65327768509894, "kl": 0.180419921875, "learning_rate": 1.96753295706163e-05, "loss": 0.0072, "reward": 0.712890625, "reward_std": 0.16032219864428043, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.015625, "rewards/tag_count_reward": 0.697265625, "step": 433 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1736, "grad_norm": 4.764226350377106, "kl": 0.201904296875, "learning_rate": 1.967179115615633e-05, "loss": 0.0081, "reward": 0.666015625, "reward_std": 0.16822053492069244, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.666015625, "step": 434 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.174, "grad_norm": 40.52926878841038, "kl": 0.32373046875, "learning_rate": 1.9668233886044597e-05, "loss": 0.0129, "reward": 0.6796875, "reward_std": 0.22738825529813766, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5546875, "step": 435 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1744, "grad_norm": 17.561341782541728, "kl": 0.167236328125, "learning_rate": 1.9664657767216176e-05, "loss": 0.0067, "reward": 0.431640625, "reward_std": 0.16249513998627663, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.431640625, "step": 436 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1748, "grad_norm": 56.8882023857232, "kl": 0.419921875, "learning_rate": 1.9661062806642903e-05, "loss": 0.0168, "reward": 0.443359375, "reward_std": 0.12458459287881851, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.443359375, "step": 437 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1752, "grad_norm": 0.44938035253983705, "kl": 0.111328125, "learning_rate": 1.9657449011333328e-05, "loss": 0.0045, "reward": 0.509765625, "reward_std": 0.13534051552414894, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.384765625, "step": 438 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1756, "grad_norm": 0.2857831960016081, "kl": 0.0833740234375, "learning_rate": 1.965381638833274e-05, "loss": 0.0033, "reward": 0.345703125, "reward_std": 0.11462906189262867, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.345703125, "step": 439 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.176, "grad_norm": 1.4842777092952, "kl": 0.55419921875, "learning_rate": 1.9650164944723116e-05, "loss": 0.0222, "reward": 0.466796875, "reward_std": 0.10344786196947098, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.466796875, "step": 440 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1764, "grad_norm": 9119.164819138601, "kl": 229.677734375, "learning_rate": 1.9646494687623135e-05, "loss": 9.2017, "reward": 0.44921875, "reward_std": 0.21274938434362411, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.44921875, "step": 441 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1768, "grad_norm": 347617.1740972504, "kl": 13697.494140625, "learning_rate": 1.964280562418815e-05, "loss": 548.662, "reward": 0.521484375, "reward_std": 0.21306364238262177, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.396484375, "step": 442 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1772, "grad_norm": 56.08031025158642, "kl": 8.49609375, "learning_rate": 1.9639097761610174e-05, "loss": 0.3393, "reward": 0.46484375, "reward_std": 0.21826820448040962, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.46484375, "step": 443 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1776, "grad_norm": 10.879834115747359, "kl": 1.640625, "learning_rate": 1.963537110711789e-05, "loss": 0.0656, "reward": 0.453125, "reward_std": 0.18697423487901688, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.453125, "step": 444 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.178, "grad_norm": 2.3581441141654746, "kl": 1.560546875, "learning_rate": 1.9631625667976584e-05, "loss": 0.0624, "reward": 0.34765625, "reward_std": 0.21538137644529343, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.34765625, "step": 445 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1784, "grad_norm": 2.3189738531991924, "kl": 1.4140625, "learning_rate": 1.962786145148819e-05, "loss": 0.0565, "reward": 0.38671875, "reward_std": 0.18194175511598587, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.26171875, "step": 446 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1788, "grad_norm": 2.026805350676354, "kl": 1.490234375, "learning_rate": 1.962407846499124e-05, "loss": 0.0597, "reward": 0.263671875, "reward_std": 0.18442164734005928, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.263671875, "step": 447 }, { "clip_ratio": 0.0, "completion_length": 1020.875, "epoch": 0.1792, "grad_norm": 1.9523217573690235, "kl": 2.23046875, "learning_rate": 1.962027671586086e-05, "loss": 0.0895, "reward": 0.373046875, "reward_std": 0.19608432427048683, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.248046875, "step": 448 }, { "clip_ratio": 0.0, "completion_length": 1023.109375, "epoch": 0.1796, "grad_norm": 2.959010586052827, "kl": 4.13671875, "learning_rate": 1.9616456211508756e-05, "loss": 0.1652, "reward": 0.263671875, "reward_std": 0.18037299811840057, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.263671875, "step": 449 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.18, "grad_norm": 1.7503979401230274, "kl": 4.1796875, "learning_rate": 1.961261695938319e-05, "loss": 0.167, "reward": 0.21875, "reward_std": 0.16324211657047272, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.21875, "step": 450 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1804, "grad_norm": 3.6581196794271102, "kl": 4.7578125, "learning_rate": 1.9608758966968987e-05, "loss": 0.1903, "reward": 0.1953125, "reward_std": 0.16080434992909431, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1953125, "step": 451 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1808, "grad_norm": 2.514768736050843, "kl": 4.7890625, "learning_rate": 1.96048822417875e-05, "loss": 0.1915, "reward": 0.310546875, "reward_std": 0.1578901819884777, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.185546875, "step": 452 }, { "clip_ratio": 0.0, "completion_length": 1000.71875, "epoch": 0.1812, "grad_norm": 0.9451778413756319, "kl": 4.671875, "learning_rate": 1.96009867913966e-05, "loss": 0.1942, "reward": 0.18359375, "reward_std": 0.1613209880888462, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.18359375, "step": 453 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1816, "grad_norm": 2.5502624079024367, "kl": 3.65234375, "learning_rate": 1.9597072623390668e-05, "loss": 0.1459, "reward": 0.298828125, "reward_std": 0.16191710531711578, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.173828125, "step": 454 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.182, "grad_norm": 1.8729427895022228, "kl": 2.5234375, "learning_rate": 1.9593139745400575e-05, "loss": 0.1009, "reward": 0.1796875, "reward_std": 0.16709048673510551, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1796875, "step": 455 }, { "clip_ratio": 0.0, "completion_length": 1017.296875, "epoch": 0.1824, "grad_norm": 10.23418888594623, "kl": 11.09765625, "learning_rate": 1.958918816509367e-05, "loss": 0.2073, "reward": 0.1484375, "reward_std": 0.15942301601171494, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1484375, "step": 456 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1828, "grad_norm": 2.2645904439754174, "kl": 3.66015625, "learning_rate": 1.958521789017376e-05, "loss": 0.1462, "reward": 0.18359375, "reward_std": 0.1659221537411213, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.18359375, "step": 457 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1832, "grad_norm": 4.089901458758794, "kl": 2.6796875, "learning_rate": 1.95812289283811e-05, "loss": 0.1069, "reward": 0.23828125, "reward_std": 0.1307060346007347, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.11328125, "step": 458 }, { "clip_ratio": 0.0, "completion_length": 1022.3515625, "epoch": 0.1836, "grad_norm": 8.81158659853987, "kl": 2.259765625, "learning_rate": 1.9577221287492368e-05, "loss": 0.0917, "reward": 0.2734375, "reward_std": 0.16068754345178604, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1484375, "step": 459 }, { "clip_ratio": 0.0, "completion_length": 1011.4609375, "epoch": 0.184, "grad_norm": 2.156039527468612, "kl": 2.8984375, "learning_rate": 1.9573194975320672e-05, "loss": 0.1202, "reward": 0.19140625, "reward_std": 0.15841015055775642, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.19140625, "step": 460 }, { "clip_ratio": 0.0, "completion_length": 1021.4609375, "epoch": 0.1844, "grad_norm": 1.940815824552875, "kl": 1.943359375, "learning_rate": 1.9569149999715514e-05, "loss": 0.0793, "reward": 0.171875, "reward_std": 0.16122083365917206, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.171875, "step": 461 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1848, "grad_norm": 6.878112813505039, "kl": 1.7421875, "learning_rate": 1.956508636856278e-05, "loss": 0.0697, "reward": 0.21875, "reward_std": 0.13613735139369965, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.21875, "step": 462 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1852, "grad_norm": 5.463626968938406, "kl": 1.84765625, "learning_rate": 1.9561004089784726e-05, "loss": 0.0737, "reward": 0.234375, "reward_std": 0.0695820702239871, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.234375, "step": 463 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1856, "grad_norm": 30.12095447008348, "kl": 6.5390625, "learning_rate": 1.9556903171339963e-05, "loss": 0.2619, "reward": 0.35546875, "reward_std": 0.08914502151310444, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.23046875, "step": 464 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.186, "grad_norm": 1.5168043368507738, "kl": 1.138671875, "learning_rate": 1.9552783621223437e-05, "loss": 0.0456, "reward": 0.232421875, "reward_std": 0.07554430142045021, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.232421875, "step": 465 }, { "clip_ratio": 0.0, "completion_length": 1018.65625, "epoch": 0.1864, "grad_norm": 0.8610131530731275, "kl": 0.9208984375, "learning_rate": 1.9548645447466433e-05, "loss": 0.0379, "reward": 0.376953125, "reward_std": 0.15007242187857628, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.251953125, "step": 466 }, { "clip_ratio": 0.0, "completion_length": 1020.5625, "epoch": 0.1868, "grad_norm": 6.001039663872343, "kl": 0.9697265625, "learning_rate": 1.9544488658136522e-05, "loss": 0.0315, "reward": 0.259765625, "reward_std": 0.15600170567631721, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.259765625, "step": 467 }, { "clip_ratio": 0.0, "completion_length": 1017.75, "epoch": 0.1872, "grad_norm": 1.512468753128959, "kl": 1.17578125, "learning_rate": 1.954031326133758e-05, "loss": 0.0585, "reward": 0.529296875, "reward_std": 0.1731642186641693, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.279296875, "step": 468 }, { "clip_ratio": 0.0, "completion_length": 1020.7109375, "epoch": 0.1876, "grad_norm": 3.093572093841267, "kl": 0.951171875, "learning_rate": 1.9536119265209763e-05, "loss": 0.0358, "reward": 0.439453125, "reward_std": 0.17772095277905464, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.314453125, "step": 469 }, { "clip_ratio": 0.0, "completion_length": 1013.171875, "epoch": 0.188, "grad_norm": 9.306009854009146, "kl": 2.4521484375, "learning_rate": 1.9531906677929472e-05, "loss": 0.0967, "reward": 0.337890625, "reward_std": 0.18443647399544716, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.337890625, "step": 470 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1884, "grad_norm": 3.9886999987437655, "kl": 1.142578125, "learning_rate": 1.9527675507709368e-05, "loss": 0.0457, "reward": 0.404296875, "reward_std": 0.21810519322752953, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.388671875, "step": 471 }, { "clip_ratio": 0.0, "completion_length": 1016.9609375, "epoch": 0.1888, "grad_norm": 26.938605875454652, "kl": 2.7265625, "learning_rate": 1.9523425762798328e-05, "loss": 0.1027, "reward": 0.41015625, "reward_std": 0.2021278217434883, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.40234375, "step": 472 }, { "clip_ratio": 0.0, "completion_length": 1017.265625, "epoch": 0.1892, "grad_norm": 2.427405754894019, "kl": 0.908203125, "learning_rate": 1.9519157451481453e-05, "loss": 0.0421, "reward": 0.39453125, "reward_std": 0.1831773929297924, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.38671875, "step": 473 }, { "clip_ratio": 0.0, "completion_length": 1018.40625, "epoch": 0.1896, "grad_norm": 3.5008156345855266, "kl": 0.94921875, "learning_rate": 1.951487058208003e-05, "loss": 0.0335, "reward": 0.5, "reward_std": 0.16991763189435005, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.375, "step": 474 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.19, "grad_norm": 1.527094878909674, "kl": 0.8369140625, "learning_rate": 1.9510565162951538e-05, "loss": 0.0334, "reward": 0.537109375, "reward_std": 0.17509515956044197, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.412109375, "step": 475 }, { "clip_ratio": 0.0, "completion_length": 1020.8828125, "epoch": 0.1904, "grad_norm": 4.680945061825825, "kl": 0.89453125, "learning_rate": 1.95062412024896e-05, "loss": 0.034, "reward": 0.509765625, "reward_std": 0.19341740012168884, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.384765625, "step": 476 }, { "clip_ratio": 0.0, "completion_length": 1020.2421875, "epoch": 0.1908, "grad_norm": 9.98278668336017, "kl": 0.701171875, "learning_rate": 1.950189870912401e-05, "loss": 0.0263, "reward": 0.37890625, "reward_std": 0.19401084259152412, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.37890625, "step": 477 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1912, "grad_norm": 11.146260627340325, "kl": 0.7861328125, "learning_rate": 1.949753769132067e-05, "loss": 0.0314, "reward": 0.498046875, "reward_std": 0.18839344009757042, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.373046875, "step": 478 }, { "clip_ratio": 0.0, "completion_length": 1017.21875, "epoch": 0.1916, "grad_norm": 6.511326370494894, "kl": 0.736328125, "learning_rate": 1.9493158157581617e-05, "loss": 0.0338, "reward": 0.529296875, "reward_std": 0.1779761016368866, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.404296875, "step": 479 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.192, "grad_norm": 4.943254309140733, "kl": 1.08984375, "learning_rate": 1.9488760116444966e-05, "loss": 0.0436, "reward": 0.42578125, "reward_std": 0.194229394197464, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.42578125, "step": 480 }, { "clip_ratio": 0.0, "completion_length": 1010.25, "epoch": 0.1924, "grad_norm": 2.7974171297504413, "kl": 0.6767578125, "learning_rate": 1.9484343576484935e-05, "loss": 0.0195, "reward": 0.583984375, "reward_std": 0.22524453699588776, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.458984375, "step": 481 }, { "clip_ratio": 0.0, "completion_length": 1021.4296875, "epoch": 0.1928, "grad_norm": 4.1937436564399775, "kl": 0.90625, "learning_rate": 1.9479908546311783e-05, "loss": 0.0327, "reward": 0.46875, "reward_std": 0.27631857991218567, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4375, "step": 482 }, { "clip_ratio": 0.0, "completion_length": 994.4921875, "epoch": 0.1932, "grad_norm": 1.494937485023258, "kl": 0.7412109375, "learning_rate": 1.947545503457184e-05, "loss": 0.0118, "reward": 0.42578125, "reward_std": 0.21693645045161247, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.42578125, "step": 483 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1936, "grad_norm": 2.5093638240547365, "kl": 1.24609375, "learning_rate": 1.9470983049947446e-05, "loss": 0.0498, "reward": 0.447265625, "reward_std": 0.21914765983819962, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.447265625, "step": 484 }, { "clip_ratio": 0.0, "completion_length": 1020.2421875, "epoch": 0.194, "grad_norm": 4.1943128572534505, "kl": 0.9931640625, "learning_rate": 1.9466492601156964e-05, "loss": 0.0436, "reward": 0.552734375, "reward_std": 0.24681289866566658, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.419921875, "step": 485 }, { "clip_ratio": 0.0, "completion_length": 1023.265625, "epoch": 0.1944, "grad_norm": 4.625121001463761, "kl": 1.921875, "learning_rate": 1.946198369695476e-05, "loss": 0.0787, "reward": 0.498046875, "reward_std": 0.2394549734890461, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.482421875, "step": 486 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1948, "grad_norm": 9.850799165102138, "kl": 2.66015625, "learning_rate": 1.945745634613117e-05, "loss": 0.1064, "reward": 0.51171875, "reward_std": 0.21146715059876442, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.50390625, "step": 487 }, { "clip_ratio": 0.0, "completion_length": 999.28125, "epoch": 0.1952, "grad_norm": 10.07947876362321, "kl": 3.30859375, "learning_rate": 1.9452910557512497e-05, "loss": 0.1286, "reward": 0.435546875, "reward_std": 0.2001444436609745, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.435546875, "step": 488 }, { "clip_ratio": 0.0, "completion_length": 1019.7421875, "epoch": 0.1956, "grad_norm": 46.58895103208046, "kl": 9.15234375, "learning_rate": 1.9448346339960984e-05, "loss": 0.3748, "reward": 0.447265625, "reward_std": 0.19806908816099167, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.431640625, "step": 489 }, { "clip_ratio": 0.0, "completion_length": 1016.9921875, "epoch": 0.196, "grad_norm": 31.321238779932912, "kl": 5.9140625, "learning_rate": 1.944376370237481e-05, "loss": 0.2374, "reward": 0.57421875, "reward_std": 0.1940269097685814, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.44921875, "step": 490 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1964, "grad_norm": 15.56530342051669, "kl": 4.13671875, "learning_rate": 1.9439162653688066e-05, "loss": 0.1656, "reward": 0.41015625, "reward_std": 0.1965167000889778, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.41015625, "step": 491 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1968, "grad_norm": 7.031375566368837, "kl": 2.6328125, "learning_rate": 1.9434543202870726e-05, "loss": 0.1053, "reward": 0.357421875, "reward_std": 0.23471428453922272, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.357421875, "step": 492 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1972, "grad_norm": 4.92477925256193, "kl": 2.30859375, "learning_rate": 1.9429905358928648e-05, "loss": 0.0924, "reward": 0.375, "reward_std": 0.2064281962811947, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.375, "step": 493 }, { "clip_ratio": 0.0, "completion_length": 1017.3828125, "epoch": 0.1976, "grad_norm": 7.237452670426786, "kl": 2.33984375, "learning_rate": 1.9425249130903544e-05, "loss": 0.0902, "reward": 0.640625, "reward_std": 0.2151174694299698, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.390625, "step": 494 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.198, "grad_norm": 4.808381533963946, "kl": 2.5234375, "learning_rate": 1.942057452787297e-05, "loss": 0.1008, "reward": 0.40234375, "reward_std": 0.21959850564599037, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.40234375, "step": 495 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1984, "grad_norm": 7.003786460566579, "kl": 2.25, "learning_rate": 1.9415881558950302e-05, "loss": 0.0901, "reward": 0.42578125, "reward_std": 0.22561552375555038, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.42578125, "step": 496 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1988, "grad_norm": 5.038106490016489, "kl": 1.806640625, "learning_rate": 1.9411170233284728e-05, "loss": 0.0723, "reward": 0.43359375, "reward_std": 0.21800953149795532, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.43359375, "step": 497 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1992, "grad_norm": 5.026072487299838, "kl": 1.2109375, "learning_rate": 1.9406440560061214e-05, "loss": 0.0484, "reward": 0.423828125, "reward_std": 0.23926019296050072, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.423828125, "step": 498 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.1996, "grad_norm": 8.286575333256462, "kl": 2.830078125, "learning_rate": 1.9401692548500504e-05, "loss": 0.113, "reward": 0.55078125, "reward_std": 0.23454558104276657, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.42578125, "step": 499 }, { "clip_ratio": 0.0, "completion_length": 1016.5390625, "epoch": 0.2, "grad_norm": 10.416468841888486, "kl": 3.234375, "learning_rate": 1.9396926207859085e-05, "loss": 0.1222, "reward": 0.380859375, "reward_std": 0.2337387278676033, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.380859375, "step": 500 }, { "clip_ratio": 0.0, "completion_length": 1023.109375, "epoch": 0.2004, "grad_norm": 18.71936052867134, "kl": 4.89453125, "learning_rate": 1.9392141547429183e-05, "loss": 0.1956, "reward": 0.38671875, "reward_std": 0.21381397917866707, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.38671875, "step": 501 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2008, "grad_norm": 4.215118751993122, "kl": 1.775390625, "learning_rate": 1.9387338576538743e-05, "loss": 0.071, "reward": 0.359375, "reward_std": 0.24846693128347397, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.359375, "step": 502 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2012, "grad_norm": 7.224165368853025, "kl": 2.1015625, "learning_rate": 1.9382517304551397e-05, "loss": 0.0841, "reward": 0.375, "reward_std": 0.2503217123448849, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.375, "step": 503 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2016, "grad_norm": 8.783730514952708, "kl": 2.296875, "learning_rate": 1.937767774086646e-05, "loss": 0.0919, "reward": 0.30078125, "reward_std": 0.2350722774863243, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.30078125, "step": 504 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.202, "grad_norm": 8.835724691316859, "kl": 3.03515625, "learning_rate": 1.937281989491892e-05, "loss": 0.1215, "reward": 0.2578125, "reward_std": 0.2308039404451847, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2578125, "step": 505 }, { "clip_ratio": 0.0, "completion_length": 983.25, "epoch": 0.2024, "grad_norm": 7.792983487962017, "kl": 3.83984375, "learning_rate": 1.936794377617938e-05, "loss": 0.1538, "reward": 0.2890625, "reward_std": 0.20251421630382538, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2890625, "step": 506 }, { "clip_ratio": 0.0, "completion_length": 975.9921875, "epoch": 0.2028, "grad_norm": 4.6826754453886865, "kl": 9.125, "learning_rate": 1.9363049394154095e-05, "loss": 0.2255, "reward": 0.32421875, "reward_std": 0.18476006016135216, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.32421875, "step": 507 }, { "clip_ratio": 0.0, "completion_length": 993.875, "epoch": 0.2032, "grad_norm": 5.8442635928592255, "kl": 7.40625, "learning_rate": 1.935813675838491e-05, "loss": 0.2963, "reward": 0.3671875, "reward_std": 0.09183414373546839, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2421875, "step": 508 }, { "clip_ratio": 0.0, "completion_length": 970.25, "epoch": 0.2036, "grad_norm": 3.8849498075253877, "kl": 6.59375, "learning_rate": 1.935320587844926e-05, "loss": 0.2634, "reward": 0.470703125, "reward_std": 0.10140727087855339, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.220703125, "step": 509 }, { "clip_ratio": 0.0, "completion_length": 803.25, "epoch": 0.204, "grad_norm": 3.6974515237972954, "kl": 6.859375, "learning_rate": 1.9348256763960146e-05, "loss": 0.2743, "reward": 0.21875, "reward_std": 0.12627344951033592, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.21875, "step": 510 }, { "clip_ratio": 0.0, "completion_length": 894.0, "epoch": 0.2044, "grad_norm": 4.188284001405017, "kl": 6.8984375, "learning_rate": 1.9343289424566122e-05, "loss": 0.2756, "reward": 0.478515625, "reward_std": 0.09960681945085526, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.228515625, "step": 511 }, { "clip_ratio": 0.0, "completion_length": 764.0, "epoch": 0.2048, "grad_norm": 22.206074270907443, "kl": 6.75, "learning_rate": 1.933830386995127e-05, "loss": 0.2704, "reward": 0.49609375, "reward_std": 0.10680375248193741, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24609375, "step": 512 }, { "clip_ratio": 0.0, "completion_length": 753.625, "epoch": 0.2052, "grad_norm": 2.907263776007787, "kl": 6.890625, "learning_rate": 1.9333300109835182e-05, "loss": 0.2754, "reward": 0.26171875, "reward_std": 0.10546811111271381, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.26171875, "step": 513 }, { "clip_ratio": 0.0, "completion_length": 844.25, "epoch": 0.2056, "grad_norm": 31.64756904584066, "kl": 6.0546875, "learning_rate": 1.9328278153972947e-05, "loss": 0.2422, "reward": 0.23828125, "reward_std": 0.12086152285337448, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.23828125, "step": 514 }, { "clip_ratio": 0.0, "completion_length": 946.828125, "epoch": 0.206, "grad_norm": 13.171112233655526, "kl": 6.5078125, "learning_rate": 1.9323238012155125e-05, "loss": 0.2382, "reward": 0.232421875, "reward_std": 0.13567833043634892, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.224609375, "step": 515 }, { "clip_ratio": 0.0, "completion_length": 975.125, "epoch": 0.2064, "grad_norm": 44.08072444989082, "kl": 6.28125, "learning_rate": 1.9318179694207726e-05, "loss": 0.2514, "reward": 0.271484375, "reward_std": 0.12142620421946049, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.271484375, "step": 516 }, { "clip_ratio": 0.0, "completion_length": 970.375, "epoch": 0.2068, "grad_norm": 8.364594652019923, "kl": 5.671875, "learning_rate": 1.9313103209992205e-05, "loss": 0.2274, "reward": 0.291015625, "reward_std": 0.16547331027686596, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.283203125, "step": 517 }, { "clip_ratio": 0.0, "completion_length": 1022.3828125, "epoch": 0.2072, "grad_norm": 319.1634328724085, "kl": 6.45703125, "learning_rate": 1.9308008569405424e-05, "loss": 0.2582, "reward": 0.287109375, "reward_std": 0.1768728345632553, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.287109375, "step": 518 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2076, "grad_norm": 58.675429976296826, "kl": 5.05859375, "learning_rate": 1.9302895782379648e-05, "loss": 0.2023, "reward": 0.41015625, "reward_std": 0.21251621842384338, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.23828125, "step": 519 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.208, "grad_norm": 2.8093099998753877, "kl": 5.96875, "learning_rate": 1.9297764858882516e-05, "loss": 0.2389, "reward": 0.240234375, "reward_std": 0.10021634586155415, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.240234375, "step": 520 }, { "clip_ratio": 0.0, "completion_length": 1005.625, "epoch": 0.2084, "grad_norm": 40.24202919149702, "kl": 6.53125, "learning_rate": 1.9292615808917027e-05, "loss": 0.2615, "reward": 0.255859375, "reward_std": 0.10949129424989223, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.255859375, "step": 521 }, { "clip_ratio": 0.0, "completion_length": 848.25, "epoch": 0.2088, "grad_norm": 1.306383265920365, "kl": 6.5625, "learning_rate": 1.9287448642521513e-05, "loss": 0.2626, "reward": 0.3828125, "reward_std": 0.07398691028356552, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2578125, "step": 522 }, { "clip_ratio": 0.0, "completion_length": 990.765625, "epoch": 0.2092, "grad_norm": 4.578338859979999, "kl": 5.7890625, "learning_rate": 1.9282263369769633e-05, "loss": 0.2319, "reward": 0.23828125, "reward_std": 0.09111380577087402, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.23828125, "step": 523 }, { "clip_ratio": 0.0, "completion_length": 933.25, "epoch": 0.2096, "grad_norm": 1.740779570580508, "kl": 6.3203125, "learning_rate": 1.9277060000770342e-05, "loss": 0.2525, "reward": 0.232421875, "reward_std": 0.07618820667266846, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.232421875, "step": 524 }, { "clip_ratio": 0.0, "completion_length": 919.75, "epoch": 0.21, "grad_norm": 1.6235406977084157, "kl": 6.2578125, "learning_rate": 1.9271838545667876e-05, "loss": 0.2508, "reward": 0.2421875, "reward_std": 0.08988374099135399, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2421875, "step": 525 }, { "clip_ratio": 0.0, "completion_length": 1004.125, "epoch": 0.2104, "grad_norm": 1.6752236848870772, "kl": 5.703125, "learning_rate": 1.9266599014641724e-05, "loss": 0.2202, "reward": 0.228515625, "reward_std": 0.08170996606349945, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.228515625, "step": 526 }, { "clip_ratio": 0.0, "completion_length": 829.625, "epoch": 0.2108, "grad_norm": 0.9469281465122057, "kl": 6.03125, "learning_rate": 1.9261341417906622e-05, "loss": 0.2413, "reward": 0.353515625, "reward_std": 0.07476464658975601, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.228515625, "step": 527 }, { "clip_ratio": 0.0, "completion_length": 1000.75, "epoch": 0.2112, "grad_norm": 0.9447515703369984, "kl": 6.140625, "learning_rate": 1.9256065765712524e-05, "loss": 0.246, "reward": 0.333984375, "reward_std": 0.08400248736143112, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.208984375, "step": 528 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2116, "grad_norm": 23.771783940410437, "kl": 10.5625, "learning_rate": 1.925077206834458e-05, "loss": 0.423, "reward": 0.212890625, "reward_std": 0.10510109178721905, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.212890625, "step": 529 }, { "clip_ratio": 0.0, "completion_length": 1018.0078125, "epoch": 0.212, "grad_norm": 1.4924183662791475, "kl": 6.6171875, "learning_rate": 1.9245460336123136e-05, "loss": 0.2556, "reward": 0.337890625, "reward_std": 0.09377043507993221, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.212890625, "step": 530 }, { "clip_ratio": 0.0, "completion_length": 1013.6328125, "epoch": 0.2124, "grad_norm": 3.268256016466928, "kl": 6.875, "learning_rate": 1.924013057940367e-05, "loss": 0.2711, "reward": 0.2265625, "reward_std": 0.08295785263180733, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2265625, "step": 531 }, { "clip_ratio": 0.0, "completion_length": 1018.875, "epoch": 0.2128, "grad_norm": 16.775030818697797, "kl": 6.8125, "learning_rate": 1.9234782808576823e-05, "loss": 0.2727, "reward": 0.2109375, "reward_std": 0.08824118599295616, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2109375, "step": 532 }, { "clip_ratio": 0.0, "completion_length": 1000.75, "epoch": 0.2132, "grad_norm": 3.7703196515239927, "kl": 7.0, "learning_rate": 1.9229417034068352e-05, "loss": 0.2806, "reward": 0.224609375, "reward_std": 0.07616276573389769, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.224609375, "step": 533 }, { "clip_ratio": 0.0, "completion_length": 919.3203125, "epoch": 0.2136, "grad_norm": 3.9625970948354015, "kl": 7.0625, "learning_rate": 1.9224033266339103e-05, "loss": 0.2766, "reward": 0.228515625, "reward_std": 0.08469227328896523, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.228515625, "step": 534 }, { "clip_ratio": 0.0, "completion_length": 795.625, "epoch": 0.214, "grad_norm": 2.841842872956771, "kl": 8.8125, "learning_rate": 1.9218631515885007e-05, "loss": 0.3513, "reward": 0.24609375, "reward_std": 0.05346224643290043, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24609375, "step": 535 }, { "clip_ratio": 0.0, "completion_length": 442.0, "epoch": 0.2144, "grad_norm": 38.48349768716312, "kl": 14.375, "learning_rate": 1.9213211793237056e-05, "loss": 0.5744, "reward": 0.251953125, "reward_std": 0.03411140665411949, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.251953125, "step": 536 }, { "clip_ratio": 0.0, "completion_length": 399.25, "epoch": 0.2148, "grad_norm": 3.5774450758918928, "kl": 8.0703125, "learning_rate": 1.9207774108961273e-05, "loss": 0.323, "reward": 0.376953125, "reward_std": 0.0234375, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.251953125, "step": 537 }, { "clip_ratio": 0.0, "completion_length": 556.5, "epoch": 0.2152, "grad_norm": 3.115904123890989, "kl": 7.953125, "learning_rate": 1.9202318473658707e-05, "loss": 0.3186, "reward": 0.369140625, "reward_std": 0.0611472949385643, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.244140625, "step": 538 }, { "clip_ratio": 0.0, "completion_length": 347.5, "epoch": 0.2156, "grad_norm": 3.8312136370688, "kl": 8.8125, "learning_rate": 1.9196844897965393e-05, "loss": 0.3526, "reward": 0.40625, "reward_std": 0.09066247940063477, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.28125, "step": 539 }, { "clip_ratio": 0.0, "completion_length": 366.625, "epoch": 0.216, "grad_norm": 2.3358316468072435, "kl": 7.640625, "learning_rate": 1.9191353392552346e-05, "loss": 0.3056, "reward": 0.37890625, "reward_std": 0.1450735665857792, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.37890625, "step": 540 }, { "clip_ratio": 0.0, "completion_length": 497.75, "epoch": 0.2164, "grad_norm": 1.2918014400765936, "kl": 6.1484375, "learning_rate": 1.9185843968125543e-05, "loss": 0.2461, "reward": 0.4375, "reward_std": 0.16834889352321625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4375, "step": 541 }, { "clip_ratio": 0.0, "completion_length": 917.5, "epoch": 0.2168, "grad_norm": 1.8065205235153414, "kl": 4.6796875, "learning_rate": 1.9180316635425883e-05, "loss": 0.1869, "reward": 0.3828125, "reward_std": 0.26895464956760406, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.375, "step": 542 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2172, "grad_norm": 3.588276666671289, "kl": 3.1328125, "learning_rate": 1.9174771405229187e-05, "loss": 0.1255, "reward": 0.404296875, "reward_std": 0.25254785269498825, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.279296875, "step": 543 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2176, "grad_norm": 3.454477520620655, "kl": 4.484375, "learning_rate": 1.9169208288346168e-05, "loss": 0.1797, "reward": 0.380859375, "reward_std": 0.2818485200405121, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.380859375, "step": 544 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.218, "grad_norm": 2.2860194270667846, "kl": 2.451171875, "learning_rate": 1.9163627295622397e-05, "loss": 0.0981, "reward": 0.32421875, "reward_std": 0.2526574656367302, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.32421875, "step": 545 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2184, "grad_norm": 1.101447162064611, "kl": 2.14453125, "learning_rate": 1.9158028437938316e-05, "loss": 0.0858, "reward": 0.482421875, "reward_std": 0.2453574240207672, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.474609375, "step": 546 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2188, "grad_norm": 1.0761369319490606, "kl": 3.94921875, "learning_rate": 1.9152411726209176e-05, "loss": 0.158, "reward": 0.615234375, "reward_std": 0.21812774240970612, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.490234375, "step": 547 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2192, "grad_norm": 1.172831624434013, "kl": 3.921875, "learning_rate": 1.914677717138505e-05, "loss": 0.1568, "reward": 0.412109375, "reward_std": 0.22084975987672806, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.412109375, "step": 548 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2196, "grad_norm": 3.0168909526097485, "kl": 4.22265625, "learning_rate": 1.914112478445079e-05, "loss": 0.1687, "reward": 0.40625, "reward_std": 0.18925124779343605, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.40625, "step": 549 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.22, "grad_norm": 28.283073484939635, "kl": 6.3359375, "learning_rate": 1.913545457642601e-05, "loss": 0.2526, "reward": 0.28125, "reward_std": 0.2087155617773533, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.28125, "step": 550 }, { "clip_ratio": 0.0, "completion_length": 1017.4296875, "epoch": 0.2204, "grad_norm": 5.294613756984718, "kl": 1.08984375, "learning_rate": 1.9129766558365076e-05, "loss": 0.0448, "reward": 0.166015625, "reward_std": 0.17571072280406952, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.166015625, "step": 551 }, { "clip_ratio": 0.0, "completion_length": 1013.171875, "epoch": 0.2208, "grad_norm": 2.7121935462196114, "kl": 2.419921875, "learning_rate": 1.9124060741357065e-05, "loss": 0.0799, "reward": 0.26953125, "reward_std": 0.24329476058483124, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.26953125, "step": 552 }, { "clip_ratio": 0.0, "completion_length": 1022.1171875, "epoch": 0.2212, "grad_norm": 1.7577919134237951, "kl": 2.8515625, "learning_rate": 1.911833713652576e-05, "loss": 0.1125, "reward": 0.2109375, "reward_std": 0.21176189556717873, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2109375, "step": 553 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2216, "grad_norm": 2.880884853509567, "kl": 4.9921875, "learning_rate": 1.9112595755029625e-05, "loss": 0.1998, "reward": 0.30859375, "reward_std": 0.18840409442782402, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.30859375, "step": 554 }, { "clip_ratio": 0.0, "completion_length": 966.0390625, "epoch": 0.222, "grad_norm": 3.2938569492319987, "kl": 6.140625, "learning_rate": 1.910683660806177e-05, "loss": 0.2332, "reward": 0.341796875, "reward_std": 0.1673884503543377, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.341796875, "step": 555 }, { "clip_ratio": 0.0, "completion_length": 662.0, "epoch": 0.2224, "grad_norm": 35.951178585012364, "kl": 13.8125, "learning_rate": 1.9101059706849957e-05, "loss": 0.5523, "reward": 0.4375, "reward_std": 0.15353985503315926, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3125, "step": 556 }, { "clip_ratio": 0.0, "completion_length": 658.875, "epoch": 0.2228, "grad_norm": 14.068022459184666, "kl": 9.625, "learning_rate": 1.9095265062656546e-05, "loss": 0.3847, "reward": 0.322265625, "reward_std": 0.13853202387690544, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.322265625, "step": 557 }, { "clip_ratio": 0.0, "completion_length": 437.25, "epoch": 0.2232, "grad_norm": 2.895975042157716, "kl": 6.9921875, "learning_rate": 1.908945268677849e-05, "loss": 0.2797, "reward": 0.353515625, "reward_std": 0.13440436124801636, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.353515625, "step": 558 }, { "clip_ratio": 0.0, "completion_length": 585.5, "epoch": 0.2236, "grad_norm": 9.151225817997743, "kl": 7.09375, "learning_rate": 1.9083622590547313e-05, "loss": 0.2832, "reward": 0.53125, "reward_std": 0.14818710833787918, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.40625, "step": 559 }, { "clip_ratio": 0.0, "completion_length": 473.25, "epoch": 0.224, "grad_norm": 14.115256514624214, "kl": 6.65625, "learning_rate": 1.907777478532909e-05, "loss": 0.2669, "reward": 0.57421875, "reward_std": 0.12247166410088539, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.44921875, "step": 560 }, { "clip_ratio": 0.0, "completion_length": 712.375, "epoch": 0.2244, "grad_norm": 5.293235082885403, "kl": 6.9921875, "learning_rate": 1.907190928252441e-05, "loss": 0.2802, "reward": 0.587890625, "reward_std": 0.12299356143921614, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.455078125, "step": 561 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2248, "grad_norm": 10.946770255030147, "kl": 9.46875, "learning_rate": 1.906602609356838e-05, "loss": 0.3792, "reward": 0.412109375, "reward_std": 0.20475443080067635, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.287109375, "step": 562 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2252, "grad_norm": 17.95840561167463, "kl": 8.234375, "learning_rate": 1.9060125229930572e-05, "loss": 0.3299, "reward": 0.146484375, "reward_std": 0.18192989379167557, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.146484375, "step": 563 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2256, "grad_norm": 16.116455811275593, "kl": 9.4921875, "learning_rate": 1.905420670311502e-05, "loss": 0.3796, "reward": 0.291015625, "reward_std": 0.19466528296470642, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.166015625, "step": 564 }, { "clip_ratio": 0.0, "completion_length": 1022.9375, "epoch": 0.226, "grad_norm": 5.208754410052853, "kl": 7.171875, "learning_rate": 1.9048270524660197e-05, "loss": 0.2864, "reward": 0.158203125, "reward_std": 0.18174222856760025, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.158203125, "step": 565 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2264, "grad_norm": 6.215013278588907, "kl": 7.5625, "learning_rate": 1.9042316706138987e-05, "loss": 0.3029, "reward": 0.32421875, "reward_std": 0.20288554579019547, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.19921875, "step": 566 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2268, "grad_norm": 6.903764897201821, "kl": 9.703125, "learning_rate": 1.9036345259158667e-05, "loss": 0.3884, "reward": 0.3515625, "reward_std": 0.2062261402606964, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2265625, "step": 567 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2272, "grad_norm": 4.7339319292119635, "kl": 7.328125, "learning_rate": 1.9030356195360875e-05, "loss": 0.2933, "reward": 0.302734375, "reward_std": 0.18570903688669205, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.177734375, "step": 568 }, { "clip_ratio": 0.0, "completion_length": 836.75, "epoch": 0.2276, "grad_norm": 3.693222367982453, "kl": 7.2421875, "learning_rate": 1.9024349526421596e-05, "loss": 0.2894, "reward": 0.4375, "reward_std": 0.18312306329607964, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3125, "step": 569 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.228, "grad_norm": 1.9965381133896662, "kl": 6.8984375, "learning_rate": 1.901832526405114e-05, "loss": 0.2757, "reward": 0.267578125, "reward_std": 0.188631571829319, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.267578125, "step": 570 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2284, "grad_norm": 3.1066303286323698, "kl": 6.3671875, "learning_rate": 1.9012283419994115e-05, "loss": 0.2547, "reward": 0.287109375, "reward_std": 0.19878660887479782, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.287109375, "step": 571 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2288, "grad_norm": 3.678131992536812, "kl": 7.5390625, "learning_rate": 1.9006224006029404e-05, "loss": 0.3016, "reward": 0.31640625, "reward_std": 0.23015020042657852, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.30078125, "step": 572 }, { "clip_ratio": 0.0, "completion_length": 842.5, "epoch": 0.2292, "grad_norm": 9.699928707804037, "kl": 6.234375, "learning_rate": 1.9000147033970148e-05, "loss": 0.2498, "reward": 0.34765625, "reward_std": 0.17282183840870857, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.34765625, "step": 573 }, { "clip_ratio": 0.0, "completion_length": 884.375, "epoch": 0.2296, "grad_norm": 3.2766300468552134, "kl": 6.6875, "learning_rate": 1.899405251566371e-05, "loss": 0.2673, "reward": 0.48828125, "reward_std": 0.18254829570651054, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.36328125, "step": 574 }, { "clip_ratio": 0.0, "completion_length": 940.375, "epoch": 0.23, "grad_norm": 1.0255504523382193, "kl": 6.6640625, "learning_rate": 1.8987940462991673e-05, "loss": 0.2666, "reward": 0.32421875, "reward_std": 0.18876251950860023, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.32421875, "step": 575 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2304, "grad_norm": 2.416598147470084, "kl": 6.828125, "learning_rate": 1.8981810887869784e-05, "loss": 0.2729, "reward": 0.3203125, "reward_std": 0.1914542391896248, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3203125, "step": 576 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2308, "grad_norm": 2.7116642496198065, "kl": 6.7265625, "learning_rate": 1.8975663802247978e-05, "loss": 0.2688, "reward": 0.2890625, "reward_std": 0.20227031782269478, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2890625, "step": 577 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2312, "grad_norm": 1.2461536805953777, "kl": 5.5234375, "learning_rate": 1.8969499218110302e-05, "loss": 0.221, "reward": 0.298828125, "reward_std": 0.21203197538852692, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.298828125, "step": 578 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2316, "grad_norm": 14.109582973998457, "kl": 3.95703125, "learning_rate": 1.896331714747493e-05, "loss": 0.1585, "reward": 0.427734375, "reward_std": 0.20737522095441818, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.302734375, "step": 579 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.232, "grad_norm": 3.518163279389842, "kl": 3.8671875, "learning_rate": 1.895711760239413e-05, "loss": 0.1546, "reward": 0.26171875, "reward_std": 0.20313510671257973, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.26171875, "step": 580 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2324, "grad_norm": 10.271682317845073, "kl": 4.4140625, "learning_rate": 1.8950900594954226e-05, "loss": 0.1763, "reward": 0.28515625, "reward_std": 0.2011103555560112, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.28515625, "step": 581 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2328, "grad_norm": 2.9873670014822915, "kl": 5.953125, "learning_rate": 1.89446661372756e-05, "loss": 0.2381, "reward": 0.326171875, "reward_std": 0.18761280551552773, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.326171875, "step": 582 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2332, "grad_norm": 1.1485554600828392, "kl": 5.796875, "learning_rate": 1.893841424151264e-05, "loss": 0.232, "reward": 0.421875, "reward_std": 0.18315363302826881, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.296875, "step": 583 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2336, "grad_norm": 88.12032820038812, "kl": 7.0703125, "learning_rate": 1.893214491985374e-05, "loss": 0.2826, "reward": 0.357421875, "reward_std": 0.19123699888586998, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.357421875, "step": 584 }, { "clip_ratio": 0.0, "completion_length": 918.625, "epoch": 0.234, "grad_norm": 1.4269597218263141, "kl": 5.96875, "learning_rate": 1.892585818452126e-05, "loss": 0.2387, "reward": 0.33984375, "reward_std": 0.1726234406232834, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.33984375, "step": 585 }, { "clip_ratio": 0.0, "completion_length": 915.5, "epoch": 0.2344, "grad_norm": 2.2765805894911963, "kl": 6.4140625, "learning_rate": 1.8919554047771508e-05, "loss": 0.257, "reward": 0.3828125, "reward_std": 0.1898644082248211, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.375, "step": 586 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2348, "grad_norm": 5.924901773384549, "kl": 6.109375, "learning_rate": 1.8913232521894734e-05, "loss": 0.2447, "reward": 0.333984375, "reward_std": 0.22421370446681976, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.333984375, "step": 587 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2352, "grad_norm": 1.4339385126959427, "kl": 5.84375, "learning_rate": 1.890689361921507e-05, "loss": 0.2337, "reward": 0.30078125, "reward_std": 0.23512886464595795, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.30078125, "step": 588 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2356, "grad_norm": 20.610589216172244, "kl": 6.671875, "learning_rate": 1.8900537352090523e-05, "loss": 0.2671, "reward": 0.501953125, "reward_std": 0.2356841191649437, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.361328125, "step": 589 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.236, "grad_norm": 1.5419745726975345, "kl": 5.1875, "learning_rate": 1.889416373291298e-05, "loss": 0.2076, "reward": 0.265625, "reward_std": 0.2381608486175537, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.265625, "step": 590 }, { "clip_ratio": 0.0, "completion_length": 925.625, "epoch": 0.2364, "grad_norm": 1.2265598095677388, "kl": 5.625, "learning_rate": 1.8887772774108116e-05, "loss": 0.225, "reward": 0.490234375, "reward_std": 0.18174729868769646, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.365234375, "step": 591 }, { "clip_ratio": 0.0, "completion_length": 924.125, "epoch": 0.2368, "grad_norm": 7.856479759266313, "kl": 5.40625, "learning_rate": 1.8881364488135448e-05, "loss": 0.2165, "reward": 0.337890625, "reward_std": 0.20485017448663712, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.337890625, "step": 592 }, { "clip_ratio": 0.0, "completion_length": 1023.828125, "epoch": 0.2372, "grad_norm": 1.7502570518618974, "kl": 6.203125, "learning_rate": 1.887493888748825e-05, "loss": 0.2482, "reward": 0.36328125, "reward_std": 0.20881887152791023, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.36328125, "step": 593 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2376, "grad_norm": 2.595005633098968, "kl": 6.6640625, "learning_rate": 1.886849598469356e-05, "loss": 0.2663, "reward": 0.373046875, "reward_std": 0.207528967410326, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.373046875, "step": 594 }, { "clip_ratio": 0.0, "completion_length": 924.375, "epoch": 0.238, "grad_norm": 2.3639532654560154, "kl": 6.28125, "learning_rate": 1.8862035792312148e-05, "loss": 0.2511, "reward": 0.427734375, "reward_std": 0.16025875136256218, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.427734375, "step": 595 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2384, "grad_norm": 5.056116734809125, "kl": 6.796875, "learning_rate": 1.8855558322938492e-05, "loss": 0.2719, "reward": 0.353515625, "reward_std": 0.21608265489339828, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.353515625, "step": 596 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2388, "grad_norm": 11.754497087245332, "kl": 9.1953125, "learning_rate": 1.8849063589200744e-05, "loss": 0.367, "reward": 0.400390625, "reward_std": 0.19715679436922073, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.400390625, "step": 597 }, { "clip_ratio": 0.0, "completion_length": 825.875, "epoch": 0.2392, "grad_norm": 6.227797599796809, "kl": 7.5078125, "learning_rate": 1.8842551603760725e-05, "loss": 0.3005, "reward": 0.5234375, "reward_std": 0.17331377044320107, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3984375, "step": 598 }, { "clip_ratio": 0.0, "completion_length": 913.3359375, "epoch": 0.2396, "grad_norm": 12.93249580271641, "kl": 10.421875, "learning_rate": 1.8836022379313884e-05, "loss": 0.4003, "reward": 0.404296875, "reward_std": 0.18322282284498215, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.404296875, "step": 599 }, { "clip_ratio": 0.0, "completion_length": 1016.6875, "epoch": 0.24, "grad_norm": 20.009401758370927, "kl": 10.0, "learning_rate": 1.8829475928589272e-05, "loss": 0.393, "reward": 0.306640625, "reward_std": 0.23094597458839417, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.306640625, "step": 600 }, { "clip_ratio": 0.0, "completion_length": 1022.046875, "epoch": 0.2404, "grad_norm": 4.865382421996224, "kl": 4.7890625, "learning_rate": 1.8822912264349535e-05, "loss": 0.1906, "reward": 0.3828125, "reward_std": 0.23982908576726913, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2578125, "step": 601 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2408, "grad_norm": 2.4757571628869846, "kl": 5.7421875, "learning_rate": 1.881633139939087e-05, "loss": 0.2299, "reward": 0.345703125, "reward_std": 0.223719272762537, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.345703125, "step": 602 }, { "clip_ratio": 0.0, "completion_length": 1011.0, "epoch": 0.2412, "grad_norm": 2.6450418912141704, "kl": 6.296875, "learning_rate": 1.8809733346543013e-05, "loss": 0.2519, "reward": 0.39453125, "reward_std": 0.198756605386734, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.39453125, "step": 603 }, { "clip_ratio": 0.0, "completion_length": 961.6640625, "epoch": 0.2416, "grad_norm": 69.77638526880162, "kl": 10.3515625, "learning_rate": 1.8803118118669203e-05, "loss": 0.4096, "reward": 0.517578125, "reward_std": 0.19684484973549843, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.392578125, "step": 604 }, { "clip_ratio": 0.0, "completion_length": 784.625, "epoch": 0.242, "grad_norm": 5.132245586535982, "kl": 10.0, "learning_rate": 1.879648572866617e-05, "loss": 0.4, "reward": 0.427734375, "reward_std": 0.1411793828010559, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.427734375, "step": 605 }, { "clip_ratio": 0.0, "completion_length": 886.25, "epoch": 0.2424, "grad_norm": 88.87498080123395, "kl": 28.953125, "learning_rate": 1.878983618946409e-05, "loss": 1.1575, "reward": 0.56640625, "reward_std": 0.17277343571186066, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.44140625, "step": 606 }, { "clip_ratio": 0.0, "completion_length": 750.5, "epoch": 0.2428, "grad_norm": 17.453255386163583, "kl": 13.6015625, "learning_rate": 1.878316951402658e-05, "loss": 0.5438, "reward": 0.43359375, "reward_std": 0.15166015923023224, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.43359375, "step": 607 }, { "clip_ratio": 0.0, "completion_length": 968.5, "epoch": 0.2432, "grad_norm": 9.915121871753922, "kl": 7.3125, "learning_rate": 1.8776485715350672e-05, "loss": 0.2924, "reward": 0.400390625, "reward_std": 0.19899985566735268, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.400390625, "step": 608 }, { "clip_ratio": 0.0, "completion_length": 880.125, "epoch": 0.2436, "grad_norm": 7.702563163726805, "kl": 6.8984375, "learning_rate": 1.8769784806466768e-05, "loss": 0.2756, "reward": 0.537109375, "reward_std": 0.2053782343864441, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.404296875, "step": 609 }, { "clip_ratio": 0.0, "completion_length": 827.25, "epoch": 0.244, "grad_norm": 8.720914402288118, "kl": 7.03125, "learning_rate": 1.8763066800438638e-05, "loss": 0.2811, "reward": 0.548828125, "reward_std": 0.20109709724783897, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.416015625, "step": 610 }, { "clip_ratio": 0.0, "completion_length": 871.75, "epoch": 0.2444, "grad_norm": 5.231939637670873, "kl": 8.6875, "learning_rate": 1.8756331710363375e-05, "loss": 0.348, "reward": 0.44140625, "reward_std": 0.15225761011242867, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.44140625, "step": 611 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2448, "grad_norm": 271.1457465484224, "kl": 29.890625, "learning_rate": 1.874957954937138e-05, "loss": 1.1946, "reward": 0.2421875, "reward_std": 0.13224306143820286, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2421875, "step": 612 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2452, "grad_norm": 52.994625221993935, "kl": 12.5078125, "learning_rate": 1.8742810330626338e-05, "loss": 0.5006, "reward": 0.357421875, "reward_std": 0.04808073490858078, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.232421875, "step": 613 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2456, "grad_norm": 8.316170826963761, "kl": 3.3310546875, "learning_rate": 1.8736024067325188e-05, "loss": 0.1328, "reward": 0.24609375, "reward_std": 0.05513726267963648, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24609375, "step": 614 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.246, "grad_norm": 4.901594852085314, "kl": 1.29052734375, "learning_rate": 1.8729220772698096e-05, "loss": 0.0516, "reward": 0.234375, "reward_std": 0.04670868441462517, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.234375, "step": 615 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2464, "grad_norm": 26.89441159522204, "kl": 7.091796875, "learning_rate": 1.8722400460008437e-05, "loss": 0.284, "reward": 0.46484375, "reward_std": 0.09613480046391487, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.21484375, "step": 616 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2468, "grad_norm": 31.025584553162147, "kl": 7.576171875, "learning_rate": 1.8715563142552758e-05, "loss": 0.303, "reward": 0.23046875, "reward_std": 0.07902417331933975, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.23046875, "step": 617 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2472, "grad_norm": 35.42181580911603, "kl": 7.84375, "learning_rate": 1.8708708833660755e-05, "loss": 0.3128, "reward": 0.224609375, "reward_std": 0.06448355130851269, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.224609375, "step": 618 }, { "clip_ratio": 0.0, "completion_length": 1023.890625, "epoch": 0.2476, "grad_norm": 8.735048499211693, "kl": 1.6640625, "learning_rate": 1.870183754669526e-05, "loss": 0.0663, "reward": 0.22265625, "reward_std": 0.0778178097680211, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.22265625, "step": 619 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.248, "grad_norm": 6.1552121707566085, "kl": 1.89453125, "learning_rate": 1.869494929505219e-05, "loss": 0.0758, "reward": 0.234375, "reward_std": 0.04670868441462517, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.234375, "step": 620 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2484, "grad_norm": 8.759735120922036, "kl": 1.875, "learning_rate": 1.8688044092160554e-05, "loss": 0.075, "reward": 0.3515625, "reward_std": 0.07407307997345924, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2265625, "step": 621 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2488, "grad_norm": 6.315051189885584, "kl": 2.19140625, "learning_rate": 1.8681121951482397e-05, "loss": 0.0874, "reward": 0.234375, "reward_std": 0.06525811273604631, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.234375, "step": 622 }, { "clip_ratio": 0.0, "completion_length": 999.875, "epoch": 0.2492, "grad_norm": 2.350104993471344, "kl": 3.421875, "learning_rate": 1.8674182886512776e-05, "loss": 0.137, "reward": 0.244140625, "reward_std": 0.012597277760505676, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.244140625, "step": 623 }, { "clip_ratio": 0.0, "completion_length": 1020.625, "epoch": 0.2496, "grad_norm": 17.41934343173672, "kl": 3.47265625, "learning_rate": 1.8667226910779767e-05, "loss": 0.1388, "reward": 0.25390625, "reward_std": 0.04867746960371733, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25390625, "step": 624 }, { "clip_ratio": 0.0, "completion_length": 950.375, "epoch": 0.25, "grad_norm": 2.1315829839311866, "kl": 3.64453125, "learning_rate": 1.866025403784439e-05, "loss": 0.1456, "reward": 0.248046875, "reward_std": 0.029454081319272518, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.248046875, "step": 625 }, { "clip_ratio": 0.0, "completion_length": 997.0, "epoch": 0.2504, "grad_norm": 9.019007079016674, "kl": 2.833984375, "learning_rate": 1.8653264281300622e-05, "loss": 0.1132, "reward": 0.259765625, "reward_std": 0.0390625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.259765625, "step": 626 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2508, "grad_norm": 14.739489173420603, "kl": 1.12890625, "learning_rate": 1.864625765477535e-05, "loss": 0.0452, "reward": 0.3671875, "reward_std": 0.04192390665411949, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2421875, "step": 627 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2512, "grad_norm": 0.7576305822346614, "kl": 0.43212890625, "learning_rate": 1.8639234171928355e-05, "loss": 0.0173, "reward": 0.373046875, "reward_std": 0.04478531330823898, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.248046875, "step": 628 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2516, "grad_norm": 11.848663984661417, "kl": 3.181640625, "learning_rate": 1.863219384645227e-05, "loss": 0.127, "reward": 0.2734375, "reward_std": 0.08618908002972603, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2734375, "step": 629 }, { "clip_ratio": 0.0, "completion_length": 930.375, "epoch": 0.252, "grad_norm": 2.673341327489382, "kl": 5.8515625, "learning_rate": 1.8625136692072577e-05, "loss": 0.2342, "reward": 0.423828125, "reward_std": 0.11240099370479584, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.423828125, "step": 630 }, { "clip_ratio": 0.0, "completion_length": 427.75, "epoch": 0.2524, "grad_norm": 1.000603552474389, "kl": 5.3671875, "learning_rate": 1.861806272254755e-05, "loss": 0.2144, "reward": 0.4921875, "reward_std": 0.02629890665411949, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4921875, "step": 631 }, { "clip_ratio": 0.0, "completion_length": 271.75, "epoch": 0.2528, "grad_norm": 0.8589993820238908, "kl": 4.1796875, "learning_rate": 1.8610971951668265e-05, "loss": 0.1671, "reward": 0.486328125, "reward_std": 0.04387271963059902, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.486328125, "step": 632 }, { "clip_ratio": 0.0, "completion_length": 741.5, "epoch": 0.2532, "grad_norm": 0.5597933539593796, "kl": 5.703125, "learning_rate": 1.8603864393258534e-05, "loss": 0.2283, "reward": 0.611328125, "reward_std": 0.07771172747015953, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.486328125, "step": 633 }, { "clip_ratio": 0.0, "completion_length": 798.0, "epoch": 0.2536, "grad_norm": 0.46471223298420555, "kl": 4.84375, "learning_rate": 1.8596740061174912e-05, "loss": 0.1936, "reward": 0.443359375, "reward_std": 0.09605142660439014, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.443359375, "step": 634 }, { "clip_ratio": 0.0, "completion_length": 945.0, "epoch": 0.254, "grad_norm": 2.2986108664776013, "kl": 3.703125, "learning_rate": 1.8589598969306646e-05, "loss": 0.1482, "reward": 0.509765625, "reward_std": 0.14286286011338234, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.384765625, "step": 635 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2544, "grad_norm": 0.3629279149153914, "kl": 4.5, "learning_rate": 1.8582441131575658e-05, "loss": 0.1802, "reward": 0.376953125, "reward_std": 0.157521553337574, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.376953125, "step": 636 }, { "clip_ratio": 0.0, "completion_length": 845.5, "epoch": 0.2548, "grad_norm": 0.25394680165297, "kl": 4.5078125, "learning_rate": 1.8575266561936526e-05, "loss": 0.1806, "reward": 0.431640625, "reward_std": 0.11520390398800373, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.431640625, "step": 637 }, { "clip_ratio": 0.0, "completion_length": 941.75, "epoch": 0.2552, "grad_norm": 0.1356066033320897, "kl": 4.8515625, "learning_rate": 1.856807527437643e-05, "loss": 0.194, "reward": 0.537109375, "reward_std": 0.11120268329977989, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.412109375, "step": 638 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2556, "grad_norm": 0.262146131150298, "kl": 4.4375, "learning_rate": 1.8560867282915164e-05, "loss": 0.1776, "reward": 0.388671875, "reward_std": 0.13852989673614502, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.388671875, "step": 639 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.256, "grad_norm": 0.8407460772069775, "kl": 5.25, "learning_rate": 1.855364260160507e-05, "loss": 0.2096, "reward": 0.408203125, "reward_std": 0.11826404929161072, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.408203125, "step": 640 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2564, "grad_norm": 0.2414139681452926, "kl": 4.44140625, "learning_rate": 1.854640124453103e-05, "loss": 0.1774, "reward": 0.50390625, "reward_std": 0.14352592080831528, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.37890625, "step": 641 }, { "clip_ratio": 0.0, "completion_length": 838.625, "epoch": 0.2568, "grad_norm": 1.8109121777570043, "kl": 4.9609375, "learning_rate": 1.8539143225810453e-05, "loss": 0.1984, "reward": 0.44140625, "reward_std": 0.07850531488656998, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.44140625, "step": 642 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2572, "grad_norm": 0.4919288536774624, "kl": 6.609375, "learning_rate": 1.8531868559593205e-05, "loss": 0.2642, "reward": 0.46875, "reward_std": 0.09633388184010983, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.46875, "step": 643 }, { "clip_ratio": 0.0, "completion_length": 846.75, "epoch": 0.2576, "grad_norm": 1.0552441639185657, "kl": 6.5859375, "learning_rate": 1.8524577260061628e-05, "loss": 0.2631, "reward": 0.470703125, "reward_std": 0.07525954768061638, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.470703125, "step": 644 }, { "clip_ratio": 0.0, "completion_length": 925.5, "epoch": 0.258, "grad_norm": 0.3977189897891455, "kl": 6.3359375, "learning_rate": 1.851726934143048e-05, "loss": 0.2535, "reward": 0.96484375, "reward_std": 0.08879486098885536, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.46484375, "step": 645 }, { "clip_ratio": 0.0, "completion_length": 827.5, "epoch": 0.2584, "grad_norm": 1.301246894316982, "kl": 6.453125, "learning_rate": 1.850994481794692e-05, "loss": 0.2579, "reward": 0.462890625, "reward_std": 0.08199471235275269, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.462890625, "step": 646 }, { "clip_ratio": 0.0, "completion_length": 827.375, "epoch": 0.2588, "grad_norm": 0.21695728931031147, "kl": 6.3125, "learning_rate": 1.8502603703890488e-05, "loss": 0.2522, "reward": 0.466796875, "reward_std": 0.08331314846873283, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.466796875, "step": 647 }, { "clip_ratio": 0.0, "completion_length": 828.5, "epoch": 0.2592, "grad_norm": 1.6435639221539988, "kl": 6.484375, "learning_rate": 1.8495246013573057e-05, "loss": 0.2595, "reward": 0.599609375, "reward_std": 0.06997986882925034, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.474609375, "step": 648 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2596, "grad_norm": 24.995658195580408, "kl": 2.095703125, "learning_rate": 1.848787176133882e-05, "loss": 0.0838, "reward": 0.529296875, "reward_std": 0.13310055434703827, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.404296875, "step": 649 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.26, "grad_norm": 1.4073881766926783, "kl": 0.253662109375, "learning_rate": 1.848048096156426e-05, "loss": 0.0101, "reward": 0.373046875, "reward_std": 0.0078125, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.248046875, "step": 650 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2604, "grad_norm": 4.786092366307188, "kl": 0.49267578125, "learning_rate": 1.8473073628658123e-05, "loss": 0.0197, "reward": 0.37109375, "reward_std": 0.015625, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24609375, "step": 651 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2608, "grad_norm": 5.901506679068674, "kl": 0.7880859375, "learning_rate": 1.8465649777061377e-05, "loss": 0.0315, "reward": 0.2421875, "reward_std": 0.03125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2421875, "step": 652 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2612, "grad_norm": 1.9975973409986347, "kl": 0.712890625, "learning_rate": 1.8458209421247208e-05, "loss": 0.0285, "reward": 0.236328125, "reward_std": 0.04478531330823898, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.236328125, "step": 653 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2616, "grad_norm": 0.526158847966476, "kl": 0.5556640625, "learning_rate": 1.8450752575720967e-05, "loss": 0.0222, "reward": 0.244140625, "reward_std": 0.0234375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.244140625, "step": 654 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.262, "grad_norm": 0.36005840558536456, "kl": 0.5771484375, "learning_rate": 1.8443279255020153e-05, "loss": 0.0231, "reward": 0.24609375, "reward_std": 0.02629890665411949, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24609375, "step": 655 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2624, "grad_norm": 0.564648602680186, "kl": 0.6025390625, "learning_rate": 1.843578947371439e-05, "loss": 0.0241, "reward": 0.234375, "reward_std": 0.04670868441462517, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.234375, "step": 656 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2628, "grad_norm": 0.39181724483647723, "kl": 0.5458984375, "learning_rate": 1.842828324640539e-05, "loss": 0.0218, "reward": 0.244140625, "reward_std": 0.034848387353122234, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.244140625, "step": 657 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2632, "grad_norm": 0.5098964581491972, "kl": 0.53125, "learning_rate": 1.8420760587726925e-05, "loss": 0.0212, "reward": 0.34375, "reward_std": 0.07889671996235847, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.21875, "step": 658 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2636, "grad_norm": 0.4352784447368266, "kl": 0.4658203125, "learning_rate": 1.8413221512344805e-05, "loss": 0.0186, "reward": 0.236328125, "reward_std": 0.0878928629681468, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.236328125, "step": 659 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.264, "grad_norm": 0.5195421639527426, "kl": 0.43603515625, "learning_rate": 1.8405666034956842e-05, "loss": 0.0174, "reward": 0.390625, "reward_std": 0.07436684798449278, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.265625, "step": 660 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2644, "grad_norm": 0.5746611252438569, "kl": 0.4072265625, "learning_rate": 1.839809417029283e-05, "loss": 0.0163, "reward": 0.521484375, "reward_std": 0.11909948103129864, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.271484375, "step": 661 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2648, "grad_norm": 0.5666288259388627, "kl": 0.38720703125, "learning_rate": 1.8390505933114503e-05, "loss": 0.0155, "reward": 0.3125, "reward_std": 0.14080403000116348, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3125, "step": 662 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2652, "grad_norm": 0.5625717528393828, "kl": 0.39990234375, "learning_rate": 1.8382901338215515e-05, "loss": 0.016, "reward": 0.310546875, "reward_std": 0.1395394578576088, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.310546875, "step": 663 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2656, "grad_norm": 0.43154276734276037, "kl": 0.3447265625, "learning_rate": 1.837528040042142e-05, "loss": 0.0138, "reward": 0.35546875, "reward_std": 0.1880146749317646, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.33984375, "step": 664 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.266, "grad_norm": 0.4358568920698131, "kl": 0.330078125, "learning_rate": 1.836764313458962e-05, "loss": 0.0132, "reward": 0.353515625, "reward_std": 0.18280309438705444, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.353515625, "step": 665 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2664, "grad_norm": 0.33831865346328316, "kl": 0.28955078125, "learning_rate": 1.8359989555609355e-05, "loss": 0.0116, "reward": 0.318359375, "reward_std": 0.15534432977437973, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.318359375, "step": 666 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2668, "grad_norm": 0.34720929546955076, "kl": 0.2822265625, "learning_rate": 1.8352319678401677e-05, "loss": 0.0113, "reward": 0.33984375, "reward_std": 0.16107267141342163, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.33203125, "step": 667 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2672, "grad_norm": 0.33675778514771315, "kl": 0.276123046875, "learning_rate": 1.834463351791939e-05, "loss": 0.011, "reward": 0.46875, "reward_std": 0.1529843993484974, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.34375, "step": 668 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2676, "grad_norm": 41.02692609824194, "kl": 7.443115234375, "learning_rate": 1.8336931089147076e-05, "loss": 0.298, "reward": 0.404296875, "reward_std": 0.1897098422050476, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.404296875, "step": 669 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.268, "grad_norm": 0.4498652287408562, "kl": 0.2880859375, "learning_rate": 1.8329212407100996e-05, "loss": 0.0115, "reward": 0.4140625, "reward_std": 0.19711381569504738, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.328125, "step": 670 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2684, "grad_norm": 0.6256091264916601, "kl": 0.2802734375, "learning_rate": 1.8321477486829128e-05, "loss": 0.0112, "reward": 0.3515625, "reward_std": 0.15614674240350723, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3515625, "step": 671 }, { "clip_ratio": 0.0, "completion_length": 1021.28125, "epoch": 0.2688, "grad_norm": 1.3252359672200325, "kl": 0.5234375, "learning_rate": 1.8313726343411085e-05, "loss": 0.0206, "reward": 0.341796875, "reward_std": 0.16556796804070473, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.341796875, "step": 672 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2692, "grad_norm": 0.7653643879437947, "kl": 0.3271484375, "learning_rate": 1.830595899195813e-05, "loss": 0.0131, "reward": 0.330078125, "reward_std": 0.18153128772974014, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.314453125, "step": 673 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2696, "grad_norm": 2.596781419914745, "kl": 0.52783203125, "learning_rate": 1.82981754476131e-05, "loss": 0.0211, "reward": 0.53125, "reward_std": 0.19032930582761765, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3203125, "step": 674 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.27, "grad_norm": 7.014017394961875, "kl": 0.6240234375, "learning_rate": 1.8290375725550417e-05, "loss": 0.025, "reward": 0.3359375, "reward_std": 0.19849364459514618, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3359375, "step": 675 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2704, "grad_norm": 10.682054412820884, "kl": 3.6376953125, "learning_rate": 1.8282559840976043e-05, "loss": 0.1451, "reward": 0.380859375, "reward_std": 0.2236599214375019, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.310546875, "step": 676 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2708, "grad_norm": 4.474099233938197, "kl": 2.2734375, "learning_rate": 1.827472780912744e-05, "loss": 0.0911, "reward": 0.392578125, "reward_std": 0.14433257281780243, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.267578125, "step": 677 }, { "clip_ratio": 0.0, "completion_length": 968.5, "epoch": 0.2712, "grad_norm": 3.5142476193921257, "kl": 4.9609375, "learning_rate": 1.8266879645273557e-05, "loss": 0.1981, "reward": 0.408203125, "reward_std": 0.13439185358583927, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.283203125, "step": 678 }, { "clip_ratio": 0.0, "completion_length": 711.25, "epoch": 0.2716, "grad_norm": 0.39854739424777474, "kl": 5.2265625, "learning_rate": 1.8259015364714786e-05, "loss": 0.2094, "reward": 0.3828125, "reward_std": 0.222596175968647, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.34375, "step": 679 }, { "clip_ratio": 0.0, "completion_length": 568.875, "epoch": 0.272, "grad_norm": 0.897570414944568, "kl": 5.4296875, "learning_rate": 1.8251134982782952e-05, "loss": 0.2173, "reward": 0.5859375, "reward_std": 0.2303304560482502, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.421875, "step": 680 }, { "clip_ratio": 0.0, "completion_length": 572.5, "epoch": 0.2724, "grad_norm": 2.065668217987566, "kl": 5.7109375, "learning_rate": 1.824323851484126e-05, "loss": 0.2284, "reward": 0.5390625, "reward_std": 0.18400633335113525, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5390625, "step": 681 }, { "clip_ratio": 0.0, "completion_length": 541.125, "epoch": 0.2728, "grad_norm": 0.7554322920948381, "kl": 4.78515625, "learning_rate": 1.8235325976284276e-05, "loss": 0.1916, "reward": 0.556640625, "reward_std": 0.17606019973754883, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.556640625, "step": 682 }, { "clip_ratio": 0.0, "completion_length": 603.375, "epoch": 0.2732, "grad_norm": 0.5033297131867626, "kl": 5.3671875, "learning_rate": 1.82273973825379e-05, "loss": 0.2149, "reward": 0.564453125, "reward_std": 0.19499235227704048, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.564453125, "step": 683 }, { "clip_ratio": 0.0, "completion_length": 837.5, "epoch": 0.2736, "grad_norm": 0.35499078237047277, "kl": 4.7265625, "learning_rate": 1.8219452749059332e-05, "loss": 0.189, "reward": 0.47265625, "reward_std": 0.1255389992147684, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.47265625, "step": 684 }, { "clip_ratio": 0.0, "completion_length": 871.625, "epoch": 0.274, "grad_norm": 0.2625734824773582, "kl": 3.02734375, "learning_rate": 1.821149209133704e-05, "loss": 0.121, "reward": 0.591796875, "reward_std": 0.08942672982811928, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.466796875, "step": 685 }, { "clip_ratio": 0.0, "completion_length": 959.25, "epoch": 0.2744, "grad_norm": 0.4207801361881935, "kl": 2.765625, "learning_rate": 1.8203515424890738e-05, "loss": 0.1107, "reward": 0.48046875, "reward_std": 0.08384781330823898, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48046875, "step": 686 }, { "clip_ratio": 0.0, "completion_length": 927.875, "epoch": 0.2748, "grad_norm": 1.734557120801568, "kl": 2.9765625, "learning_rate": 1.819552276527134e-05, "loss": 0.1193, "reward": 0.61328125, "reward_std": 0.05259781330823898, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48828125, "step": 687 }, { "clip_ratio": 0.0, "completion_length": 942.125, "epoch": 0.2752, "grad_norm": 1.987780562252656, "kl": 1.884765625, "learning_rate": 1.8187514128060946e-05, "loss": 0.0754, "reward": 0.5, "reward_std": 0.0726165771484375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5, "step": 688 }, { "clip_ratio": 0.0, "completion_length": 874.625, "epoch": 0.2756, "grad_norm": 10.117058531893516, "kl": 2.140625, "learning_rate": 1.8179489528872808e-05, "loss": 0.0855, "reward": 0.490234375, "reward_std": 0.10122986882925034, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.490234375, "step": 689 }, { "clip_ratio": 0.0, "completion_length": 834.78125, "epoch": 0.276, "grad_norm": 0.5762401690757674, "kl": 2.8671875, "learning_rate": 1.8171448983351284e-05, "loss": 0.1096, "reward": 0.65625, "reward_std": 0.2092609480023384, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4921875, "step": 690 }, { "clip_ratio": 0.0, "completion_length": 809.75, "epoch": 0.2764, "grad_norm": 2.437422112600217, "kl": 3.140625, "learning_rate": 1.816339250717184e-05, "loss": 0.1256, "reward": 0.64453125, "reward_std": 0.07740516494959593, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.51953125, "step": 691 }, { "clip_ratio": 0.0, "completion_length": 776.25, "epoch": 0.2768, "grad_norm": 1.7728068484050594, "kl": 3.2890625, "learning_rate": 1.8155320116040983e-05, "loss": 0.1315, "reward": 0.61328125, "reward_std": 0.06623280607163906, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48828125, "step": 692 }, { "clip_ratio": 0.0, "completion_length": 475.75, "epoch": 0.2772, "grad_norm": 8.158279111696201, "kl": 5.1796875, "learning_rate": 1.814723182569625e-05, "loss": 0.2075, "reward": 0.517578125, "reward_std": 0.1850089244544506, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.517578125, "step": 693 }, { "clip_ratio": 0.0, "completion_length": 709.0, "epoch": 0.2776, "grad_norm": 5.56882484611759, "kl": 4.546875, "learning_rate": 1.8139127651906183e-05, "loss": 0.182, "reward": 0.333984375, "reward_std": 0.1458146944642067, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.333984375, "step": 694 }, { "clip_ratio": 0.0, "completion_length": 961.875, "epoch": 0.278, "grad_norm": 28.415365923382325, "kl": 4.67578125, "learning_rate": 1.8131007610470278e-05, "loss": 0.187, "reward": 0.43359375, "reward_std": 0.1696704812347889, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.30859375, "step": 695 }, { "clip_ratio": 0.0, "completion_length": 831.96875, "epoch": 0.2784, "grad_norm": 24.487652239810554, "kl": 3.796875, "learning_rate": 1.812287171721897e-05, "loss": 0.1449, "reward": 0.24609375, "reward_std": 0.2049945555627346, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.22265625, "step": 696 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2788, "grad_norm": 13.302242722820026, "kl": 4.421875, "learning_rate": 1.8114719988013612e-05, "loss": 0.1774, "reward": 0.193359375, "reward_std": 0.1532646119594574, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.193359375, "step": 697 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2792, "grad_norm": 99.69383078985645, "kl": 15.171875, "learning_rate": 1.81065524387464e-05, "loss": 0.6038, "reward": 0.171875, "reward_std": 0.1502489298582077, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.171875, "step": 698 }, { "clip_ratio": 0.0, "completion_length": 919.875, "epoch": 0.2796, "grad_norm": 100.67820187853759, "kl": 22.1640625, "learning_rate": 1.80983690853404e-05, "loss": 0.8845, "reward": 0.18359375, "reward_std": 0.1502643134444952, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.17578125, "step": 699 }, { "clip_ratio": 0.0, "completion_length": 1019.2734375, "epoch": 0.28, "grad_norm": 8.466679674560622, "kl": 6.0, "learning_rate": 1.8090169943749477e-05, "loss": 0.2366, "reward": 0.123046875, "reward_std": 0.1389829758554697, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.123046875, "step": 700 }, { "clip_ratio": 0.0, "completion_length": 533.375, "epoch": 0.2804, "grad_norm": 4.11889314025289, "kl": 6.359375, "learning_rate": 1.8081955029958272e-05, "loss": 0.254, "reward": 0.279296875, "reward_std": 0.10607871226966381, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.279296875, "step": 701 }, { "clip_ratio": 0.0, "completion_length": 511.0, "epoch": 0.2808, "grad_norm": 4.2634445479486365, "kl": 6.1796875, "learning_rate": 1.8073724359982184e-05, "loss": 0.2468, "reward": 0.25, "reward_std": 0.11318767257034779, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 702 }, { "clip_ratio": 0.0, "completion_length": 737.125, "epoch": 0.2812, "grad_norm": 75.48345196738245, "kl": 13.84375, "learning_rate": 1.8065477949867327e-05, "loss": 0.5531, "reward": 0.255859375, "reward_std": 0.1148808654397726, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.255859375, "step": 703 }, { "clip_ratio": 0.0, "completion_length": 963.375, "epoch": 0.2816, "grad_norm": 108.26607004286788, "kl": 10.578125, "learning_rate": 1.8057215815690494e-05, "loss": 0.4235, "reward": 0.20703125, "reward_std": 0.1530219353735447, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.19140625, "step": 704 }, { "clip_ratio": 0.0, "completion_length": 943.25, "epoch": 0.282, "grad_norm": 3.1570872694122882, "kl": 5.8203125, "learning_rate": 1.804893797355914e-05, "loss": 0.2326, "reward": 0.1953125, "reward_std": 0.11001058109104633, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1953125, "step": 705 }, { "clip_ratio": 0.0, "completion_length": 883.375, "epoch": 0.2824, "grad_norm": 28.99076780035745, "kl": 8.2890625, "learning_rate": 1.8040644439611348e-05, "loss": 0.3313, "reward": 0.345703125, "reward_std": 0.09105645306408405, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.220703125, "step": 706 }, { "clip_ratio": 0.0, "completion_length": 822.875, "epoch": 0.2828, "grad_norm": 9.199855831406007, "kl": 7.53125, "learning_rate": 1.803233523001578e-05, "loss": 0.3012, "reward": 0.2578125, "reward_std": 0.09671953693032265, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 707 }, { "clip_ratio": 0.0, "completion_length": 417.25, "epoch": 0.2832, "grad_norm": 2.348565203707098, "kl": 5.46875, "learning_rate": 1.802401036097167e-05, "loss": 0.219, "reward": 0.29296875, "reward_std": 0.10581010580062866, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.27734375, "step": 708 }, { "clip_ratio": 0.0, "completion_length": 372.875, "epoch": 0.2836, "grad_norm": 0.5621730870129552, "kl": 5.3359375, "learning_rate": 1.8015669848708768e-05, "loss": 0.2132, "reward": 0.533203125, "reward_std": 0.10368834808468819, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.283203125, "step": 709 }, { "clip_ratio": 0.0, "completion_length": 494.75, "epoch": 0.284, "grad_norm": 0.9369224813761824, "kl": 5.046875, "learning_rate": 1.8007313709487334e-05, "loss": 0.2017, "reward": 0.43359375, "reward_std": 0.18870560452342033, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.43359375, "step": 710 }, { "clip_ratio": 0.0, "completion_length": 614.5, "epoch": 0.2844, "grad_norm": 8.494660373534584, "kl": 4.7578125, "learning_rate": 1.7998941959598097e-05, "loss": 0.1905, "reward": 0.4375, "reward_std": 0.16863296553492546, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4375, "step": 711 }, { "clip_ratio": 0.0, "completion_length": 689.875, "epoch": 0.2848, "grad_norm": 0.4032565801483084, "kl": 3.38671875, "learning_rate": 1.79905546153622e-05, "loss": 0.1356, "reward": 0.478515625, "reward_std": 0.16093482449650764, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478515625, "step": 712 }, { "clip_ratio": 0.0, "completion_length": 781.375, "epoch": 0.2852, "grad_norm": 2.5495959118752536, "kl": 4.0546875, "learning_rate": 1.7982151693131206e-05, "loss": 0.1625, "reward": 0.755859375, "reward_std": 0.1781025007367134, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.505859375, "step": 713 }, { "clip_ratio": 0.0, "completion_length": 871.875, "epoch": 0.2856, "grad_norm": 2.270124322741176, "kl": 3.30859375, "learning_rate": 1.7973733209287036e-05, "loss": 0.1322, "reward": 0.431640625, "reward_std": 0.18774127960205078, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.431640625, "step": 714 }, { "clip_ratio": 0.0, "completion_length": 893.0, "epoch": 0.286, "grad_norm": 2.672165903712165, "kl": 3.13671875, "learning_rate": 1.7965299180241963e-05, "loss": 0.1253, "reward": 0.572265625, "reward_std": 0.19069704040884972, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.447265625, "step": 715 }, { "clip_ratio": 0.0, "completion_length": 968.625, "epoch": 0.2864, "grad_norm": 0.9873374411025954, "kl": 3.04296875, "learning_rate": 1.7956849622438554e-05, "loss": 0.1215, "reward": 0.53515625, "reward_std": 0.21302483975887299, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.52734375, "step": 716 }, { "clip_ratio": 0.0, "completion_length": 947.75, "epoch": 0.2868, "grad_norm": 9.459479853246064, "kl": 3.5, "learning_rate": 1.794838455234966e-05, "loss": 0.1399, "reward": 0.5390625, "reward_std": 0.19581802934408188, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5390625, "step": 717 }, { "clip_ratio": 0.0, "completion_length": 960.875, "epoch": 0.2872, "grad_norm": 214.7506953960623, "kl": 16.828125, "learning_rate": 1.7939903986478354e-05, "loss": 0.6732, "reward": 0.685546875, "reward_std": 0.20451579615473747, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.560546875, "step": 718 }, { "clip_ratio": 0.0, "completion_length": 825.75, "epoch": 0.2876, "grad_norm": 11.918792615252523, "kl": 3.984375, "learning_rate": 1.793140794135795e-05, "loss": 0.1595, "reward": 0.65625, "reward_std": 0.2544405274093151, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5625, "step": 719 }, { "clip_ratio": 0.0, "completion_length": 853.0, "epoch": 0.288, "grad_norm": 373.29096180102874, "kl": 53.8125, "learning_rate": 1.792289643355191e-05, "loss": 2.1457, "reward": 0.51171875, "reward_std": 0.2012772522866726, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.51171875, "step": 720 }, { "clip_ratio": 0.0, "completion_length": 875.25, "epoch": 0.2884, "grad_norm": 118.43034393041638, "kl": 12.5546875, "learning_rate": 1.7914369479653858e-05, "loss": 0.5035, "reward": 0.6328125, "reward_std": 0.21789884567260742, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5, "step": 721 }, { "clip_ratio": 0.0, "completion_length": 843.875, "epoch": 0.2888, "grad_norm": 13.361845365831101, "kl": 4.78515625, "learning_rate": 1.7905827096287532e-05, "loss": 0.1911, "reward": 0.45703125, "reward_std": 0.21297640725970268, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.45703125, "step": 722 }, { "clip_ratio": 0.0, "completion_length": 699.25, "epoch": 0.2892, "grad_norm": 7.978373368785838, "kl": 5.42578125, "learning_rate": 1.789726930010674e-05, "loss": 0.2171, "reward": 0.46875, "reward_std": 0.21669849008321762, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.46875, "step": 723 }, { "clip_ratio": 0.0, "completion_length": 709.25, "epoch": 0.2896, "grad_norm": 419.4816495300095, "kl": 72.3125, "learning_rate": 1.7888696107795343e-05, "loss": 2.8896, "reward": 0.40625, "reward_std": 0.1917015053331852, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.40625, "step": 724 }, { "clip_ratio": 0.0, "completion_length": 795.75, "epoch": 0.29, "grad_norm": 179.17671434199937, "kl": 33.0, "learning_rate": 1.788010753606722e-05, "loss": 1.3196, "reward": 0.36328125, "reward_std": 0.18855420500040054, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.36328125, "step": 725 }, { "clip_ratio": 0.0, "completion_length": 771.5, "epoch": 0.2904, "grad_norm": 7.233520082437296, "kl": 5.2109375, "learning_rate": 1.7871503601666233e-05, "loss": 0.2085, "reward": 0.3671875, "reward_std": 0.20065737143158913, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3515625, "step": 726 }, { "clip_ratio": 0.0, "completion_length": 598.625, "epoch": 0.2908, "grad_norm": 8.079630147017062, "kl": 5.2421875, "learning_rate": 1.786288432136619e-05, "loss": 0.2096, "reward": 0.3828125, "reward_std": 0.15464483201503754, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3828125, "step": 727 }, { "clip_ratio": 0.0, "completion_length": 820.375, "epoch": 0.2912, "grad_norm": 14.601586023866282, "kl": 8.0625, "learning_rate": 1.785424971197082e-05, "loss": 0.3219, "reward": 0.359375, "reward_std": 0.16332368925213814, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.359375, "step": 728 }, { "clip_ratio": 0.0, "completion_length": 883.0, "epoch": 0.2916, "grad_norm": 4.636250088719881, "kl": 7.15625, "learning_rate": 1.7845599790313735e-05, "loss": 0.2864, "reward": 0.3671875, "reward_std": 0.1796620897948742, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3671875, "step": 729 }, { "clip_ratio": 0.0, "completion_length": 612.75, "epoch": 0.292, "grad_norm": 10.879801746118416, "kl": 5.859375, "learning_rate": 1.78369345732584e-05, "loss": 0.2341, "reward": 0.390625, "reward_std": 0.15659838542342186, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.390625, "step": 730 }, { "clip_ratio": 0.0, "completion_length": 998.0, "epoch": 0.2924, "grad_norm": 9.471477955160575, "kl": 7.328125, "learning_rate": 1.78282540776981e-05, "loss": 0.2931, "reward": 0.3515625, "reward_std": 0.16151371225714684, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3515625, "step": 731 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2928, "grad_norm": 179.0967121782571, "kl": 39.25, "learning_rate": 1.7819558320555902e-05, "loss": 1.5688, "reward": 0.42578125, "reward_std": 0.17988409847021103, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.29296875, "step": 732 }, { "clip_ratio": 0.0, "completion_length": 1016.6171875, "epoch": 0.2932, "grad_norm": 146.0948662259994, "kl": 35.15625, "learning_rate": 1.7810847318784632e-05, "loss": 1.4113, "reward": 0.43359375, "reward_std": 0.14969342574477196, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.30859375, "step": 733 }, { "clip_ratio": 0.0, "completion_length": 1019.9296875, "epoch": 0.2936, "grad_norm": 87.91269266312906, "kl": 11.4375, "learning_rate": 1.780212108936684e-05, "loss": 0.4561, "reward": 0.27734375, "reward_std": 0.15933694690465927, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.27734375, "step": 734 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.294, "grad_norm": 8.402580551645539, "kl": 8.203125, "learning_rate": 1.7793379649314743e-05, "loss": 0.3283, "reward": 0.291015625, "reward_std": 0.16280678659677505, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.291015625, "step": 735 }, { "clip_ratio": 0.0, "completion_length": 811.0, "epoch": 0.2944, "grad_norm": 13.513653980328353, "kl": 5.953125, "learning_rate": 1.7784623015670237e-05, "loss": 0.2383, "reward": 0.333984375, "reward_std": 0.15507853403687477, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.333984375, "step": 736 }, { "clip_ratio": 0.0, "completion_length": 849.875, "epoch": 0.2948, "grad_norm": 18.36221724329242, "kl": 5.5625, "learning_rate": 1.7775851205504823e-05, "loss": 0.2226, "reward": 0.3359375, "reward_std": 0.14516658335924149, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3359375, "step": 737 }, { "clip_ratio": 0.0, "completion_length": 831.125, "epoch": 0.2952, "grad_norm": 16.887470109336128, "kl": 6.7421875, "learning_rate": 1.7767064235919594e-05, "loss": 0.2699, "reward": 0.3671875, "reward_std": 0.13402020186185837, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3671875, "step": 738 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2956, "grad_norm": 9.03707507494216, "kl": 7.8359375, "learning_rate": 1.7758262124045195e-05, "loss": 0.3135, "reward": 0.623046875, "reward_std": 0.1484675519168377, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.373046875, "step": 739 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.296, "grad_norm": 26.76345673334224, "kl": 12.046875, "learning_rate": 1.7749444887041797e-05, "loss": 0.4821, "reward": 0.412109375, "reward_std": 0.24436797201633453, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.341796875, "step": 740 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2964, "grad_norm": 9.383049229392425, "kl": 6.375, "learning_rate": 1.7740612542099054e-05, "loss": 0.2551, "reward": 0.388671875, "reward_std": 0.15143867582082748, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.388671875, "step": 741 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2968, "grad_norm": 3.4052331056461513, "kl": 5.8125, "learning_rate": 1.7731765106436073e-05, "loss": 0.232, "reward": 0.35546875, "reward_std": 0.15457339584827423, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.35546875, "step": 742 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2972, "grad_norm": 11.23983245252572, "kl": 6.8671875, "learning_rate": 1.7722902597301385e-05, "loss": 0.2742, "reward": 0.466796875, "reward_std": 0.15230421349406242, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.341796875, "step": 743 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2976, "grad_norm": 4.376299336124743, "kl": 7.359375, "learning_rate": 1.7714025031972904e-05, "loss": 0.2945, "reward": 0.4140625, "reward_std": 0.11988093331456184, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4140625, "step": 744 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.298, "grad_norm": 7.838335646867208, "kl": 6.265625, "learning_rate": 1.7705132427757895e-05, "loss": 0.251, "reward": 0.43359375, "reward_std": 0.11675624549388885, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.43359375, "step": 745 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2984, "grad_norm": 7.966518178143308, "kl": 6.96875, "learning_rate": 1.7696224801992947e-05, "loss": 0.2788, "reward": 0.416015625, "reward_std": 0.1252806019037962, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.416015625, "step": 746 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2988, "grad_norm": 2.116845225989324, "kl": 5.421875, "learning_rate": 1.7687302172043933e-05, "loss": 0.217, "reward": 0.392578125, "reward_std": 0.12253302149474621, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.392578125, "step": 747 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2992, "grad_norm": 7.958722085515506, "kl": 5.2421875, "learning_rate": 1.767836455530598e-05, "loss": 0.2096, "reward": 0.392578125, "reward_std": 0.12820490077137947, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.392578125, "step": 748 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.2996, "grad_norm": 18.481793497671184, "kl": 5.7578125, "learning_rate": 1.7669411969203417e-05, "loss": 0.2302, "reward": 0.435546875, "reward_std": 0.13609936088323593, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.435546875, "step": 749 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.3, "grad_norm": 0.9086488802528834, "kl": 4.78125, "learning_rate": 1.766044443118978e-05, "loss": 0.1912, "reward": 0.37890625, "reward_std": 0.12503610365092754, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.37890625, "step": 750 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.3004, "grad_norm": 2.877397410021712, "kl": 3.3828125, "learning_rate": 1.7651461958747745e-05, "loss": 0.1353, "reward": 0.587890625, "reward_std": 0.13348994217813015, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.337890625, "step": 751 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.3008, "grad_norm": 0.2675909051610945, "kl": 3.9921875, "learning_rate": 1.764246456938909e-05, "loss": 0.16, "reward": 0.375, "reward_std": 0.12197988480329514, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.375, "step": 752 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.3012, "grad_norm": 0.4117556239766885, "kl": 3.24609375, "learning_rate": 1.76334522806547e-05, "loss": 0.1302, "reward": 0.466796875, "reward_std": 0.14042013138532639, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.333984375, "step": 753 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.3016, "grad_norm": 1.067072770813668, "kl": 5.2890625, "learning_rate": 1.762442511011448e-05, "loss": 0.2113, "reward": 0.40625, "reward_std": 0.1186870951205492, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.40625, "step": 754 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.302, "grad_norm": 0.1865528917397511, "kl": 5.2734375, "learning_rate": 1.761538307536737e-05, "loss": 0.2111, "reward": 0.52734375, "reward_std": 0.11608045361936092, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.40234375, "step": 755 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.3024, "grad_norm": 0.14197063355493686, "kl": 5.2421875, "learning_rate": 1.7606326194041274e-05, "loss": 0.2094, "reward": 0.544921875, "reward_std": 0.11911628395318985, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.419921875, "step": 756 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.3028, "grad_norm": 0.3109525165087628, "kl": 5.796875, "learning_rate": 1.759725448379305e-05, "loss": 0.232, "reward": 0.427734375, "reward_std": 0.10506740212440491, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.427734375, "step": 757 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.3032, "grad_norm": 1.302237203326983, "kl": 6.59375, "learning_rate": 1.7588167962308458e-05, "loss": 0.2637, "reward": 0.455078125, "reward_std": 0.10177040286362171, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.455078125, "step": 758 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.3036, "grad_norm": 0.2456405096029007, "kl": 5.5703125, "learning_rate": 1.7579066647302134e-05, "loss": 0.2226, "reward": 0.42578125, "reward_std": 0.10485793463885784, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.42578125, "step": 759 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.304, "grad_norm": 0.26326341566227746, "kl": 5.875, "learning_rate": 1.7569950556517566e-05, "loss": 0.235, "reward": 0.419921875, "reward_std": 0.12119310721755028, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.419921875, "step": 760 }, { "clip_ratio": 0.0, "completion_length": 811.75, "epoch": 0.3044, "grad_norm": 1.1955531149082597, "kl": 6.4765625, "learning_rate": 1.7560819707727034e-05, "loss": 0.259, "reward": 0.4609375, "reward_std": 0.09831409342586994, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4609375, "step": 761 }, { "clip_ratio": 0.0, "completion_length": 910.375, "epoch": 0.3048, "grad_norm": 3.071714044394505, "kl": 7.2109375, "learning_rate": 1.7551674118731592e-05, "loss": 0.2885, "reward": 0.4765625, "reward_std": 0.09547309204936028, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4765625, "step": 762 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.3052, "grad_norm": 3.0402227403629922, "kl": 0.5517578125, "learning_rate": 1.754251380736104e-05, "loss": 0.022, "reward": 0.470703125, "reward_std": 0.15580294281244278, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.470703125, "step": 763 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.3056, "grad_norm": 0.6294738280749103, "kl": 0.163818359375, "learning_rate": 1.7533338791473872e-05, "loss": 0.0066, "reward": 0.466796875, "reward_std": 0.12620963528752327, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.341796875, "step": 764 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.306, "grad_norm": 0.5016808888964971, "kl": 0.1634521484375, "learning_rate": 1.7524149088957244e-05, "loss": 0.0065, "reward": 0.322265625, "reward_std": 0.11701854504644871, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.322265625, "step": 765 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.3064, "grad_norm": 0.18247799692418926, "kl": 0.09765625, "learning_rate": 1.7514944717726962e-05, "loss": 0.0039, "reward": 0.400390625, "reward_std": 0.12418591044843197, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.400390625, "step": 766 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.3068, "grad_norm": 0.11754897872150051, "kl": 0.1014404296875, "learning_rate": 1.7505725695727414e-05, "loss": 0.0041, "reward": 0.677734375, "reward_std": 0.11700233817100525, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.427734375, "step": 767 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.3072, "grad_norm": 0.1718130475138294, "kl": 0.111572265625, "learning_rate": 1.749649204093155e-05, "loss": 0.0045, "reward": 0.548828125, "reward_std": 0.13098826445639133, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.423828125, "step": 768 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.3076, "grad_norm": 0.1459081852691173, "kl": 0.1033935546875, "learning_rate": 1.7487243771340862e-05, "loss": 0.0041, "reward": 0.380859375, "reward_std": 0.16410095617175102, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.380859375, "step": 769 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.308, "grad_norm": 0.27295731710202875, "kl": 0.109375, "learning_rate": 1.747798090498532e-05, "loss": 0.0044, "reward": 0.462890625, "reward_std": 0.1649511456489563, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.462890625, "step": 770 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.3084, "grad_norm": 55.38381302713849, "kl": 8.5816650390625, "learning_rate": 1.746870345992336e-05, "loss": 0.3441, "reward": 0.482421875, "reward_std": 0.1679735891520977, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.482421875, "step": 771 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.3088, "grad_norm": 30.108175855465518, "kl": 4.596923828125, "learning_rate": 1.7459411454241822e-05, "loss": 0.185, "reward": 0.54296875, "reward_std": 0.15372930839657784, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.41796875, "step": 772 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.3092, "grad_norm": 30.26766729162282, "kl": 4.83984375, "learning_rate": 1.7450104906055963e-05, "loss": 0.1938, "reward": 0.443359375, "reward_std": 0.11680441349744797, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.318359375, "step": 773 }, { "clip_ratio": 0.0, "completion_length": 570.875, "epoch": 0.3096, "grad_norm": 12.155106965244666, "kl": 3.6953125, "learning_rate": 1.7440783833509366e-05, "loss": 0.148, "reward": 0.462890625, "reward_std": 0.1310111116617918, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.337890625, "step": 774 }, { "clip_ratio": 0.0, "completion_length": 220.0, "epoch": 0.31, "grad_norm": 0.4900070289479255, "kl": 6.0390625, "learning_rate": 1.7431448254773943e-05, "loss": 0.2412, "reward": 0.494140625, "reward_std": 0.01848640665411949, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.494140625, "step": 775 }, { "clip_ratio": 0.0, "completion_length": 376.625, "epoch": 0.3104, "grad_norm": 0.3064513663279705, "kl": 6.4296875, "learning_rate": 1.7422098188049885e-05, "loss": 0.2568, "reward": 0.73828125, "reward_std": 0.04043455049395561, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48828125, "step": 776 }, { "clip_ratio": 0.0, "completion_length": 249.375, "epoch": 0.3108, "grad_norm": 0.5173747511730586, "kl": 6.421875, "learning_rate": 1.741273365156561e-05, "loss": 0.2565, "reward": 0.623046875, "reward_std": 0.0078125, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.498046875, "step": 777 }, { "clip_ratio": 0.0, "completion_length": 261.5, "epoch": 0.3112, "grad_norm": 0.11137666820683129, "kl": 5.734375, "learning_rate": 1.7403354663577782e-05, "loss": 0.2294, "reward": 0.62109375, "reward_std": 0.015625, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.49609375, "step": 778 }, { "clip_ratio": 0.0, "completion_length": 177.5, "epoch": 0.3116, "grad_norm": 0.24756302555201065, "kl": 5.875, "learning_rate": 1.7393961242371203e-05, "loss": 0.235, "reward": 0.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5, "step": 779 }, { "clip_ratio": 0.0, "completion_length": 505.75, "epoch": 0.312, "grad_norm": 0.0687073713542933, "kl": 6.3125, "learning_rate": 1.7384553406258842e-05, "loss": 0.2524, "reward": 0.494140625, "reward_std": 0.0234375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.494140625, "step": 780 }, { "clip_ratio": 0.0, "completion_length": 196.625, "epoch": 0.3124, "grad_norm": 0.13400218965228836, "kl": 5.5, "learning_rate": 1.737513117358174e-05, "loss": 0.2196, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5, "step": 781 }, { "clip_ratio": 0.0, "completion_length": 618.5, "epoch": 0.3128, "grad_norm": 0.12311125373907034, "kl": 6.140625, "learning_rate": 1.7365694562709034e-05, "loss": 0.2461, "reward": 0.4921875, "reward_std": 0.03125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4921875, "step": 782 }, { "clip_ratio": 0.0, "completion_length": 421.875, "epoch": 0.3132, "grad_norm": 0.10847139345860596, "kl": 6.0703125, "learning_rate": 1.7356243592037876e-05, "loss": 0.2426, "reward": 0.4921875, "reward_std": 0.024809550493955612, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4921875, "step": 783 }, { "clip_ratio": 0.0, "completion_length": 239.125, "epoch": 0.3136, "grad_norm": 0.09795373820798756, "kl": 5.6484375, "learning_rate": 1.7346778279993417e-05, "loss": 0.2259, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5, "step": 784 }, { "clip_ratio": 0.0, "completion_length": 407.875, "epoch": 0.314, "grad_norm": 6.598800460164906, "kl": 5.6484375, "learning_rate": 1.7337298645028764e-05, "loss": 0.226, "reward": 0.490234375, "reward_std": 0.03411140665411949, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.490234375, "step": 785 }, { "clip_ratio": 0.0, "completion_length": 508.375, "epoch": 0.3144, "grad_norm": 1.6129245248345436, "kl": 5.8046875, "learning_rate": 1.732780470562496e-05, "loss": 0.232, "reward": 0.615234375, "reward_std": 0.028222277760505676, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.490234375, "step": 786 }, { "clip_ratio": 0.0, "completion_length": 635.75, "epoch": 0.3148, "grad_norm": 0.16664793841419737, "kl": 6.0078125, "learning_rate": 1.7318296480290912e-05, "loss": 0.2402, "reward": 0.607421875, "reward_std": 0.05452118441462517, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.482421875, "step": 787 }, { "clip_ratio": 0.0, "completion_length": 612.375, "epoch": 0.3152, "grad_norm": 8.81434703042956, "kl": 5.984375, "learning_rate": 1.7308773987563406e-05, "loss": 0.2391, "reward": 0.484375, "reward_std": 0.04175759106874466, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.484375, "step": 788 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.3156, "grad_norm": 17.854282369365375, "kl": 9.765625, "learning_rate": 1.7299237246007018e-05, "loss": 0.3903, "reward": 0.4609375, "reward_std": 0.08846627548336983, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4609375, "step": 789 }, { "clip_ratio": 0.0, "completion_length": 930.375, "epoch": 0.316, "grad_norm": 0.9853107939920241, "kl": 6.265625, "learning_rate": 1.7289686274214116e-05, "loss": 0.2503, "reward": 0.580078125, "reward_std": 0.09755494073033333, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.455078125, "step": 790 }, { "clip_ratio": 0.0, "completion_length": 933.25, "epoch": 0.3164, "grad_norm": 5.361408670055174, "kl": 6.4453125, "learning_rate": 1.7280121090804813e-05, "loss": 0.2578, "reward": 0.591796875, "reward_std": 0.0892931018024683, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.466796875, "step": 791 }, { "clip_ratio": 0.0, "completion_length": 926.125, "epoch": 0.3168, "grad_norm": 1.0348283815876473, "kl": 6.0703125, "learning_rate": 1.727054171442692e-05, "loss": 0.2431, "reward": 0.5859375, "reward_std": 0.08982179313898087, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4609375, "step": 792 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.3172, "grad_norm": 2.396905883178101, "kl": 6.234375, "learning_rate": 1.7260948163755918e-05, "loss": 0.2494, "reward": 0.43359375, "reward_std": 0.11982019431889057, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.43359375, "step": 793 }, { "clip_ratio": 0.0, "completion_length": 761.875, "epoch": 0.3176, "grad_norm": 1.5290357537624357, "kl": 6.5234375, "learning_rate": 1.7251340457494934e-05, "loss": 0.261, "reward": 0.58984375, "reward_std": 0.06825299002230167, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.46484375, "step": 794 }, { "clip_ratio": 0.0, "completion_length": 844.5, "epoch": 0.318, "grad_norm": 0.5771768666438043, "kl": 6.46875, "learning_rate": 1.7241718614374678e-05, "loss": 0.2588, "reward": 0.458984375, "reward_std": 0.09574373997747898, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.458984375, "step": 795 }, { "clip_ratio": 0.0, "completion_length": 912.625, "epoch": 0.3184, "grad_norm": 0.21424460616866722, "kl": 6.1328125, "learning_rate": 1.7232082653153422e-05, "loss": 0.2456, "reward": 0.453125, "reward_std": 0.09116422012448311, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.453125, "step": 796 }, { "clip_ratio": 0.0, "completion_length": 708.125, "epoch": 0.3188, "grad_norm": 0.12788639906728008, "kl": 5.65625, "learning_rate": 1.722243259261697e-05, "loss": 0.2261, "reward": 0.47265625, "reward_std": 0.058557212352752686, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.47265625, "step": 797 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.3192, "grad_norm": 0.1837151951425689, "kl": 5.6796875, "learning_rate": 1.721276845157861e-05, "loss": 0.2274, "reward": 0.431640625, "reward_std": 0.11596748605370522, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.431640625, "step": 798 }, { "clip_ratio": 0.0, "completion_length": 817.0, "epoch": 0.3196, "grad_norm": 0.21924458309716827, "kl": 5.65625, "learning_rate": 1.720309024887907e-05, "loss": 0.2265, "reward": 0.431640625, "reward_std": 0.09748402237892151, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.431640625, "step": 799 }, { "clip_ratio": 0.0, "completion_length": 921.25, "epoch": 0.32, "grad_norm": 0.22332867265513748, "kl": 6.5625, "learning_rate": 1.7193398003386514e-05, "loss": 0.2621, "reward": 0.4609375, "reward_std": 0.08516712486743927, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4609375, "step": 800 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.3204, "grad_norm": 0.6446533407802679, "kl": 6.6953125, "learning_rate": 1.7183691733996463e-05, "loss": 0.2673, "reward": 0.45703125, "reward_std": 0.12657073140144348, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.44921875, "step": 801 }, { "clip_ratio": 0.0, "completion_length": 818.125, "epoch": 0.3208, "grad_norm": 0.31739577904241534, "kl": 6.0625, "learning_rate": 1.717397145963179e-05, "loss": 0.2425, "reward": 0.587890625, "reward_std": 0.08093303442001343, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.462890625, "step": 802 }, { "clip_ratio": 0.0, "completion_length": 926.25, "epoch": 0.3212, "grad_norm": 0.3278176618813345, "kl": 5.90625, "learning_rate": 1.716423719924266e-05, "loss": 0.2357, "reward": 0.44921875, "reward_std": 0.09869000315666199, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.44921875, "step": 803 }, { "clip_ratio": 0.0, "completion_length": 817.125, "epoch": 0.3216, "grad_norm": 0.8817821327402601, "kl": 6.3125, "learning_rate": 1.715448897180652e-05, "loss": 0.2519, "reward": 0.4453125, "reward_std": 0.10484878718852997, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4453125, "step": 804 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.322, "grad_norm": 3.2153152276589063, "kl": 8.25, "learning_rate": 1.7144726796328034e-05, "loss": 0.3297, "reward": 0.466796875, "reward_std": 0.0968216098845005, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.466796875, "step": 805 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.3224, "grad_norm": 0.4631002681492446, "kl": 6.671875, "learning_rate": 1.7134950691839063e-05, "loss": 0.267, "reward": 0.451171875, "reward_std": 0.1323176547884941, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.451171875, "step": 806 }, { "clip_ratio": 0.0, "completion_length": 590.5, "epoch": 0.3228, "grad_norm": 0.1603167894448104, "kl": 6.5078125, "learning_rate": 1.7125160677398625e-05, "loss": 0.2602, "reward": 0.48828125, "reward_std": 0.03697281330823898, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48828125, "step": 807 }, { "clip_ratio": 0.0, "completion_length": 586.625, "epoch": 0.3232, "grad_norm": 0.09731298712538364, "kl": 7.234375, "learning_rate": 1.7115356772092858e-05, "loss": 0.2896, "reward": 0.61328125, "reward_std": 0.03697281330823898, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48828125, "step": 808 }, { "clip_ratio": 0.0, "completion_length": 701.875, "epoch": 0.3236, "grad_norm": 0.20513973546875616, "kl": 6.3515625, "learning_rate": 1.710553899503496e-05, "loss": 0.2536, "reward": 0.611328125, "reward_std": 0.043847277760505676, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.486328125, "step": 809 }, { "clip_ratio": 0.0, "completion_length": 496.125, "epoch": 0.324, "grad_norm": 0.35840705046564836, "kl": 6.828125, "learning_rate": 1.709570736536521e-05, "loss": 0.2732, "reward": 0.490234375, "reward_std": 0.03262205049395561, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.490234375, "step": 810 }, { "clip_ratio": 0.0, "completion_length": 590.125, "epoch": 0.3244, "grad_norm": 0.221357959065914, "kl": 6.796875, "learning_rate": 1.7085861902250864e-05, "loss": 0.2723, "reward": 0.490234375, "reward_std": 0.03411140665411949, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.490234375, "step": 811 }, { "clip_ratio": 0.0, "completion_length": 702.5, "epoch": 0.3248, "grad_norm": 12.754296399272315, "kl": 6.4140625, "learning_rate": 1.7076002624886156e-05, "loss": 0.2566, "reward": 0.4765625, "reward_std": 0.057711176574230194, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4765625, "step": 812 }, { "clip_ratio": 0.0, "completion_length": 615.75, "epoch": 0.3252, "grad_norm": 30.379878422473304, "kl": 6.46875, "learning_rate": 1.706612955249225e-05, "loss": 0.2587, "reward": 0.603515625, "reward_std": 0.05287160910665989, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478515625, "step": 813 }, { "clip_ratio": 0.0, "completion_length": 743.375, "epoch": 0.3256, "grad_norm": 2.3138688211460874, "kl": 6.4765625, "learning_rate": 1.705624270431721e-05, "loss": 0.2592, "reward": 0.599609375, "reward_std": 0.05918231979012489, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.474609375, "step": 814 }, { "clip_ratio": 0.0, "completion_length": 920.75, "epoch": 0.326, "grad_norm": 0.28597371699790297, "kl": 6.71875, "learning_rate": 1.7046342099635948e-05, "loss": 0.2691, "reward": 0.466796875, "reward_std": 0.08246497623622417, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.466796875, "step": 815 }, { "clip_ratio": 0.0, "completion_length": 823.625, "epoch": 0.3264, "grad_norm": 0.49020290898055957, "kl": 6.5703125, "learning_rate": 1.7036427757750205e-05, "loss": 0.263, "reward": 0.4765625, "reward_std": 0.08837999403476715, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.46875, "step": 816 }, { "clip_ratio": 0.0, "completion_length": 918.875, "epoch": 0.3268, "grad_norm": 0.16756716122448556, "kl": 6.3359375, "learning_rate": 1.7026499697988496e-05, "loss": 0.2535, "reward": 0.43359375, "reward_std": 0.10925769992172718, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.43359375, "step": 817 }, { "clip_ratio": 0.0, "completion_length": 707.375, "epoch": 0.3272, "grad_norm": 0.19452576181702588, "kl": 6.703125, "learning_rate": 1.7016557939706075e-05, "loss": 0.2686, "reward": 0.46875, "reward_std": 0.07635262608528137, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.46875, "step": 818 }, { "clip_ratio": 0.0, "completion_length": 925.75, "epoch": 0.3276, "grad_norm": 1.6194644545300991, "kl": 6.171875, "learning_rate": 1.700660250228492e-05, "loss": 0.2473, "reward": 0.4453125, "reward_std": 0.09081817977130413, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4453125, "step": 819 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.328, "grad_norm": 0.2402182377616252, "kl": 7.0546875, "learning_rate": 1.6996633405133656e-05, "loss": 0.2825, "reward": 0.46484375, "reward_std": 0.0980534553527832, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.46484375, "step": 820 }, { "clip_ratio": 0.0, "completion_length": 592.5, "epoch": 0.3284, "grad_norm": 0.1590253520196885, "kl": 6.7578125, "learning_rate": 1.6986650667687552e-05, "loss": 0.2697, "reward": 0.48046875, "reward_std": 0.06524410098791122, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48046875, "step": 821 }, { "clip_ratio": 0.0, "completion_length": 586.875, "epoch": 0.3288, "grad_norm": 0.20537743095765587, "kl": 6.046875, "learning_rate": 1.6976654309408464e-05, "loss": 0.2415, "reward": 0.484375, "reward_std": 0.04521932825446129, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.484375, "step": 822 }, { "clip_ratio": 0.0, "completion_length": 704.375, "epoch": 0.3292, "grad_norm": 0.10652240701975746, "kl": 6.8984375, "learning_rate": 1.696664434978481e-05, "loss": 0.276, "reward": 0.48046875, "reward_std": 0.06822281330823898, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48046875, "step": 823 }, { "clip_ratio": 0.0, "completion_length": 298.75, "epoch": 0.3296, "grad_norm": 0.09021688722946496, "kl": 6.0078125, "learning_rate": 1.695662080833151e-05, "loss": 0.2402, "reward": 0.498046875, "reward_std": 0.0078125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.498046875, "step": 824 }, { "clip_ratio": 0.0, "completion_length": 277.75, "epoch": 0.33, "grad_norm": 0.26138767864031137, "kl": 5.8203125, "learning_rate": 1.6946583704589973e-05, "loss": 0.233, "reward": 0.490234375, "reward_std": 0.029160313308238983, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.490234375, "step": 825 }, { "clip_ratio": 0.0, "completion_length": 427.75, "epoch": 0.3304, "grad_norm": 0.062189487373279374, "kl": 5.8515625, "learning_rate": 1.693653305812805e-05, "loss": 0.2343, "reward": 0.494140625, "reward_std": 0.0234375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.494140625, "step": 826 }, { "clip_ratio": 0.0, "completion_length": 613.625, "epoch": 0.3308, "grad_norm": 0.09930628159312561, "kl": 6.640625, "learning_rate": 1.6926468888539988e-05, "loss": 0.266, "reward": 0.61328125, "reward_std": 0.046875, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48828125, "step": 827 }, { "clip_ratio": 0.0, "completion_length": 273.625, "epoch": 0.3312, "grad_norm": 0.3478532386886582, "kl": 6.3671875, "learning_rate": 1.6916391215446403e-05, "loss": 0.2546, "reward": 0.49609375, "reward_std": 0.015625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.49609375, "step": 828 }, { "clip_ratio": 0.0, "completion_length": 276.25, "epoch": 0.3316, "grad_norm": 0.3454268552696317, "kl": 5.4921875, "learning_rate": 1.690630005849423e-05, "loss": 0.2198, "reward": 0.49609375, "reward_std": 0.015625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.49609375, "step": 829 }, { "clip_ratio": 0.0, "completion_length": 522.375, "epoch": 0.332, "grad_norm": 0.09650750634963633, "kl": 5.984375, "learning_rate": 1.68961954373567e-05, "loss": 0.2394, "reward": 0.486328125, "reward_std": 0.04478531330823898, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.486328125, "step": 830 }, { "clip_ratio": 0.0, "completion_length": 501.75, "epoch": 0.3324, "grad_norm": 0.11887266388145447, "kl": 6.1875, "learning_rate": 1.6886077371733285e-05, "loss": 0.2478, "reward": 0.48828125, "reward_std": 0.04043455049395561, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48828125, "step": 831 }, { "clip_ratio": 0.0, "completion_length": 509.75, "epoch": 0.3328, "grad_norm": 0.08879625266474925, "kl": 5.6640625, "learning_rate": 1.6875945881349676e-05, "loss": 0.2268, "reward": 0.60546875, "reward_std": 0.050004106014966965, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48046875, "step": 832 }, { "clip_ratio": 0.0, "completion_length": 334.125, "epoch": 0.3332, "grad_norm": 0.14882486573236675, "kl": 5.6015625, "learning_rate": 1.686580098595773e-05, "loss": 0.2241, "reward": 0.49609375, "reward_std": 0.010673906654119492, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.49609375, "step": 833 }, { "clip_ratio": 0.0, "completion_length": 338.875, "epoch": 0.3336, "grad_norm": 0.08261042707838138, "kl": 5.9375, "learning_rate": 1.6855642705335438e-05, "loss": 0.2375, "reward": 0.49609375, "reward_std": 0.015625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.49609375, "step": 834 }, { "clip_ratio": 0.0, "completion_length": 335.75, "epoch": 0.334, "grad_norm": 0.07616352172977574, "kl": 4.6796875, "learning_rate": 1.684547105928689e-05, "loss": 0.187, "reward": 0.74609375, "reward_std": 0.015625, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.49609375, "step": 835 }, { "clip_ratio": 0.0, "completion_length": 475.75, "epoch": 0.3344, "grad_norm": 0.43500232646915243, "kl": 5.359375, "learning_rate": 1.6835286067642228e-05, "loss": 0.2144, "reward": 0.484375, "reward_std": 0.05605955049395561, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.484375, "step": 836 }, { "clip_ratio": 0.0, "completion_length": 524.625, "epoch": 0.3348, "grad_norm": 0.10597805212986211, "kl": 5.6953125, "learning_rate": 1.6825087750257617e-05, "loss": 0.2281, "reward": 0.490234375, "reward_std": 0.0390625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.490234375, "step": 837 }, { "clip_ratio": 0.0, "completion_length": 434.75, "epoch": 0.3352, "grad_norm": 0.29971975979951426, "kl": 5.5546875, "learning_rate": 1.68148761270152e-05, "loss": 0.2221, "reward": 0.490234375, "reward_std": 0.03262205049395561, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.490234375, "step": 838 }, { "clip_ratio": 0.0, "completion_length": 550.375, "epoch": 0.3356, "grad_norm": 0.07158082699302826, "kl": 5.9765625, "learning_rate": 1.6804651217823055e-05, "loss": 0.2391, "reward": 0.486328125, "reward_std": 0.04478531330823898, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.486328125, "step": 839 }, { "clip_ratio": 0.0, "completion_length": 735.0, "epoch": 0.336, "grad_norm": 84.02790800625124, "kl": 6.0078125, "learning_rate": 1.6794413042615168e-05, "loss": 0.2409, "reward": 0.470703125, "reward_std": 0.1008448638021946, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.470703125, "step": 840 }, { "clip_ratio": 0.0, "completion_length": 832.0, "epoch": 0.3364, "grad_norm": 0.21300010818570805, "kl": 6.484375, "learning_rate": 1.6784161621351384e-05, "loss": 0.2588, "reward": 0.576171875, "reward_std": 0.11784898489713669, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.451171875, "step": 841 }, { "clip_ratio": 0.0, "completion_length": 831.625, "epoch": 0.3368, "grad_norm": 0.09303706777153298, "kl": 6.4375, "learning_rate": 1.6773896974017373e-05, "loss": 0.2574, "reward": 0.46484375, "reward_std": 0.09640797972679138, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.46484375, "step": 842 }, { "clip_ratio": 0.0, "completion_length": 843.375, "epoch": 0.3372, "grad_norm": 16.292298519805442, "kl": 5.71875, "learning_rate": 1.6763619120624595e-05, "loss": 0.2288, "reward": 0.7109375, "reward_std": 0.10031019896268845, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4609375, "step": 843 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.3376, "grad_norm": 9.723220500088514, "kl": 4.5, "learning_rate": 1.6753328081210244e-05, "loss": 0.1803, "reward": 0.421875, "reward_std": 0.13849112205207348, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.421875, "step": 844 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.338, "grad_norm": 56.50761086594937, "kl": 6.203125, "learning_rate": 1.6743023875837233e-05, "loss": 0.2485, "reward": 0.45703125, "reward_std": 0.10534250177443027, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.45703125, "step": 845 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.3384, "grad_norm": 64.35611780246136, "kl": 3.97265625, "learning_rate": 1.6732706524594138e-05, "loss": 0.1586, "reward": 0.443359375, "reward_std": 0.12870439141988754, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.443359375, "step": 846 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.3388, "grad_norm": 28.722299933210902, "kl": 1.00732421875, "learning_rate": 1.6722376047595163e-05, "loss": 0.0403, "reward": 0.576171875, "reward_std": 0.11882846057415009, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.451171875, "step": 847 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.3392, "grad_norm": 29.222641621131785, "kl": 5.328125, "learning_rate": 1.6712032464980094e-05, "loss": 0.2133, "reward": 0.45703125, "reward_std": 0.10998464561998844, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.45703125, "step": 848 }, { "clip_ratio": 0.0, "completion_length": 742.0, "epoch": 0.3396, "grad_norm": 0.11434822161862093, "kl": 6.25, "learning_rate": 1.6701675796914284e-05, "loss": 0.2498, "reward": 0.484375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.484375, "step": 849 }, { "clip_ratio": 0.0, "completion_length": 740.25, "epoch": 0.34, "grad_norm": 0.17860422527953146, "kl": 6.0234375, "learning_rate": 1.6691306063588583e-05, "loss": 0.241, "reward": 0.4609375, "reward_std": 0.09082392230629921, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4609375, "step": 850 }, { "clip_ratio": 0.0, "completion_length": 656.25, "epoch": 0.3404, "grad_norm": 0.49593867531817054, "kl": 6.0390625, "learning_rate": 1.668092328521932e-05, "loss": 0.2417, "reward": 0.603515625, "reward_std": 0.0695948638021946, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478515625, "step": 851 }, { "clip_ratio": 0.0, "completion_length": 676.125, "epoch": 0.3408, "grad_norm": 0.8711451010657114, "kl": 5.609375, "learning_rate": 1.6670527482048246e-05, "loss": 0.2244, "reward": 0.484375, "reward_std": 0.05259781330823898, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.484375, "step": 852 }, { "clip_ratio": 0.0, "completion_length": 361.375, "epoch": 0.3412, "grad_norm": 0.07348021090394, "kl": 5.3125, "learning_rate": 1.666011867434252e-05, "loss": 0.2125, "reward": 0.494140625, "reward_std": 0.016997050493955612, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.494140625, "step": 853 }, { "clip_ratio": 0.0, "completion_length": 740.25, "epoch": 0.3416, "grad_norm": 0.1623734806929879, "kl": 6.15625, "learning_rate": 1.6649696882394635e-05, "loss": 0.2462, "reward": 0.48046875, "reward_std": 0.06822281330823898, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48046875, "step": 854 }, { "clip_ratio": 0.0, "completion_length": 461.375, "epoch": 0.342, "grad_norm": 0.07852978272062712, "kl": 5.46875, "learning_rate": 1.6639262126522417e-05, "loss": 0.2185, "reward": 0.494140625, "reward_std": 0.0234375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.494140625, "step": 855 }, { "clip_ratio": 0.0, "completion_length": 742.25, "epoch": 0.3424, "grad_norm": 0.12520819530212102, "kl": 6.1875, "learning_rate": 1.6628814427068954e-05, "loss": 0.2473, "reward": 0.6015625, "reward_std": 0.07410174608230591, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4765625, "step": 856 }, { "clip_ratio": 0.0, "completion_length": 775.25, "epoch": 0.3428, "grad_norm": 7.548245435330551, "kl": 6.15625, "learning_rate": 1.6618353804402567e-05, "loss": 0.2465, "reward": 0.478515625, "reward_std": 0.06865682825446129, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478515625, "step": 857 }, { "clip_ratio": 0.0, "completion_length": 673.875, "epoch": 0.3432, "grad_norm": 5.960308407365638, "kl": 7.75, "learning_rate": 1.6607880278916778e-05, "loss": 0.3101, "reward": 0.48046875, "reward_std": 0.06822281330823898, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48046875, "step": 858 }, { "clip_ratio": 0.0, "completion_length": 320.5, "epoch": 0.3436, "grad_norm": 0.06894164353799924, "kl": 5.2578125, "learning_rate": 1.6597393871030264e-05, "loss": 0.2108, "reward": 0.498046875, "reward_std": 0.0078125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.498046875, "step": 859 }, { "clip_ratio": 0.0, "completion_length": 828.25, "epoch": 0.344, "grad_norm": 0.3090460955748707, "kl": 6.109375, "learning_rate": 1.6586894601186804e-05, "loss": 0.2446, "reward": 0.599609375, "reward_std": 0.08224115148186684, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.474609375, "step": 860 }, { "clip_ratio": 0.0, "completion_length": 850.625, "epoch": 0.3444, "grad_norm": 1.8098960185058912, "kl": 6.1328125, "learning_rate": 1.6576382489855274e-05, "loss": 0.2455, "reward": 0.478515625, "reward_std": 0.0745459571480751, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478515625, "step": 861 }, { "clip_ratio": 0.0, "completion_length": 746.875, "epoch": 0.3448, "grad_norm": 0.10436746187157658, "kl": 5.9453125, "learning_rate": 1.6565857557529567e-05, "loss": 0.2378, "reward": 0.4765625, "reward_std": 0.07003497518599033, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4765625, "step": 862 }, { "clip_ratio": 0.0, "completion_length": 718.875, "epoch": 0.3452, "grad_norm": 0.13527688551532638, "kl": 6.6328125, "learning_rate": 1.6555319824728577e-05, "loss": 0.2652, "reward": 0.48046875, "reward_std": 0.06084432825446129, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48046875, "step": 863 }, { "clip_ratio": 0.0, "completion_length": 535.0, "epoch": 0.3456, "grad_norm": 0.1207558824939854, "kl": 5.875, "learning_rate": 1.654476931199615e-05, "loss": 0.2354, "reward": 0.615234375, "reward_std": 0.03262205049395561, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.490234375, "step": 864 }, { "clip_ratio": 0.0, "completion_length": 505.5, "epoch": 0.346, "grad_norm": 0.12752518454302594, "kl": 6.296875, "learning_rate": 1.6534206039901057e-05, "loss": 0.2523, "reward": 0.486328125, "reward_std": 0.03740682825446129, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.486328125, "step": 865 }, { "clip_ratio": 0.0, "completion_length": 407.125, "epoch": 0.3464, "grad_norm": 0.14875775781750733, "kl": 6.6484375, "learning_rate": 1.652363002903693e-05, "loss": 0.2656, "reward": 0.61328125, "reward_std": 0.035483457148075104, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48828125, "step": 866 }, { "clip_ratio": 0.0, "completion_length": 701.875, "epoch": 0.3468, "grad_norm": 0.8951716484333186, "kl": 6.8515625, "learning_rate": 1.6513041300022253e-05, "loss": 0.2743, "reward": 0.48046875, "reward_std": 0.060293007642030716, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48046875, "step": 867 }, { "clip_ratio": 0.0, "completion_length": 707.625, "epoch": 0.3472, "grad_norm": 0.0939478697182626, "kl": 6.765625, "learning_rate": 1.650243987350029e-05, "loss": 0.2706, "reward": 0.486328125, "reward_std": 0.0546875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.486328125, "step": 868 }, { "clip_ratio": 0.0, "completion_length": 916.25, "epoch": 0.3476, "grad_norm": 3.0365162688354483, "kl": 6.6328125, "learning_rate": 1.649182577013906e-05, "loss": 0.2652, "reward": 0.466796875, "reward_std": 0.08122309111058712, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.466796875, "step": 869 }, { "clip_ratio": 0.0, "completion_length": 716.875, "epoch": 0.348, "grad_norm": 0.14169592449378968, "kl": 6.328125, "learning_rate": 1.6481199010631312e-05, "loss": 0.2532, "reward": 0.47265625, "reward_std": 0.06155727431178093, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.47265625, "step": 870 }, { "clip_ratio": 0.0, "completion_length": 921.625, "epoch": 0.3484, "grad_norm": 0.09534144820341903, "kl": 6.671875, "learning_rate": 1.6470559615694445e-05, "loss": 0.2665, "reward": 0.4609375, "reward_std": 0.08867387473583221, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4609375, "step": 871 }, { "clip_ratio": 0.0, "completion_length": 924.625, "epoch": 0.3488, "grad_norm": 0.131036113417464, "kl": 6.546875, "learning_rate": 1.6459907606070513e-05, "loss": 0.2622, "reward": 0.46875, "reward_std": 0.08455466292798519, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.46875, "step": 872 }, { "clip_ratio": 0.0, "completion_length": 934.875, "epoch": 0.3492, "grad_norm": 0.14714079033376484, "kl": 6.6015625, "learning_rate": 1.6449243002526146e-05, "loss": 0.264, "reward": 0.4609375, "reward_std": 0.08587360195815563, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4609375, "step": 873 }, { "clip_ratio": 0.0, "completion_length": 919.875, "epoch": 0.3496, "grad_norm": 0.17438741407537914, "kl": 5.96875, "learning_rate": 1.643856582585254e-05, "loss": 0.2391, "reward": 0.458984375, "reward_std": 0.10870842449367046, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.458984375, "step": 874 }, { "clip_ratio": 0.0, "completion_length": 718.625, "epoch": 0.35, "grad_norm": 0.2820844201680946, "kl": 6.5546875, "learning_rate": 1.6427876096865394e-05, "loss": 0.2623, "reward": 0.466796875, "reward_std": 0.07368068769574165, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.466796875, "step": 875 }, { "clip_ratio": 0.0, "completion_length": 649.625, "epoch": 0.3504, "grad_norm": 0.18813084485280482, "kl": 6.40625, "learning_rate": 1.6417173836404888e-05, "loss": 0.2562, "reward": 0.486328125, "reward_std": 0.043847277760505676, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.486328125, "step": 876 }, { "clip_ratio": 0.0, "completion_length": 722.75, "epoch": 0.3508, "grad_norm": 0.13231821820626133, "kl": 6.359375, "learning_rate": 1.6406459065335616e-05, "loss": 0.2545, "reward": 0.466796875, "reward_std": 0.07360323145985603, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.466796875, "step": 877 }, { "clip_ratio": 0.0, "completion_length": 710.875, "epoch": 0.3512, "grad_norm": 0.24117338449052, "kl": 6.6640625, "learning_rate": 1.6395731804546582e-05, "loss": 0.2668, "reward": 0.482421875, "reward_std": 0.06536140665411949, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.482421875, "step": 878 }, { "clip_ratio": 0.0, "completion_length": 613.75, "epoch": 0.3516, "grad_norm": 0.23131475395614623, "kl": 5.9609375, "learning_rate": 1.6384992074951124e-05, "loss": 0.2389, "reward": 0.46875, "reward_std": 0.07203313335776329, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.46875, "step": 879 }, { "clip_ratio": 0.0, "completion_length": 623.75, "epoch": 0.352, "grad_norm": 0.10486168117702224, "kl": 6.3359375, "learning_rate": 1.63742398974869e-05, "loss": 0.2531, "reward": 0.60546875, "reward_std": 0.05738259106874466, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48046875, "step": 880 }, { "clip_ratio": 0.0, "completion_length": 709.75, "epoch": 0.3524, "grad_norm": 0.16026656182084303, "kl": 6.3046875, "learning_rate": 1.6363475293115824e-05, "loss": 0.2526, "reward": 0.603515625, "reward_std": 0.07108421996235847, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478515625, "step": 881 }, { "clip_ratio": 0.0, "completion_length": 515.0, "epoch": 0.3528, "grad_norm": 0.2559298500515755, "kl": 6.2109375, "learning_rate": 1.6352698282824045e-05, "loss": 0.2484, "reward": 0.486328125, "reward_std": 0.03740682825446129, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.486328125, "step": 882 }, { "clip_ratio": 0.0, "completion_length": 616.5, "epoch": 0.3532, "grad_norm": 0.10888061323555101, "kl": 6.2421875, "learning_rate": 1.6341908887621894e-05, "loss": 0.2501, "reward": 0.611328125, "reward_std": 0.04478531330823898, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.486328125, "step": 883 }, { "clip_ratio": 0.0, "completion_length": 333.125, "epoch": 0.3536, "grad_norm": 0.5241098348704837, "kl": 5.671875, "learning_rate": 1.6331107128543856e-05, "loss": 0.2269, "reward": 0.498046875, "reward_std": 0.0078125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.498046875, "step": 884 }, { "clip_ratio": 0.0, "completion_length": 375.875, "epoch": 0.354, "grad_norm": 0.1553893396139741, "kl": 5.1796875, "learning_rate": 1.632029302664851e-05, "loss": 0.207, "reward": 0.623046875, "reward_std": 0.0078125, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.498046875, "step": 885 }, { "clip_ratio": 0.0, "completion_length": 519.875, "epoch": 0.3544, "grad_norm": 0.15326524589993137, "kl": 5.875, "learning_rate": 1.6309466603018497e-05, "loss": 0.235, "reward": 0.49609375, "reward_std": 0.015625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.49609375, "step": 886 }, { "clip_ratio": 0.0, "completion_length": 524.75, "epoch": 0.3548, "grad_norm": 0.08183678898287428, "kl": 5.4140625, "learning_rate": 1.6298627878760488e-05, "loss": 0.2168, "reward": 0.48828125, "reward_std": 0.02959432825446129, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48828125, "step": 887 }, { "clip_ratio": 0.0, "completion_length": 648.25, "epoch": 0.3552, "grad_norm": 0.09353613361261652, "kl": 4.81640625, "learning_rate": 1.628777687500513e-05, "loss": 0.1929, "reward": 0.4921875, "reward_std": 0.02629890665411949, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4921875, "step": 888 }, { "clip_ratio": 0.0, "completion_length": 568.375, "epoch": 0.3556, "grad_norm": 0.5127498630466223, "kl": 4.984375, "learning_rate": 1.6276913612907005e-05, "loss": 0.1996, "reward": 0.615234375, "reward_std": 0.02178792469203472, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.490234375, "step": 889 }, { "clip_ratio": 0.0, "completion_length": 750.0, "epoch": 0.356, "grad_norm": 0.09805275906367174, "kl": 4.5625, "learning_rate": 1.6266038113644605e-05, "loss": 0.1825, "reward": 0.482421875, "reward_std": 0.05215360224246979, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.482421875, "step": 890 }, { "clip_ratio": 0.0, "completion_length": 541.5, "epoch": 0.3564, "grad_norm": 0.2747264734043828, "kl": 4.20703125, "learning_rate": 1.6255150398420273e-05, "loss": 0.168, "reward": 0.486328125, "reward_std": 0.016010859981179237, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.486328125, "step": 891 }, { "clip_ratio": 0.0, "completion_length": 851.25, "epoch": 0.3568, "grad_norm": 0.14300532862693877, "kl": 5.375, "learning_rate": 1.624425048846016e-05, "loss": 0.2149, "reward": 0.607421875, "reward_std": 0.06387205049395561, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.482421875, "step": 892 }, { "clip_ratio": 0.0, "completion_length": 897.125, "epoch": 0.3572, "grad_norm": 0.16870093911053594, "kl": 5.328125, "learning_rate": 1.6233338405014204e-05, "loss": 0.2131, "reward": 0.48046875, "reward_std": 0.05738259106874466, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48046875, "step": 893 }, { "clip_ratio": 0.0, "completion_length": 825.125, "epoch": 0.3576, "grad_norm": 0.2648316174103762, "kl": 5.671875, "learning_rate": 1.6222414169356066e-05, "loss": 0.227, "reward": 0.484375, "reward_std": 0.06233368441462517, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.484375, "step": 894 }, { "clip_ratio": 0.0, "completion_length": 880.375, "epoch": 0.358, "grad_norm": 0.154587376165261, "kl": 5.5078125, "learning_rate": 1.6211477802783105e-05, "loss": 0.2203, "reward": 0.478515625, "reward_std": 0.06865682825446129, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478515625, "step": 895 }, { "clip_ratio": 0.0, "completion_length": 762.125, "epoch": 0.3584, "grad_norm": 0.2182808814427237, "kl": 4.8125, "learning_rate": 1.620052932661633e-05, "loss": 0.1927, "reward": 0.61328125, "reward_std": 0.051659777760505676, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48828125, "step": 896 }, { "clip_ratio": 0.0, "completion_length": 885.625, "epoch": 0.3588, "grad_norm": 0.562972088293493, "kl": 4.671875, "learning_rate": 1.618956876220035e-05, "loss": 0.1867, "reward": 0.462890625, "reward_std": 0.14579572156071663, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.447265625, "step": 897 }, { "clip_ratio": 0.0, "completion_length": 655.875, "epoch": 0.3592, "grad_norm": 0.24960159829676487, "kl": 5.0625, "learning_rate": 1.6178596130903345e-05, "loss": 0.2022, "reward": 0.5, "reward_std": 0.04738743044435978, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5, "step": 898 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.3596, "grad_norm": 0.2599577773921507, "kl": 5.27734375, "learning_rate": 1.6167611454117027e-05, "loss": 0.2108, "reward": 0.474609375, "reward_std": 0.0944178868085146, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.474609375, "step": 899 }, { "clip_ratio": 0.0, "completion_length": 961.75, "epoch": 0.36, "grad_norm": 0.21043374924240654, "kl": 5.4453125, "learning_rate": 1.6156614753256583e-05, "loss": 0.2179, "reward": 0.474609375, "reward_std": 0.09272734820842743, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.474609375, "step": 900 }, { "clip_ratio": 0.0, "completion_length": 996.375, "epoch": 0.3604, "grad_norm": 0.2544911337801651, "kl": 5.3203125, "learning_rate": 1.6145606049760644e-05, "loss": 0.2131, "reward": 0.478515625, "reward_std": 0.11878204345703125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478515625, "step": 901 }, { "clip_ratio": 0.0, "completion_length": 898.0, "epoch": 0.3608, "grad_norm": 0.2779520303183107, "kl": 4.390625, "learning_rate": 1.6134585365091243e-05, "loss": 0.1756, "reward": 0.486328125, "reward_std": 0.09447582066059113, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.486328125, "step": 902 }, { "clip_ratio": 0.0, "completion_length": 904.125, "epoch": 0.3612, "grad_norm": 0.23359395045803708, "kl": 4.6875, "learning_rate": 1.6123552720733767e-05, "loss": 0.1874, "reward": 0.5, "reward_std": 0.13663224503397942, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5, "step": 903 }, { "clip_ratio": 0.0, "completion_length": 860.5, "epoch": 0.3616, "grad_norm": 0.29969908129751066, "kl": 4.4765625, "learning_rate": 1.611250813819692e-05, "loss": 0.1792, "reward": 0.619140625, "reward_std": 0.125724408775568, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.494140625, "step": 904 }, { "clip_ratio": 0.0, "completion_length": 918.625, "epoch": 0.362, "grad_norm": 0.2414657012776565, "kl": 5.3359375, "learning_rate": 1.610145163901268e-05, "loss": 0.2132, "reward": 0.55859375, "reward_std": 0.14488892629742622, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.55859375, "step": 905 }, { "clip_ratio": 0.0, "completion_length": 749.625, "epoch": 0.3624, "grad_norm": 0.33632737973072796, "kl": 4.296875, "learning_rate": 1.6090383244736256e-05, "loss": 0.1723, "reward": 0.59375, "reward_std": 0.16826364025473595, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.59375, "step": 906 }, { "clip_ratio": 0.0, "completion_length": 754.75, "epoch": 0.3628, "grad_norm": 0.34292117645875186, "kl": 4.375, "learning_rate": 1.6079302976946055e-05, "loss": 0.1752, "reward": 0.734375, "reward_std": 0.154843982309103, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.609375, "step": 907 }, { "clip_ratio": 0.0, "completion_length": 734.375, "epoch": 0.3632, "grad_norm": 0.4597912225113042, "kl": 4.078125, "learning_rate": 1.6068210857243625e-05, "loss": 0.1632, "reward": 0.728515625, "reward_std": 0.15504353493452072, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.603515625, "step": 908 }, { "clip_ratio": 0.0, "completion_length": 468.625, "epoch": 0.3636, "grad_norm": 0.4861198688378019, "kl": 3.8359375, "learning_rate": 1.6057106907253617e-05, "loss": 0.1537, "reward": 0.703125, "reward_std": 0.08246412128210068, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.703125, "step": 909 }, { "clip_ratio": 0.0, "completion_length": 572.5, "epoch": 0.364, "grad_norm": 0.2788324494523435, "kl": 4.1796875, "learning_rate": 1.6045991148623752e-05, "loss": 0.1676, "reward": 0.732421875, "reward_std": 0.05325482226908207, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.732421875, "step": 910 }, { "clip_ratio": 0.0, "completion_length": 691.375, "epoch": 0.3644, "grad_norm": 0.23230065705271333, "kl": 4.9453125, "learning_rate": 1.6034863603024768e-05, "loss": 0.1977, "reward": 0.708984375, "reward_std": 0.09458901733160019, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.708984375, "step": 911 }, { "clip_ratio": 0.0, "completion_length": 446.125, "epoch": 0.3648, "grad_norm": 2.0887127634617504, "kl": 4.33203125, "learning_rate": 1.6023724292150387e-05, "loss": 0.1733, "reward": 0.734375, "reward_std": 0.08012315630912781, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7265625, "step": 912 }, { "clip_ratio": 0.0, "completion_length": 602.75, "epoch": 0.3652, "grad_norm": 0.14513654754212313, "kl": 4.63671875, "learning_rate": 1.601257323771727e-05, "loss": 0.1858, "reward": 0.71875, "reward_std": 0.09052931517362595, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.71875, "step": 913 }, { "clip_ratio": 0.0, "completion_length": 480.375, "epoch": 0.3656, "grad_norm": 0.3017765190932495, "kl": 4.16796875, "learning_rate": 1.6001410461464955e-05, "loss": 0.1665, "reward": 0.912109375, "reward_std": 0.1015625, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.740234375, "step": 914 }, { "clip_ratio": 0.0, "completion_length": 602.125, "epoch": 0.366, "grad_norm": 0.3965602144550272, "kl": 4.80859375, "learning_rate": 1.599023598515586e-05, "loss": 0.1925, "reward": 0.7421875, "reward_std": 0.08086910098791122, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.734375, "step": 915 }, { "clip_ratio": 0.0, "completion_length": 511.25, "epoch": 0.3664, "grad_norm": 0.5311512867001745, "kl": 4.1953125, "learning_rate": 1.597904983057519e-05, "loss": 0.168, "reward": 0.837890625, "reward_std": 0.09025543369352818, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.712890625, "step": 916 }, { "clip_ratio": 0.0, "completion_length": 750.5, "epoch": 0.3668, "grad_norm": 0.38480082540561894, "kl": 5.3046875, "learning_rate": 1.596785201953093e-05, "loss": 0.212, "reward": 0.84765625, "reward_std": 0.09589069709181786, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.72265625, "step": 917 }, { "clip_ratio": 0.0, "completion_length": 657.5, "epoch": 0.3672, "grad_norm": 0.36234322474127956, "kl": 4.234375, "learning_rate": 1.5956642573853784e-05, "loss": 0.1695, "reward": 0.7890625, "reward_std": 0.11329681053757668, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6640625, "step": 918 }, { "clip_ratio": 0.0, "completion_length": 519.125, "epoch": 0.3676, "grad_norm": 1.2504651083101972, "kl": 4.5390625, "learning_rate": 1.5945421515397135e-05, "loss": 0.1811, "reward": 0.8671875, "reward_std": 0.03125, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7421875, "step": 919 }, { "clip_ratio": 0.0, "completion_length": 763.25, "epoch": 0.368, "grad_norm": 0.2770326202902919, "kl": 5.5, "learning_rate": 1.5934188866037017e-05, "loss": 0.2197, "reward": 0.830078125, "reward_std": 0.11750245466828346, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.705078125, "step": 920 }, { "clip_ratio": 0.0, "completion_length": 661.125, "epoch": 0.3684, "grad_norm": 0.3396329772253615, "kl": 5.0859375, "learning_rate": 1.592294464767205e-05, "loss": 0.2036, "reward": 0.83984375, "reward_std": 0.1008685901761055, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.71484375, "step": 921 }, { "clip_ratio": 0.0, "completion_length": 843.375, "epoch": 0.3688, "grad_norm": 0.315478922747648, "kl": 5.6875, "learning_rate": 1.591168888222342e-05, "loss": 0.2272, "reward": 0.83203125, "reward_std": 0.12356225773692131, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.70703125, "step": 922 }, { "clip_ratio": 0.0, "completion_length": 836.25, "epoch": 0.3692, "grad_norm": 1.1359443626113495, "kl": 5.5625, "learning_rate": 1.5900421591634813e-05, "loss": 0.2226, "reward": 0.916015625, "reward_std": 0.19269749149680138, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.697265625, "step": 923 }, { "clip_ratio": 0.0, "completion_length": 726.375, "epoch": 0.3696, "grad_norm": 0.5472163792277692, "kl": 5.3515625, "learning_rate": 1.5889142797872387e-05, "loss": 0.2139, "reward": 0.951171875, "reward_std": 0.1159161776304245, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.701171875, "step": 924 }, { "clip_ratio": 0.0, "completion_length": 749.875, "epoch": 0.37, "grad_norm": 1.0493098301157433, "kl": 5.4296875, "learning_rate": 1.5877852522924733e-05, "loss": 0.2175, "reward": 0.65625, "reward_std": 0.17295800521969795, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.65625, "step": 925 }, { "clip_ratio": 0.0, "completion_length": 833.25, "epoch": 0.3704, "grad_norm": 0.19936292671134076, "kl": 5.8359375, "learning_rate": 1.5866550788802815e-05, "loss": 0.2333, "reward": 0.826171875, "reward_std": 0.13090714067220688, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.701171875, "step": 926 }, { "clip_ratio": 0.0, "completion_length": 931.5, "epoch": 0.3708, "grad_norm": 0.199006365420841, "kl": 5.6640625, "learning_rate": 1.5855237617539943e-05, "loss": 0.2265, "reward": 0.798828125, "reward_std": 0.17533262819051743, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.673828125, "step": 927 }, { "clip_ratio": 0.0, "completion_length": 653.75, "epoch": 0.3712, "grad_norm": 0.17866313802432737, "kl": 5.0, "learning_rate": 1.5843913031191722e-05, "loss": 0.1998, "reward": 0.716796875, "reward_std": 0.09049590863287449, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.716796875, "step": 928 }, { "clip_ratio": 0.0, "completion_length": 746.125, "epoch": 0.3716, "grad_norm": 0.3762779179659985, "kl": 5.125, "learning_rate": 1.5832577051836016e-05, "loss": 0.2054, "reward": 0.69140625, "reward_std": 0.12932413443922997, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.69140625, "step": 929 }, { "clip_ratio": 0.0, "completion_length": 836.625, "epoch": 0.372, "grad_norm": 0.41267191883511634, "kl": 5.1640625, "learning_rate": 1.5821229701572897e-05, "loss": 0.2062, "reward": 0.73828125, "reward_std": 0.2252337522804737, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.66015625, "step": 930 }, { "clip_ratio": 0.0, "completion_length": 756.25, "epoch": 0.3724, "grad_norm": 0.5134209485452443, "kl": 5.375, "learning_rate": 1.5809871002524602e-05, "loss": 0.2149, "reward": 0.810546875, "reward_std": 0.13940271735191345, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.685546875, "step": 931 }, { "clip_ratio": 0.0, "completion_length": 579.5, "epoch": 0.3728, "grad_norm": 0.45960780471120594, "kl": 5.1015625, "learning_rate": 1.5798500976835493e-05, "loss": 0.2037, "reward": 0.8359375, "reward_std": 0.10487748682498932, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7109375, "step": 932 }, { "clip_ratio": 0.0, "completion_length": 684.125, "epoch": 0.3732, "grad_norm": 0.420573028111549, "kl": 4.99609375, "learning_rate": 1.5787119646672025e-05, "loss": 0.1996, "reward": 0.68359375, "reward_std": 0.12034809775650501, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.68359375, "step": 933 }, { "clip_ratio": 0.0, "completion_length": 855.875, "epoch": 0.3736, "grad_norm": 0.40420558901552905, "kl": 5.40625, "learning_rate": 1.5775727034222675e-05, "loss": 0.216, "reward": 0.677734375, "reward_std": 0.1463232170790434, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.677734375, "step": 934 }, { "clip_ratio": 0.0, "completion_length": 854.25, "epoch": 0.374, "grad_norm": 78.52701665823083, "kl": 5.6171875, "learning_rate": 1.5764323161697933e-05, "loss": 0.225, "reward": 0.69140625, "reward_std": 0.1495482251048088, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.69140625, "step": 935 }, { "clip_ratio": 0.0, "completion_length": 661.5, "epoch": 0.3744, "grad_norm": 0.27113952608582875, "kl": 5.359375, "learning_rate": 1.575290805133023e-05, "loss": 0.2142, "reward": 0.724609375, "reward_std": 0.0810343436896801, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.724609375, "step": 936 }, { "clip_ratio": 0.0, "completion_length": 691.5, "epoch": 0.3748, "grad_norm": 0.30608743879636174, "kl": 4.484375, "learning_rate": 1.57414817253739e-05, "loss": 0.1795, "reward": 0.83203125, "reward_std": 0.09817857295274734, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.70703125, "step": 937 }, { "clip_ratio": 0.0, "completion_length": 773.875, "epoch": 0.3752, "grad_norm": 0.3091875368102946, "kl": 4.9609375, "learning_rate": 1.5730044206105156e-05, "loss": 0.1989, "reward": 0.80859375, "reward_std": 0.16510383039712906, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.71484375, "step": 938 }, { "clip_ratio": 0.0, "completion_length": 732.875, "epoch": 0.3756, "grad_norm": 0.24971438107175134, "kl": 4.484375, "learning_rate": 1.5718595515822027e-05, "loss": 0.1799, "reward": 0.83203125, "reward_std": 0.16933366656303406, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.72265625, "step": 939 }, { "clip_ratio": 0.0, "completion_length": 968.5, "epoch": 0.376, "grad_norm": 0.15251661240746803, "kl": 5.1875, "learning_rate": 1.570713567684432e-05, "loss": 0.2073, "reward": 0.697265625, "reward_std": 0.15518635138869286, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.697265625, "step": 940 }, { "clip_ratio": 0.0, "completion_length": 707.75, "epoch": 0.3764, "grad_norm": 0.22439232315673352, "kl": 4.3359375, "learning_rate": 1.5695664711513575e-05, "loss": 0.1734, "reward": 0.740234375, "reward_std": 0.08868160098791122, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.732421875, "step": 941 }, { "clip_ratio": 0.0, "completion_length": 846.75, "epoch": 0.3768, "grad_norm": 0.21851077254340087, "kl": 4.28125, "learning_rate": 1.568418264219303e-05, "loss": 0.1712, "reward": 0.716796875, "reward_std": 0.14986234158277512, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.701171875, "step": 942 }, { "clip_ratio": 0.0, "completion_length": 948.0, "epoch": 0.3772, "grad_norm": 0.8683015649779784, "kl": 4.046875, "learning_rate": 1.567268949126757e-05, "loss": 0.1617, "reward": 0.78515625, "reward_std": 0.1767427660524845, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.66015625, "step": 943 }, { "clip_ratio": 0.0, "completion_length": 816.125, "epoch": 0.3776, "grad_norm": 0.27996877638362366, "kl": 4.03125, "learning_rate": 1.5661185281143666e-05, "loss": 0.1611, "reward": 0.80859375, "reward_std": 0.1514010913670063, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.69921875, "step": 944 }, { "clip_ratio": 0.0, "completion_length": 970.0, "epoch": 0.378, "grad_norm": 0.20504437357049993, "kl": 3.921875, "learning_rate": 1.564967003424938e-05, "loss": 0.1567, "reward": 0.69140625, "reward_std": 0.14155962318181992, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.69140625, "step": 945 }, { "clip_ratio": 0.0, "completion_length": 845.0, "epoch": 0.3784, "grad_norm": 0.3460196363244027, "kl": 3.9921875, "learning_rate": 1.5638143773034268e-05, "loss": 0.1592, "reward": 0.8515625, "reward_std": 0.10312877595424652, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.71875, "step": 946 }, { "clip_ratio": 0.0, "completion_length": 882.375, "epoch": 0.3788, "grad_norm": 0.2522836643212475, "kl": 3.6875, "learning_rate": 1.562660651996937e-05, "loss": 0.1475, "reward": 0.890625, "reward_std": 0.1636759750545025, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7109375, "step": 947 }, { "clip_ratio": 0.0, "completion_length": 935.75, "epoch": 0.3792, "grad_norm": 0.32780779728799664, "kl": 4.109375, "learning_rate": 1.5615058297547144e-05, "loss": 0.1641, "reward": 0.82421875, "reward_std": 0.129464790225029, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.69921875, "step": 948 }, { "clip_ratio": 0.0, "completion_length": 931.875, "epoch": 0.3796, "grad_norm": 0.30197843714225064, "kl": 3.6015625, "learning_rate": 1.5603499128281447e-05, "loss": 0.1441, "reward": 0.701171875, "reward_std": 0.15086116269230843, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.701171875, "step": 949 }, { "clip_ratio": 0.0, "completion_length": 874.0, "epoch": 0.38, "grad_norm": 0.1982090089217144, "kl": 3.5234375, "learning_rate": 1.5591929034707468e-05, "loss": 0.141, "reward": 0.720703125, "reward_std": 0.09308596886694431, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.720703125, "step": 950 }, { "clip_ratio": 0.0, "completion_length": 699.25, "epoch": 0.3804, "grad_norm": 0.22867251118767576, "kl": 3.5234375, "learning_rate": 1.55803480393817e-05, "loss": 0.141, "reward": 0.8046875, "reward_std": 0.11375363171100616, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7421875, "step": 951 }, { "clip_ratio": 0.0, "completion_length": 893.625, "epoch": 0.3808, "grad_norm": 0.18391229863893133, "kl": 3.75, "learning_rate": 1.556875616488188e-05, "loss": 0.1497, "reward": 0.779296875, "reward_std": 0.18430540710687637, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.716796875, "step": 952 }, { "clip_ratio": 0.0, "completion_length": 926.125, "epoch": 0.3812, "grad_norm": 0.9477841868899489, "kl": 4.7265625, "learning_rate": 1.5557153433806967e-05, "loss": 0.1888, "reward": 0.779296875, "reward_std": 0.20784425362944603, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.701171875, "step": 953 }, { "clip_ratio": 0.0, "completion_length": 828.375, "epoch": 0.3816, "grad_norm": 1.221762111862495, "kl": 3.7578125, "learning_rate": 1.5545539868777075e-05, "loss": 0.1504, "reward": 0.69921875, "reward_std": 0.10762947797775269, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.69921875, "step": 954 }, { "clip_ratio": 0.0, "completion_length": 701.75, "epoch": 0.382, "grad_norm": 0.21942679788633598, "kl": 3.75390625, "learning_rate": 1.553391549243344e-05, "loss": 0.1505, "reward": 0.73046875, "reward_std": 0.058320626616477966, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.73046875, "step": 955 }, { "clip_ratio": 0.0, "completion_length": 872.5, "epoch": 0.3824, "grad_norm": 0.5411660640932874, "kl": 4.29296875, "learning_rate": 1.5522280327438388e-05, "loss": 0.1719, "reward": 0.6796875, "reward_std": 0.14524710923433304, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6796875, "step": 956 }, { "clip_ratio": 0.0, "completion_length": 912.0, "epoch": 0.3828, "grad_norm": 0.36076938438412987, "kl": 4.8046875, "learning_rate": 1.5510634396475262e-05, "loss": 0.192, "reward": 0.73046875, "reward_std": 0.19971567392349243, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.69921875, "step": 957 }, { "clip_ratio": 0.0, "completion_length": 908.375, "epoch": 0.3832, "grad_norm": 0.17978398926512407, "kl": 4.671875, "learning_rate": 1.54989777222484e-05, "loss": 0.1872, "reward": 0.716796875, "reward_std": 0.11465360224246979, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.716796875, "step": 958 }, { "clip_ratio": 0.0, "completion_length": 661.375, "epoch": 0.3836, "grad_norm": 0.19553102512531764, "kl": 4.03125, "learning_rate": 1.5487310327483087e-05, "loss": 0.1615, "reward": 0.734375, "reward_std": 0.04081955552101135, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.734375, "step": 959 }, { "clip_ratio": 0.0, "completion_length": 764.875, "epoch": 0.384, "grad_norm": 0.12920553193963188, "kl": 4.546875, "learning_rate": 1.5475632234925505e-05, "loss": 0.182, "reward": 0.724609375, "reward_std": 0.09512205049395561, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.724609375, "step": 960 }, { "clip_ratio": 0.0, "completion_length": 647.125, "epoch": 0.3844, "grad_norm": 0.13504315772831674, "kl": 4.390625, "learning_rate": 1.5463943467342694e-05, "loss": 0.1759, "reward": 0.7265625, "reward_std": 0.056444816291332245, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7265625, "step": 961 }, { "clip_ratio": 0.0, "completion_length": 706.5, "epoch": 0.3848, "grad_norm": 0.19318283036432954, "kl": 4.6171875, "learning_rate": 1.5452244047522504e-05, "loss": 0.1847, "reward": 0.880859375, "reward_std": 0.09660791605710983, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.732421875, "step": 962 }, { "clip_ratio": 0.0, "completion_length": 624.25, "epoch": 0.3852, "grad_norm": 0.457678645809761, "kl": 4.0703125, "learning_rate": 1.544053399827355e-05, "loss": 0.1631, "reward": 0.728515625, "reward_std": 0.0667114146053791, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.728515625, "step": 963 }, { "clip_ratio": 0.0, "completion_length": 724.625, "epoch": 0.3856, "grad_norm": 6.794449756660861, "kl": 5.0625, "learning_rate": 1.5428813342425177e-05, "loss": 0.2023, "reward": 0.7109375, "reward_std": 0.11965570598840714, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7109375, "step": 964 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.386, "grad_norm": 19.43226679326528, "kl": 4.3671875, "learning_rate": 1.54170821028274e-05, "loss": 0.1747, "reward": 0.67578125, "reward_std": 0.24103903770446777, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.54296875, "step": 965 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.3864, "grad_norm": 3.159245823423717, "kl": 0.919921875, "learning_rate": 1.540534030235087e-05, "loss": 0.0368, "reward": 0.5546875, "reward_std": 0.21053316816687584, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5546875, "step": 966 }, { "clip_ratio": 0.0, "completion_length": 1018.828125, "epoch": 0.3868, "grad_norm": 30.773386609370938, "kl": 6.798828125, "learning_rate": 1.5393587963886837e-05, "loss": 0.2669, "reward": 0.71484375, "reward_std": 0.20085986703634262, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.58984375, "step": 967 }, { "clip_ratio": 0.0, "completion_length": 687.5, "epoch": 0.3872, "grad_norm": 9.701297588138587, "kl": 2.5390625, "learning_rate": 1.5381825110347072e-05, "loss": 0.1016, "reward": 0.720703125, "reward_std": 0.09373383782804012, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.720703125, "step": 968 }, { "clip_ratio": 0.0, "completion_length": 732.0, "epoch": 0.3876, "grad_norm": 0.8768500792137195, "kl": 3.5078125, "learning_rate": 1.5370051764663872e-05, "loss": 0.1403, "reward": 0.76171875, "reward_std": 0.10456908121705055, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.73828125, "step": 969 }, { "clip_ratio": 0.0, "completion_length": 714.375, "epoch": 0.388, "grad_norm": 0.4744617062125385, "kl": 3.421875, "learning_rate": 1.5358267949789968e-05, "loss": 0.1368, "reward": 0.896484375, "reward_std": 0.08883683383464813, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.740234375, "step": 970 }, { "clip_ratio": 0.0, "completion_length": 764.5, "epoch": 0.3884, "grad_norm": 2.7206538014255437, "kl": 3.03515625, "learning_rate": 1.5346473688698514e-05, "loss": 0.1213, "reward": 0.744140625, "reward_std": 0.04403293877840042, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.744140625, "step": 971 }, { "clip_ratio": 0.0, "completion_length": 799.0, "epoch": 0.3888, "grad_norm": 0.7926321842704551, "kl": 3.140625, "learning_rate": 1.533466900438303e-05, "loss": 0.1255, "reward": 0.8671875, "reward_std": 0.06922422721982002, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.734375, "step": 972 }, { "clip_ratio": 0.0, "completion_length": 757.5, "epoch": 0.3892, "grad_norm": 0.5367765193568923, "kl": 3.45703125, "learning_rate": 1.532285391985734e-05, "loss": 0.1382, "reward": 0.861328125, "reward_std": 0.043295957148075104, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.736328125, "step": 973 }, { "clip_ratio": 0.0, "completion_length": 800.125, "epoch": 0.3896, "grad_norm": 0.4583041602770902, "kl": 3.85546875, "learning_rate": 1.5311028458155567e-05, "loss": 0.1546, "reward": 0.73046875, "reward_std": 0.07349834218621254, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.73046875, "step": 974 }, { "clip_ratio": 0.0, "completion_length": 813.625, "epoch": 0.39, "grad_norm": 0.7167212292017308, "kl": 3.76171875, "learning_rate": 1.529919264233205e-05, "loss": 0.1506, "reward": 0.7265625, "reward_std": 0.08547815307974815, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7265625, "step": 975 }, { "clip_ratio": 0.0, "completion_length": 867.125, "epoch": 0.3904, "grad_norm": 0.12970690764601772, "kl": 3.9296875, "learning_rate": 1.528734649546132e-05, "loss": 0.1573, "reward": 0.732421875, "reward_std": 0.06865446642041206, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.732421875, "step": 976 }, { "clip_ratio": 0.0, "completion_length": 745.0, "epoch": 0.3908, "grad_norm": 0.3785320406574679, "kl": 3.78125, "learning_rate": 1.5275490040638038e-05, "loss": 0.1513, "reward": 0.841796875, "reward_std": 0.17969447001814842, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.724609375, "step": 977 }, { "clip_ratio": 0.0, "completion_length": 693.125, "epoch": 0.3912, "grad_norm": 0.05582321006825047, "kl": 3.98828125, "learning_rate": 1.526362330097698e-05, "loss": 0.1593, "reward": 0.744140625, "reward_std": 0.0234375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.744140625, "step": 978 }, { "clip_ratio": 0.0, "completion_length": 623.125, "epoch": 0.3916, "grad_norm": 0.23216079595172373, "kl": 4.0, "learning_rate": 1.5251746299612959e-05, "loss": 0.1602, "reward": 0.873046875, "reward_std": 0.058537889271974564, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.740234375, "step": 979 }, { "clip_ratio": 0.0, "completion_length": 830.5, "epoch": 0.392, "grad_norm": 0.4020467972438768, "kl": 4.8671875, "learning_rate": 1.5239859059700794e-05, "loss": 0.1944, "reward": 0.8359375, "reward_std": 0.13038768246769905, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7109375, "step": 980 }, { "clip_ratio": 0.0, "completion_length": 587.5, "epoch": 0.3924, "grad_norm": 0.6812637828001137, "kl": 3.8984375, "learning_rate": 1.5227961604415266e-05, "loss": 0.156, "reward": 0.744140625, "reward_std": 0.0234375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.744140625, "step": 981 }, { "clip_ratio": 0.0, "completion_length": 920.0, "epoch": 0.3928, "grad_norm": 0.10739561219185212, "kl": 5.671875, "learning_rate": 1.5216053956951081e-05, "loss": 0.2269, "reward": 0.8359375, "reward_std": 0.14139671996235847, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7109375, "step": 982 }, { "clip_ratio": 0.0, "completion_length": 514.5, "epoch": 0.3932, "grad_norm": 1.2609313229217194, "kl": 4.0078125, "learning_rate": 1.5204136140522799e-05, "loss": 0.1602, "reward": 0.84765625, "reward_std": 0.0660141110420227, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.74609375, "step": 983 }, { "clip_ratio": 0.0, "completion_length": 725.375, "epoch": 0.3936, "grad_norm": 0.3121395677204706, "kl": 4.609375, "learning_rate": 1.5192208178364815e-05, "loss": 0.1848, "reward": 0.8359375, "reward_std": 0.09803583100438118, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.734375, "step": 984 }, { "clip_ratio": 0.0, "completion_length": 964.0, "epoch": 0.394, "grad_norm": 7.013654473091847, "kl": 4.9609375, "learning_rate": 1.5180270093731305e-05, "loss": 0.1983, "reward": 0.71875, "reward_std": 0.11509781330823898, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.71875, "step": 985 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.3944, "grad_norm": 0.3438181706862284, "kl": 4.76171875, "learning_rate": 1.5168321909896171e-05, "loss": 0.1905, "reward": 0.68359375, "reward_std": 0.13723275437951088, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.68359375, "step": 986 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.3948, "grad_norm": 0.37434239835841904, "kl": 2.927734375, "learning_rate": 1.5156363650153012e-05, "loss": 0.1171, "reward": 0.634765625, "reward_std": 0.19419245794415474, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.626953125, "step": 987 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.3952, "grad_norm": 0.6598077803363676, "kl": 2.615234375, "learning_rate": 1.5144395337815066e-05, "loss": 0.1046, "reward": 0.609375, "reward_std": 0.23497949540615082, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.609375, "step": 988 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.3956, "grad_norm": 0.6126821551630962, "kl": 4.0390625, "learning_rate": 1.5132416996215171e-05, "loss": 0.1616, "reward": 0.6796875, "reward_std": 0.1536012776196003, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6796875, "step": 989 }, { "clip_ratio": 0.0, "completion_length": 971.5, "epoch": 0.396, "grad_norm": 0.30253947016519167, "kl": 4.08203125, "learning_rate": 1.5120428648705716e-05, "loss": 0.1631, "reward": 0.763671875, "reward_std": 0.1182689368724823, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.755859375, "step": 990 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.3964, "grad_norm": 0.421210599095285, "kl": 4.1171875, "learning_rate": 1.51084303186586e-05, "loss": 0.1651, "reward": 0.779296875, "reward_std": 0.14214756712317467, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.779296875, "step": 991 }, { "clip_ratio": 0.0, "completion_length": 1012.75, "epoch": 0.3968, "grad_norm": 6.811488022225336, "kl": 2.71484375, "learning_rate": 1.5096422029465178e-05, "loss": 0.1085, "reward": 0.81640625, "reward_std": 0.1870644222944975, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.81640625, "step": 992 }, { "clip_ratio": 0.0, "completion_length": 1017.625, "epoch": 0.3972, "grad_norm": 18.843740255691987, "kl": 7.5546875, "learning_rate": 1.508440380453623e-05, "loss": 0.3022, "reward": 0.880859375, "reward_std": 0.21170634031295776, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.880859375, "step": 993 }, { "clip_ratio": 0.0, "completion_length": 1006.375, "epoch": 0.3976, "grad_norm": 16.47482701934583, "kl": 3.57421875, "learning_rate": 1.5072375667301893e-05, "loss": 0.1431, "reward": 1.001953125, "reward_std": 0.16670615412294865, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.884765625, "step": 994 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.398, "grad_norm": 16.125517267476848, "kl": 1.921875, "learning_rate": 1.5060337641211637e-05, "loss": 0.0769, "reward": 0.865234375, "reward_std": 0.2258804515004158, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.740234375, "step": 995 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.3984, "grad_norm": 15.800905435222177, "kl": 3.12109375, "learning_rate": 1.504828974973422e-05, "loss": 0.1247, "reward": 0.91015625, "reward_std": 0.23509002476930618, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.79296875, "step": 996 }, { "clip_ratio": 0.0, "completion_length": 1002.125, "epoch": 0.3988, "grad_norm": 5.8337581094090725, "kl": 1.923828125, "learning_rate": 1.503623201635761e-05, "loss": 0.0769, "reward": 0.98046875, "reward_std": 0.1679573878645897, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.85546875, "step": 997 }, { "clip_ratio": 0.0, "completion_length": 950.5, "epoch": 0.3992, "grad_norm": 7.556518680863777, "kl": 2.6171875, "learning_rate": 1.5024164464588982e-05, "loss": 0.1045, "reward": 0.892578125, "reward_std": 0.19834443554282188, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.783203125, "step": 998 }, { "clip_ratio": 0.0, "completion_length": 887.875, "epoch": 0.3996, "grad_norm": 4.036405389820712, "kl": 2.625, "learning_rate": 1.5012087117954643e-05, "loss": 0.1051, "reward": 0.8515625, "reward_std": 0.191015362739563, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7265625, "step": 999 }, { "clip_ratio": 0.0, "completion_length": 1019.5, "epoch": 0.4, "grad_norm": 5.7791214226253, "kl": 2.220703125, "learning_rate": 1.5000000000000002e-05, "loss": 0.0888, "reward": 0.91015625, "reward_std": 0.17512419447302818, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.78515625, "step": 1000 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4004, "grad_norm": 9.52257133296929, "kl": 2.201171875, "learning_rate": 1.498790313428951e-05, "loss": 0.0879, "reward": 0.837890625, "reward_std": 0.16919897124171257, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.837890625, "step": 1001 }, { "clip_ratio": 0.0, "completion_length": 1022.3359375, "epoch": 0.4008, "grad_norm": 12.40343642405453, "kl": 2.037109375, "learning_rate": 1.4975796544406627e-05, "loss": 0.0814, "reward": 0.798828125, "reward_std": 0.21720796078443527, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.798828125, "step": 1002 }, { "clip_ratio": 0.0, "completion_length": 999.75, "epoch": 0.4012, "grad_norm": 16.971677826306976, "kl": 2.7109375, "learning_rate": 1.496368025395377e-05, "loss": 0.1086, "reward": 1.1015625, "reward_std": 0.18030565977096558, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8984375, "step": 1003 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4016, "grad_norm": 1.9477973183474382, "kl": 2.0859375, "learning_rate": 1.4951554286552266e-05, "loss": 0.0834, "reward": 0.841796875, "reward_std": 0.19244202598929405, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.841796875, "step": 1004 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.402, "grad_norm": 0.6514027801866377, "kl": 2.390625, "learning_rate": 1.493941866584231e-05, "loss": 0.0955, "reward": 0.890625, "reward_std": 0.16140714660286903, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.890625, "step": 1005 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4024, "grad_norm": 0.8303114058171508, "kl": 2.23828125, "learning_rate": 1.4927273415482916e-05, "loss": 0.0895, "reward": 0.8671875, "reward_std": 0.1790929213166237, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8671875, "step": 1006 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4028, "grad_norm": 1.5710675052441705, "kl": 2.703125, "learning_rate": 1.4915118559151871e-05, "loss": 0.1081, "reward": 0.818359375, "reward_std": 0.25242454558610916, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.818359375, "step": 1007 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4032, "grad_norm": 15.554009206592818, "kl": 2.37109375, "learning_rate": 1.4902954120545687e-05, "loss": 0.0949, "reward": 0.81640625, "reward_std": 0.1981324441730976, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.81640625, "step": 1008 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4036, "grad_norm": 0.40692273200153684, "kl": 2.828125, "learning_rate": 1.4890780123379565e-05, "loss": 0.1132, "reward": 0.775390625, "reward_std": 0.2027147337794304, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.775390625, "step": 1009 }, { "clip_ratio": 0.0, "completion_length": 981.0, "epoch": 0.404, "grad_norm": 0.40002630336662914, "kl": 2.984375, "learning_rate": 1.4878596591387329e-05, "loss": 0.1194, "reward": 0.951171875, "reward_std": 0.1978583261370659, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.826171875, "step": 1010 }, { "clip_ratio": 0.0, "completion_length": 950.25, "epoch": 0.4044, "grad_norm": 0.5453844319466978, "kl": 3.3984375, "learning_rate": 1.4866403548321402e-05, "loss": 0.1357, "reward": 0.8515625, "reward_std": 0.21163541078567505, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8515625, "step": 1011 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4048, "grad_norm": 0.2441399534712837, "kl": 3.53125, "learning_rate": 1.485420101795274e-05, "loss": 0.1412, "reward": 0.8125, "reward_std": 0.20824889466166496, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8125, "step": 1012 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4052, "grad_norm": 0.2427083712905118, "kl": 4.2890625, "learning_rate": 1.4841989024070809e-05, "loss": 0.1714, "reward": 1.0703125, "reward_std": 0.27038489654660225, "rewards/accuracy_reward": 0.2265625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.84375, "step": 1013 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4056, "grad_norm": 0.3304329871764904, "kl": 4.3984375, "learning_rate": 1.4829767590483508e-05, "loss": 0.1759, "reward": 0.861328125, "reward_std": 0.17996907234191895, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.861328125, "step": 1014 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.406, "grad_norm": 0.748377853043975, "kl": 3.80859375, "learning_rate": 1.4817536741017153e-05, "loss": 0.1525, "reward": 0.806640625, "reward_std": 0.21300148591399193, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.806640625, "step": 1015 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4064, "grad_norm": 2.2435943198452843, "kl": 3.685546875, "learning_rate": 1.4805296499516408e-05, "loss": 0.1473, "reward": 0.90625, "reward_std": 0.2985646203160286, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.703125, "step": 1016 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4068, "grad_norm": 0.20477512637407438, "kl": 2.640625, "learning_rate": 1.4793046889844252e-05, "loss": 0.1055, "reward": 0.701171875, "reward_std": 0.23930985108017921, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.701171875, "step": 1017 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4072, "grad_norm": 3.7910502444512546, "kl": 3.349609375, "learning_rate": 1.4780787935881925e-05, "loss": 0.1343, "reward": 0.64453125, "reward_std": 0.2326519563794136, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.64453125, "step": 1018 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4076, "grad_norm": 0.23586087085649554, "kl": 2.349609375, "learning_rate": 1.4768519661528879e-05, "loss": 0.0939, "reward": 0.83984375, "reward_std": 0.2503856383264065, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.69921875, "step": 1019 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.408, "grad_norm": 0.24594576404787136, "kl": 2.76171875, "learning_rate": 1.4756242090702756e-05, "loss": 0.1105, "reward": 0.677734375, "reward_std": 0.2330993451178074, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.677734375, "step": 1020 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4084, "grad_norm": 2.2231236539863715, "kl": 3.84765625, "learning_rate": 1.4743955247339292e-05, "loss": 0.1539, "reward": 0.748046875, "reward_std": 0.21437354385852814, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.748046875, "step": 1021 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4088, "grad_norm": 0.3877516665681726, "kl": 3.4921875, "learning_rate": 1.4731659155392332e-05, "loss": 0.1399, "reward": 0.71875, "reward_std": 0.25362421572208405, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.71875, "step": 1022 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4092, "grad_norm": 5.518892487554459, "kl": 4.328125, "learning_rate": 1.4719353838833729e-05, "loss": 0.1733, "reward": 0.6875, "reward_std": 0.2537628635764122, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6875, "step": 1023 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4096, "grad_norm": 0.4570717389853737, "kl": 4.3515625, "learning_rate": 1.470703932165333e-05, "loss": 0.1743, "reward": 0.76171875, "reward_std": 0.2538025677204132, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.76171875, "step": 1024 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.41, "grad_norm": 0.1697200672703657, "kl": 3.88671875, "learning_rate": 1.469471562785891e-05, "loss": 0.1555, "reward": 0.84765625, "reward_std": 0.24713882058858871, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.72265625, "step": 1025 }, { "clip_ratio": 0.0, "completion_length": 936.5, "epoch": 0.4104, "grad_norm": 0.22275403880009828, "kl": 4.3203125, "learning_rate": 1.4682382781476146e-05, "loss": 0.1727, "reward": 0.8984375, "reward_std": 0.2141275331377983, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7734375, "step": 1026 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4108, "grad_norm": 1.4696109820545618, "kl": 4.02734375, "learning_rate": 1.4670040806548555e-05, "loss": 0.1606, "reward": 0.7421875, "reward_std": 0.24881666898727417, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7421875, "step": 1027 }, { "clip_ratio": 0.0, "completion_length": 929.25, "epoch": 0.4112, "grad_norm": 0.3123014077536165, "kl": 5.03125, "learning_rate": 1.4657689727137443e-05, "loss": 0.2013, "reward": 0.896484375, "reward_std": 0.22837287187576294, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.849609375, "step": 1028 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4116, "grad_norm": 0.1742649175174701, "kl": 5.2421875, "learning_rate": 1.464532956732188e-05, "loss": 0.2097, "reward": 0.951171875, "reward_std": 0.17985611036419868, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.826171875, "step": 1029 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.412, "grad_norm": 3.4162713154334803, "kl": 5.0390625, "learning_rate": 1.463296035119862e-05, "loss": 0.2019, "reward": 1.0234375, "reward_std": 0.2525712884962559, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8046875, "step": 1030 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4124, "grad_norm": 1.2870111761693928, "kl": 4.6875, "learning_rate": 1.4620582102882088e-05, "loss": 0.1877, "reward": 0.79296875, "reward_std": 0.2221830189228058, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.79296875, "step": 1031 }, { "clip_ratio": 0.0, "completion_length": 941.375, "epoch": 0.4128, "grad_norm": 1.691786528326955, "kl": 5.4453125, "learning_rate": 1.4608194846504311e-05, "loss": 0.2179, "reward": 0.81640625, "reward_std": 0.21062815189361572, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.81640625, "step": 1032 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4132, "grad_norm": 0.7950544663914282, "kl": 4.26171875, "learning_rate": 1.4595798606214882e-05, "loss": 0.1708, "reward": 0.912109375, "reward_std": 0.26808205246925354, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.787109375, "step": 1033 }, { "clip_ratio": 0.0, "completion_length": 793.625, "epoch": 0.4136, "grad_norm": 0.9207707534915422, "kl": 4.94921875, "learning_rate": 1.4583393406180898e-05, "loss": 0.1981, "reward": 0.90234375, "reward_std": 0.17725158482789993, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.90234375, "step": 1034 }, { "clip_ratio": 0.0, "completion_length": 972.0, "epoch": 0.414, "grad_norm": 2.6781129136515904, "kl": 4.8203125, "learning_rate": 1.4570979270586944e-05, "loss": 0.1926, "reward": 0.82421875, "reward_std": 0.2341214120388031, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.82421875, "step": 1035 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4144, "grad_norm": 24.783784275809985, "kl": 11.875, "learning_rate": 1.4558556223635004e-05, "loss": 0.4749, "reward": 0.8203125, "reward_std": 0.21796300634741783, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8203125, "step": 1036 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4148, "grad_norm": 7.084886558387461, "kl": 5.2578125, "learning_rate": 1.454612428954444e-05, "loss": 0.2104, "reward": 0.8359375, "reward_std": 0.24900556728243828, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8125, "step": 1037 }, { "clip_ratio": 0.0, "completion_length": 1023.1015625, "epoch": 0.4152, "grad_norm": 643.3290302924479, "kl": 25.8984375, "learning_rate": 1.4533683492551954e-05, "loss": 1.0364, "reward": 0.642578125, "reward_std": 0.25798796117305756, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.626953125, "step": 1038 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4156, "grad_norm": 262.22519965759506, "kl": 12.0, "learning_rate": 1.4521233856911507e-05, "loss": 0.4808, "reward": 0.609375, "reward_std": 0.21389301866292953, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.609375, "step": 1039 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.416, "grad_norm": 33.24858771263275, "kl": 3.81640625, "learning_rate": 1.4508775406894308e-05, "loss": 0.1525, "reward": 0.57421875, "reward_std": 0.17370493337512016, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.57421875, "step": 1040 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4164, "grad_norm": 21.245353542106244, "kl": 2.666015625, "learning_rate": 1.449630816678874e-05, "loss": 0.1066, "reward": 0.56640625, "reward_std": 0.19933824241161346, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.53515625, "step": 1041 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4168, "grad_norm": 7.9684149741454755, "kl": 4.48828125, "learning_rate": 1.4483832160900326e-05, "loss": 0.1798, "reward": 0.513671875, "reward_std": 0.14508352056145668, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.513671875, "step": 1042 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4172, "grad_norm": 3.2225458757510315, "kl": 2.283203125, "learning_rate": 1.4471347413551673e-05, "loss": 0.0914, "reward": 0.626953125, "reward_std": 0.1386814210563898, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.501953125, "step": 1043 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4176, "grad_norm": 3.1666815779189474, "kl": 2.73046875, "learning_rate": 1.4458853949082443e-05, "loss": 0.1092, "reward": 0.62109375, "reward_std": 0.11744177713990211, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.49609375, "step": 1044 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.418, "grad_norm": 2.9174743654005173, "kl": 2.890625, "learning_rate": 1.4446351791849276e-05, "loss": 0.1157, "reward": 0.755859375, "reward_std": 0.13556713983416557, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.505859375, "step": 1045 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4184, "grad_norm": 9.259042358779636, "kl": 3.00390625, "learning_rate": 1.4433840966225772e-05, "loss": 0.1203, "reward": 0.607421875, "reward_std": 0.18921349197626114, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.466796875, "step": 1046 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4188, "grad_norm": 4.944011357162509, "kl": 4.1796875, "learning_rate": 1.4421321496602428e-05, "loss": 0.1675, "reward": 0.498046875, "reward_std": 0.16966531053185463, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.498046875, "step": 1047 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4192, "grad_norm": 8.039533970093888, "kl": 5.0078125, "learning_rate": 1.4408793407386587e-05, "loss": 0.2002, "reward": 0.51953125, "reward_std": 0.17315999045968056, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.51953125, "step": 1048 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4196, "grad_norm": 13.447466000863168, "kl": 4.46875, "learning_rate": 1.43962567230024e-05, "loss": 0.1789, "reward": 0.677734375, "reward_std": 0.15039347298443317, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.552734375, "step": 1049 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.42, "grad_norm": 7.3024236521354045, "kl": 4.7109375, "learning_rate": 1.4383711467890776e-05, "loss": 0.1885, "reward": 0.501953125, "reward_std": 0.19688305258750916, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.501953125, "step": 1050 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4204, "grad_norm": 4.236160700175511, "kl": 3.74609375, "learning_rate": 1.437115766650933e-05, "loss": 0.1501, "reward": 0.474609375, "reward_std": 0.19502191990613937, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.474609375, "step": 1051 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4208, "grad_norm": 2.6024048457768205, "kl": 3.546875, "learning_rate": 1.4358595343332342e-05, "loss": 0.142, "reward": 0.49609375, "reward_std": 0.19944792985916138, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.49609375, "step": 1052 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4212, "grad_norm": 10.06042396658531, "kl": 6.69921875, "learning_rate": 1.4346024522850704e-05, "loss": 0.2682, "reward": 0.529296875, "reward_std": 0.268559355288744, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.490234375, "step": 1053 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4216, "grad_norm": 33.61296931684503, "kl": 10.296875, "learning_rate": 1.4333445229571874e-05, "loss": 0.4118, "reward": 0.515625, "reward_std": 0.2443062663078308, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5078125, "step": 1054 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.422, "grad_norm": 2.8066321555283156, "kl": 5.3203125, "learning_rate": 1.4320857488019826e-05, "loss": 0.2128, "reward": 0.669921875, "reward_std": 0.21774032711982727, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.544921875, "step": 1055 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4224, "grad_norm": 33.54300520067402, "kl": 12.921875, "learning_rate": 1.4308261322735006e-05, "loss": 0.5144, "reward": 0.541015625, "reward_std": 0.224500834941864, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.541015625, "step": 1056 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4228, "grad_norm": 0.9722468243312085, "kl": 5.0703125, "learning_rate": 1.4295656758274283e-05, "loss": 0.2031, "reward": 0.595703125, "reward_std": 0.21895433962345123, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.595703125, "step": 1057 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4232, "grad_norm": 2.295565421880949, "kl": 5.5859375, "learning_rate": 1.4283043819210905e-05, "loss": 0.2235, "reward": 0.658203125, "reward_std": 0.26568491011857986, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.595703125, "step": 1058 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4236, "grad_norm": 0.7813124253827821, "kl": 4.8984375, "learning_rate": 1.4270422530134433e-05, "loss": 0.1962, "reward": 0.751953125, "reward_std": 0.21883412450551987, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.611328125, "step": 1059 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.424, "grad_norm": 0.2573971889289969, "kl": 3.890625, "learning_rate": 1.4257792915650728e-05, "loss": 0.1555, "reward": 0.5625, "reward_std": 0.2021920457482338, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5625, "step": 1060 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4244, "grad_norm": 0.4375951861342384, "kl": 3.8828125, "learning_rate": 1.424515500038186e-05, "loss": 0.1551, "reward": 0.708984375, "reward_std": 0.19389567524194717, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.583984375, "step": 1061 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4248, "grad_norm": 2.925386960204455, "kl": 4.46875, "learning_rate": 1.4232508808966097e-05, "loss": 0.1785, "reward": 0.55078125, "reward_std": 0.19072070345282555, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.55078125, "step": 1062 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4252, "grad_norm": 0.514924004505416, "kl": 5.53125, "learning_rate": 1.4219854366057831e-05, "loss": 0.2213, "reward": 0.66015625, "reward_std": 0.14529666677117348, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.66015625, "step": 1063 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4256, "grad_norm": 13.647291502283712, "kl": 6.9453125, "learning_rate": 1.420719169632755e-05, "loss": 0.278, "reward": 0.6484375, "reward_std": 0.23469997197389603, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6171875, "step": 1064 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.426, "grad_norm": 0.6817185386107929, "kl": 6.1796875, "learning_rate": 1.4194520824461773e-05, "loss": 0.2469, "reward": 0.6640625, "reward_std": 0.17723320052027702, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.65625, "step": 1065 }, { "clip_ratio": 0.0, "completion_length": 946.875, "epoch": 0.4264, "grad_norm": 0.23409998186542466, "kl": 5.2421875, "learning_rate": 1.4181841775163014e-05, "loss": 0.21, "reward": 0.6640625, "reward_std": 0.14877425879240036, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6640625, "step": 1066 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4268, "grad_norm": 0.18845988188104174, "kl": 5.6015625, "learning_rate": 1.4169154573149737e-05, "loss": 0.2238, "reward": 0.80078125, "reward_std": 0.16656966507434845, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.67578125, "step": 1067 }, { "clip_ratio": 0.0, "completion_length": 957.375, "epoch": 0.4272, "grad_norm": 0.3053156517395978, "kl": 4.9609375, "learning_rate": 1.415645924315628e-05, "loss": 0.1987, "reward": 0.775390625, "reward_std": 0.15013382025063038, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.650390625, "step": 1068 }, { "clip_ratio": 0.0, "completion_length": 749.5, "epoch": 0.4276, "grad_norm": 0.39769654778059704, "kl": 5.5234375, "learning_rate": 1.4143755809932843e-05, "loss": 0.221, "reward": 0.693359375, "reward_std": 0.12244644574820995, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.693359375, "step": 1069 }, { "clip_ratio": 0.0, "completion_length": 831.125, "epoch": 0.428, "grad_norm": 0.2866609718902502, "kl": 5.28515625, "learning_rate": 1.413104429824542e-05, "loss": 0.2113, "reward": 0.64453125, "reward_std": 0.16035678796470165, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.64453125, "step": 1070 }, { "clip_ratio": 0.0, "completion_length": 567.625, "epoch": 0.4284, "grad_norm": 0.5255341634504379, "kl": 4.80859375, "learning_rate": 1.411832473287575e-05, "loss": 0.1926, "reward": 0.728515625, "reward_std": 0.06310233101248741, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.728515625, "step": 1071 }, { "clip_ratio": 0.0, "completion_length": 942.875, "epoch": 0.4288, "grad_norm": 1.5528955194714913, "kl": 5.6796875, "learning_rate": 1.4105597138621281e-05, "loss": 0.227, "reward": 0.814453125, "reward_std": 0.13671252876520157, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.689453125, "step": 1072 }, { "clip_ratio": 0.0, "completion_length": 922.75, "epoch": 0.4292, "grad_norm": 1.2392384812684138, "kl": 6.0234375, "learning_rate": 1.4092861540295109e-05, "loss": 0.2409, "reward": 0.689453125, "reward_std": 0.1551232635974884, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.689453125, "step": 1073 }, { "clip_ratio": 0.0, "completion_length": 538.75, "epoch": 0.4296, "grad_norm": 2.050473257845361, "kl": 5.3046875, "learning_rate": 1.4080117962725929e-05, "loss": 0.2123, "reward": 0.8359375, "reward_std": 0.08656632713973522, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7109375, "step": 1074 }, { "clip_ratio": 0.0, "completion_length": 934.0, "epoch": 0.43, "grad_norm": 0.7616977356306974, "kl": 6.3125, "learning_rate": 1.4067366430758004e-05, "loss": 0.2526, "reward": 0.708984375, "reward_std": 0.11639644205570221, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.708984375, "step": 1075 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4304, "grad_norm": 2.069980295359457, "kl": 6.6015625, "learning_rate": 1.4054606969251095e-05, "loss": 0.2641, "reward": 0.7890625, "reward_std": 0.18334010429680347, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6953125, "step": 1076 }, { "clip_ratio": 0.0, "completion_length": 734.625, "epoch": 0.4308, "grad_norm": 1.3104239315520467, "kl": 5.546875, "learning_rate": 1.4041839603080423e-05, "loss": 0.2223, "reward": 0.697265625, "reward_std": 0.11110583320260048, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.697265625, "step": 1077 }, { "clip_ratio": 0.0, "completion_length": 551.375, "epoch": 0.4312, "grad_norm": 1.6947584750601532, "kl": 5.34375, "learning_rate": 1.4029064357136628e-05, "loss": 0.214, "reward": 0.857421875, "reward_std": 0.06387205049395561, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.732421875, "step": 1078 }, { "clip_ratio": 0.0, "completion_length": 928.5, "epoch": 0.4316, "grad_norm": 3.7588849482420645, "kl": 5.9609375, "learning_rate": 1.4016281256325702e-05, "loss": 0.2385, "reward": 0.611328125, "reward_std": 0.1922212354838848, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.611328125, "step": 1079 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.432, "grad_norm": 2.2888777279632615, "kl": 5.5234375, "learning_rate": 1.4003490325568953e-05, "loss": 0.2206, "reward": 0.5234375, "reward_std": 0.29113319888710976, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5078125, "step": 1080 }, { "clip_ratio": 0.0, "completion_length": 936.25, "epoch": 0.4324, "grad_norm": 2.4147380078622045, "kl": 5.33984375, "learning_rate": 1.3990691589802955e-05, "loss": 0.2136, "reward": 0.60546875, "reward_std": 0.2140753734856844, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.60546875, "step": 1081 }, { "clip_ratio": 0.0, "completion_length": 924.25, "epoch": 0.4328, "grad_norm": 2.1536775646715944, "kl": 6.4375, "learning_rate": 1.39778850739795e-05, "loss": 0.2575, "reward": 0.69921875, "reward_std": 0.14226839691400528, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.69921875, "step": 1082 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4332, "grad_norm": 2.460816548081574, "kl": 6.4140625, "learning_rate": 1.3965070803065543e-05, "loss": 0.2568, "reward": 0.63671875, "reward_std": 0.2240305282175541, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.63671875, "step": 1083 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4336, "grad_norm": 1.5080163274890468, "kl": 6.71875, "learning_rate": 1.3952248802043166e-05, "loss": 0.2691, "reward": 0.646484375, "reward_std": 0.2099643424153328, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.646484375, "step": 1084 }, { "clip_ratio": 0.0, "completion_length": 932.75, "epoch": 0.434, "grad_norm": 3.253749907946663, "kl": 6.6484375, "learning_rate": 1.3939419095909513e-05, "loss": 0.2661, "reward": 0.642578125, "reward_std": 0.14185254089534283, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.642578125, "step": 1085 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4344, "grad_norm": 4.857306403491315, "kl": 6.0234375, "learning_rate": 1.3926581709676752e-05, "loss": 0.2414, "reward": 0.576171875, "reward_std": 0.27349376678466797, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.576171875, "step": 1086 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4348, "grad_norm": 2.4222027314078436, "kl": 4.9765625, "learning_rate": 1.3913736668372027e-05, "loss": 0.199, "reward": 0.330078125, "reward_std": 0.14209461957216263, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.330078125, "step": 1087 }, { "clip_ratio": 0.0, "completion_length": 1018.265625, "epoch": 0.4352, "grad_norm": 7.382603641822328, "kl": 6.6796875, "learning_rate": 1.3900883997037398e-05, "loss": 0.2664, "reward": 0.435546875, "reward_std": 0.12203985825181007, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.310546875, "step": 1088 }, { "clip_ratio": 0.0, "completion_length": 1019.171875, "epoch": 0.4356, "grad_norm": 1.3183212283611403, "kl": 4.625, "learning_rate": 1.388802372072981e-05, "loss": 0.1825, "reward": 0.296875, "reward_std": 0.11890139430761337, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.296875, "step": 1089 }, { "clip_ratio": 0.0, "completion_length": 1017.1171875, "epoch": 0.436, "grad_norm": 2.99529355027241, "kl": 3.6171875, "learning_rate": 1.3875155864521031e-05, "loss": 0.1426, "reward": 0.3203125, "reward_std": 0.19654517248272896, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.296875, "step": 1090 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4364, "grad_norm": 5.163676567681528, "kl": 3.59375, "learning_rate": 1.3862280453497601e-05, "loss": 0.1439, "reward": 0.42578125, "reward_std": 0.23100878670811653, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.33984375, "step": 1091 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4368, "grad_norm": 6.674510983515601, "kl": 6.5234375, "learning_rate": 1.3849397512760797e-05, "loss": 0.261, "reward": 0.322265625, "reward_std": 0.145750280469656, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.322265625, "step": 1092 }, { "clip_ratio": 0.0, "completion_length": 1016.875, "epoch": 0.4372, "grad_norm": 1.519596845850696, "kl": 6.4765625, "learning_rate": 1.3836507067426565e-05, "loss": 0.2428, "reward": 0.341796875, "reward_std": 0.1320943757891655, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.341796875, "step": 1093 }, { "clip_ratio": 0.0, "completion_length": 929.28125, "epoch": 0.4376, "grad_norm": 1.413821453860164, "kl": 7.4140625, "learning_rate": 1.3823609142625492e-05, "loss": 0.2956, "reward": 0.408203125, "reward_std": 0.12067233212292194, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.408203125, "step": 1094 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.438, "grad_norm": 2.6243269210143594, "kl": 6.6328125, "learning_rate": 1.3810703763502744e-05, "loss": 0.2653, "reward": 0.546875, "reward_std": 0.10583988577127457, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.421875, "step": 1095 }, { "clip_ratio": 0.0, "completion_length": 771.875, "epoch": 0.4384, "grad_norm": 1.9356340515356705, "kl": 5.7109375, "learning_rate": 1.3797790955218014e-05, "loss": 0.2288, "reward": 0.447265625, "reward_std": 0.09712507016956806, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.447265625, "step": 1096 }, { "clip_ratio": 0.0, "completion_length": 944.25, "epoch": 0.4388, "grad_norm": 1.3011567860418805, "kl": 6.3671875, "learning_rate": 1.3784870742945482e-05, "loss": 0.2547, "reward": 0.5390625, "reward_std": 0.15093075670301914, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4296875, "step": 1097 }, { "clip_ratio": 0.0, "completion_length": 584.375, "epoch": 0.4392, "grad_norm": 0.7011759191045217, "kl": 4.34375, "learning_rate": 1.3771943151873768e-05, "loss": 0.1743, "reward": 0.580078125, "reward_std": 0.05448688194155693, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.455078125, "step": 1098 }, { "clip_ratio": 0.0, "completion_length": 675.75, "epoch": 0.4396, "grad_norm": 0.9069602448962957, "kl": 5.265625, "learning_rate": 1.3759008207205869e-05, "loss": 0.2107, "reward": 0.48828125, "reward_std": 0.043432608246803284, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48828125, "step": 1099 }, { "clip_ratio": 0.0, "completion_length": 844.0, "epoch": 0.44, "grad_norm": 16.648476879197002, "kl": 5.859375, "learning_rate": 1.3746065934159123e-05, "loss": 0.234, "reward": 0.53125, "reward_std": 0.13772599771618843, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.46875, "step": 1100 }, { "clip_ratio": 0.0, "completion_length": 561.375, "epoch": 0.4404, "grad_norm": 0.35483338791510194, "kl": 5.046875, "learning_rate": 1.373311635796515e-05, "loss": 0.2022, "reward": 0.611328125, "reward_std": 0.03246183134615421, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.486328125, "step": 1101 }, { "clip_ratio": 0.0, "completion_length": 861.125, "epoch": 0.4408, "grad_norm": 0.6823885167092514, "kl": 5.9375, "learning_rate": 1.3720159503869816e-05, "loss": 0.2375, "reward": 0.474609375, "reward_std": 0.07201205939054489, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.474609375, "step": 1102 }, { "clip_ratio": 0.0, "completion_length": 771.75, "epoch": 0.4412, "grad_norm": 0.3614877639352549, "kl": 6.1015625, "learning_rate": 1.3707195397133165e-05, "loss": 0.2444, "reward": 0.478515625, "reward_std": 0.06425705552101135, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478515625, "step": 1103 }, { "clip_ratio": 0.0, "completion_length": 963.5, "epoch": 0.4416, "grad_norm": 3.0575566415929045, "kl": 6.390625, "learning_rate": 1.3694224063029396e-05, "loss": 0.2556, "reward": 0.466796875, "reward_std": 0.08797654882073402, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.466796875, "step": 1104 }, { "clip_ratio": 0.0, "completion_length": 679.375, "epoch": 0.442, "grad_norm": 0.22528884814902084, "kl": 5.39453125, "learning_rate": 1.3681245526846782e-05, "loss": 0.2158, "reward": 0.611328125, "reward_std": 0.04478531330823898, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.486328125, "step": 1105 }, { "clip_ratio": 0.0, "completion_length": 787.25, "epoch": 0.4424, "grad_norm": 0.48217423574446416, "kl": 5.6796875, "learning_rate": 1.3668259813887644e-05, "loss": 0.227, "reward": 0.552734375, "reward_std": 0.15566487237811089, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.482421875, "step": 1106 }, { "clip_ratio": 0.0, "completion_length": 525.5, "epoch": 0.4428, "grad_norm": 0.2397635123688895, "kl": 5.609375, "learning_rate": 1.365526694946829e-05, "loss": 0.2244, "reward": 0.494140625, "reward_std": 0.0234375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.494140625, "step": 1107 }, { "clip_ratio": 0.0, "completion_length": 556.625, "epoch": 0.4432, "grad_norm": 0.15887966859113428, "kl": 5.6640625, "learning_rate": 1.3642266958918985e-05, "loss": 0.2268, "reward": 0.611328125, "reward_std": 0.039834219962358475, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.486328125, "step": 1108 }, { "clip_ratio": 0.0, "completion_length": 744.625, "epoch": 0.4436, "grad_norm": 1.3784143499793207, "kl": 6.1015625, "learning_rate": 1.3629259867583864e-05, "loss": 0.2441, "reward": 0.4765625, "reward_std": 0.06530078127980232, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4765625, "step": 1109 }, { "clip_ratio": 0.0, "completion_length": 593.375, "epoch": 0.444, "grad_norm": 0.2985034954145908, "kl": 5.65625, "learning_rate": 1.3616245700820922e-05, "loss": 0.2262, "reward": 0.720703125, "reward_std": 0.07664071768522263, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.486328125, "step": 1110 }, { "clip_ratio": 0.0, "completion_length": 835.5, "epoch": 0.4444, "grad_norm": 0.31044383170686907, "kl": 6.125, "learning_rate": 1.3603224484001949e-05, "loss": 0.245, "reward": 0.685546875, "reward_std": 0.1278614066541195, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.474609375, "step": 1111 }, { "clip_ratio": 0.0, "completion_length": 840.125, "epoch": 0.4448, "grad_norm": 0.2010237698301334, "kl": 6.578125, "learning_rate": 1.3590196242512463e-05, "loss": 0.2632, "reward": 0.48046875, "reward_std": 0.05738259106874466, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48046875, "step": 1112 }, { "clip_ratio": 0.0, "completion_length": 444.375, "epoch": 0.4452, "grad_norm": 0.43575934350377965, "kl": 5.078125, "learning_rate": 1.3577161001751696e-05, "loss": 0.2035, "reward": 0.53515625, "reward_std": 0.10219087451696396, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48046875, "step": 1113 }, { "clip_ratio": 0.0, "completion_length": 738.375, "epoch": 0.4456, "grad_norm": 0.39922762228341907, "kl": 5.5234375, "learning_rate": 1.3564118787132507e-05, "loss": 0.2206, "reward": 0.482421875, "reward_std": 0.08844615519046783, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.474609375, "step": 1114 }, { "clip_ratio": 0.0, "completion_length": 653.75, "epoch": 0.446, "grad_norm": 0.26859413975301366, "kl": 5.2265625, "learning_rate": 1.3551069624081372e-05, "loss": 0.2088, "reward": 0.47265625, "reward_std": 0.04722111485898495, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.47265625, "step": 1115 }, { "clip_ratio": 0.0, "completion_length": 664.5, "epoch": 0.4464, "grad_norm": 0.17145658286547935, "kl": 5.4140625, "learning_rate": 1.3538013538038295e-05, "loss": 0.2165, "reward": 0.607421875, "reward_std": 0.043135738000273705, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.482421875, "step": 1116 }, { "clip_ratio": 0.0, "completion_length": 935.5, "epoch": 0.4468, "grad_norm": 0.49429412978382414, "kl": 5.4765625, "learning_rate": 1.3524950554456786e-05, "loss": 0.2191, "reward": 0.4375, "reward_std": 0.09776041842997074, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4375, "step": 1117 }, { "clip_ratio": 0.0, "completion_length": 944.0, "epoch": 0.4472, "grad_norm": 0.3146881084821215, "kl": 5.6640625, "learning_rate": 1.3511880698803801e-05, "loss": 0.2267, "reward": 0.57421875, "reward_std": 0.09420903585851192, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.44921875, "step": 1118 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4476, "grad_norm": 0.33325246399500885, "kl": 5.0078125, "learning_rate": 1.349880399655969e-05, "loss": 0.2001, "reward": 0.552734375, "reward_std": 0.10945891216397285, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.427734375, "step": 1119 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.448, "grad_norm": 0.21159717725069083, "kl": 4.703125, "learning_rate": 1.3485720473218153e-05, "loss": 0.1879, "reward": 0.51953125, "reward_std": 0.11333542503416538, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.39453125, "step": 1120 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4484, "grad_norm": 0.2816127284594134, "kl": 5.296875, "learning_rate": 1.347263015428619e-05, "loss": 0.2116, "reward": 0.53515625, "reward_std": 0.10943535156548023, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.41015625, "step": 1121 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4488, "grad_norm": 0.2507506553597322, "kl": 5.1796875, "learning_rate": 1.3459533065284049e-05, "loss": 0.207, "reward": 0.419921875, "reward_std": 0.11887414008378983, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.419921875, "step": 1122 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4492, "grad_norm": 0.1788483342577228, "kl": 5.875, "learning_rate": 1.344642923174517e-05, "loss": 0.235, "reward": 0.44140625, "reward_std": 0.09842864237725735, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.44140625, "step": 1123 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4496, "grad_norm": 0.13108405034145626, "kl": 5.8046875, "learning_rate": 1.3433318679216154e-05, "loss": 0.2326, "reward": 0.43359375, "reward_std": 0.10427170433104038, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.43359375, "step": 1124 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.45, "grad_norm": 0.2880873151933463, "kl": 6.625, "learning_rate": 1.342020143325669e-05, "loss": 0.265, "reward": 0.453125, "reward_std": 0.09134218841791153, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.453125, "step": 1125 }, { "clip_ratio": 0.0, "completion_length": 914.0, "epoch": 0.4504, "grad_norm": 0.14159455338041801, "kl": 5.875, "learning_rate": 1.340707751943952e-05, "loss": 0.2349, "reward": 0.56640625, "reward_std": 0.09385833516716957, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.44140625, "step": 1126 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4508, "grad_norm": 0.3961842722291843, "kl": 5.3984375, "learning_rate": 1.3393946963350381e-05, "loss": 0.2159, "reward": 0.431640625, "reward_std": 0.10515587776899338, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.431640625, "step": 1127 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4512, "grad_norm": 0.2531502039897197, "kl": 5.8828125, "learning_rate": 1.3380809790587975e-05, "loss": 0.2354, "reward": 0.4375, "reward_std": 0.10500071384012699, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4375, "step": 1128 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4516, "grad_norm": 0.18848976980663434, "kl": 6.140625, "learning_rate": 1.3367666026763884e-05, "loss": 0.2456, "reward": 0.52734375, "reward_std": 0.16850409656763077, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.44140625, "step": 1129 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.452, "grad_norm": 0.2516854818311333, "kl": 6.640625, "learning_rate": 1.3354515697502552e-05, "loss": 0.2658, "reward": 0.45703125, "reward_std": 0.09517527930438519, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.45703125, "step": 1130 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4524, "grad_norm": 0.18271522149962993, "kl": 6.34375, "learning_rate": 1.3341358828441217e-05, "loss": 0.2536, "reward": 0.611328125, "reward_std": 0.14994782768189907, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.447265625, "step": 1131 }, { "clip_ratio": 0.0, "completion_length": 911.125, "epoch": 0.4528, "grad_norm": 0.470171048613331, "kl": 6.625, "learning_rate": 1.3328195445229869e-05, "loss": 0.265, "reward": 0.4609375, "reward_std": 0.09358605183660984, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4609375, "step": 1132 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4532, "grad_norm": 0.2486141424673023, "kl": 5.74609375, "learning_rate": 1.3315025573531198e-05, "loss": 0.23, "reward": 0.5625, "reward_std": 0.09913608245551586, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4375, "step": 1133 }, { "clip_ratio": 0.0, "completion_length": 795.25, "epoch": 0.4536, "grad_norm": 0.24981409811835337, "kl": 5.8515625, "learning_rate": 1.3301849239020537e-05, "loss": 0.2335, "reward": 0.44921875, "reward_std": 0.08098086155951023, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.44921875, "step": 1134 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.454, "grad_norm": 0.3246463226493314, "kl": 5.890625, "learning_rate": 1.3288666467385834e-05, "loss": 0.2357, "reward": 0.5703125, "reward_std": 0.10331955552101135, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4453125, "step": 1135 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4544, "grad_norm": 0.185370052754622, "kl": 6.5234375, "learning_rate": 1.327547728432757e-05, "loss": 0.2608, "reward": 0.587890625, "reward_std": 0.08874007128179073, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.462890625, "step": 1136 }, { "clip_ratio": 0.0, "completion_length": 806.125, "epoch": 0.4548, "grad_norm": 0.21741130501713243, "kl": 5.609375, "learning_rate": 1.3262281715558736e-05, "loss": 0.2247, "reward": 0.564453125, "reward_std": 0.08986436761915684, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.439453125, "step": 1137 }, { "clip_ratio": 0.0, "completion_length": 690.375, "epoch": 0.4552, "grad_norm": 0.21675181592768616, "kl": 5.6171875, "learning_rate": 1.3249079786804765e-05, "loss": 0.2246, "reward": 0.6328125, "reward_std": 0.12357140891253948, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4609375, "step": 1138 }, { "clip_ratio": 0.0, "completion_length": 687.875, "epoch": 0.4556, "grad_norm": 0.2058882842643726, "kl": 6.09375, "learning_rate": 1.3235871523803496e-05, "loss": 0.2437, "reward": 0.478515625, "reward_std": 0.057271381840109825, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478515625, "step": 1139 }, { "clip_ratio": 0.0, "completion_length": 501.875, "epoch": 0.456, "grad_norm": 0.3135882150717308, "kl": 5.6796875, "learning_rate": 1.3222656952305113e-05, "loss": 0.2272, "reward": 0.5390625, "reward_std": 0.10249541327357292, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.484375, "step": 1140 }, { "clip_ratio": 0.0, "completion_length": 914.375, "epoch": 0.4564, "grad_norm": 0.2270053601726656, "kl": 6.953125, "learning_rate": 1.3209436098072095e-05, "loss": 0.2778, "reward": 0.703125, "reward_std": 0.11696234904229641, "rewards/accuracy_reward": 0.2265625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4765625, "step": 1141 }, { "clip_ratio": 0.0, "completion_length": 592.5, "epoch": 0.4568, "grad_norm": 0.3814175121692207, "kl": 6.015625, "learning_rate": 1.319620898687918e-05, "loss": 0.2408, "reward": 0.517578125, "reward_std": 0.09974897652864456, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.486328125, "step": 1142 }, { "clip_ratio": 0.0, "completion_length": 801.75, "epoch": 0.4572, "grad_norm": 0.18190494090327092, "kl": 5.8828125, "learning_rate": 1.3182975644513296e-05, "loss": 0.2352, "reward": 0.587890625, "reward_std": 0.1106303483247757, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.470703125, "step": 1143 }, { "clip_ratio": 0.0, "completion_length": 922.5, "epoch": 0.4576, "grad_norm": 0.11653997118053057, "kl": 6.3828125, "learning_rate": 1.316973609677352e-05, "loss": 0.2554, "reward": 0.712890625, "reward_std": 0.0794061403721571, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.462890625, "step": 1144 }, { "clip_ratio": 0.0, "completion_length": 581.375, "epoch": 0.458, "grad_norm": 0.0753838328618614, "kl": 6.2734375, "learning_rate": 1.3156490369471026e-05, "loss": 0.251, "reward": 0.61328125, "reward_std": 0.03697281330823898, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48828125, "step": 1145 }, { "clip_ratio": 0.0, "completion_length": 705.0, "epoch": 0.4584, "grad_norm": 0.14245843046948883, "kl": 5.765625, "learning_rate": 1.3143238488429042e-05, "loss": 0.2303, "reward": 0.689453125, "reward_std": 0.11083377152681351, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478515625, "step": 1146 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4588, "grad_norm": 0.18109380007146803, "kl": 6.34375, "learning_rate": 1.3129980479482783e-05, "loss": 0.2539, "reward": 0.455078125, "reward_std": 0.09475662559270859, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.455078125, "step": 1147 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4592, "grad_norm": 0.11574738806266205, "kl": 6.4296875, "learning_rate": 1.3116716368479418e-05, "loss": 0.2574, "reward": 0.5859375, "reward_std": 0.10793562792241573, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4609375, "step": 1148 }, { "clip_ratio": 0.0, "completion_length": 916.125, "epoch": 0.4596, "grad_norm": 0.1662282578723417, "kl": 6.25, "learning_rate": 1.3103446181278015e-05, "loss": 0.2501, "reward": 0.470703125, "reward_std": 0.07664071768522263, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.470703125, "step": 1149 }, { "clip_ratio": 0.0, "completion_length": 720.25, "epoch": 0.46, "grad_norm": 0.2275830666110175, "kl": 5.15625, "learning_rate": 1.3090169943749475e-05, "loss": 0.2063, "reward": 0.44140625, "reward_std": 0.07542478665709496, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.44140625, "step": 1150 }, { "clip_ratio": 0.0, "completion_length": 715.875, "epoch": 0.4604, "grad_norm": 0.3234736238907084, "kl": 5.453125, "learning_rate": 1.3076887681776509e-05, "loss": 0.2186, "reward": 0.693359375, "reward_std": 0.14828730188310146, "rewards/accuracy_reward": 0.2421875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.451171875, "step": 1151 }, { "clip_ratio": 0.0, "completion_length": 806.75, "epoch": 0.4608, "grad_norm": 0.11476337227279113, "kl": 6.0078125, "learning_rate": 1.306359942125356e-05, "loss": 0.2402, "reward": 0.65234375, "reward_std": 0.13415084220468998, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.46484375, "step": 1152 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4612, "grad_norm": 0.1342524382235585, "kl": 5.8046875, "learning_rate": 1.3050305188086778e-05, "loss": 0.2323, "reward": 0.6875, "reward_std": 0.10398751869797707, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4375, "step": 1153 }, { "clip_ratio": 0.0, "completion_length": 842.5, "epoch": 0.4616, "grad_norm": 0.31875145322678583, "kl": 5.09765625, "learning_rate": 1.3037005008193944e-05, "loss": 0.2037, "reward": 0.4296875, "reward_std": 0.08109292574226856, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4296875, "step": 1154 }, { "clip_ratio": 0.0, "completion_length": 953.625, "epoch": 0.462, "grad_norm": 0.2557009517193242, "kl": 6.046875, "learning_rate": 1.3023698907504447e-05, "loss": 0.2422, "reward": 0.580078125, "reward_std": 0.08699656091630459, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.455078125, "step": 1155 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4624, "grad_norm": 0.14289383515193088, "kl": 6.2421875, "learning_rate": 1.3010386911959207e-05, "loss": 0.2496, "reward": 0.583984375, "reward_std": 0.0955631360411644, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.458984375, "step": 1156 }, { "clip_ratio": 0.0, "completion_length": 751.625, "epoch": 0.4628, "grad_norm": 0.1435167751364602, "kl": 5.4453125, "learning_rate": 1.299706904751064e-05, "loss": 0.2178, "reward": 0.47265625, "reward_std": 0.0580955371260643, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.47265625, "step": 1157 }, { "clip_ratio": 0.0, "completion_length": 857.25, "epoch": 0.4632, "grad_norm": 0.11285783216704162, "kl": 5.9296875, "learning_rate": 1.2983745340122604e-05, "loss": 0.2374, "reward": 0.54296875, "reward_std": 0.11600105091929436, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48046875, "step": 1158 }, { "clip_ratio": 0.0, "completion_length": 836.25, "epoch": 0.4636, "grad_norm": 0.09290285567457036, "kl": 6.1015625, "learning_rate": 1.297041581577035e-05, "loss": 0.244, "reward": 0.46875, "reward_std": 0.06783140823245049, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.46875, "step": 1159 }, { "clip_ratio": 0.0, "completion_length": 662.75, "epoch": 0.464, "grad_norm": 0.10132294177510011, "kl": 5.4765625, "learning_rate": 1.2957080500440469e-05, "loss": 0.2186, "reward": 0.537109375, "reward_std": 0.11330920085310936, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.474609375, "step": 1160 }, { "clip_ratio": 0.0, "completion_length": 857.25, "epoch": 0.4644, "grad_norm": 0.09339877782843198, "kl": 6.203125, "learning_rate": 1.2943739420130837e-05, "loss": 0.2484, "reward": 0.576171875, "reward_std": 0.129456777125597, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.466796875, "step": 1161 }, { "clip_ratio": 0.0, "completion_length": 775.125, "epoch": 0.4648, "grad_norm": 0.12175530560689751, "kl": 5.96875, "learning_rate": 1.2930392600850574e-05, "loss": 0.2388, "reward": 0.705078125, "reward_std": 0.10576925426721573, "rewards/accuracy_reward": 0.2265625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478515625, "step": 1162 }, { "clip_ratio": 0.0, "completion_length": 932.125, "epoch": 0.4652, "grad_norm": 0.11490305847494106, "kl": 6.1171875, "learning_rate": 1.291704006861999e-05, "loss": 0.2452, "reward": 0.8203125, "reward_std": 0.12220924720168114, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4765625, "step": 1163 }, { "clip_ratio": 0.0, "completion_length": 852.75, "epoch": 0.4656, "grad_norm": 0.23394465497915146, "kl": 6.0546875, "learning_rate": 1.2903681849470528e-05, "loss": 0.2422, "reward": 0.509765625, "reward_std": 0.130447655916214, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478515625, "step": 1164 }, { "clip_ratio": 0.0, "completion_length": 770.375, "epoch": 0.466, "grad_norm": 0.163162929257072, "kl": 5.78125, "learning_rate": 1.2890317969444716e-05, "loss": 0.2312, "reward": 0.587890625, "reward_std": 0.09462824277579784, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478515625, "step": 1165 }, { "clip_ratio": 0.0, "completion_length": 708.875, "epoch": 0.4664, "grad_norm": 0.14085061881564917, "kl": 5.40625, "learning_rate": 1.287694845459613e-05, "loss": 0.2164, "reward": 0.478515625, "reward_std": 0.046043483540415764, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478515625, "step": 1166 }, { "clip_ratio": 0.0, "completion_length": 822.375, "epoch": 0.4668, "grad_norm": 0.09346303467144912, "kl": 6.3984375, "learning_rate": 1.2863573330989315e-05, "loss": 0.2556, "reward": 0.5859375, "reward_std": 0.11179256625473499, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.46875, "step": 1167 }, { "clip_ratio": 0.0, "completion_length": 525.75, "epoch": 0.4672, "grad_norm": 0.07764764198959942, "kl": 5.1484375, "learning_rate": 1.2850192624699762e-05, "loss": 0.2056, "reward": 0.490234375, "reward_std": 0.028222277760505676, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.490234375, "step": 1168 }, { "clip_ratio": 0.0, "completion_length": 968.0, "epoch": 0.4676, "grad_norm": 0.16655304683086275, "kl": 6.0234375, "learning_rate": 1.2836806361813846e-05, "loss": 0.2408, "reward": 0.58203125, "reward_std": 0.08835325017571449, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.45703125, "step": 1169 }, { "clip_ratio": 0.0, "completion_length": 938.25, "epoch": 0.468, "grad_norm": 0.12079855208469899, "kl": 6.1015625, "learning_rate": 1.2823414568428767e-05, "loss": 0.2441, "reward": 0.46875, "reward_std": 0.07960453070700169, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.46875, "step": 1170 }, { "clip_ratio": 0.0, "completion_length": 733.875, "epoch": 0.4684, "grad_norm": 0.11651476698288019, "kl": 6.0625, "learning_rate": 1.2810017270652513e-05, "loss": 0.2424, "reward": 0.611328125, "reward_std": 0.039834219962358475, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.486328125, "step": 1171 }, { "clip_ratio": 0.0, "completion_length": 835.5, "epoch": 0.4688, "grad_norm": 0.18699983624443053, "kl": 5.3046875, "learning_rate": 1.27966144946038e-05, "loss": 0.2125, "reward": 0.5703125, "reward_std": 0.09577989019453526, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.453125, "step": 1172 }, { "clip_ratio": 0.0, "completion_length": 772.125, "epoch": 0.4692, "grad_norm": 0.10307354158510151, "kl": 6.2578125, "learning_rate": 1.278320626641203e-05, "loss": 0.2503, "reward": 0.603515625, "reward_std": 0.058760738000273705, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478515625, "step": 1173 }, { "clip_ratio": 0.0, "completion_length": 849.625, "epoch": 0.4696, "grad_norm": 0.2846833440983316, "kl": 5.53125, "learning_rate": 1.2769792612217224e-05, "loss": 0.2211, "reward": 0.453125, "reward_std": 0.09662418067455292, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.453125, "step": 1174 }, { "clip_ratio": 0.0, "completion_length": 924.375, "epoch": 0.47, "grad_norm": 0.13165639706571658, "kl": 6.15625, "learning_rate": 1.2756373558169992e-05, "loss": 0.246, "reward": 0.462890625, "reward_std": 0.0794061403721571, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.462890625, "step": 1175 }, { "clip_ratio": 0.0, "completion_length": 932.625, "epoch": 0.4704, "grad_norm": 0.21643028018269986, "kl": 5.9453125, "learning_rate": 1.2742949130431468e-05, "loss": 0.238, "reward": 0.498046875, "reward_std": 0.1472237128764391, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.458984375, "step": 1176 }, { "clip_ratio": 0.0, "completion_length": 920.875, "epoch": 0.4708, "grad_norm": 0.23209032869856905, "kl": 6.2890625, "learning_rate": 1.2729519355173254e-05, "loss": 0.2518, "reward": 0.517578125, "reward_std": 0.15630261227488518, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.462890625, "step": 1177 }, { "clip_ratio": 0.0, "completion_length": 606.25, "epoch": 0.4712, "grad_norm": 0.13763527644703574, "kl": 5.7109375, "learning_rate": 1.2716084258577388e-05, "loss": 0.2283, "reward": 0.611328125, "reward_std": 0.039834219962358475, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.486328125, "step": 1178 }, { "clip_ratio": 0.0, "completion_length": 830.375, "epoch": 0.4716, "grad_norm": 0.377677452702842, "kl": 6.2421875, "learning_rate": 1.270264386683628e-05, "loss": 0.2496, "reward": 0.599609375, "reward_std": 0.07289028540253639, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.474609375, "step": 1179 }, { "clip_ratio": 0.0, "completion_length": 962.375, "epoch": 0.472, "grad_norm": 2.4718078400754053, "kl": 6.8359375, "learning_rate": 1.2689198206152657e-05, "loss": 0.2736, "reward": 0.466796875, "reward_std": 0.09182258322834969, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.466796875, "step": 1180 }, { "clip_ratio": 0.0, "completion_length": 939.25, "epoch": 0.4724, "grad_norm": 1.592022593199364, "kl": 6.1484375, "learning_rate": 1.2675747302739528e-05, "loss": 0.2458, "reward": 0.466796875, "reward_std": 0.08687148988246918, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.466796875, "step": 1181 }, { "clip_ratio": 0.0, "completion_length": 676.875, "epoch": 0.4728, "grad_norm": 2.546381336809866, "kl": 5.5078125, "learning_rate": 1.2662291182820115e-05, "loss": 0.221, "reward": 0.466796875, "reward_std": 0.05024883709847927, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.466796875, "step": 1182 }, { "clip_ratio": 0.0, "completion_length": 847.25, "epoch": 0.4732, "grad_norm": 9.616807473825416, "kl": 6.1328125, "learning_rate": 1.2648829872627809e-05, "loss": 0.2454, "reward": 0.591796875, "reward_std": 0.08064967580139637, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.466796875, "step": 1183 }, { "clip_ratio": 0.0, "completion_length": 615.125, "epoch": 0.4736, "grad_norm": 7.3921126987569075, "kl": 6.578125, "learning_rate": 1.263536339840613e-05, "loss": 0.263, "reward": 0.490234375, "reward_std": 0.0390625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.490234375, "step": 1184 }, { "clip_ratio": 0.0, "completion_length": 698.375, "epoch": 0.474, "grad_norm": 9.846812862489182, "kl": 6.1171875, "learning_rate": 1.2621891786408648e-05, "loss": 0.245, "reward": 0.59765625, "reward_std": 0.05776817165315151, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.47265625, "step": 1185 }, { "clip_ratio": 0.0, "completion_length": 962.25, "epoch": 0.4744, "grad_norm": 22.479292294166747, "kl": 14.546875, "learning_rate": 1.2608415062898971e-05, "loss": 0.581, "reward": 0.44140625, "reward_std": 0.11537322774529457, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.44140625, "step": 1186 }, { "clip_ratio": 0.0, "completion_length": 845.75, "epoch": 0.4748, "grad_norm": 3.350070426993043, "kl": 8.203125, "learning_rate": 1.2594933254150654e-05, "loss": 0.3278, "reward": 0.52734375, "reward_std": 0.13128934055566788, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.47265625, "step": 1187 }, { "clip_ratio": 0.0, "completion_length": 924.75, "epoch": 0.4752, "grad_norm": 4.133297268807293, "kl": 6.5703125, "learning_rate": 1.2581446386447178e-05, "loss": 0.2628, "reward": 0.46484375, "reward_std": 0.06822281330823898, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.46484375, "step": 1188 }, { "clip_ratio": 0.0, "completion_length": 940.25, "epoch": 0.4756, "grad_norm": 2.27576850711593, "kl": 7.59375, "learning_rate": 1.256795448608188e-05, "loss": 0.3038, "reward": 0.697265625, "reward_std": 0.10656357742846012, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.447265625, "step": 1189 }, { "clip_ratio": 0.0, "completion_length": 678.375, "epoch": 0.476, "grad_norm": 2.1036455667685496, "kl": 7.3828125, "learning_rate": 1.2554457579357906e-05, "loss": 0.2953, "reward": 0.484375, "reward_std": 0.05605955049395561, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.484375, "step": 1190 }, { "clip_ratio": 0.0, "completion_length": 753.7578125, "epoch": 0.4764, "grad_norm": 13.841738779407773, "kl": 9.53125, "learning_rate": 1.2540955692588173e-05, "loss": 0.3791, "reward": 0.591796875, "reward_std": 0.07239186391234398, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.466796875, "step": 1191 }, { "clip_ratio": 0.0, "completion_length": 843.375, "epoch": 0.4768, "grad_norm": 10.497573924627872, "kl": 9.1796875, "learning_rate": 1.2527448852095295e-05, "loss": 0.3669, "reward": 0.5, "reward_std": 0.13951832801103592, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.46875, "step": 1192 }, { "clip_ratio": 0.0, "completion_length": 710.828125, "epoch": 0.4772, "grad_norm": 2.9100650965115546, "kl": 7.0546875, "learning_rate": 1.251393708421155e-05, "loss": 0.2664, "reward": 0.60546875, "reward_std": 0.06233368441462517, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48046875, "step": 1193 }, { "clip_ratio": 0.0, "completion_length": 785.0, "epoch": 0.4776, "grad_norm": 181.3774691357735, "kl": 55.359375, "learning_rate": 1.2500420415278822e-05, "loss": 2.2132, "reward": 0.61328125, "reward_std": 0.22122382000088692, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.45703125, "step": 1194 }, { "clip_ratio": 0.0, "completion_length": 670.875, "epoch": 0.478, "grad_norm": 19.849587960234555, "kl": 12.875, "learning_rate": 1.2486898871648552e-05, "loss": 0.5151, "reward": 0.4765625, "reward_std": 0.06271020323038101, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4765625, "step": 1195 }, { "clip_ratio": 0.0, "completion_length": 730.375, "epoch": 0.4784, "grad_norm": 3.583756586654068, "kl": 7.0703125, "learning_rate": 1.2473372479681671e-05, "loss": 0.2834, "reward": 0.73046875, "reward_std": 0.20204294845461845, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.47265625, "step": 1196 }, { "clip_ratio": 0.0, "completion_length": 722.0, "epoch": 0.4788, "grad_norm": 5300.004311375607, "kl": 216.296875, "learning_rate": 1.2459841265748582e-05, "loss": 8.6578, "reward": 0.4765625, "reward_std": 0.051659777760505676, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4765625, "step": 1197 }, { "clip_ratio": 0.0, "completion_length": 851.0, "epoch": 0.4792, "grad_norm": 3.8119403040423476, "kl": 8.03125, "learning_rate": 1.2446305256229074e-05, "loss": 0.3213, "reward": 0.583984375, "reward_std": 0.08407642133533955, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.458984375, "step": 1198 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4796, "grad_norm": 13.353694213347275, "kl": 10.03125, "learning_rate": 1.2432764477512294e-05, "loss": 0.4019, "reward": 0.599609375, "reward_std": 0.08868160098791122, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.474609375, "step": 1199 }, { "clip_ratio": 0.0, "completion_length": 670.75, "epoch": 0.48, "grad_norm": 3.5889780072760633, "kl": 7.40625, "learning_rate": 1.2419218955996677e-05, "loss": 0.2964, "reward": 0.603515625, "reward_std": 0.05138225294649601, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478515625, "step": 1200 }, { "clip_ratio": 0.0, "completion_length": 731.375, "epoch": 0.4804, "grad_norm": 2.2215912349919287, "kl": 7.5859375, "learning_rate": 1.2405668718089918e-05, "loss": 0.3033, "reward": 0.61328125, "reward_std": 0.046875, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48828125, "step": 1201 }, { "clip_ratio": 0.0, "completion_length": 710.25, "epoch": 0.4808, "grad_norm": 334.91436872557034, "kl": 51.1171875, "learning_rate": 1.2392113790208895e-05, "loss": 2.0449, "reward": 0.58203125, "reward_std": 0.09465491026639938, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48828125, "step": 1202 }, { "clip_ratio": 0.0, "completion_length": 753.625, "epoch": 0.4812, "grad_norm": 34.02843853488386, "kl": 15.703125, "learning_rate": 1.2378554198779632e-05, "loss": 0.6132, "reward": 0.478515625, "reward_std": 0.061834799125790596, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478515625, "step": 1203 }, { "clip_ratio": 0.0, "completion_length": 823.0, "epoch": 0.4816, "grad_norm": 4.811432918379856, "kl": 7.59375, "learning_rate": 1.236498997023725e-05, "loss": 0.3036, "reward": 0.6015625, "reward_std": 0.08730955049395561, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.484375, "step": 1204 }, { "clip_ratio": 0.0, "completion_length": 543.0, "epoch": 0.482, "grad_norm": 2.768865460942379, "kl": 6.828125, "learning_rate": 1.23514211310259e-05, "loss": 0.2731, "reward": 0.615234375, "reward_std": 0.03262205049395561, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.490234375, "step": 1205 }, { "clip_ratio": 0.0, "completion_length": 742.75, "epoch": 0.4824, "grad_norm": 330.97023951025835, "kl": 66.0, "learning_rate": 1.2337847707598738e-05, "loss": 2.6403, "reward": 0.5703125, "reward_std": 0.13023405522108078, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.46875, "step": 1206 }, { "clip_ratio": 0.0, "completion_length": 700.125, "epoch": 0.4828, "grad_norm": 33.40980257812405, "kl": 15.75, "learning_rate": 1.2324269726417841e-05, "loss": 0.6302, "reward": 0.720703125, "reward_std": 0.08793565817177296, "rewards/accuracy_reward": 0.2421875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478515625, "step": 1207 }, { "clip_ratio": 0.0, "completion_length": 810.0, "epoch": 0.4832, "grad_norm": 5.1294646937611255, "kl": 8.203125, "learning_rate": 1.2310687213954182e-05, "loss": 0.3279, "reward": 0.478515625, "reward_std": 0.06777860224246979, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478515625, "step": 1208 }, { "clip_ratio": 0.0, "completion_length": 791.625, "epoch": 0.4836, "grad_norm": 4.194167475745828, "kl": 8.765625, "learning_rate": 1.2297100196687557e-05, "loss": 0.3512, "reward": 0.611328125, "reward_std": 0.04478531330823898, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.486328125, "step": 1209 }, { "clip_ratio": 0.0, "completion_length": 483.375, "epoch": 0.484, "grad_norm": 10.355592850160635, "kl": 9.734375, "learning_rate": 1.2283508701106559e-05, "loss": 0.3894, "reward": 0.494140625, "reward_std": 0.01848640665411949, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.494140625, "step": 1210 }, { "clip_ratio": 0.0, "completion_length": 670.25, "epoch": 0.4844, "grad_norm": 1.9758134642167187, "kl": 8.4375, "learning_rate": 1.2269912753708502e-05, "loss": 0.3372, "reward": 0.478515625, "reward_std": 0.061834799125790596, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478515625, "step": 1211 }, { "clip_ratio": 0.0, "completion_length": 714.25, "epoch": 0.4848, "grad_norm": 11.196891529798329, "kl": 12.6953125, "learning_rate": 1.2256312380999376e-05, "loss": 0.5085, "reward": 0.4765625, "reward_std": 0.052589621394872665, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4765625, "step": 1212 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4852, "grad_norm": 7.6274948114088374, "kl": 9.828125, "learning_rate": 1.2242707609493814e-05, "loss": 0.3935, "reward": 0.779296875, "reward_std": 0.16923228278756142, "rewards/accuracy_reward": 0.3203125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.458984375, "step": 1213 }, { "clip_ratio": 0.0, "completion_length": 881.875, "epoch": 0.4856, "grad_norm": 3.0978577784628762, "kl": 11.203125, "learning_rate": 1.2229098465715005e-05, "loss": 0.4488, "reward": 0.59375, "reward_std": 0.0890091098845005, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.46875, "step": 1214 }, { "clip_ratio": 0.0, "completion_length": 457.125, "epoch": 0.486, "grad_norm": 1.6572781702807218, "kl": 6.140625, "learning_rate": 1.2215484976194675e-05, "loss": 0.2455, "reward": 0.494140625, "reward_std": 0.0234375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.494140625, "step": 1215 }, { "clip_ratio": 0.0, "completion_length": 770.0, "epoch": 0.4864, "grad_norm": 27.476913533092997, "kl": 17.296875, "learning_rate": 1.2201867167473015e-05, "loss": 0.6922, "reward": 0.474609375, "reward_std": 0.08191424608230591, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.474609375, "step": 1216 }, { "clip_ratio": 0.0, "completion_length": 847.875, "epoch": 0.4868, "grad_norm": 17.279406421253487, "kl": 15.640625, "learning_rate": 1.2188245066098647e-05, "loss": 0.6257, "reward": 0.591796875, "reward_std": 0.07933073490858078, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.466796875, "step": 1217 }, { "clip_ratio": 0.0, "completion_length": 769.625, "epoch": 0.4872, "grad_norm": 8.703432342755828, "kl": 7.5078125, "learning_rate": 1.217461869862855e-05, "loss": 0.3006, "reward": 0.609375, "reward_std": 0.05605955049395561, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.484375, "step": 1218 }, { "clip_ratio": 0.0, "completion_length": 663.25, "epoch": 0.4876, "grad_norm": 7.217258648856391, "kl": 6.5859375, "learning_rate": 1.2160988091628023e-05, "loss": 0.2633, "reward": 0.609375, "reward_std": 0.05259781330823898, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.484375, "step": 1219 }, { "clip_ratio": 0.0, "completion_length": 850.125, "epoch": 0.488, "grad_norm": 5.9351119515055535, "kl": 8.1875, "learning_rate": 1.2147353271670634e-05, "loss": 0.3272, "reward": 0.482421875, "reward_std": 0.06536140665411949, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.482421875, "step": 1220 }, { "clip_ratio": 0.0, "completion_length": 779.625, "epoch": 0.4884, "grad_norm": 23.338056001332937, "kl": 16.125, "learning_rate": 1.2133714265338162e-05, "loss": 0.6456, "reward": 0.603515625, "reward_std": 0.06464377045631409, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478515625, "step": 1221 }, { "clip_ratio": 0.0, "completion_length": 771.875, "epoch": 0.4888, "grad_norm": 26.45058194533374, "kl": 17.234375, "learning_rate": 1.212007109922055e-05, "loss": 0.6899, "reward": 0.603515625, "reward_std": 0.06370573490858078, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478515625, "step": 1222 }, { "clip_ratio": 0.0, "completion_length": 842.0, "epoch": 0.4892, "grad_norm": 5.0692826370980075, "kl": 11.265625, "learning_rate": 1.2106423799915841e-05, "loss": 0.4514, "reward": 0.474609375, "reward_std": 0.08131499215960503, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.474609375, "step": 1223 }, { "clip_ratio": 0.0, "completion_length": 652.25, "epoch": 0.4896, "grad_norm": 4.2113511051774335, "kl": 6.8671875, "learning_rate": 1.2092772394030153e-05, "loss": 0.2743, "reward": 0.48828125, "reward_std": 0.046875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48828125, "step": 1224 }, { "clip_ratio": 0.0, "completion_length": 687.0, "epoch": 0.49, "grad_norm": 4.929244597933145, "kl": 6.734375, "learning_rate": 1.2079116908177592e-05, "loss": 0.2695, "reward": 0.486328125, "reward_std": 0.08906660601496696, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478515625, "step": 1225 }, { "clip_ratio": 0.0, "completion_length": 823.125, "epoch": 0.4904, "grad_norm": 3.6014991578199336, "kl": 8.34375, "learning_rate": 1.2065457368980236e-05, "loss": 0.3343, "reward": 0.478515625, "reward_std": 0.0695948638021946, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478515625, "step": 1226 }, { "clip_ratio": 0.0, "completion_length": 763.375, "epoch": 0.4908, "grad_norm": 14.691713699521673, "kl": 13.0703125, "learning_rate": 1.2051793803068046e-05, "loss": 0.5238, "reward": 0.484375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.484375, "step": 1227 }, { "clip_ratio": 0.0, "completion_length": 478.875, "epoch": 0.4912, "grad_norm": 7.664455934049021, "kl": 8.875, "learning_rate": 1.203812623707885e-05, "loss": 0.3556, "reward": 0.49609375, "reward_std": 0.015625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.49609375, "step": 1228 }, { "clip_ratio": 0.0, "completion_length": 860.75, "epoch": 0.4916, "grad_norm": 22.106733791772832, "kl": 16.3125, "learning_rate": 1.202445469765826e-05, "loss": 0.6521, "reward": 0.6015625, "reward_std": 0.08086910098791122, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4765625, "step": 1229 }, { "clip_ratio": 0.0, "completion_length": 623.875, "epoch": 0.492, "grad_norm": 1.6422876220626959, "kl": 7.171875, "learning_rate": 1.2010779211459649e-05, "loss": 0.2867, "reward": 0.5, "reward_std": 0.0625, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4921875, "step": 1230 }, { "clip_ratio": 0.0, "completion_length": 551.0, "epoch": 0.4924, "grad_norm": 1.2490271239727304, "kl": 6.9296875, "learning_rate": 1.1997099805144071e-05, "loss": 0.2775, "reward": 0.490234375, "reward_std": 0.027670957148075104, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.490234375, "step": 1231 }, { "clip_ratio": 0.0, "completion_length": 534.375, "epoch": 0.4928, "grad_norm": 1.3765888945473945, "kl": 6.375, "learning_rate": 1.1983416505380234e-05, "loss": 0.2553, "reward": 0.494140625, "reward_std": 0.0234375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.494140625, "step": 1232 }, { "clip_ratio": 0.0, "completion_length": 695.5, "epoch": 0.4932, "grad_norm": 9.111790243469333, "kl": 10.875, "learning_rate": 1.1969729338844429e-05, "loss": 0.4355, "reward": 0.484375, "reward_std": 0.051659777760505676, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.484375, "step": 1233 }, { "clip_ratio": 0.0, "completion_length": 650.625, "epoch": 0.4936, "grad_norm": 6.985168100041085, "kl": 10.1640625, "learning_rate": 1.1956038332220484e-05, "loss": 0.4056, "reward": 0.609375, "reward_std": 0.05754890665411949, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.484375, "step": 1234 }, { "clip_ratio": 0.0, "completion_length": 480.625, "epoch": 0.494, "grad_norm": 47.60102160699634, "kl": 17.046875, "learning_rate": 1.194234351219972e-05, "loss": 0.6817, "reward": 0.4921875, "reward_std": 0.013975424692034721, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4921875, "step": 1235 }, { "clip_ratio": 0.0, "completion_length": 598.625, "epoch": 0.4944, "grad_norm": 14.524658262994903, "kl": 10.3359375, "learning_rate": 1.192864490548089e-05, "loss": 0.4129, "reward": 0.484375, "reward_std": 0.046157363802194595, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.484375, "step": 1236 }, { "clip_ratio": 0.0, "completion_length": 405.375, "epoch": 0.4948, "grad_norm": 2.405633752010123, "kl": 5.7265625, "learning_rate": 1.191494253877013e-05, "loss": 0.2286, "reward": 0.49609375, "reward_std": 0.015625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.49609375, "step": 1237 }, { "clip_ratio": 0.0, "completion_length": 489.25, "epoch": 0.4952, "grad_norm": 1.4947071341635678, "kl": 5.4609375, "learning_rate": 1.1901236438780902e-05, "loss": 0.2185, "reward": 0.619140625, "reward_std": 0.01848640665411949, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.494140625, "step": 1238 }, { "clip_ratio": 0.0, "completion_length": 690.75, "epoch": 0.4956, "grad_norm": 3.420528304016391, "kl": 6.6171875, "learning_rate": 1.1887526632233954e-05, "loss": 0.2645, "reward": 0.611328125, "reward_std": 0.04824705049395561, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.486328125, "step": 1239 }, { "clip_ratio": 0.0, "completion_length": 541.125, "epoch": 0.496, "grad_norm": 1.8481098730010581, "kl": 7.3671875, "learning_rate": 1.187381314585725e-05, "loss": 0.2944, "reward": 0.478515625, "reward_std": 0.057816606014966965, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478515625, "step": 1240 }, { "clip_ratio": 0.0, "completion_length": 504.625, "epoch": 0.4964, "grad_norm": 0.9697414834463914, "kl": 6.578125, "learning_rate": 1.186009600638593e-05, "loss": 0.2631, "reward": 0.49609375, "reward_std": 0.015625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.49609375, "step": 1241 }, { "clip_ratio": 0.0, "completion_length": 674.0, "epoch": 0.4968, "grad_norm": 2.7352077279556215, "kl": 5.5234375, "learning_rate": 1.184637524056227e-05, "loss": 0.2209, "reward": 0.619140625, "reward_std": 0.0234375, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.494140625, "step": 1242 }, { "clip_ratio": 0.0, "completion_length": 707.5, "epoch": 0.4972, "grad_norm": 1.8764531056059937, "kl": 6.6796875, "learning_rate": 1.1832650875135599e-05, "loss": 0.267, "reward": 0.490234375, "reward_std": 0.03411140665411949, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.490234375, "step": 1243 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4976, "grad_norm": 55.233824072436555, "kl": 14.3203125, "learning_rate": 1.181892293686227e-05, "loss": 0.5734, "reward": 0.517578125, "reward_std": 0.13362469151616096, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.392578125, "step": 1244 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.498, "grad_norm": 7.060537164816692, "kl": 4.3984375, "learning_rate": 1.1805191452505602e-05, "loss": 0.1759, "reward": 0.31640625, "reward_std": 0.11562062799930573, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.31640625, "step": 1245 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4984, "grad_norm": 1.5187419985069404, "kl": 1.88671875, "learning_rate": 1.1791456448835825e-05, "loss": 0.0755, "reward": 0.408203125, "reward_std": 0.09809093736112118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.283203125, "step": 1246 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4988, "grad_norm": 0.9174926123905239, "kl": 1.611328125, "learning_rate": 1.1777717952630033e-05, "loss": 0.0645, "reward": 0.291015625, "reward_std": 0.09107859246432781, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.291015625, "step": 1247 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4992, "grad_norm": 0.8375678666395254, "kl": 1.345703125, "learning_rate": 1.1763975990672125e-05, "loss": 0.0538, "reward": 0.412109375, "reward_std": 0.0931459404528141, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.287109375, "step": 1248 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.4996, "grad_norm": 2.225632592638219, "kl": 3.06640625, "learning_rate": 1.1750230589752763e-05, "loss": 0.1224, "reward": 0.326171875, "reward_std": 0.12041721120476723, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.326171875, "step": 1249 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5, "grad_norm": 1.3326109528436, "kl": 2.447265625, "learning_rate": 1.1736481776669307e-05, "loss": 0.0979, "reward": 0.333984375, "reward_std": 0.11790733598172665, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.333984375, "step": 1250 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5004, "grad_norm": 1.9232989116969823, "kl": 2.8046875, "learning_rate": 1.1722729578225769e-05, "loss": 0.1122, "reward": 0.349609375, "reward_std": 0.11628344468772411, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.349609375, "step": 1251 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5008, "grad_norm": 4.671357683607638, "kl": 2.044921875, "learning_rate": 1.1708974021232768e-05, "loss": 0.0818, "reward": 0.455078125, "reward_std": 0.11360900849103928, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.330078125, "step": 1252 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5012, "grad_norm": 1.4107839546724459, "kl": 2.8828125, "learning_rate": 1.1695215132507465e-05, "loss": 0.1153, "reward": 0.37109375, "reward_std": 0.13627665117383003, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.37109375, "step": 1253 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5016, "grad_norm": 0.9482449011181819, "kl": 2.3828125, "learning_rate": 1.1681452938873516e-05, "loss": 0.0953, "reward": 0.34375, "reward_std": 0.1255780104547739, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.34375, "step": 1254 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.502, "grad_norm": 0.9187862959456657, "kl": 4.34765625, "learning_rate": 1.1667687467161025e-05, "loss": 0.1742, "reward": 0.41796875, "reward_std": 0.12516618706285954, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.41796875, "step": 1255 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5024, "grad_norm": 5.247112812054117, "kl": 4.046875, "learning_rate": 1.1653918744206478e-05, "loss": 0.1619, "reward": 0.41796875, "reward_std": 0.12014022283256054, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.41796875, "step": 1256 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5028, "grad_norm": 4.900087991211882, "kl": 4.7421875, "learning_rate": 1.1640146796852711e-05, "loss": 0.1897, "reward": 0.453125, "reward_std": 0.10554593615233898, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.453125, "step": 1257 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5032, "grad_norm": 1.1619895205430302, "kl": 4.046875, "learning_rate": 1.1626371651948839e-05, "loss": 0.1621, "reward": 0.451171875, "reward_std": 0.10360817424952984, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.451171875, "step": 1258 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5036, "grad_norm": 7.537805792761545, "kl": 4.53125, "learning_rate": 1.1612593336350209e-05, "loss": 0.1814, "reward": 0.453125, "reward_std": 0.1083775945007801, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.453125, "step": 1259 }, { "clip_ratio": 0.0, "completion_length": 966.0, "epoch": 0.504, "grad_norm": 1.9349761717729788, "kl": 4.375, "learning_rate": 1.159881187691835e-05, "loss": 0.1751, "reward": 0.47265625, "reward_std": 0.10071872174739838, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.46484375, "step": 1260 }, { "clip_ratio": 0.0, "completion_length": 930.0, "epoch": 0.5044, "grad_norm": 2.724276453743379, "kl": 5.453125, "learning_rate": 1.158502730052093e-05, "loss": 0.2179, "reward": 0.484375, "reward_std": 0.05292639881372452, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.484375, "step": 1261 }, { "clip_ratio": 0.0, "completion_length": 945.75, "epoch": 0.5048, "grad_norm": 0.785356061019219, "kl": 5.734375, "learning_rate": 1.157123963403168e-05, "loss": 0.2296, "reward": 0.603515625, "reward_std": 0.06613312661647797, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478515625, "step": 1262 }, { "clip_ratio": 0.0, "completion_length": 952.625, "epoch": 0.5052, "grad_norm": 0.3643739118205527, "kl": 6.1875, "learning_rate": 1.1557448904330362e-05, "loss": 0.2478, "reward": 0.470703125, "reward_std": 0.07179812714457512, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.470703125, "step": 1263 }, { "clip_ratio": 0.0, "completion_length": 504.75, "epoch": 0.5056, "grad_norm": 0.2477376324261918, "kl": 6.234375, "learning_rate": 1.1543655138302714e-05, "loss": 0.2499, "reward": 0.494140625, "reward_std": 0.01848640665411949, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.494140625, "step": 1264 }, { "clip_ratio": 0.0, "completion_length": 551.5, "epoch": 0.506, "grad_norm": 0.5320281710251399, "kl": 6.515625, "learning_rate": 1.1529858362840383e-05, "loss": 0.2605, "reward": 0.474609375, "reward_std": 0.0776811596006155, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.474609375, "step": 1265 }, { "clip_ratio": 0.0, "completion_length": 574.875, "epoch": 0.5064, "grad_norm": 0.41067985897378323, "kl": 6.109375, "learning_rate": 1.1516058604840891e-05, "loss": 0.2438, "reward": 0.4921875, "reward_std": 0.03125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4921875, "step": 1266 }, { "clip_ratio": 0.0, "completion_length": 615.5, "epoch": 0.5068, "grad_norm": 0.2926759844393752, "kl": 5.71875, "learning_rate": 1.1502255891207572e-05, "loss": 0.229, "reward": 0.484375, "reward_std": 0.049292195588350296, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.484375, "step": 1267 }, { "clip_ratio": 0.0, "completion_length": 715.625, "epoch": 0.5072, "grad_norm": 12.449325397471917, "kl": 7.546875, "learning_rate": 1.1488450248849523e-05, "loss": 0.3017, "reward": 0.486328125, "reward_std": 0.04973640665411949, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.486328125, "step": 1268 }, { "clip_ratio": 0.0, "completion_length": 434.625, "epoch": 0.5076, "grad_norm": 5.08803701580731, "kl": 5.3203125, "learning_rate": 1.1474641704681551e-05, "loss": 0.2129, "reward": 0.490234375, "reward_std": 0.03652860224246979, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.490234375, "step": 1269 }, { "clip_ratio": 0.0, "completion_length": 328.625, "epoch": 0.508, "grad_norm": 24.60751089036157, "kl": 5.2265625, "learning_rate": 1.1460830285624119e-05, "loss": 0.2093, "reward": 0.48828125, "reward_std": 0.05259781330823898, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48828125, "step": 1270 }, { "clip_ratio": 0.0, "completion_length": 558.5, "epoch": 0.5084, "grad_norm": 1.4976348280031746, "kl": 6.65625, "learning_rate": 1.1447016018603293e-05, "loss": 0.2666, "reward": 0.47265625, "reward_std": 0.08417786471545696, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.47265625, "step": 1271 }, { "clip_ratio": 0.0, "completion_length": 718.5, "epoch": 0.5088, "grad_norm": 9.367577511111914, "kl": 9.109375, "learning_rate": 1.1433198930550694e-05, "loss": 0.3647, "reward": 0.484375, "reward_std": 0.049292195588350296, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.484375, "step": 1272 }, { "clip_ratio": 0.0, "completion_length": 531.625, "epoch": 0.5092, "grad_norm": 1.518162219595887, "kl": 7.3671875, "learning_rate": 1.1419379048403446e-05, "loss": 0.2946, "reward": 0.673828125, "reward_std": 0.08748093992471695, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.494140625, "step": 1273 }, { "clip_ratio": 0.0, "completion_length": 478.625, "epoch": 0.5096, "grad_norm": 0.09344180646623988, "kl": 5.890625, "learning_rate": 1.140555639910411e-05, "loss": 0.2353, "reward": 0.751953125, "reward_std": 0.0078125, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.501953125, "step": 1274 }, { "clip_ratio": 0.0, "completion_length": 507.625, "epoch": 0.51, "grad_norm": 1.4091953578398515, "kl": 5.984375, "learning_rate": 1.1391731009600655e-05, "loss": 0.2391, "reward": 0.498046875, "reward_std": 0.0234375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.498046875, "step": 1275 }, { "clip_ratio": 0.0, "completion_length": 429.125, "epoch": 0.5104, "grad_norm": 2.182022328009069, "kl": 5.7890625, "learning_rate": 1.137790290684638e-05, "loss": 0.2319, "reward": 0.5, "reward_std": 0.015625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5, "step": 1276 }, { "clip_ratio": 0.0, "completion_length": 608.5, "epoch": 0.5108, "grad_norm": 2.7523440787611015, "kl": 7.21875, "learning_rate": 1.1364072117799884e-05, "loss": 0.2888, "reward": 0.482421875, "reward_std": 0.053969863802194595, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.482421875, "step": 1277 }, { "clip_ratio": 0.0, "completion_length": 480.0, "epoch": 0.5112, "grad_norm": 0.21030413950078608, "kl": 5.875, "learning_rate": 1.1350238669424993e-05, "loss": 0.2348, "reward": 0.62109375, "reward_std": 0.015625, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.49609375, "step": 1278 }, { "clip_ratio": 0.0, "completion_length": 441.75, "epoch": 0.5116, "grad_norm": 0.27236358370353764, "kl": 6.4140625, "learning_rate": 1.1336402588690727e-05, "loss": 0.2568, "reward": 0.498046875, "reward_std": 0.01848640665411949, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.498046875, "step": 1279 }, { "clip_ratio": 0.0, "completion_length": 405.875, "epoch": 0.512, "grad_norm": 0.12674414616719792, "kl": 5.578125, "learning_rate": 1.1322563902571227e-05, "loss": 0.223, "reward": 0.49609375, "reward_std": 0.015625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.49609375, "step": 1280 }, { "clip_ratio": 0.0, "completion_length": 654.25, "epoch": 0.5124, "grad_norm": 0.3936984738419875, "kl": 6.0546875, "learning_rate": 1.1308722638045724e-05, "loss": 0.2426, "reward": 0.484375, "reward_std": 0.05605955049395561, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.484375, "step": 1281 }, { "clip_ratio": 0.0, "completion_length": 512.875, "epoch": 0.5128, "grad_norm": 0.171818430432612, "kl": 5.8359375, "learning_rate": 1.129487882209847e-05, "loss": 0.2338, "reward": 0.7265625, "reward_std": 0.050389111042022705, "rewards/accuracy_reward": 0.2265625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5, "step": 1282 }, { "clip_ratio": 0.0, "completion_length": 507.5, "epoch": 0.5132, "grad_norm": 3.1956149627279915, "kl": 5.921875, "learning_rate": 1.1281032481718696e-05, "loss": 0.2371, "reward": 0.615234375, "reward_std": 0.0390625, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.490234375, "step": 1283 }, { "clip_ratio": 0.0, "completion_length": 648.625, "epoch": 0.5136, "grad_norm": 0.3369174862992583, "kl": 6.5, "learning_rate": 1.1267183643900548e-05, "loss": 0.2599, "reward": 0.72265625, "reward_std": 0.08313017711043358, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48828125, "step": 1284 }, { "clip_ratio": 0.0, "completion_length": 595.625, "epoch": 0.514, "grad_norm": 0.12282705487996358, "kl": 5.7265625, "learning_rate": 1.1253332335643043e-05, "loss": 0.229, "reward": 0.4921875, "reward_std": 0.03125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4921875, "step": 1285 }, { "clip_ratio": 0.0, "completion_length": 596.875, "epoch": 0.5144, "grad_norm": 0.5135452277578049, "kl": 6.0234375, "learning_rate": 1.1239478583950019e-05, "loss": 0.241, "reward": 0.6171875, "reward_std": 0.03125, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4921875, "step": 1286 }, { "clip_ratio": 0.0, "completion_length": 478.625, "epoch": 0.5148, "grad_norm": 0.40572550111845196, "kl": 5.78125, "learning_rate": 1.1225622415830068e-05, "loss": 0.2317, "reward": 0.685546875, "reward_std": 0.09938595443964005, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.490234375, "step": 1287 }, { "clip_ratio": 0.0, "completion_length": 560.5, "epoch": 0.5152, "grad_norm": 0.6806972116209393, "kl": 6.1015625, "learning_rate": 1.1211763858296507e-05, "loss": 0.244, "reward": 0.7421875, "reward_std": 0.04043455049395561, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4921875, "step": 1288 }, { "clip_ratio": 0.0, "completion_length": 939.75, "epoch": 0.5156, "grad_norm": 3.3792059198276916, "kl": 4.984375, "learning_rate": 1.1197902938367297e-05, "loss": 0.1989, "reward": 0.48046875, "reward_std": 0.08730955049395561, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48046875, "step": 1289 }, { "clip_ratio": 0.0, "completion_length": 999.125, "epoch": 0.516, "grad_norm": 0.34160810065590874, "kl": 3.5078125, "learning_rate": 1.1184039683065014e-05, "loss": 0.1401, "reward": 0.486328125, "reward_std": 0.0546875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.486328125, "step": 1290 }, { "clip_ratio": 0.0, "completion_length": 1017.875, "epoch": 0.5164, "grad_norm": 0.6090248245786533, "kl": 2.58984375, "learning_rate": 1.1170174119416778e-05, "loss": 0.1034, "reward": 0.61328125, "reward_std": 0.04708939045667648, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48828125, "step": 1291 }, { "clip_ratio": 0.0, "completion_length": 1004.125, "epoch": 0.5168, "grad_norm": 0.3021261644604368, "kl": 2.45703125, "learning_rate": 1.1156306274454218e-05, "loss": 0.0982, "reward": 0.486328125, "reward_std": 0.0703125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.486328125, "step": 1292 }, { "clip_ratio": 0.0, "completion_length": 984.375, "epoch": 0.5172, "grad_norm": 0.4554279796595494, "kl": 2.7734375, "learning_rate": 1.1142436175213409e-05, "loss": 0.1109, "reward": 0.705078125, "reward_std": 0.09997996315360069, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.494140625, "step": 1293 }, { "clip_ratio": 0.0, "completion_length": 931.375, "epoch": 0.5176, "grad_norm": 0.885160129135975, "kl": 3.39453125, "learning_rate": 1.1128563848734817e-05, "loss": 0.1356, "reward": 0.603515625, "reward_std": 0.07108421996235847, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.486328125, "step": 1294 }, { "clip_ratio": 0.0, "completion_length": 782.828125, "epoch": 0.518, "grad_norm": 0.28846437928930413, "kl": 3.55078125, "learning_rate": 1.1114689322063255e-05, "loss": 0.1409, "reward": 0.625, "reward_std": 0.015625, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5, "step": 1295 }, { "clip_ratio": 0.0, "completion_length": 860.125, "epoch": 0.5184, "grad_norm": 0.22427061378205845, "kl": 2.91796875, "learning_rate": 1.1100812622247823e-05, "loss": 0.1164, "reward": 0.619140625, "reward_std": 0.0546875, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.494140625, "step": 1296 }, { "clip_ratio": 0.0, "completion_length": 776.4609375, "epoch": 0.5188, "grad_norm": 0.4307688360039255, "kl": 3.1484375, "learning_rate": 1.1086933776341853e-05, "loss": 0.126, "reward": 0.623046875, "reward_std": 0.0390625, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.498046875, "step": 1297 }, { "clip_ratio": 0.0, "completion_length": 679.5625, "epoch": 0.5192, "grad_norm": 1.33212989444975, "kl": 3.91796875, "learning_rate": 1.1073052811402867e-05, "loss": 0.1306, "reward": 0.50390625, "reward_std": 0.02629890665411949, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.50390625, "step": 1298 }, { "clip_ratio": 0.0, "completion_length": 632.5, "epoch": 0.5196, "grad_norm": 0.8926753306465672, "kl": 3.67578125, "learning_rate": 1.105916975449252e-05, "loss": 0.1468, "reward": 0.625, "reward_std": 0.02629890665411949, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5, "step": 1299 }, { "clip_ratio": 0.0, "completion_length": 624.25, "epoch": 0.52, "grad_norm": 0.31365799533124894, "kl": 3.7265625, "learning_rate": 1.1045284632676535e-05, "loss": 0.1486, "reward": 0.505859375, "reward_std": 0.07108421996235847, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.505859375, "step": 1300 }, { "clip_ratio": 0.0, "completion_length": 846.765625, "epoch": 0.5204, "grad_norm": 0.31509291770079706, "kl": 4.3984375, "learning_rate": 1.1031397473024674e-05, "loss": 0.1772, "reward": 0.478515625, "reward_std": 0.11958620324730873, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478515625, "step": 1301 }, { "clip_ratio": 0.0, "completion_length": 685.5703125, "epoch": 0.5208, "grad_norm": 0.35217413290385424, "kl": 4.1484375, "learning_rate": 1.1017508302610665e-05, "loss": 0.1658, "reward": 0.474609375, "reward_std": 0.08856561779975891, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.474609375, "step": 1302 }, { "clip_ratio": 0.0, "completion_length": 857.8515625, "epoch": 0.5212, "grad_norm": 0.9577004300053403, "kl": 4.40625, "learning_rate": 1.1003617148512149e-05, "loss": 0.177, "reward": 0.482421875, "reward_std": 0.1441047377884388, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.482421875, "step": 1303 }, { "clip_ratio": 0.0, "completion_length": 1008.53125, "epoch": 0.5216, "grad_norm": 0.32675485978381263, "kl": 4.75, "learning_rate": 1.0989724037810651e-05, "loss": 0.1885, "reward": 0.486328125, "reward_std": 0.24306639283895493, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.470703125, "step": 1304 }, { "clip_ratio": 0.0, "completion_length": 1005.5234375, "epoch": 0.522, "grad_norm": 0.44293030986579374, "kl": 4.8125, "learning_rate": 1.0975828997591496e-05, "loss": 0.1994, "reward": 0.453125, "reward_std": 0.23683075234293938, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4453125, "step": 1305 }, { "clip_ratio": 0.0, "completion_length": 946.71875, "epoch": 0.5224, "grad_norm": 0.7833349566753467, "kl": 4.453125, "learning_rate": 1.0961932054943778e-05, "loss": 0.1824, "reward": 0.458984375, "reward_std": 0.20550117269158363, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.458984375, "step": 1306 }, { "clip_ratio": 0.0, "completion_length": 1007.34375, "epoch": 0.5228, "grad_norm": 1.6280156876839673, "kl": 4.90625, "learning_rate": 1.0948033236960294e-05, "loss": 0.2083, "reward": 0.59375, "reward_std": 0.21680723875761032, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.46875, "step": 1307 }, { "clip_ratio": 0.0, "completion_length": 1012.3203125, "epoch": 0.5232, "grad_norm": 0.5265119677691602, "kl": 4.2734375, "learning_rate": 1.0934132570737508e-05, "loss": 0.178, "reward": 0.486328125, "reward_std": 0.273839320987463, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.447265625, "step": 1308 }, { "clip_ratio": 0.0, "completion_length": 1002.765625, "epoch": 0.5236, "grad_norm": 0.8099921084030361, "kl": 3.859375, "learning_rate": 1.0920230083375474e-05, "loss": 0.158, "reward": 0.41796875, "reward_std": 0.2522111013531685, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.41796875, "step": 1309 }, { "clip_ratio": 0.0, "completion_length": 1013.0859375, "epoch": 0.524, "grad_norm": 0.5810404503283085, "kl": 3.7109375, "learning_rate": 1.0906325801977804e-05, "loss": 0.1426, "reward": 0.556640625, "reward_std": 0.2603452689945698, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.431640625, "step": 1310 }, { "clip_ratio": 0.0, "completion_length": 995.9453125, "epoch": 0.5244, "grad_norm": 12.306908023605416, "kl": 6.92578125, "learning_rate": 1.0892419753651606e-05, "loss": 0.2811, "reward": 0.4375, "reward_std": 0.25088728964328766, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4375, "step": 1311 }, { "clip_ratio": 0.0, "completion_length": 971.7578125, "epoch": 0.5248, "grad_norm": 4.051979370466598, "kl": 4.2109375, "learning_rate": 1.0878511965507435e-05, "loss": 0.1736, "reward": 0.625, "reward_std": 0.3057894706726074, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4375, "step": 1312 }, { "clip_ratio": 0.0, "completion_length": 971.7890625, "epoch": 0.5252, "grad_norm": 2.5914415377031705, "kl": 4.015625, "learning_rate": 1.086460246465923e-05, "loss": 0.1633, "reward": 0.4921875, "reward_std": 0.2200658731162548, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4921875, "step": 1313 }, { "clip_ratio": 0.0, "completion_length": 966.390625, "epoch": 0.5256, "grad_norm": 1.5993927198748028, "kl": 3.80859375, "learning_rate": 1.0850691278224282e-05, "loss": 0.1607, "reward": 0.62890625, "reward_std": 0.31032848730683327, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.44921875, "step": 1314 }, { "clip_ratio": 0.0, "completion_length": 942.25, "epoch": 0.526, "grad_norm": 0.7628123784929399, "kl": 3.58203125, "learning_rate": 1.083677843332316e-05, "loss": 0.1462, "reward": 0.62109375, "reward_std": 0.2061202973127365, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.49609375, "step": 1315 }, { "clip_ratio": 0.0, "completion_length": 936.3359375, "epoch": 0.5264, "grad_norm": 0.9910696687192699, "kl": 3.30078125, "learning_rate": 1.0822863957079657e-05, "loss": 0.1465, "reward": 0.4921875, "reward_std": 0.25617221370339394, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.484375, "step": 1316 }, { "clip_ratio": 0.0, "completion_length": 952.765625, "epoch": 0.5268, "grad_norm": 0.8806888316006328, "kl": 4.4765625, "learning_rate": 1.0808947876620768e-05, "loss": 0.1916, "reward": 0.60546875, "reward_std": 0.2593121975660324, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.55078125, "step": 1317 }, { "clip_ratio": 0.0, "completion_length": 960.75, "epoch": 0.5272, "grad_norm": 0.27433138870661217, "kl": 3.76953125, "learning_rate": 1.07950302190766e-05, "loss": 0.143, "reward": 0.478515625, "reward_std": 0.19716444611549377, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.478515625, "step": 1318 }, { "clip_ratio": 0.0, "completion_length": 930.6875, "epoch": 0.5276, "grad_norm": 1.134660195416294, "kl": 3.92578125, "learning_rate": 1.0781111011580336e-05, "loss": 0.1404, "reward": 0.515625, "reward_std": 0.1955420933663845, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.515625, "step": 1319 }, { "clip_ratio": 0.0, "completion_length": 919.8671875, "epoch": 0.528, "grad_norm": 0.9779451962475408, "kl": 3.94921875, "learning_rate": 1.0767190281268187e-05, "loss": 0.1368, "reward": 0.572265625, "reward_std": 0.2458159625530243, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.494140625, "step": 1320 }, { "clip_ratio": 0.0, "completion_length": 869.953125, "epoch": 0.5284, "grad_norm": 1.11726786839189, "kl": 3.67578125, "learning_rate": 1.0753268055279328e-05, "loss": 0.1485, "reward": 0.529296875, "reward_std": 0.19145524874329567, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.529296875, "step": 1321 }, { "clip_ratio": 0.0, "completion_length": 940.5546875, "epoch": 0.5288, "grad_norm": 0.600985240685163, "kl": 4.0859375, "learning_rate": 1.0739344360755853e-05, "loss": 0.1434, "reward": 0.533203125, "reward_std": 0.21214960888028145, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.533203125, "step": 1322 }, { "clip_ratio": 0.0, "completion_length": 947.3203125, "epoch": 0.5292, "grad_norm": 1.1068862418165613, "kl": 3.8515625, "learning_rate": 1.072541922484271e-05, "loss": 0.1789, "reward": 0.505859375, "reward_std": 0.24831748753786087, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.498046875, "step": 1323 }, { "clip_ratio": 0.0, "completion_length": 947.5859375, "epoch": 0.5296, "grad_norm": 0.7562563510742988, "kl": 4.05859375, "learning_rate": 1.071149267468767e-05, "loss": 0.1599, "reward": 0.671875, "reward_std": 0.18981262668967247, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5390625, "step": 1324 }, { "clip_ratio": 0.0, "completion_length": 868.3515625, "epoch": 0.53, "grad_norm": 6.197133446808893, "kl": 3.8671875, "learning_rate": 1.0697564737441254e-05, "loss": 0.1712, "reward": 0.515625, "reward_std": 0.17817479372024536, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.515625, "step": 1325 }, { "clip_ratio": 0.0, "completion_length": 860.078125, "epoch": 0.5304, "grad_norm": 1.7817516684454036, "kl": 3.63671875, "learning_rate": 1.0683635440256689e-05, "loss": 0.1185, "reward": 0.572265625, "reward_std": 0.1466907486319542, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.572265625, "step": 1326 }, { "clip_ratio": 0.0, "completion_length": 893.890625, "epoch": 0.5308, "grad_norm": 0.4243070494297974, "kl": 4.25, "learning_rate": 1.0669704810289852e-05, "loss": 0.1617, "reward": 0.56640625, "reward_std": 0.162509486079216, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.56640625, "step": 1327 }, { "clip_ratio": 0.0, "completion_length": 807.75, "epoch": 0.5312, "grad_norm": 10.580963800887492, "kl": 3.75390625, "learning_rate": 1.0655772874699217e-05, "loss": 0.1715, "reward": 0.603515625, "reward_std": 0.1719711646437645, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.603515625, "step": 1328 }, { "clip_ratio": 0.0, "completion_length": 963.2265625, "epoch": 0.5316, "grad_norm": 6.563488207706025, "kl": 4.41015625, "learning_rate": 1.0641839660645806e-05, "loss": 0.1754, "reward": 0.626953125, "reward_std": 0.2538832426071167, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.595703125, "step": 1329 }, { "clip_ratio": 0.0, "completion_length": 987.21875, "epoch": 0.532, "grad_norm": 0.6029338863515471, "kl": 4.8359375, "learning_rate": 1.0627905195293135e-05, "loss": 0.2152, "reward": 0.58984375, "reward_std": 0.21073518320918083, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.58984375, "step": 1330 }, { "clip_ratio": 0.0, "completion_length": 837.5625, "epoch": 0.5324, "grad_norm": 0.4842628229612355, "kl": 4.25, "learning_rate": 1.0613969505807157e-05, "loss": 0.1887, "reward": 0.587890625, "reward_std": 0.19428642094135284, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.587890625, "step": 1331 }, { "clip_ratio": 0.0, "completion_length": 985.5234375, "epoch": 0.5328, "grad_norm": 1.5089502925323788, "kl": 4.453125, "learning_rate": 1.0600032619356208e-05, "loss": 0.174, "reward": 0.564453125, "reward_std": 0.2315107323229313, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.564453125, "step": 1332 }, { "clip_ratio": 0.0, "completion_length": 884.6875, "epoch": 0.5332, "grad_norm": 0.8510960945984906, "kl": 4.1953125, "learning_rate": 1.0586094563110965e-05, "loss": 0.1676, "reward": 0.693359375, "reward_std": 0.1969766803085804, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.568359375, "step": 1333 }, { "clip_ratio": 0.0, "completion_length": 896.046875, "epoch": 0.5336, "grad_norm": 0.5630484097892435, "kl": 4.1640625, "learning_rate": 1.0572155364244383e-05, "loss": 0.159, "reward": 0.767578125, "reward_std": 0.2504921816289425, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.603515625, "step": 1334 }, { "clip_ratio": 0.0, "completion_length": 800.6796875, "epoch": 0.534, "grad_norm": 0.40304506065819184, "kl": 3.3984375, "learning_rate": 1.055821504993164e-05, "loss": 0.1322, "reward": 0.654296875, "reward_std": 0.1641817595809698, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.654296875, "step": 1335 }, { "clip_ratio": 0.0, "completion_length": 917.875, "epoch": 0.5344, "grad_norm": 0.8891925783810631, "kl": 4.44921875, "learning_rate": 1.0544273647350091e-05, "loss": 0.1966, "reward": 0.791015625, "reward_std": 0.18304334580898285, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.658203125, "step": 1336 }, { "clip_ratio": 0.0, "completion_length": 993.953125, "epoch": 0.5348, "grad_norm": 0.7407789396887853, "kl": 3.58203125, "learning_rate": 1.053033118367922e-05, "loss": 0.1465, "reward": 0.634765625, "reward_std": 0.21460697054862976, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.634765625, "step": 1337 }, { "clip_ratio": 0.0, "completion_length": 926.859375, "epoch": 0.5352, "grad_norm": 0.7385066910151785, "kl": 4.14453125, "learning_rate": 1.0516387686100566e-05, "loss": 0.1904, "reward": 0.7109375, "reward_std": 0.28137611970305443, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6484375, "step": 1338 }, { "clip_ratio": 0.0, "completion_length": 990.8203125, "epoch": 0.5356, "grad_norm": 0.37397964813275075, "kl": 3.984375, "learning_rate": 1.0502443181797696e-05, "loss": 0.1659, "reward": 0.767578125, "reward_std": 0.21704262495040894, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.642578125, "step": 1339 }, { "clip_ratio": 0.0, "completion_length": 892.2734375, "epoch": 0.536, "grad_norm": 0.6383577905234661, "kl": 3.890625, "learning_rate": 1.0488497697956134e-05, "loss": 0.1589, "reward": 0.787109375, "reward_std": 0.17004867270588875, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.662109375, "step": 1340 }, { "clip_ratio": 0.0, "completion_length": 929.0078125, "epoch": 0.5364, "grad_norm": 1.36074967949177, "kl": 3.32421875, "learning_rate": 1.0474551261763315e-05, "loss": 0.141, "reward": 0.6640625, "reward_std": 0.1999126933515072, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6640625, "step": 1341 }, { "clip_ratio": 0.0, "completion_length": 905.7421875, "epoch": 0.5368, "grad_norm": 1.5740633927791228, "kl": 4.2734375, "learning_rate": 1.0460603900408523e-05, "loss": 0.1622, "reward": 0.65625, "reward_std": 0.17078421637415886, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.65625, "step": 1342 }, { "clip_ratio": 0.0, "completion_length": 1000.71875, "epoch": 0.5372, "grad_norm": 1.6022713482669733, "kl": 4.3671875, "learning_rate": 1.0446655641082864e-05, "loss": 0.1797, "reward": 0.787109375, "reward_std": 0.19779729470610619, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.662109375, "step": 1343 }, { "clip_ratio": 0.0, "completion_length": 951.7578125, "epoch": 0.5376, "grad_norm": 1.1246850894131057, "kl": 4.30078125, "learning_rate": 1.0432706510979172e-05, "loss": 0.1651, "reward": 0.68359375, "reward_std": 0.260277446359396, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.64453125, "step": 1344 }, { "clip_ratio": 0.0, "completion_length": 686.5859375, "epoch": 0.538, "grad_norm": 1.0136588960149877, "kl": 3.54296875, "learning_rate": 1.0418756537291996e-05, "loss": 0.1395, "reward": 0.712890625, "reward_std": 0.1914949230849743, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.673828125, "step": 1345 }, { "clip_ratio": 0.0, "completion_length": 884.9609375, "epoch": 0.5384, "grad_norm": 0.24949781938432472, "kl": 4.71875, "learning_rate": 1.0404805747217525e-05, "loss": 0.1878, "reward": 0.66015625, "reward_std": 0.17996172420680523, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.66015625, "step": 1346 }, { "clip_ratio": 0.0, "completion_length": 947.46875, "epoch": 0.5388, "grad_norm": 0.6180308664705375, "kl": 5.0859375, "learning_rate": 1.0390854167953537e-05, "loss": 0.2077, "reward": 0.80078125, "reward_std": 0.17835700511932373, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.67578125, "step": 1347 }, { "clip_ratio": 0.0, "completion_length": 790.6796875, "epoch": 0.5392, "grad_norm": 0.40370360269674893, "kl": 5.171875, "learning_rate": 1.0376901826699349e-05, "loss": 0.2035, "reward": 0.6875, "reward_std": 0.13980965316295624, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6875, "step": 1348 }, { "clip_ratio": 0.0, "completion_length": 956.625, "epoch": 0.5396, "grad_norm": 0.6909391870544271, "kl": 5.2734375, "learning_rate": 1.036294875065576e-05, "loss": 0.2109, "reward": 0.814453125, "reward_std": 0.16111458465456963, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.689453125, "step": 1349 }, { "clip_ratio": 0.0, "completion_length": 831.5, "epoch": 0.54, "grad_norm": 0.4649290737893589, "kl": 4.6640625, "learning_rate": 1.0348994967025012e-05, "loss": 0.1863, "reward": 0.904296875, "reward_std": 0.2047283835709095, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.701171875, "step": 1350 }, { "clip_ratio": 0.0, "completion_length": 852.75, "epoch": 0.5404, "grad_norm": 0.5673571113479953, "kl": 5.1015625, "learning_rate": 1.0335040503010715e-05, "loss": 0.2038, "reward": 0.693359375, "reward_std": 0.16366009786725044, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.693359375, "step": 1351 }, { "clip_ratio": 0.0, "completion_length": 851.109375, "epoch": 0.5408, "grad_norm": 0.23023400148929932, "kl": 5.4453125, "learning_rate": 1.0321085385817818e-05, "loss": 0.2202, "reward": 0.70703125, "reward_std": 0.12708456441760063, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.70703125, "step": 1352 }, { "clip_ratio": 0.0, "completion_length": 708.375, "epoch": 0.5412, "grad_norm": 0.36472003906458206, "kl": 4.734375, "learning_rate": 1.030712964265253e-05, "loss": 0.1896, "reward": 0.771484375, "reward_std": 0.1552789956331253, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.716796875, "step": 1353 }, { "clip_ratio": 0.0, "completion_length": 700.25, "epoch": 0.5416, "grad_norm": 0.3722122596915503, "kl": 4.48828125, "learning_rate": 1.0293173300722286e-05, "loss": 0.1796, "reward": 0.869140625, "reward_std": 0.20933087170124054, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.705078125, "step": 1354 }, { "clip_ratio": 0.0, "completion_length": 935.296875, "epoch": 0.542, "grad_norm": 0.36033930333640113, "kl": 5.9453125, "learning_rate": 1.0279216387235691e-05, "loss": 0.2392, "reward": 0.939453125, "reward_std": 0.15446127951145172, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.689453125, "step": 1355 }, { "clip_ratio": 0.0, "completion_length": 856.0, "epoch": 0.5424, "grad_norm": 0.32346784178037347, "kl": 5.40234375, "learning_rate": 1.026525892940246e-05, "loss": 0.2101, "reward": 0.81640625, "reward_std": 0.16738729551434517, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.69140625, "step": 1356 }, { "clip_ratio": 0.0, "completion_length": 541.5546875, "epoch": 0.5428, "grad_norm": 0.4904344993002118, "kl": 3.93359375, "learning_rate": 1.0251300954433377e-05, "loss": 0.1576, "reward": 0.849609375, "reward_std": 0.06172315776348114, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.724609375, "step": 1357 }, { "clip_ratio": 0.0, "completion_length": 720.4140625, "epoch": 0.5432, "grad_norm": 0.4472468058228944, "kl": 4.84375, "learning_rate": 1.0237342489540221e-05, "loss": 0.1943, "reward": 0.8359375, "reward_std": 0.11027613468468189, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7109375, "step": 1358 }, { "clip_ratio": 0.0, "completion_length": 638.265625, "epoch": 0.5436, "grad_norm": 0.4222318197719325, "kl": 4.140625, "learning_rate": 1.0223383561935738e-05, "loss": 0.1608, "reward": 0.85546875, "reward_std": 0.078125, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.73046875, "step": 1359 }, { "clip_ratio": 0.0, "completion_length": 570.875, "epoch": 0.544, "grad_norm": 0.4104832985390634, "kl": 3.796875, "learning_rate": 1.0209424198833571e-05, "loss": 0.1517, "reward": 0.8515625, "reward_std": 0.06711846217513084, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7265625, "step": 1360 }, { "clip_ratio": 0.0, "completion_length": 727.625, "epoch": 0.5444, "grad_norm": 0.6959936056309524, "kl": 4.8671875, "learning_rate": 1.0195464427448213e-05, "loss": 0.1948, "reward": 0.724609375, "reward_std": 0.0756682027131319, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.724609375, "step": 1361 }, { "clip_ratio": 0.0, "completion_length": 728.4296875, "epoch": 0.5448, "grad_norm": 0.6712418391416436, "kl": 4.5078125, "learning_rate": 1.0181504274994949e-05, "loss": 0.1822, "reward": 0.7109375, "reward_std": 0.09901386126875877, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7109375, "step": 1362 }, { "clip_ratio": 0.0, "completion_length": 656.234375, "epoch": 0.5452, "grad_norm": 0.3701543433185431, "kl": 4.328125, "learning_rate": 1.0167543768689816e-05, "loss": 0.1764, "reward": 0.845703125, "reward_std": 0.08388295583426952, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.720703125, "step": 1363 }, { "clip_ratio": 0.0, "completion_length": 873.8046875, "epoch": 0.5456, "grad_norm": 0.6588151633816544, "kl": 5.30078125, "learning_rate": 1.0153582935749531e-05, "loss": 0.2151, "reward": 0.857421875, "reward_std": 0.24457097053527832, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.708984375, "step": 1364 }, { "clip_ratio": 0.0, "completion_length": 625.1875, "epoch": 0.546, "grad_norm": 0.7114012854562592, "kl": 4.40625, "learning_rate": 1.0139621803391454e-05, "loss": 0.169, "reward": 0.75, "reward_std": 0.128127783536911, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7265625, "step": 1365 }, { "clip_ratio": 0.0, "completion_length": 667.625, "epoch": 0.5464, "grad_norm": 0.463098045332295, "kl": 4.0078125, "learning_rate": 1.0125660398833528e-05, "loss": 0.1602, "reward": 0.732421875, "reward_std": 0.06536140665411949, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.732421875, "step": 1366 }, { "clip_ratio": 0.0, "completion_length": 570.96875, "epoch": 0.5468, "grad_norm": 0.3958387745647534, "kl": 3.73046875, "learning_rate": 1.0111698749294223e-05, "loss": 0.1488, "reward": 0.8671875, "reward_std": 0.0667334571480751, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.75, "step": 1367 }, { "clip_ratio": 0.0, "completion_length": 596.75, "epoch": 0.5472, "grad_norm": 0.49296050918031553, "kl": 3.765625, "learning_rate": 1.0097736881992492e-05, "loss": 0.1508, "reward": 0.751953125, "reward_std": 0.07654347270727158, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.751953125, "step": 1368 }, { "clip_ratio": 0.0, "completion_length": 594.984375, "epoch": 0.5476, "grad_norm": 0.5902732344038504, "kl": 3.7734375, "learning_rate": 1.0083774824147707e-05, "loss": 0.1494, "reward": 0.767578125, "reward_std": 0.12321632355451584, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.767578125, "step": 1369 }, { "clip_ratio": 0.0, "completion_length": 799.25, "epoch": 0.548, "grad_norm": 0.5093397579069905, "kl": 4.609375, "learning_rate": 1.0069812602979617e-05, "loss": 0.1845, "reward": 0.830078125, "reward_std": 0.19572444632649422, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0078125, "rewards/tag_count_reward": 0.822265625, "step": 1370 }, { "clip_ratio": 0.0, "completion_length": 539.53125, "epoch": 0.5484, "grad_norm": 0.6888344851957043, "kl": 3.42578125, "learning_rate": 1.0055850245708283e-05, "loss": 0.1348, "reward": 1.095703125, "reward_std": 0.21617605350911617, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.015625, "rewards/tag_count_reward": 0.892578125, "step": 1371 }, { "clip_ratio": 0.0, "completion_length": 950.5, "epoch": 0.5488, "grad_norm": 1531.6829472270113, "kl": 23.75390625, "learning_rate": 1.0041887779554041e-05, "loss": 0.9501, "reward": 0.783203125, "reward_std": 0.3191300109028816, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.015625, "rewards/tag_count_reward": 0.728515625, "step": 1372 }, { "clip_ratio": 0.0, "completion_length": 728.25, "epoch": 0.5492, "grad_norm": 72.20880888245816, "kl": 4.5, "learning_rate": 1.0027925231737428e-05, "loss": 0.1798, "reward": 0.9921875, "reward_std": 0.22758449241518974, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.03125, "rewards/tag_count_reward": 0.8359375, "step": 1373 }, { "clip_ratio": 0.0, "completion_length": 596.7109375, "epoch": 0.5496, "grad_norm": 0.5739511001342293, "kl": 3.1640625, "learning_rate": 1.0013962629479145e-05, "loss": 0.1272, "reward": 0.9765625, "reward_std": 0.20066512003540993, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0546875, "rewards/tag_count_reward": 0.921875, "step": 1374 }, { "clip_ratio": 0.0, "completion_length": 652.5, "epoch": 0.55, "grad_norm": 0.4733587548545324, "kl": 3.48828125, "learning_rate": 1e-05, "loss": 0.1394, "reward": 0.986328125, "reward_std": 0.3445693254470825, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.1484375, "rewards/tag_count_reward": 0.837890625, "step": 1375 }, { "clip_ratio": 0.0, "completion_length": 591.5, "epoch": 0.5504, "grad_norm": 0.5980240660793723, "kl": 3.296875, "learning_rate": 9.986037370520856e-06, "loss": 0.1316, "reward": 0.982421875, "reward_std": 0.4577172100543976, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.1953125, "rewards/tag_count_reward": 0.654296875, "step": 1376 }, { "clip_ratio": 0.0, "completion_length": 639.75, "epoch": 0.5508, "grad_norm": 0.518227600781698, "kl": 3.14453125, "learning_rate": 9.972074768262576e-06, "loss": 0.1256, "reward": 1.130859375, "reward_std": 0.5826351344585419, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.4296875, "rewards/tag_count_reward": 0.646484375, "step": 1377 }, { "clip_ratio": 0.0, "completion_length": 645.5859375, "epoch": 0.5512, "grad_norm": 0.5713238659748845, "kl": 2.2421875, "learning_rate": 9.958112220445964e-06, "loss": 0.0891, "reward": 1.626953125, "reward_std": 0.42380233854055405, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.8125, "rewards/tag_count_reward": 0.580078125, "step": 1378 }, { "clip_ratio": 0.0, "completion_length": 660.28125, "epoch": 0.5516, "grad_norm": 0.5164526275445533, "kl": 2.296875, "learning_rate": 9.944149754291719e-06, "loss": 0.0859, "reward": 1.55078125, "reward_std": 0.25708746910095215, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.48046875, "step": 1379 }, { "clip_ratio": 0.0, "completion_length": 768.375, "epoch": 0.552, "grad_norm": 0.41834238616854985, "kl": 2.49609375, "learning_rate": 9.930187397020385e-06, "loss": 0.0999, "reward": 1.369140625, "reward_std": 0.2220463901758194, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.423828125, "step": 1380 }, { "clip_ratio": 0.0, "completion_length": 728.625, "epoch": 0.5524, "grad_norm": 0.41375432918820443, "kl": 2.4140625, "learning_rate": 9.916225175852295e-06, "loss": 0.0966, "reward": 1.44921875, "reward_std": 0.2242263350635767, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.49609375, "step": 1381 }, { "clip_ratio": 0.0, "completion_length": 805.875, "epoch": 0.5528, "grad_norm": 0.7119083993627805, "kl": 2.08984375, "learning_rate": 9.902263118007513e-06, "loss": 0.0836, "reward": 1.498046875, "reward_std": 0.2972038798034191, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.591796875, "step": 1382 }, { "clip_ratio": 0.0, "completion_length": 940.453125, "epoch": 0.5532, "grad_norm": 0.38182885187890786, "kl": 2.06640625, "learning_rate": 9.88830125070578e-06, "loss": 0.0862, "reward": 1.7265625, "reward_std": 0.3453005403280258, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.703125, "step": 1383 }, { "clip_ratio": 0.0, "completion_length": 960.6171875, "epoch": 0.5536, "grad_norm": 0.39250699234648584, "kl": 1.794921875, "learning_rate": 9.874339601166474e-06, "loss": 0.0713, "reward": 1.78125, "reward_std": 0.3785530924797058, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.734375, "step": 1384 }, { "clip_ratio": 0.0, "completion_length": 988.0, "epoch": 0.554, "grad_norm": 0.5427411780293564, "kl": 1.787109375, "learning_rate": 9.860378196608549e-06, "loss": 0.0717, "reward": 1.572265625, "reward_std": 0.5180066823959351, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.7578125, "rewards/tag_count_reward": 0.712890625, "step": 1385 }, { "clip_ratio": 0.0, "completion_length": 992.625, "epoch": 0.5544, "grad_norm": 0.5658967302019318, "kl": 2.55078125, "learning_rate": 9.84641706425047e-06, "loss": 0.1022, "reward": 1.552734375, "reward_std": 0.4172213599085808, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8359375, "rewards/tag_count_reward": 0.716796875, "step": 1386 }, { "clip_ratio": 0.0, "completion_length": 978.0, "epoch": 0.5548, "grad_norm": 0.5880181963826273, "kl": 1.96875, "learning_rate": 9.832456231310189e-06, "loss": 0.0787, "reward": 1.75, "reward_std": 0.3195689395070076, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.8828125, "rewards/tag_count_reward": 0.734375, "step": 1387 }, { "clip_ratio": 0.0, "completion_length": 1001.265625, "epoch": 0.5552, "grad_norm": 0.967614428410156, "kl": 2.373046875, "learning_rate": 9.818495725005053e-06, "loss": 0.0937, "reward": 1.765625, "reward_std": 0.3021479658782482, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.734375, "step": 1388 }, { "clip_ratio": 0.0, "completion_length": 1014.75, "epoch": 0.5556, "grad_norm": 4.402903893615541, "kl": 2.62109375, "learning_rate": 9.80453557255179e-06, "loss": 0.1051, "reward": 1.638671875, "reward_std": 0.30754194408655167, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.724609375, "step": 1389 }, { "clip_ratio": 0.0, "completion_length": 1009.8984375, "epoch": 0.556, "grad_norm": 2.2780980715894272, "kl": 2.74609375, "learning_rate": 9.790575801166432e-06, "loss": 0.1138, "reward": 1.517578125, "reward_std": 0.5146963745355606, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.7890625, "rewards/tag_count_reward": 0.650390625, "step": 1390 }, { "clip_ratio": 0.0, "completion_length": 1013.515625, "epoch": 0.5564, "grad_norm": 1.1689164914219685, "kl": 2.578125, "learning_rate": 9.776616438064265e-06, "loss": 0.1046, "reward": 1.478515625, "reward_std": 0.43723929673433304, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.7890625, "rewards/tag_count_reward": 0.689453125, "step": 1391 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5568, "grad_norm": 1.4072195988553369, "kl": 3.58984375, "learning_rate": 9.762657510459784e-06, "loss": 0.1434, "reward": 1.501953125, "reward_std": 0.590392492711544, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.7109375, "rewards/tag_count_reward": 0.650390625, "step": 1392 }, { "clip_ratio": 0.0, "completion_length": 977.75, "epoch": 0.5572, "grad_norm": 6.119038417306581, "kl": 3.8984375, "learning_rate": 9.748699045566626e-06, "loss": 0.1559, "reward": 1.373046875, "reward_std": 0.5106342285871506, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.6875, "rewards/tag_count_reward": 0.677734375, "step": 1393 }, { "clip_ratio": 0.0, "completion_length": 951.875, "epoch": 0.5576, "grad_norm": 1.2702029807156565, "kl": 4.94921875, "learning_rate": 9.73474107059754e-06, "loss": 0.198, "reward": 1.48046875, "reward_std": 0.5029492378234863, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.7734375, "rewards/tag_count_reward": 0.70703125, "step": 1394 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.558, "grad_norm": 1.3093257817968382, "kl": 5.8359375, "learning_rate": 9.720783612764314e-06, "loss": 0.2334, "reward": 1.2890625, "reward_std": 0.6140102744102478, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.6171875, "rewards/tag_count_reward": 0.671875, "step": 1395 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5584, "grad_norm": 0.8222781171204252, "kl": 6.015625, "learning_rate": 9.706826699277719e-06, "loss": 0.2402, "reward": 1.40234375, "reward_std": 0.5355711579322815, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.5546875, "rewards/tag_count_reward": 0.72265625, "step": 1396 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5588, "grad_norm": 1.651703072161114, "kl": 6.484375, "learning_rate": 9.692870357347474e-06, "loss": 0.2595, "reward": 1.306640625, "reward_std": 0.5335954800248146, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.578125, "rewards/tag_count_reward": 0.728515625, "step": 1397 }, { "clip_ratio": 0.0, "completion_length": 925.375, "epoch": 0.5592, "grad_norm": 0.6513502990497276, "kl": 5.828125, "learning_rate": 9.678914614182185e-06, "loss": 0.2229, "reward": 1.220703125, "reward_std": 0.5661241412162781, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.4921875, "rewards/tag_count_reward": 0.728515625, "step": 1398 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5596, "grad_norm": 1.0804711395365283, "kl": 5.859375, "learning_rate": 9.664959496989286e-06, "loss": 0.2343, "reward": 1.330078125, "reward_std": 0.550861582159996, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.4609375, "rewards/tag_count_reward": 0.744140625, "step": 1399 }, { "clip_ratio": 0.0, "completion_length": 814.625, "epoch": 0.56, "grad_norm": 0.7672507053599259, "kl": 5.765625, "learning_rate": 9.651005032974994e-06, "loss": 0.2305, "reward": 1.322265625, "reward_std": 0.4840143248438835, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.5703125, "rewards/tag_count_reward": 0.751953125, "step": 1400 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5604, "grad_norm": 3.2645790268998542, "kl": 6.2265625, "learning_rate": 9.637051249344244e-06, "loss": 0.2494, "reward": 1.29296875, "reward_std": 0.5502727255225182, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.546875, "rewards/tag_count_reward": 0.74609375, "step": 1401 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5608, "grad_norm": 25.88639315846552, "kl": 6.7734375, "learning_rate": 9.623098173300655e-06, "loss": 0.2711, "reward": 1.4609375, "reward_std": 0.5535630434751511, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.671875, "rewards/tag_count_reward": 0.734375, "step": 1402 }, { "clip_ratio": 0.0, "completion_length": 940.25, "epoch": 0.5612, "grad_norm": 25.161347820788198, "kl": 8.015625, "learning_rate": 9.609145832046465e-06, "loss": 0.3203, "reward": 1.46875, "reward_std": 0.5782418176531792, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.6015625, "rewards/tag_count_reward": 0.75, "step": 1403 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5616, "grad_norm": 9.447186321544065, "kl": 5.1640625, "learning_rate": 9.595194252782476e-06, "loss": 0.2069, "reward": 1.1015625, "reward_std": 0.607534259557724, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.453125, "rewards/tag_count_reward": 0.6484375, "step": 1404 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.562, "grad_norm": 0.7517941006421601, "kl": 3.71875, "learning_rate": 9.581243462708007e-06, "loss": 0.1487, "reward": 0.935546875, "reward_std": 0.6199385225772858, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.3515625, "rewards/tag_count_reward": 0.583984375, "step": 1405 }, { "clip_ratio": 0.0, "completion_length": 1018.2421875, "epoch": 0.5624, "grad_norm": 0.7655721258744123, "kl": 4.53125, "learning_rate": 9.567293489020831e-06, "loss": 0.18, "reward": 1.22265625, "reward_std": 0.6344556361436844, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.5078125, "rewards/tag_count_reward": 0.60546875, "step": 1406 }, { "clip_ratio": 0.0, "completion_length": 846.875, "epoch": 0.5628, "grad_norm": 0.6181637040896837, "kl": 5.53125, "learning_rate": 9.553344358917141e-06, "loss": 0.2211, "reward": 1.455078125, "reward_std": 0.43329034000635147, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.7578125, "rewards/tag_count_reward": 0.697265625, "step": 1407 }, { "clip_ratio": 0.0, "completion_length": 943.125, "epoch": 0.5632, "grad_norm": 1.0894649452368494, "kl": 5.46875, "learning_rate": 9.539396099591477e-06, "loss": 0.2185, "reward": 1.607421875, "reward_std": 0.6437253206968307, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.6171875, "rewards/tag_count_reward": 0.677734375, "step": 1408 }, { "clip_ratio": 0.0, "completion_length": 836.25, "epoch": 0.5636, "grad_norm": 0.8322023029710401, "kl": 5.6171875, "learning_rate": 9.525448738236691e-06, "loss": 0.225, "reward": 1.345703125, "reward_std": 0.4820963814854622, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.671875, "rewards/tag_count_reward": 0.673828125, "step": 1409 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.564, "grad_norm": 1.6769685628848252, "kl": 5.5078125, "learning_rate": 9.511502302043867e-06, "loss": 0.2203, "reward": 1.408203125, "reward_std": 0.6146216541528702, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.6171875, "rewards/tag_count_reward": 0.666015625, "step": 1410 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5644, "grad_norm": 3.2660575861047927, "kl": 6.3515625, "learning_rate": 9.497556818202306e-06, "loss": 0.254, "reward": 1.49609375, "reward_std": 0.6185446828603745, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.671875, "rewards/tag_count_reward": 0.69921875, "step": 1411 }, { "clip_ratio": 0.0, "completion_length": 918.5234375, "epoch": 0.5648, "grad_norm": 4.766918037718708, "kl": 6.59375, "learning_rate": 9.483612313899436e-06, "loss": 0.2522, "reward": 1.8203125, "reward_std": 0.4268338233232498, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.828125, "rewards/tag_count_reward": 0.7421875, "step": 1412 }, { "clip_ratio": 0.0, "completion_length": 925.75, "epoch": 0.5652, "grad_norm": 2.749770510434662, "kl": 6.2109375, "learning_rate": 9.469668816320785e-06, "loss": 0.2491, "reward": 1.421875, "reward_std": 0.5355380102992058, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.6875, "rewards/tag_count_reward": 0.71875, "step": 1413 }, { "clip_ratio": 0.0, "completion_length": 847.875, "epoch": 0.5656, "grad_norm": 26.521866478015465, "kl": 5.828125, "learning_rate": 9.45572635264991e-06, "loss": 0.2328, "reward": 1.787109375, "reward_std": 0.48686185479164124, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.8125, "rewards/tag_count_reward": 0.724609375, "step": 1414 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.566, "grad_norm": 10.528290546731455, "kl": 5.6015625, "learning_rate": 9.441784950068362e-06, "loss": 0.2237, "reward": 1.30859375, "reward_std": 0.6487497091293335, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.6171875, "rewards/tag_count_reward": 0.69140625, "step": 1415 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5664, "grad_norm": 135.82901325802743, "kl": 4.6015625, "learning_rate": 9.42784463575562e-06, "loss": 0.1839, "reward": 0.716796875, "reward_std": 0.5279682576656342, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.203125, "rewards/tag_count_reward": 0.513671875, "step": 1416 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5668, "grad_norm": 15.537538310311621, "kl": 1.78515625, "learning_rate": 9.413905436889035e-06, "loss": 0.0714, "reward": 0.671875, "reward_std": 0.34688250720500946, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0625, "rewards/tag_count_reward": 0.484375, "step": 1417 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5672, "grad_norm": 115.6545008829073, "kl": 4.078125, "learning_rate": 9.399967380643795e-06, "loss": 0.1634, "reward": 0.638671875, "reward_std": 0.4863635152578354, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.1328125, "rewards/tag_count_reward": 0.498046875, "step": 1418 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5676, "grad_norm": 8.465125368194489, "kl": 2.31640625, "learning_rate": 9.386030494192847e-06, "loss": 0.0926, "reward": 0.646484375, "reward_std": 0.4309869334101677, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.1328125, "rewards/tag_count_reward": 0.513671875, "step": 1419 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.568, "grad_norm": 3.1863942449760607, "kl": 3.42578125, "learning_rate": 9.372094804706867e-06, "loss": 0.1372, "reward": 0.9765625, "reward_std": 0.6932622939348221, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.3515625, "rewards/tag_count_reward": 0.6015625, "step": 1420 }, { "clip_ratio": 0.0, "completion_length": 1017.4765625, "epoch": 0.5684, "grad_norm": 15.355926513615136, "kl": 4.6484375, "learning_rate": 9.358160339354194e-06, "loss": 0.1799, "reward": 1.248046875, "reward_std": 0.6804347038269043, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.578125, "rewards/tag_count_reward": 0.654296875, "step": 1421 }, { "clip_ratio": 0.0, "completion_length": 936.2265625, "epoch": 0.5688, "grad_norm": 33.75128818002873, "kl": 6.2421875, "learning_rate": 9.344227125300788e-06, "loss": 0.2376, "reward": 1.412109375, "reward_std": 0.526820182800293, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.78125, "rewards/tag_count_reward": 0.630859375, "step": 1422 }, { "clip_ratio": 0.0, "completion_length": 933.8515625, "epoch": 0.5692, "grad_norm": 15.862790200214997, "kl": 5.90625, "learning_rate": 9.330295189710153e-06, "loss": 0.2306, "reward": 1.373046875, "reward_std": 0.5718606263399124, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.75, "rewards/tag_count_reward": 0.623046875, "step": 1423 }, { "clip_ratio": 0.0, "completion_length": 926.25, "epoch": 0.5696, "grad_norm": 336.28329967683004, "kl": 5.296875, "learning_rate": 9.316364559743315e-06, "loss": 0.2118, "reward": 1.48046875, "reward_std": 0.5524060428142548, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.765625, "rewards/tag_count_reward": 0.58984375, "step": 1424 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.57, "grad_norm": 93.52455286476983, "kl": 7.4609375, "learning_rate": 9.302435262558748e-06, "loss": 0.2982, "reward": 1.455078125, "reward_std": 0.5953521132469177, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.7265625, "rewards/tag_count_reward": 0.595703125, "step": 1425 }, { "clip_ratio": 0.0, "completion_length": 995.84375, "epoch": 0.5704, "grad_norm": 30.996395514933305, "kl": 5.4765625, "learning_rate": 9.288507325312334e-06, "loss": 0.1897, "reward": 1.435546875, "reward_std": 0.6914304792881012, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.6796875, "rewards/tag_count_reward": 0.630859375, "step": 1426 }, { "clip_ratio": 0.0, "completion_length": 1018.5234375, "epoch": 0.5708, "grad_norm": 264.8298745791599, "kl": 3.51953125, "learning_rate": 9.274580775157294e-06, "loss": 0.1421, "reward": 1.080078125, "reward_std": 0.6630901843309402, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.5625, "rewards/tag_count_reward": 0.517578125, "step": 1427 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5712, "grad_norm": 67.52037396428398, "kl": 4.20703125, "learning_rate": 9.260655639244152e-06, "loss": 0.1681, "reward": 1.216796875, "reward_std": 0.6932563185691833, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.625, "rewards/tag_count_reward": 0.576171875, "step": 1428 }, { "clip_ratio": 0.0, "completion_length": 1010.8828125, "epoch": 0.5716, "grad_norm": 30.79563939702983, "kl": 5.515625, "learning_rate": 9.246731944720675e-06, "loss": 0.205, "reward": 1.45703125, "reward_std": 0.6285959333181381, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.7109375, "rewards/tag_count_reward": 0.62109375, "step": 1429 }, { "clip_ratio": 0.0, "completion_length": 1017.1953125, "epoch": 0.572, "grad_norm": 95.33708960128095, "kl": 4.19921875, "learning_rate": 9.232809718731815e-06, "loss": 0.1519, "reward": 1.46875, "reward_std": 0.6027989909052849, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.671875, "rewards/tag_count_reward": 0.671875, "step": 1430 }, { "clip_ratio": 0.0, "completion_length": 977.2578125, "epoch": 0.5724, "grad_norm": 2.257195016409454, "kl": 6.125, "learning_rate": 9.218888988419668e-06, "loss": 0.1635, "reward": 1.51953125, "reward_std": 0.5209648311138153, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.765625, "rewards/tag_count_reward": 0.75390625, "step": 1431 }, { "clip_ratio": 0.0, "completion_length": 898.0, "epoch": 0.5728, "grad_norm": 3.0309645026023593, "kl": 5.96875, "learning_rate": 9.204969780923404e-06, "loss": 0.192, "reward": 1.521484375, "reward_std": 0.5350270420312881, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.7265625, "rewards/tag_count_reward": 0.771484375, "step": 1432 }, { "clip_ratio": 0.0, "completion_length": 972.7578125, "epoch": 0.5732, "grad_norm": 5.308764326058561, "kl": 5.8359375, "learning_rate": 9.191052123379234e-06, "loss": 0.178, "reward": 1.560546875, "reward_std": 0.546789214015007, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.6640625, "rewards/tag_count_reward": 0.771484375, "step": 1433 }, { "clip_ratio": 0.0, "completion_length": 967.75, "epoch": 0.5736, "grad_norm": 5.512595523367832, "kl": 5.5234375, "learning_rate": 9.177136042920344e-06, "loss": 0.1464, "reward": 1.458984375, "reward_std": 0.5773710757493973, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.703125, "rewards/tag_count_reward": 0.755859375, "step": 1434 }, { "clip_ratio": 0.0, "completion_length": 1005.4375, "epoch": 0.574, "grad_norm": 39.85133642388948, "kl": 6.46875, "learning_rate": 9.163221566676847e-06, "loss": 0.2448, "reward": 1.2421875, "reward_std": 0.5310853198170662, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.515625, "rewards/tag_count_reward": 0.7265625, "step": 1435 }, { "clip_ratio": 0.0, "completion_length": 917.828125, "epoch": 0.5744, "grad_norm": 3.3907311911439106, "kl": 6.03125, "learning_rate": 9.14930872177572e-06, "loss": 0.2077, "reward": 1.2421875, "reward_std": 0.5958608686923981, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.5390625, "rewards/tag_count_reward": 0.703125, "step": 1436 }, { "clip_ratio": 0.0, "completion_length": 818.2421875, "epoch": 0.5748, "grad_norm": 5.138410841002648, "kl": 5.6640625, "learning_rate": 9.135397535340773e-06, "loss": 0.2103, "reward": 1.32421875, "reward_std": 0.608442634344101, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.609375, "rewards/tag_count_reward": 0.71484375, "step": 1437 }, { "clip_ratio": 0.0, "completion_length": 840.3359375, "epoch": 0.5752, "grad_norm": 7.656414657549047, "kl": 5.953125, "learning_rate": 9.121488034492569e-06, "loss": 0.2034, "reward": 1.578125, "reward_std": 0.586449384689331, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.6875, "rewards/tag_count_reward": 0.765625, "step": 1438 }, { "clip_ratio": 0.0, "completion_length": 937.5, "epoch": 0.5756, "grad_norm": 6.0347708915221805, "kl": 6.78125, "learning_rate": 9.107580246348395e-06, "loss": 0.2404, "reward": 1.578125, "reward_std": 0.46219296008348465, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.828125, "rewards/tag_count_reward": 0.75, "step": 1439 }, { "clip_ratio": 0.0, "completion_length": 852.3671875, "epoch": 0.576, "grad_norm": 485.51441714612525, "kl": 7.34375, "learning_rate": 9.093674198022201e-06, "loss": 0.2664, "reward": 1.6796875, "reward_std": 0.481221504509449, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.7578125, "step": 1440 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5764, "grad_norm": 96.51175380597198, "kl": 6.0859375, "learning_rate": 9.07976991662453e-06, "loss": 0.2432, "reward": 1.291015625, "reward_std": 0.6728495061397552, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.6640625, "rewards/tag_count_reward": 0.619140625, "step": 1441 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5768, "grad_norm": 38.54070130883371, "kl": 6.8203125, "learning_rate": 9.065867429262497e-06, "loss": 0.2726, "reward": 1.259765625, "reward_std": 0.6568325012922287, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.6640625, "rewards/tag_count_reward": 0.595703125, "step": 1442 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5772, "grad_norm": 13.910164738087083, "kl": 5.9921875, "learning_rate": 9.051966763039706e-06, "loss": 0.2399, "reward": 1.314453125, "reward_std": 0.7042272388935089, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.703125, "rewards/tag_count_reward": 0.611328125, "step": 1443 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5776, "grad_norm": 12.042344145754528, "kl": 4.70703125, "learning_rate": 9.038067945056229e-06, "loss": 0.1887, "reward": 1.296875, "reward_std": 0.7747727632522583, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.5625, "rewards/tag_count_reward": 0.5625, "step": 1444 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.578, "grad_norm": 2199758.8768707947, "kl": 10052.390625, "learning_rate": 9.024171002408507e-06, "loss": 402.2307, "reward": 1.30078125, "reward_std": 0.6668940335512161, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.6796875, "rewards/tag_count_reward": 0.62109375, "step": 1445 }, { "clip_ratio": 0.0, "completion_length": 1017.421875, "epoch": 0.5784, "grad_norm": 24541.72571382156, "kl": 36.04296875, "learning_rate": 9.01027596218935e-06, "loss": 1.5034, "reward": 0.982421875, "reward_std": 0.7479348480701447, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.390625, "rewards/tag_count_reward": 0.591796875, "step": 1446 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5788, "grad_norm": 128.8135123068292, "kl": 4.3359375, "learning_rate": 8.996382851487851e-06, "loss": 0.1734, "reward": 0.775390625, "reward_std": 0.5175662711262703, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.2265625, "rewards/tag_count_reward": 0.548828125, "step": 1447 }, { "clip_ratio": 0.0, "completion_length": 1017.171875, "epoch": 0.5792, "grad_norm": 9.992078761389013, "kl": 5.8984375, "learning_rate": 8.982491697389339e-06, "loss": 0.2298, "reward": 1.423828125, "reward_std": 0.6812544614076614, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.671875, "rewards/tag_count_reward": 0.626953125, "step": 1448 }, { "clip_ratio": 0.0, "completion_length": 973.234375, "epoch": 0.5796, "grad_norm": 1.0910301736447714, "kl": 5.078125, "learning_rate": 8.968602526975329e-06, "loss": 0.154, "reward": 1.3515625, "reward_std": 0.7044045478105545, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.671875, "rewards/tag_count_reward": 0.6796875, "step": 1449 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.58, "grad_norm": 5.020840790512588, "kl": 4.9140625, "learning_rate": 8.954715367323468e-06, "loss": 0.1965, "reward": 1.255859375, "reward_std": 0.7008561939001083, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.6015625, "rewards/tag_count_reward": 0.654296875, "step": 1450 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5804, "grad_norm": 49.584887459054094, "kl": 2.357421875, "learning_rate": 8.940830245507483e-06, "loss": 0.0943, "reward": 0.986328125, "reward_std": 0.6417175680398941, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.3671875, "rewards/tag_count_reward": 0.619140625, "step": 1451 }, { "clip_ratio": 0.0, "completion_length": 1016.46875, "epoch": 0.5808, "grad_norm": 19.410097712386783, "kl": 5.7734375, "learning_rate": 8.926947188597133e-06, "loss": 0.2236, "reward": 1.203125, "reward_std": 0.7563407570123672, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.46875, "rewards/tag_count_reward": 0.6484375, "step": 1452 }, { "clip_ratio": 0.0, "completion_length": 1010.84375, "epoch": 0.5812, "grad_norm": 3.407664171406286, "kl": 6.0390625, "learning_rate": 8.913066223658152e-06, "loss": 0.2262, "reward": 1.5625, "reward_std": 0.5961131602525711, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.78125, "rewards/tag_count_reward": 0.78125, "step": 1453 }, { "clip_ratio": 0.0, "completion_length": 1010.46875, "epoch": 0.5816, "grad_norm": 11.2450629439442, "kl": 5.546875, "learning_rate": 8.89918737775218e-06, "loss": 0.2143, "reward": 1.314453125, "reward_std": 0.7126972377300262, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.6171875, "rewards/tag_count_reward": 0.697265625, "step": 1454 }, { "clip_ratio": 0.0, "completion_length": 1010.6015625, "epoch": 0.582, "grad_norm": 1.0756118206857648, "kl": 5.1015625, "learning_rate": 8.885310677936746e-06, "loss": 0.1933, "reward": 1.392578125, "reward_std": 0.7343569546937943, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.671875, "rewards/tag_count_reward": 0.720703125, "step": 1455 }, { "clip_ratio": 0.0, "completion_length": 890.1171875, "epoch": 0.5824, "grad_norm": 0.7667913055091184, "kl": 5.125, "learning_rate": 8.871436151265183e-06, "loss": 0.1762, "reward": 1.84375, "reward_std": 0.530832551419735, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.7890625, "rewards/tag_count_reward": 0.8046875, "step": 1456 }, { "clip_ratio": 0.0, "completion_length": 934.9609375, "epoch": 0.5828, "grad_norm": 2.142255783145446, "kl": 5.953125, "learning_rate": 8.857563824786598e-06, "loss": 0.2209, "reward": 1.8671875, "reward_std": 0.5348035395145416, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.8671875, "rewards/tag_count_reward": 0.8125, "step": 1457 }, { "clip_ratio": 0.0, "completion_length": 902.125, "epoch": 0.5832, "grad_norm": 0.3781840784655161, "kl": 5.484375, "learning_rate": 8.843693725545787e-06, "loss": 0.1822, "reward": 1.59765625, "reward_std": 0.576813206076622, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.7890625, "rewards/tag_count_reward": 0.80859375, "step": 1458 }, { "clip_ratio": 0.0, "completion_length": 622.3359375, "epoch": 0.5836, "grad_norm": 27.948555044791714, "kl": 4.71484375, "learning_rate": 8.829825880583228e-06, "loss": 0.116, "reward": 1.669921875, "reward_std": 0.5206689238548279, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.828125, "rewards/tag_count_reward": 0.841796875, "step": 1459 }, { "clip_ratio": 0.0, "completion_length": 905.6171875, "epoch": 0.584, "grad_norm": 0.3895756682994108, "kl": 6.078125, "learning_rate": 8.815960316934991e-06, "loss": 0.2204, "reward": 1.69140625, "reward_std": 0.5338086411356926, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.84375, "rewards/tag_count_reward": 0.84765625, "step": 1460 }, { "clip_ratio": 0.0, "completion_length": 906.390625, "epoch": 0.5844, "grad_norm": 0.3834879388022923, "kl": 6.25, "learning_rate": 8.802097061632706e-06, "loss": 0.1936, "reward": 1.849609375, "reward_std": 0.507087804377079, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.859375, "rewards/tag_count_reward": 0.865234375, "step": 1461 }, { "clip_ratio": 0.0, "completion_length": 793.046875, "epoch": 0.5848, "grad_norm": 0.7571042691847577, "kl": 5.78125, "learning_rate": 8.788236141703498e-06, "loss": 0.1817, "reward": 1.69140625, "reward_std": 0.5075175017118454, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8359375, "rewards/tag_count_reward": 0.85546875, "step": 1462 }, { "clip_ratio": 0.0, "completion_length": 882.2421875, "epoch": 0.5852, "grad_norm": 0.8064250189330414, "kl": 6.203125, "learning_rate": 8.774377584169934e-06, "loss": 0.1825, "reward": 1.7109375, "reward_std": 0.5089640989899635, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8359375, "rewards/tag_count_reward": 0.875, "step": 1463 }, { "clip_ratio": 0.0, "completion_length": 833.34375, "epoch": 0.5856, "grad_norm": 0.6039253520193163, "kl": 6.09375, "learning_rate": 8.760521416049983e-06, "loss": 0.1991, "reward": 1.791015625, "reward_std": 0.4567227438092232, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.875, "rewards/tag_count_reward": 0.916015625, "step": 1464 }, { "clip_ratio": 0.0, "completion_length": 610.671875, "epoch": 0.586, "grad_norm": 0.7351587344090895, "kl": 5.71875, "learning_rate": 8.746667664356957e-06, "loss": 0.2086, "reward": 2.10546875, "reward_std": 0.3397483117878437, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.94140625, "step": 1465 }, { "clip_ratio": 0.0, "completion_length": 807.53125, "epoch": 0.5864, "grad_norm": 0.5348650470808051, "kl": 6.171875, "learning_rate": 8.732816356099455e-06, "loss": 0.2303, "reward": 1.78515625, "reward_std": 0.4296867847442627, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.859375, "rewards/tag_count_reward": 0.92578125, "step": 1466 }, { "clip_ratio": 0.0, "completion_length": 704.1484375, "epoch": 0.5868, "grad_norm": 30.809656831455165, "kl": 7.3828125, "learning_rate": 8.718967518281307e-06, "loss": 0.2879, "reward": 1.9453125, "reward_std": 0.4239741452038288, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.8671875, "rewards/tag_count_reward": 0.953125, "step": 1467 }, { "clip_ratio": 0.0, "completion_length": 912.0, "epoch": 0.5872, "grad_norm": 85.84892002998136, "kl": 5.1171875, "learning_rate": 8.705121177901532e-06, "loss": 0.2045, "reward": 1.005859375, "reward_std": 0.5380842834711075, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.2265625, "rewards/tag_count_reward": 0.779296875, "step": 1468 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5876, "grad_norm": 1.9858723834625764, "kl": 4.65625, "learning_rate": 8.69127736195428e-06, "loss": 0.1865, "reward": 1.419921875, "reward_std": 0.6338995695114136, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.453125, "rewards/tag_count_reward": 0.841796875, "step": 1469 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.588, "grad_norm": 1.5992000410772569, "kl": 4.609375, "learning_rate": 8.677436097428775e-06, "loss": 0.1841, "reward": 1.494140625, "reward_std": 0.6469445079565048, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.515625, "rewards/tag_count_reward": 0.853515625, "step": 1470 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5884, "grad_norm": 0.45876066892735595, "kl": 5.9765625, "learning_rate": 8.663597411309278e-06, "loss": 0.239, "reward": 1.68359375, "reward_std": 0.6403552740812302, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.71875, "rewards/tag_count_reward": 0.90234375, "step": 1471 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5888, "grad_norm": 1.9956664961322002, "kl": 6.8046875, "learning_rate": 8.649761330575009e-06, "loss": 0.2717, "reward": 1.60546875, "reward_std": 0.6340619772672653, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.71875, "rewards/tag_count_reward": 0.88671875, "step": 1472 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5892, "grad_norm": 2.053636183002416, "kl": 7.046875, "learning_rate": 8.635927882200117e-06, "loss": 0.2819, "reward": 1.8203125, "reward_std": 0.473195381462574, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8671875, "rewards/tag_count_reward": 0.953125, "step": 1473 }, { "clip_ratio": 0.0, "completion_length": 817.25, "epoch": 0.5896, "grad_norm": 8.442885997696061, "kl": 7.1796875, "learning_rate": 8.62209709315362e-06, "loss": 0.2872, "reward": 1.84375, "reward_std": 0.4411155506968498, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.9453125, "step": 1474 }, { "clip_ratio": 0.0, "completion_length": 824.5, "epoch": 0.59, "grad_norm": 1.4375915205680743, "kl": 6.03125, "learning_rate": 8.60826899039935e-06, "loss": 0.2409, "reward": 1.958984375, "reward_std": 0.5474384427070618, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.8359375, "rewards/tag_count_reward": 0.919921875, "step": 1475 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5904, "grad_norm": 109.23809944442671, "kl": 3.94921875, "learning_rate": 8.594443600895892e-06, "loss": 0.158, "reward": 1.4453125, "reward_std": 0.6522954553365707, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.6328125, "rewards/tag_count_reward": 0.8125, "step": 1476 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5908, "grad_norm": 99.8453227742556, "kl": 3.859375, "learning_rate": 8.580620951596556e-06, "loss": 0.1542, "reward": 1.412109375, "reward_std": 0.6909126341342926, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.6171875, "rewards/tag_count_reward": 0.794921875, "step": 1477 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5912, "grad_norm": 62.579000581312414, "kl": 5.875, "learning_rate": 8.566801069449307e-06, "loss": 0.2354, "reward": 1.53125, "reward_std": 0.7288174480199814, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.625, "rewards/tag_count_reward": 0.78125, "step": 1478 }, { "clip_ratio": 0.0, "completion_length": 932.75, "epoch": 0.5916, "grad_norm": 55.03030702582512, "kl": 7.1640625, "learning_rate": 8.552983981396709e-06, "loss": 0.2865, "reward": 1.70703125, "reward_std": 0.5021977797150612, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8046875, "rewards/tag_count_reward": 0.90234375, "step": 1479 }, { "clip_ratio": 0.0, "completion_length": 805.125, "epoch": 0.592, "grad_norm": 13.949810302397525, "kl": 6.1328125, "learning_rate": 8.539169714375885e-06, "loss": 0.2451, "reward": 1.73828125, "reward_std": 0.49959027022123337, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.828125, "rewards/tag_count_reward": 0.91015625, "step": 1480 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5924, "grad_norm": 275.47429709890486, "kl": 6.5625, "learning_rate": 8.525358295318454e-06, "loss": 0.2625, "reward": 1.4765625, "reward_std": 0.7545635849237442, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.671875, "rewards/tag_count_reward": 0.8046875, "step": 1481 }, { "clip_ratio": 0.0, "completion_length": 915.875, "epoch": 0.5928, "grad_norm": 8.548031863117561, "kl": 5.8828125, "learning_rate": 8.511549751150478e-06, "loss": 0.2354, "reward": 1.470703125, "reward_std": 0.693891853094101, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.6796875, "rewards/tag_count_reward": 0.791015625, "step": 1482 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5932, "grad_norm": 112.66088721946039, "kl": 5.5078125, "learning_rate": 8.49774410879243e-06, "loss": 0.2204, "reward": 1.26171875, "reward_std": 0.8508214801549911, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.546875, "rewards/tag_count_reward": 0.69140625, "step": 1483 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5936, "grad_norm": 8.769098736168454, "kl": 5.578125, "learning_rate": 8.483941395159114e-06, "loss": 0.2225, "reward": 1.390625, "reward_std": 0.820626750588417, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.6328125, "rewards/tag_count_reward": 0.7578125, "step": 1484 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.594, "grad_norm": 1.0838909075981185, "kl": 4.62890625, "learning_rate": 8.47014163715962e-06, "loss": 0.1854, "reward": 1.50390625, "reward_std": 0.9129415899515152, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.53125, "rewards/tag_count_reward": 0.70703125, "step": 1485 }, { "clip_ratio": 0.0, "completion_length": 620.375, "epoch": 0.5944, "grad_norm": 4.624679083632154, "kl": 5.859375, "learning_rate": 8.45634486169729e-06, "loss": 0.2342, "reward": 1.869140625, "reward_std": 0.34751949459314346, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.955078125, "step": 1486 }, { "clip_ratio": 0.0, "completion_length": 605.375, "epoch": 0.5948, "grad_norm": 5.285178176505311, "kl": 6.15625, "learning_rate": 8.44255109566964e-06, "loss": 0.2464, "reward": 2.06640625, "reward_std": 0.18092159926891327, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.98046875, "step": 1487 }, { "clip_ratio": 0.0, "completion_length": 918.5, "epoch": 0.5952, "grad_norm": 13.403581173264916, "kl": 6.828125, "learning_rate": 8.428760365968327e-06, "loss": 0.2733, "reward": 1.85546875, "reward_std": 0.3892829865217209, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.92578125, "step": 1488 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5956, "grad_norm": 21.777825702468057, "kl": 4.63671875, "learning_rate": 8.414972699479076e-06, "loss": 0.1858, "reward": 1.744140625, "reward_std": 0.394255168735981, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.84375, "rewards/tag_count_reward": 0.775390625, "step": 1489 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.596, "grad_norm": 11.211125681305813, "kl": 1.810546875, "learning_rate": 8.401188123081653e-06, "loss": 0.0725, "reward": 1.62890625, "reward_std": 0.6439783573150635, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.6796875, "rewards/tag_count_reward": 0.78515625, "step": 1490 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5964, "grad_norm": 10.289464407994338, "kl": 2.564453125, "learning_rate": 8.387406663649796e-06, "loss": 0.1026, "reward": 1.521484375, "reward_std": 0.5099193081259727, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.84375, "rewards/tag_count_reward": 0.677734375, "step": 1491 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5968, "grad_norm": 21.929152339480453, "kl": 6.009765625, "learning_rate": 8.373628348051165e-06, "loss": 0.2403, "reward": 1.85546875, "reward_std": 0.3348463773727417, "rewards/accuracy_reward": 0.2421875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.66796875, "step": 1492 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5972, "grad_norm": 27.686615936036755, "kl": 5.3671875, "learning_rate": 8.35985320314729e-06, "loss": 0.2146, "reward": 1.72265625, "reward_std": 0.4356834292411804, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.61328125, "step": 1493 }, { "clip_ratio": 0.0, "completion_length": 902.375, "epoch": 0.5976, "grad_norm": 16.40852450945125, "kl": 3.11328125, "learning_rate": 8.346081255793524e-06, "loss": 0.1248, "reward": 1.61328125, "reward_std": 0.2675826624035835, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.66015625, "step": 1494 }, { "clip_ratio": 0.0, "completion_length": 540.3671875, "epoch": 0.598, "grad_norm": 15.424661388697661, "kl": 3.0546875, "learning_rate": 8.332312532838978e-06, "loss": 0.1116, "reward": 1.9453125, "reward_std": 0.14674776792526245, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.7265625, "step": 1495 }, { "clip_ratio": 0.0, "completion_length": 342.625, "epoch": 0.5984, "grad_norm": 2.645566096976535, "kl": 3.80859375, "learning_rate": 8.318547061126485e-06, "loss": 0.152, "reward": 1.966796875, "reward_std": 0.17346668615937233, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.857421875, "step": 1496 }, { "clip_ratio": 0.0, "completion_length": 504.625, "epoch": 0.5988, "grad_norm": 1.091434317432056, "kl": 5.4921875, "learning_rate": 8.30478486749254e-06, "loss": 0.2201, "reward": 1.953125, "reward_std": 0.17759781330823898, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.9765625, "step": 1497 }, { "clip_ratio": 0.0, "completion_length": 520.875, "epoch": 0.5992, "grad_norm": 1.2087313942247175, "kl": 5.5078125, "learning_rate": 8.291025978767236e-06, "loss": 0.2207, "reward": 2.021484375, "reward_std": 0.2584594264626503, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.958984375, "step": 1498 }, { "clip_ratio": 0.0, "completion_length": 343.0, "epoch": 0.5996, "grad_norm": 0.44082800964933977, "kl": 4.66796875, "learning_rate": 8.277270421774234e-06, "loss": 0.1865, "reward": 1.978515625, "reward_std": 0.0859375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.994140625, "step": 1499 }, { "clip_ratio": 0.0, "completion_length": 517.25, "epoch": 0.6, "grad_norm": 0.5022554206576282, "kl": 5.546875, "learning_rate": 8.263518223330698e-06, "loss": 0.2222, "reward": 2.09375, "reward_std": 0.2716270014643669, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.9609375, "step": 1500 }, { "clip_ratio": 0.0, "completion_length": 412.125, "epoch": 0.6004, "grad_norm": 0.5320554938398091, "kl": 5.3984375, "learning_rate": 8.249769410247239e-06, "loss": 0.2163, "reward": 1.96484375, "reward_std": 0.140625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98828125, "step": 1501 }, { "clip_ratio": 0.0, "completion_length": 629.875, "epoch": 0.6008, "grad_norm": 0.862304448641457, "kl": 5.9296875, "learning_rate": 8.236024009327879e-06, "loss": 0.2374, "reward": 2.15625, "reward_std": 0.30612194538116455, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.9609375, "step": 1502 }, { "clip_ratio": 0.0, "completion_length": 241.875, "epoch": 0.6012, "grad_norm": 0.9624331871876929, "kl": 5.171875, "learning_rate": 8.222282047369972e-06, "loss": 0.2072, "reward": 1.978515625, "reward_std": 0.07838455587625504, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.994140625, "step": 1503 }, { "clip_ratio": 0.0, "completion_length": 310.625, "epoch": 0.6016, "grad_norm": 2.083982150697921, "kl": 5.6953125, "learning_rate": 8.208543551164178e-06, "loss": 0.2276, "reward": 2.02734375, "reward_std": 0.1656431294977665, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1504 }, { "clip_ratio": 0.0, "completion_length": 509.25, "epoch": 0.602, "grad_norm": 21.05693334766354, "kl": 5.6484375, "learning_rate": 8.194808547494401e-06, "loss": 0.226, "reward": 1.90625, "reward_std": 0.3160976693034172, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.96875, "step": 1505 }, { "clip_ratio": 0.0, "completion_length": 828.625, "epoch": 0.6024, "grad_norm": 0.8963213567349052, "kl": 6.6796875, "learning_rate": 8.181077063137733e-06, "loss": 0.2668, "reward": 1.875, "reward_std": 0.3741498291492462, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.953125, "step": 1506 }, { "clip_ratio": 0.0, "completion_length": 818.875, "epoch": 0.6028, "grad_norm": 5.603834195693048, "kl": 7.3359375, "learning_rate": 8.167349124864406e-06, "loss": 0.2933, "reward": 1.822265625, "reward_std": 0.4368221387267113, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.931640625, "step": 1507 }, { "clip_ratio": 0.0, "completion_length": 714.0, "epoch": 0.6032, "grad_norm": 3.4708735063959955, "kl": 6.5703125, "learning_rate": 8.153624759437733e-06, "loss": 0.2633, "reward": 1.876953125, "reward_std": 0.31104468926787376, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.955078125, "step": 1508 }, { "clip_ratio": 0.0, "completion_length": 919.25, "epoch": 0.6036, "grad_norm": 2.0176673822911173, "kl": 6.90625, "learning_rate": 8.139903993614069e-06, "loss": 0.2764, "reward": 1.98046875, "reward_std": 0.5023251101374626, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.94140625, "step": 1509 }, { "clip_ratio": 0.0, "completion_length": 920.0, "epoch": 0.604, "grad_norm": 2.661757855970237, "kl": 7.390625, "learning_rate": 8.126186854142752e-06, "loss": 0.2954, "reward": 1.9296875, "reward_std": 0.47456425428390503, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.9453125, "step": 1510 }, { "clip_ratio": 0.0, "completion_length": 820.125, "epoch": 0.6044, "grad_norm": 2.235985634062661, "kl": 6.6640625, "learning_rate": 8.112473367766051e-06, "loss": 0.2662, "reward": 1.97265625, "reward_std": 0.3840191289782524, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.95703125, "step": 1511 }, { "clip_ratio": 0.0, "completion_length": 638.125, "epoch": 0.6048, "grad_norm": 2.5442693543054893, "kl": 5.5, "learning_rate": 8.098763561219101e-06, "loss": 0.2202, "reward": 1.994140625, "reward_std": 0.2872113138437271, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.955078125, "step": 1512 }, { "clip_ratio": 0.0, "completion_length": 929.0, "epoch": 0.6052, "grad_norm": 2.283095406785078, "kl": 6.03125, "learning_rate": 8.08505746122987e-06, "loss": 0.2411, "reward": 1.97265625, "reward_std": 0.34544968605041504, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.94921875, "step": 1513 }, { "clip_ratio": 0.0, "completion_length": 837.5, "epoch": 0.6056, "grad_norm": 1.5763171673243725, "kl": 5.6484375, "learning_rate": 8.07135509451911e-06, "loss": 0.226, "reward": 1.880859375, "reward_std": 0.3366757184267044, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.951171875, "step": 1514 }, { "clip_ratio": 0.0, "completion_length": 929.25, "epoch": 0.606, "grad_norm": 5.883834265520073, "kl": 6.0390625, "learning_rate": 8.057656487800283e-06, "loss": 0.2416, "reward": 2.119140625, "reward_std": 0.368885762989521, "rewards/accuracy_reward": 0.2265625, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.962890625, "step": 1515 }, { "clip_ratio": 0.0, "completion_length": 545.125, "epoch": 0.6064, "grad_norm": 0.627120379558695, "kl": 5.578125, "learning_rate": 8.04396166777952e-06, "loss": 0.2232, "reward": 1.9375, "reward_std": 0.188387930393219, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.9765625, "step": 1516 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.6068, "grad_norm": 1.9146760249073473, "kl": 6.828125, "learning_rate": 8.030270661155575e-06, "loss": 0.2734, "reward": 1.970703125, "reward_std": 0.4731467068195343, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.939453125, "step": 1517 }, { "clip_ratio": 0.0, "completion_length": 653.125, "epoch": 0.6072, "grad_norm": 0.4360673768916708, "kl": 5.6484375, "learning_rate": 8.016583494619769e-06, "loss": 0.2261, "reward": 1.91796875, "reward_std": 0.2540379762649536, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 1518 }, { "clip_ratio": 0.0, "completion_length": 658.125, "epoch": 0.6076, "grad_norm": 0.3950844873702678, "kl": 5.6015625, "learning_rate": 8.00290019485593e-06, "loss": 0.2238, "reward": 1.896484375, "reward_std": 0.2675948962569237, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.958984375, "step": 1519 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.608, "grad_norm": 6.379274293170597, "kl": 7.0390625, "learning_rate": 7.989220788540356e-06, "loss": 0.2816, "reward": 1.8046875, "reward_std": 0.5228014588356018, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.875, "rewards/tag_count_reward": 0.9296875, "step": 1520 }, { "clip_ratio": 0.0, "completion_length": 662.625, "epoch": 0.6084, "grad_norm": 0.7056585172283016, "kl": 5.65625, "learning_rate": 7.975545302341743e-06, "loss": 0.2262, "reward": 1.9375, "reward_std": 0.22029343992471695, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.9765625, "step": 1521 }, { "clip_ratio": 0.0, "completion_length": 593.875, "epoch": 0.6088, "grad_norm": 1.5168566547844184, "kl": 5.3359375, "learning_rate": 7.961873762921153e-06, "loss": 0.2136, "reward": 1.927734375, "reward_std": 0.18976997584104538, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.974609375, "step": 1522 }, { "clip_ratio": 0.0, "completion_length": 741.625, "epoch": 0.6092, "grad_norm": 1.1353013561628382, "kl": 6.03125, "learning_rate": 7.948206196931953e-06, "loss": 0.2413, "reward": 1.923828125, "reward_std": 0.29691219329833984, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.970703125, "step": 1523 }, { "clip_ratio": 0.0, "completion_length": 861.0, "epoch": 0.6096, "grad_norm": 0.515602051208302, "kl": 6.515625, "learning_rate": 7.934542631019767e-06, "loss": 0.2607, "reward": 1.82421875, "reward_std": 0.4151355251669884, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8828125, "rewards/tag_count_reward": 0.94140625, "step": 1524 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.61, "grad_norm": 3.69468010082983, "kl": 6.6796875, "learning_rate": 7.92088309182241e-06, "loss": 0.267, "reward": 1.79296875, "reward_std": 0.48415693640708923, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.859375, "rewards/tag_count_reward": 0.93359375, "step": 1525 }, { "clip_ratio": 0.0, "completion_length": 747.625, "epoch": 0.6104, "grad_norm": 8.131498211388633, "kl": 6.46875, "learning_rate": 7.907227605969849e-06, "loss": 0.2588, "reward": 1.884765625, "reward_std": 0.32052288949489594, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.955078125, "step": 1526 }, { "clip_ratio": 0.0, "completion_length": 841.5, "epoch": 0.6108, "grad_norm": 1.1217812339274928, "kl": 6.671875, "learning_rate": 7.89357620008416e-06, "loss": 0.2671, "reward": 1.80859375, "reward_std": 0.43722061440348625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8828125, "rewards/tag_count_reward": 0.92578125, "step": 1527 }, { "clip_ratio": 0.0, "completion_length": 928.0, "epoch": 0.6112, "grad_norm": 0.9374020349718699, "kl": 6.2265625, "learning_rate": 7.879928900779457e-06, "loss": 0.2486, "reward": 1.970703125, "reward_std": 0.5771021917462349, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.859375, "rewards/tag_count_reward": 0.916015625, "step": 1528 }, { "clip_ratio": 0.0, "completion_length": 923.375, "epoch": 0.6116, "grad_norm": 1.088472954724334, "kl": 6.765625, "learning_rate": 7.866285734661842e-06, "loss": 0.2705, "reward": 1.921875, "reward_std": 0.49129824340343475, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.875, "rewards/tag_count_reward": 0.921875, "step": 1529 }, { "clip_ratio": 0.0, "completion_length": 748.125, "epoch": 0.612, "grad_norm": 0.5300157750002871, "kl": 6.34375, "learning_rate": 7.852646728329368e-06, "loss": 0.2532, "reward": 2.078125, "reward_std": 0.3482198864221573, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.96875, "step": 1530 }, { "clip_ratio": 0.0, "completion_length": 840.875, "epoch": 0.6124, "grad_norm": 0.3201680670466735, "kl": 6.0078125, "learning_rate": 7.83901190837198e-06, "loss": 0.2407, "reward": 1.861328125, "reward_std": 0.4077613055706024, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.947265625, "step": 1531 }, { "clip_ratio": 0.0, "completion_length": 571.25, "epoch": 0.6128, "grad_norm": 0.3191125319461573, "kl": 5.5625, "learning_rate": 7.825381301371452e-06, "loss": 0.2229, "reward": 2.041015625, "reward_std": 0.2109375, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 1532 }, { "clip_ratio": 0.0, "completion_length": 697.75, "epoch": 0.6132, "grad_norm": 0.3939987411324822, "kl": 5.8203125, "learning_rate": 7.811754933901358e-06, "loss": 0.233, "reward": 1.90625, "reward_std": 0.2686697915196419, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.9609375, "step": 1533 }, { "clip_ratio": 0.0, "completion_length": 751.25, "epoch": 0.6136, "grad_norm": 0.27645958979350266, "kl": 6.546875, "learning_rate": 7.798132832526986e-06, "loss": 0.2616, "reward": 1.935546875, "reward_std": 0.2578125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1534 }, { "clip_ratio": 0.0, "completion_length": 642.0, "epoch": 0.614, "grad_norm": 0.6304923444640985, "kl": 5.984375, "learning_rate": 7.784515023805328e-06, "loss": 0.2393, "reward": 1.935546875, "reward_std": 0.22143208980560303, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1535 }, { "clip_ratio": 0.0, "completion_length": 605.375, "epoch": 0.6144, "grad_norm": 0.6097473561806409, "kl": 5.625, "learning_rate": 7.770901534284996e-06, "loss": 0.225, "reward": 2.07421875, "reward_std": 0.17117708921432495, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1536 }, { "clip_ratio": 0.0, "completion_length": 604.125, "epoch": 0.6148, "grad_norm": 0.4004861892444212, "kl": 5.515625, "learning_rate": 7.757292390506191e-06, "loss": 0.2206, "reward": 1.94921875, "reward_std": 0.223192036151886, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.97265625, "step": 1537 }, { "clip_ratio": 0.0, "completion_length": 564.625, "epoch": 0.6152, "grad_norm": 0.43306988352954534, "kl": 5.01171875, "learning_rate": 7.743687619000625e-06, "loss": 0.2002, "reward": 1.94140625, "reward_std": 0.234375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.98046875, "step": 1538 }, { "clip_ratio": 0.0, "completion_length": 601.75, "epoch": 0.6156, "grad_norm": 0.2956942040059223, "kl": 6.09375, "learning_rate": 7.730087246291503e-06, "loss": 0.2441, "reward": 2.09765625, "reward_std": 0.2272624969482422, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1539 }, { "clip_ratio": 0.0, "completion_length": 517.375, "epoch": 0.616, "grad_norm": 0.14732243498188685, "kl": 5.21875, "learning_rate": 7.716491298893443e-06, "loss": 0.2091, "reward": 1.974609375, "reward_std": 0.09378718957304955, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.990234375, "step": 1540 }, { "clip_ratio": 0.0, "completion_length": 513.0, "epoch": 0.6164, "grad_norm": 0.15514400338685372, "kl": 5.5234375, "learning_rate": 7.702899803312443e-06, "loss": 0.2209, "reward": 1.974609375, "reward_std": 0.1015625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.990234375, "step": 1541 }, { "clip_ratio": 0.0, "completion_length": 848.75, "epoch": 0.6168, "grad_norm": 0.25479425305862263, "kl": 6.6640625, "learning_rate": 7.689312786045823e-06, "loss": 0.2663, "reward": 1.912109375, "reward_std": 0.32460808008909225, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 1542 }, { "clip_ratio": 0.0, "completion_length": 467.5, "epoch": 0.6172, "grad_norm": 0.5024149412255, "kl": 5.6875, "learning_rate": 7.67573027358216e-06, "loss": 0.2275, "reward": 2.083984375, "reward_std": 0.1339605376124382, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1543 }, { "clip_ratio": 0.0, "completion_length": 472.546875, "epoch": 0.6176, "grad_norm": 0.2045667688918987, "kl": 5.265625, "learning_rate": 7.662152292401265e-06, "loss": 0.2041, "reward": 2.091796875, "reward_std": 0.1328125, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.990234375, "step": 1544 }, { "clip_ratio": 0.0, "completion_length": 514.75, "epoch": 0.618, "grad_norm": 0.6373549777300336, "kl": 5.4375, "learning_rate": 7.6485788689741e-06, "loss": 0.2179, "reward": 1.9453125, "reward_std": 0.14018996804952621, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1545 }, { "clip_ratio": 0.0, "completion_length": 730.875, "epoch": 0.6184, "grad_norm": 0.410251161913121, "kl": 6.484375, "learning_rate": 7.635010029762755e-06, "loss": 0.2596, "reward": 2.064453125, "reward_std": 0.2421875, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 1546 }, { "clip_ratio": 0.0, "completion_length": 650.625, "epoch": 0.6188, "grad_norm": 0.15917993330777677, "kl": 5.703125, "learning_rate": 7.621445801220372e-06, "loss": 0.2287, "reward": 1.955078125, "reward_std": 0.1796875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.986328125, "step": 1547 }, { "clip_ratio": 0.0, "completion_length": 553.0, "epoch": 0.6192, "grad_norm": 0.29519321642914526, "kl": 6.09375, "learning_rate": 7.6078862097911075e-06, "loss": 0.2438, "reward": 1.990234375, "reward_std": 0.14425812661647797, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.990234375, "step": 1548 }, { "clip_ratio": 0.0, "completion_length": 845.875, "epoch": 0.6196, "grad_norm": 0.34049224560757585, "kl": 6.4609375, "learning_rate": 7.594331281910082e-06, "loss": 0.2586, "reward": 1.890625, "reward_std": 0.35110122337937355, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.9609375, "step": 1549 }, { "clip_ratio": 0.0, "completion_length": 552.625, "epoch": 0.62, "grad_norm": 0.18437372976303837, "kl": 5.640625, "learning_rate": 7.580781044003324e-06, "loss": 0.2251, "reward": 1.935546875, "reward_std": 0.18266618251800537, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1550 }, { "clip_ratio": 0.0, "completion_length": 478.25, "epoch": 0.6204, "grad_norm": 0.2144833780148728, "kl": 5.578125, "learning_rate": 7.5672355224877115e-06, "loss": 0.2232, "reward": 1.98828125, "reward_std": 0.046875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.99609375, "step": 1551 }, { "clip_ratio": 0.0, "completion_length": 766.875, "epoch": 0.6208, "grad_norm": 0.9044295060247303, "kl": 6.359375, "learning_rate": 7.553694743770928e-06, "loss": 0.2546, "reward": 2.09765625, "reward_std": 0.4152570590376854, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.94921875, "step": 1552 }, { "clip_ratio": 0.0, "completion_length": 631.0, "epoch": 0.6212, "grad_norm": 0.3041454644843442, "kl": 6.0234375, "learning_rate": 7.54015873425142e-06, "loss": 0.2406, "reward": 2.095703125, "reward_std": 0.25401634722948074, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 1553 }, { "clip_ratio": 0.0, "completion_length": 660.125, "epoch": 0.6216, "grad_norm": 0.2077242388303748, "kl": 5.78125, "learning_rate": 7.526627520318329e-06, "loss": 0.2314, "reward": 1.9765625, "reward_std": 0.3175232410430908, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.9609375, "step": 1554 }, { "clip_ratio": 0.0, "completion_length": 844.0859375, "epoch": 0.622, "grad_norm": 0.37689953840337037, "kl": 6.34375, "learning_rate": 7.513101128351454e-06, "loss": 0.24, "reward": 1.896484375, "reward_std": 0.3555157035589218, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.958984375, "step": 1555 }, { "clip_ratio": 0.0, "completion_length": 768.125, "epoch": 0.6224, "grad_norm": 0.22768123256981432, "kl": 5.9375, "learning_rate": 7.49957958472118e-06, "loss": 0.2376, "reward": 1.90234375, "reward_std": 0.29774222522974014, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.96484375, "step": 1556 }, { "clip_ratio": 0.0, "completion_length": 648.375, "epoch": 0.6228, "grad_norm": 0.30347013388437805, "kl": 5.53125, "learning_rate": 7.486062915788453e-06, "loss": 0.2212, "reward": 1.94921875, "reward_std": 0.203125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1557 }, { "clip_ratio": 0.0, "completion_length": 824.5, "epoch": 0.6232, "grad_norm": 0.39783640181387236, "kl": 6.765625, "learning_rate": 7.472551147904708e-06, "loss": 0.2707, "reward": 1.916015625, "reward_std": 0.27844521403312683, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.970703125, "step": 1558 }, { "clip_ratio": 0.0, "completion_length": 587.5, "epoch": 0.6236, "grad_norm": 1.7484369411006109, "kl": 5.5390625, "learning_rate": 7.4590443074118325e-06, "loss": 0.2213, "reward": 2.013671875, "reward_std": 0.23290780186653137, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.982421875, "step": 1559 }, { "clip_ratio": 0.0, "completion_length": 447.625, "epoch": 0.624, "grad_norm": 0.838206246849219, "kl": 4.53515625, "learning_rate": 7.445542420642097e-06, "loss": 0.1811, "reward": 1.966796875, "reward_std": 0.10585808008909225, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.990234375, "step": 1560 }, { "clip_ratio": 0.0, "completion_length": 839.875, "epoch": 0.6244, "grad_norm": 0.40947925936578816, "kl": 6.125, "learning_rate": 7.432045513918122e-06, "loss": 0.245, "reward": 2.162109375, "reward_std": 0.3228645622730255, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 1561 }, { "clip_ratio": 0.0, "completion_length": 855.375, "epoch": 0.6248, "grad_norm": 25.299898794616297, "kl": 6.390625, "learning_rate": 7.418553613552824e-06, "loss": 0.2554, "reward": 2.02734375, "reward_std": 0.3536948561668396, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.95703125, "step": 1562 }, { "clip_ratio": 0.0, "completion_length": 747.625, "epoch": 0.6252, "grad_norm": 0.1888415882355029, "kl": 6.1796875, "learning_rate": 7.405066745849347e-06, "loss": 0.2471, "reward": 1.990234375, "reward_std": 0.28975439071655273, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.974609375, "step": 1563 }, { "clip_ratio": 0.0, "completion_length": 635.5, "epoch": 0.6256, "grad_norm": 22.44151936938105, "kl": 6.1953125, "learning_rate": 7.391584937101034e-06, "loss": 0.2477, "reward": 1.935546875, "reward_std": 0.2578125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1564 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.626, "grad_norm": 45.67911982637413, "kl": 6.8671875, "learning_rate": 7.378108213591355e-06, "loss": 0.2749, "reward": 1.818359375, "reward_std": 0.49386974424123764, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.875, "rewards/tag_count_reward": 0.943359375, "step": 1565 }, { "clip_ratio": 0.0, "completion_length": 921.875, "epoch": 0.6264, "grad_norm": 0.8154284829509723, "kl": 6.609375, "learning_rate": 7.364636601593875e-06, "loss": 0.2647, "reward": 1.80078125, "reward_std": 0.42649824917316437, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8671875, "rewards/tag_count_reward": 0.93359375, "step": 1566 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.6268, "grad_norm": 32.40704852253096, "kl": 4.79296875, "learning_rate": 7.351170127372191e-06, "loss": 0.1918, "reward": 1.806640625, "reward_std": 0.46892525255680084, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.859375, "rewards/tag_count_reward": 0.939453125, "step": 1567 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.6272, "grad_norm": 2.5431984036450084, "kl": 6.390625, "learning_rate": 7.33770881717989e-06, "loss": 0.2553, "reward": 2.0, "reward_std": 0.579082190990448, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.84375, "rewards/tag_count_reward": 0.90625, "step": 1568 }, { "clip_ratio": 0.0, "completion_length": 601.5, "epoch": 0.6276, "grad_norm": 10.214993314438788, "kl": 5.90625, "learning_rate": 7.324252697260475e-06, "loss": 0.2359, "reward": 2.0625, "reward_std": 0.25068528950214386, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.9765625, "step": 1569 }, { "clip_ratio": 0.0, "completion_length": 705.0, "epoch": 0.628, "grad_norm": 0.6149278829622873, "kl": 6.3125, "learning_rate": 7.310801793847344e-06, "loss": 0.2522, "reward": 2.04296875, "reward_std": 0.24991613626480103, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.98046875, "step": 1570 }, { "clip_ratio": 0.0, "completion_length": 722.0, "epoch": 0.6284, "grad_norm": 0.1827564830279425, "kl": 6.203125, "learning_rate": 7.297356133163722e-06, "loss": 0.248, "reward": 1.875, "reward_std": 0.3267768993973732, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.9609375, "step": 1571 }, { "clip_ratio": 0.0, "completion_length": 716.5, "epoch": 0.6288, "grad_norm": 0.3012604579591866, "kl": 6.203125, "learning_rate": 7.283915741422611e-06, "loss": 0.2482, "reward": 1.91796875, "reward_std": 0.2911948561668396, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 1572 }, { "clip_ratio": 0.0, "completion_length": 550.25, "epoch": 0.6292, "grad_norm": 0.5127283481954164, "kl": 5.5859375, "learning_rate": 7.27048064482675e-06, "loss": 0.2238, "reward": 2.234375, "reward_std": 0.31659550219774246, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.984375, "step": 1573 }, { "clip_ratio": 0.0, "completion_length": 711.75, "epoch": 0.6296, "grad_norm": 0.2830680833813688, "kl": 6.4765625, "learning_rate": 7.257050869568536e-06, "loss": 0.2591, "reward": 1.9375, "reward_std": 0.2198980376124382, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.9765625, "step": 1574 }, { "clip_ratio": 0.0, "completion_length": 811.875, "epoch": 0.63, "grad_norm": 0.34740567625438795, "kl": 6.75, "learning_rate": 7.243626441830009e-06, "loss": 0.2691, "reward": 1.962890625, "reward_std": 0.36877574026584625, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 1575 }, { "clip_ratio": 0.0, "completion_length": 646.625, "epoch": 0.6304, "grad_norm": 0.3250680084515875, "kl": 5.921875, "learning_rate": 7.2302073877827775e-06, "loss": 0.237, "reward": 2.013671875, "reward_std": 0.34426628798246384, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1576 }, { "clip_ratio": 0.0, "completion_length": 552.5, "epoch": 0.6308, "grad_norm": 0.23670634625507228, "kl": 5.6171875, "learning_rate": 7.216793733587976e-06, "loss": 0.2245, "reward": 1.951171875, "reward_std": 0.1666145622730255, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.982421875, "step": 1577 }, { "clip_ratio": 0.0, "completion_length": 449.5, "epoch": 0.6312, "grad_norm": 1.181948359284794, "kl": 5.1640625, "learning_rate": 7.203385505396203e-06, "loss": 0.2058, "reward": 2.078125, "reward_std": 0.17525971680879593, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1578 }, { "clip_ratio": 0.0, "completion_length": 833.125, "epoch": 0.6316, "grad_norm": 1.0605711453879787, "kl": 5.875, "learning_rate": 7.189982729347491e-06, "loss": 0.2348, "reward": 1.853515625, "reward_std": 0.40387725085020065, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.939453125, "step": 1579 }, { "clip_ratio": 0.0, "completion_length": 927.75, "epoch": 0.632, "grad_norm": 0.9050052002030526, "kl": 6.421875, "learning_rate": 7.176585431571235e-06, "loss": 0.257, "reward": 1.84765625, "reward_std": 0.4126622676849365, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.94921875, "step": 1580 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.6324, "grad_norm": 5.899901534413347, "kl": 6.7890625, "learning_rate": 7.163193638186159e-06, "loss": 0.2713, "reward": 1.77734375, "reward_std": 0.5040597468614578, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8359375, "rewards/tag_count_reward": 0.94140625, "step": 1581 }, { "clip_ratio": 0.0, "completion_length": 931.625, "epoch": 0.6328, "grad_norm": 0.5743850334989685, "kl": 6.5390625, "learning_rate": 7.149807375300239e-06, "loss": 0.2618, "reward": 1.91796875, "reward_std": 0.46796268224716187, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.859375, "rewards/tag_count_reward": 0.93359375, "step": 1582 }, { "clip_ratio": 0.0, "completion_length": 841.75, "epoch": 0.6332, "grad_norm": 3.606931163015117, "kl": 6.390625, "learning_rate": 7.13642666901069e-06, "loss": 0.2556, "reward": 1.92578125, "reward_std": 0.35729893296957016, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.97265625, "step": 1583 }, { "clip_ratio": 0.0, "completion_length": 749.0, "epoch": 0.6336, "grad_norm": 0.18210402726945873, "kl": 6.3046875, "learning_rate": 7.123051545403874e-06, "loss": 0.2524, "reward": 2.046875, "reward_std": 0.3317646309733391, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.9609375, "step": 1584 }, { "clip_ratio": 0.0, "completion_length": 745.875, "epoch": 0.634, "grad_norm": 0.5523037105070827, "kl": 5.6328125, "learning_rate": 7.109682030555283e-06, "loss": 0.2256, "reward": 1.97265625, "reward_std": 0.34424538910388947, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.96484375, "step": 1585 }, { "clip_ratio": 0.0, "completion_length": 671.5, "epoch": 0.6344, "grad_norm": 1.4259837044751957, "kl": 5.890625, "learning_rate": 7.096318150529476e-06, "loss": 0.236, "reward": 2.01171875, "reward_std": 0.3236183822154999, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.95703125, "step": 1586 }, { "clip_ratio": 0.0, "completion_length": 662.875, "epoch": 0.6348, "grad_norm": 0.3048683905027931, "kl": 6.0390625, "learning_rate": 7.082959931380011e-06, "loss": 0.2417, "reward": 2.0703125, "reward_std": 0.20329221710562706, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1587 }, { "clip_ratio": 0.0, "completion_length": 662.0, "epoch": 0.6352, "grad_norm": 1.11218307925123, "kl": 5.65625, "learning_rate": 7.069607399149427e-06, "loss": 0.2264, "reward": 2.0546875, "reward_std": 0.21875, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.9765625, "step": 1588 }, { "clip_ratio": 0.0, "completion_length": 755.25, "epoch": 0.6356, "grad_norm": 0.6142498740117138, "kl": 5.9609375, "learning_rate": 7.056260579869165e-06, "loss": 0.2384, "reward": 1.9140625, "reward_std": 0.2848476767539978, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.96875, "step": 1589 }, { "clip_ratio": 0.0, "completion_length": 934.125, "epoch": 0.636, "grad_norm": 0.6040827840033536, "kl": 6.7109375, "learning_rate": 7.042919499559538e-06, "loss": 0.2682, "reward": 1.876953125, "reward_std": 0.3990654796361923, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.955078125, "step": 1590 }, { "clip_ratio": 0.0, "completion_length": 702.75, "epoch": 0.6364, "grad_norm": 36.45044654393839, "kl": 12.8671875, "learning_rate": 7.029584184229653e-06, "loss": 0.5148, "reward": 1.912109375, "reward_std": 0.25800251960754395, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 1591 }, { "clip_ratio": 0.0, "completion_length": 613.25, "epoch": 0.6368, "grad_norm": 0.31466694591661676, "kl": 5.546875, "learning_rate": 7.016254659877398e-06, "loss": 0.2218, "reward": 1.947265625, "reward_std": 0.1808355376124382, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 1592 }, { "clip_ratio": 0.0, "completion_length": 775.0390625, "epoch": 0.6372, "grad_norm": 0.5489122084867363, "kl": 6.109375, "learning_rate": 7.002930952489362e-06, "loss": 0.2332, "reward": 2.048828125, "reward_std": 0.2677573561668396, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.970703125, "step": 1593 }, { "clip_ratio": 0.0, "completion_length": 697.25, "epoch": 0.6376, "grad_norm": 0.465494246460181, "kl": 5.3671875, "learning_rate": 6.9896130880407965e-06, "loss": 0.2149, "reward": 2.029296875, "reward_std": 0.29242895543575287, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.966796875, "step": 1594 }, { "clip_ratio": 0.0, "completion_length": 506.140625, "epoch": 0.638, "grad_norm": 0.23946200181775962, "kl": 4.8359375, "learning_rate": 6.976301092495556e-06, "loss": 0.1891, "reward": 2.0390625, "reward_std": 0.19838428497314453, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.984375, "step": 1595 }, { "clip_ratio": 0.0, "completion_length": 851.125, "epoch": 0.6384, "grad_norm": 4.181197795209564, "kl": 6.515625, "learning_rate": 6.962994991806059e-06, "loss": 0.2606, "reward": 1.892578125, "reward_std": 0.36633190512657166, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.962890625, "step": 1596 }, { "clip_ratio": 0.0, "completion_length": 589.265625, "epoch": 0.6388, "grad_norm": 0.8203354107433822, "kl": 4.984375, "learning_rate": 6.949694811913226e-06, "loss": 0.1836, "reward": 1.912109375, "reward_std": 0.24651141837239265, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 1597 }, { "clip_ratio": 0.0, "completion_length": 746.6015625, "epoch": 0.6392, "grad_norm": 0.34306238653942833, "kl": 5.765625, "learning_rate": 6.9364005787464406e-06, "loss": 0.208, "reward": 1.98046875, "reward_std": 0.3056868240237236, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 1598 }, { "clip_ratio": 0.0, "completion_length": 934.1796875, "epoch": 0.6396, "grad_norm": 0.6748597930807981, "kl": 6.328125, "learning_rate": 6.923112318223497e-06, "loss": 0.2459, "reward": 1.998046875, "reward_std": 0.4123416692018509, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.951171875, "step": 1599 }, { "clip_ratio": 0.0, "completion_length": 788.0, "epoch": 0.64, "grad_norm": 0.648984327954412, "kl": 5.75, "learning_rate": 6.909830056250527e-06, "loss": 0.2299, "reward": 1.888671875, "reward_std": 0.38120902329683304, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.951171875, "step": 1600 }, { "clip_ratio": 0.0, "completion_length": 764.125, "epoch": 0.6404, "grad_norm": 0.20252466308772435, "kl": 5.90625, "learning_rate": 6.896553818721989e-06, "loss": 0.236, "reward": 1.921875, "reward_std": 0.28279343992471695, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 1601 }, { "clip_ratio": 0.0, "completion_length": 775.375, "epoch": 0.6408, "grad_norm": 0.18047348257032994, "kl": 6.015625, "learning_rate": 6.883283631520582e-06, "loss": 0.2406, "reward": 1.92578125, "reward_std": 0.2891925275325775, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 1602 }, { "clip_ratio": 0.0, "completion_length": 577.1484375, "epoch": 0.6412, "grad_norm": 0.5189413996185906, "kl": 5.640625, "learning_rate": 6.870019520517217e-06, "loss": 0.2171, "reward": 2.076171875, "reward_std": 0.18753719329833984, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.982421875, "step": 1603 }, { "clip_ratio": 0.0, "completion_length": 487.125, "epoch": 0.6416, "grad_norm": 1.6404665096742195, "kl": 5.1953125, "learning_rate": 6.856761511570963e-06, "loss": 0.2074, "reward": 1.9765625, "reward_std": 0.242316335439682, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 1604 }, { "clip_ratio": 0.0, "completion_length": 503.140625, "epoch": 0.642, "grad_norm": 1.259377463736923, "kl": 5.125, "learning_rate": 6.843509630528977e-06, "loss": 0.2005, "reward": 2.083984375, "reward_std": 0.15638002753257751, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1605 }, { "clip_ratio": 0.0, "completion_length": 595.875, "epoch": 0.6424, "grad_norm": 0.2533687550985217, "kl": 4.69140625, "learning_rate": 6.830263903226483e-06, "loss": 0.1877, "reward": 2.064453125, "reward_std": 0.18050189316272736, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 1606 }, { "clip_ratio": 0.0, "completion_length": 861.5, "epoch": 0.6428, "grad_norm": 0.3240991762030482, "kl": 6.21875, "learning_rate": 6.8170243554867065e-06, "loss": 0.2488, "reward": 1.8359375, "reward_std": 0.41508013755083084, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.9375, "step": 1607 }, { "clip_ratio": 0.0, "completion_length": 672.5, "epoch": 0.6432, "grad_norm": 2.920615358804504, "kl": 5.4296875, "learning_rate": 6.803791013120822e-06, "loss": 0.2168, "reward": 1.939453125, "reward_std": 0.20755060762166977, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 1608 }, { "clip_ratio": 0.0, "completion_length": 761.75, "epoch": 0.6436, "grad_norm": 0.21982763384687085, "kl": 5.9453125, "learning_rate": 6.790563901927907e-06, "loss": 0.2378, "reward": 1.91015625, "reward_std": 0.29688572883605957, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.96484375, "step": 1609 }, { "clip_ratio": 0.0, "completion_length": 863.25, "epoch": 0.644, "grad_norm": 0.3245695076298472, "kl": 6.0390625, "learning_rate": 6.777343047694891e-06, "loss": 0.2353, "reward": 1.826171875, "reward_std": 0.4538600817322731, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.927734375, "step": 1610 }, { "clip_ratio": 0.0, "completion_length": 924.875, "epoch": 0.6444, "grad_norm": 0.5050519510660526, "kl": 6.53125, "learning_rate": 6.764128476196505e-06, "loss": 0.2611, "reward": 1.830078125, "reward_std": 0.476801335811615, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.931640625, "step": 1611 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.6448, "grad_norm": 1.0713141289067258, "kl": 6.7890625, "learning_rate": 6.750920213195238e-06, "loss": 0.2716, "reward": 1.794921875, "reward_std": 0.5650844722986221, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.8671875, "rewards/tag_count_reward": 0.912109375, "step": 1612 }, { "clip_ratio": 0.0, "completion_length": 681.75, "epoch": 0.6452, "grad_norm": 0.9565139345627539, "kl": 5.6015625, "learning_rate": 6.737718284441267e-06, "loss": 0.2243, "reward": 2.048828125, "reward_std": 0.26353612542152405, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.970703125, "step": 1613 }, { "clip_ratio": 0.0, "completion_length": 757.1640625, "epoch": 0.6456, "grad_norm": 0.2530194205877888, "kl": 6.125, "learning_rate": 6.7245227156724324e-06, "loss": 0.231, "reward": 2.044921875, "reward_std": 0.29643067717552185, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 1614 }, { "clip_ratio": 0.0, "completion_length": 565.5, "epoch": 0.646, "grad_norm": 0.19485620084532976, "kl": 5.3203125, "learning_rate": 6.711333532614168e-06, "loss": 0.2128, "reward": 2.06640625, "reward_std": 0.17688271403312683, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.98046875, "step": 1615 }, { "clip_ratio": 0.0, "completion_length": 659.25, "epoch": 0.6464, "grad_norm": 0.12187240638970243, "kl": 5.546875, "learning_rate": 6.698150760979463e-06, "loss": 0.2222, "reward": 1.9296875, "reward_std": 0.2443198561668396, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.96875, "step": 1616 }, { "clip_ratio": 0.0, "completion_length": 587.75, "epoch": 0.6468, "grad_norm": 0.24879830460580754, "kl": 5.09375, "learning_rate": 6.684974426468809e-06, "loss": 0.2038, "reward": 1.935546875, "reward_std": 0.192073255777359, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1617 }, { "clip_ratio": 0.0, "completion_length": 672.875, "epoch": 0.6472, "grad_norm": 0.20133279201197188, "kl": 5.3515625, "learning_rate": 6.671804554770135e-06, "loss": 0.2142, "reward": 1.939453125, "reward_std": 0.21023958921432495, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 1618 }, { "clip_ratio": 0.0, "completion_length": 582.5, "epoch": 0.6476, "grad_norm": 0.23208745037654263, "kl": 5.625, "learning_rate": 6.658641171558785e-06, "loss": 0.225, "reward": 2.072265625, "reward_std": 0.1740073561668396, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 1619 }, { "clip_ratio": 0.0, "completion_length": 851.5, "epoch": 0.648, "grad_norm": 0.5194812283480206, "kl": 5.9140625, "learning_rate": 6.645484302497452e-06, "loss": 0.2364, "reward": 2.025390625, "reward_std": 0.355338454246521, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.955078125, "step": 1620 }, { "clip_ratio": 0.0, "completion_length": 421.375, "epoch": 0.6484, "grad_norm": 0.835544431184385, "kl": 4.484375, "learning_rate": 6.63233397323612e-06, "loss": 0.1794, "reward": 2.056640625, "reward_std": 0.11873093992471695, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1621 }, { "clip_ratio": 0.0, "completion_length": 601.875, "epoch": 0.6488, "grad_norm": 0.13807351180473262, "kl": 5.203125, "learning_rate": 6.6191902094120295e-06, "loss": 0.2082, "reward": 1.96484375, "reward_std": 0.140625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98828125, "step": 1622 }, { "clip_ratio": 0.0, "completion_length": 508.875, "epoch": 0.6492, "grad_norm": 0.1835227608277377, "kl": 4.828125, "learning_rate": 6.60605303664962e-06, "loss": 0.1929, "reward": 2.087890625, "reward_std": 0.12863312661647797, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.994140625, "step": 1623 }, { "clip_ratio": 0.0, "completion_length": 688.875, "epoch": 0.6496, "grad_norm": 0.4777938955994381, "kl": 5.5859375, "learning_rate": 6.5929224805604845e-06, "loss": 0.2236, "reward": 1.94921875, "reward_std": 0.203125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1624 }, { "clip_ratio": 0.0, "completion_length": 285.25, "epoch": 0.65, "grad_norm": 0.1620539717638674, "kl": 4.14453125, "learning_rate": 6.579798566743314e-06, "loss": 0.1658, "reward": 2.0078125, "reward_std": 0.03125, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1625 }, { "clip_ratio": 0.0, "completion_length": 617.625, "epoch": 0.6504, "grad_norm": 1.453964432469814, "kl": 5.21875, "learning_rate": 6.566681320783849e-06, "loss": 0.2092, "reward": 1.93359375, "reward_std": 0.1990194395184517, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.97265625, "step": 1626 }, { "clip_ratio": 0.0, "completion_length": 328.5, "epoch": 0.6508, "grad_norm": 0.2441232801781101, "kl": 4.40625, "learning_rate": 6.553570768254831e-06, "loss": 0.1761, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1627 }, { "clip_ratio": 0.0, "completion_length": 345.125, "epoch": 0.6512, "grad_norm": 0.06849556738143495, "kl": 4.4296875, "learning_rate": 6.540466934715953e-06, "loss": 0.1768, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1628 }, { "clip_ratio": 0.0, "completion_length": 585.125, "epoch": 0.6516, "grad_norm": 0.19826343905065627, "kl": 4.9609375, "learning_rate": 6.52736984571381e-06, "loss": 0.1984, "reward": 1.970703125, "reward_std": 0.1171875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.994140625, "step": 1629 }, { "clip_ratio": 0.0, "completion_length": 704.625, "epoch": 0.652, "grad_norm": 0.16078291462125505, "kl": 5.375, "learning_rate": 6.5142795267818505e-06, "loss": 0.215, "reward": 1.92578125, "reward_std": 0.2711557447910309, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 1630 }, { "clip_ratio": 0.0, "completion_length": 598.125, "epoch": 0.6524, "grad_norm": 0.09660973657091211, "kl": 5.6484375, "learning_rate": 6.501196003440313e-06, "loss": 0.2258, "reward": 1.95703125, "reward_std": 0.171875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98046875, "step": 1631 }, { "clip_ratio": 0.0, "completion_length": 764.125, "epoch": 0.6528, "grad_norm": 0.25236731818181773, "kl": 6.0625, "learning_rate": 6.488119301196201e-06, "loss": 0.2425, "reward": 2.10546875, "reward_std": 0.328125, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.97265625, "step": 1632 }, { "clip_ratio": 0.0, "completion_length": 526.0, "epoch": 0.6532, "grad_norm": 0.22750107341709114, "kl": 5.0859375, "learning_rate": 6.475049445543215e-06, "loss": 0.2033, "reward": 2.013671875, "reward_std": 0.22976026684045792, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.982421875, "step": 1633 }, { "clip_ratio": 0.0, "completion_length": 589.375, "epoch": 0.6536, "grad_norm": 0.2839140074458256, "kl": 5.171875, "learning_rate": 6.461986461961706e-06, "loss": 0.2073, "reward": 2.40625, "reward_std": 0.2520497143268585, "rewards/accuracy_reward": 0.4453125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 1634 }, { "clip_ratio": 0.0, "completion_length": 596.125, "epoch": 0.654, "grad_norm": 3.511945203805136, "kl": 5.3125, "learning_rate": 6.448930375918632e-06, "loss": 0.2129, "reward": 2.072265625, "reward_std": 0.2109375, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 1635 }, { "clip_ratio": 0.0, "completion_length": 356.875, "epoch": 0.6544, "grad_norm": 0.13627571832614385, "kl": 4.2421875, "learning_rate": 6.435881212867494e-06, "loss": 0.17, "reward": 2.0546875, "reward_std": 0.06404343992471695, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1636 }, { "clip_ratio": 0.0, "completion_length": 869.5, "epoch": 0.6548, "grad_norm": 0.23532961351299933, "kl": 6.203125, "learning_rate": 6.422838998248308e-06, "loss": 0.2482, "reward": 1.87890625, "reward_std": 0.38268523663282394, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.94921875, "step": 1637 }, { "clip_ratio": 0.0, "completion_length": 381.625, "epoch": 0.6552, "grad_norm": 0.23091303865192817, "kl": 3.8984375, "learning_rate": 6.409803757487539e-06, "loss": 0.1559, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1638 }, { "clip_ratio": 0.0, "completion_length": 540.875, "epoch": 0.6556, "grad_norm": 0.3465610634093949, "kl": 4.640625, "learning_rate": 6.396775515998055e-06, "loss": 0.1855, "reward": 1.978515625, "reward_std": 0.0859375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.994140625, "step": 1639 }, { "clip_ratio": 0.0, "completion_length": 502.5, "epoch": 0.656, "grad_norm": 0.6540565141428509, "kl": 4.921875, "learning_rate": 6.383754299179079e-06, "loss": 0.1967, "reward": 2.111328125, "reward_std": 0.0546875, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1640 }, { "clip_ratio": 0.0, "completion_length": 607.25, "epoch": 0.6564, "grad_norm": 0.16585636638719758, "kl": 4.8984375, "learning_rate": 6.370740132416138e-06, "loss": 0.1962, "reward": 1.958984375, "reward_std": 0.1640625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1641 }, { "clip_ratio": 0.0, "completion_length": 506.4296875, "epoch": 0.6568, "grad_norm": 0.21918137804711504, "kl": 4.55078125, "learning_rate": 6.357733041081018e-06, "loss": 0.1768, "reward": 2.080078125, "reward_std": 0.15273308008909225, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.986328125, "step": 1642 }, { "clip_ratio": 0.0, "completion_length": 630.0, "epoch": 0.6572, "grad_norm": 0.2076462420223773, "kl": 4.90625, "learning_rate": 6.344733050531713e-06, "loss": 0.1963, "reward": 1.982421875, "reward_std": 0.17550812661647797, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.990234375, "step": 1643 }, { "clip_ratio": 0.0, "completion_length": 618.875, "epoch": 0.6576, "grad_norm": 0.2258671051715339, "kl": 4.25, "learning_rate": 6.33174018611236e-06, "loss": 0.1701, "reward": 1.9453125, "reward_std": 0.17653940618038177, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1644 }, { "clip_ratio": 0.0, "completion_length": 801.5, "epoch": 0.658, "grad_norm": 0.7279887261980107, "kl": 5.296875, "learning_rate": 6.318754473153221e-06, "loss": 0.212, "reward": 2.044921875, "reward_std": 0.3396209254860878, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.966796875, "step": 1645 }, { "clip_ratio": 0.0, "completion_length": 652.125, "epoch": 0.6584, "grad_norm": 0.6897949012858614, "kl": 4.984375, "learning_rate": 6.305775936970606e-06, "loss": 0.1994, "reward": 2.158203125, "reward_std": 0.2857210859656334, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.970703125, "step": 1646 }, { "clip_ratio": 0.0, "completion_length": 560.375, "epoch": 0.6588, "grad_norm": 0.32495004746096473, "kl": 4.796875, "learning_rate": 6.292804602866833e-06, "loss": 0.1922, "reward": 2.072265625, "reward_std": 0.14617788791656494, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 1647 }, { "clip_ratio": 0.0, "completion_length": 952.0, "epoch": 0.6592, "grad_norm": 0.530783297185507, "kl": 6.0, "learning_rate": 6.27984049613019e-06, "loss": 0.2398, "reward": 2.01953125, "reward_std": 0.38721735030412674, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.95703125, "step": 1648 }, { "clip_ratio": 0.0, "completion_length": 692.25, "epoch": 0.6596, "grad_norm": 0.36902077851831083, "kl": 4.9375, "learning_rate": 6.2668836420348535e-06, "loss": 0.1972, "reward": 1.923828125, "reward_std": 0.2452743873000145, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.970703125, "step": 1649 }, { "clip_ratio": 0.0, "completion_length": 722.875, "epoch": 0.66, "grad_norm": 0.1826044900036637, "kl": 5.5234375, "learning_rate": 6.25393406584088e-06, "loss": 0.2211, "reward": 1.9296875, "reward_std": 0.22704888880252838, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.9765625, "step": 1650 }, { "clip_ratio": 0.0, "completion_length": 638.125, "epoch": 0.6604, "grad_norm": 0.2190412212469507, "kl": 5.0859375, "learning_rate": 6.240991792794133e-06, "loss": 0.2038, "reward": 2.00390625, "reward_std": 0.20809492468833923, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98046875, "step": 1651 }, { "clip_ratio": 0.0, "completion_length": 791.0, "epoch": 0.6608, "grad_norm": 0.3560108618714268, "kl": 5.3359375, "learning_rate": 6.228056848126236e-06, "loss": 0.2135, "reward": 1.8984375, "reward_std": 0.30385828018188477, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.9609375, "step": 1652 }, { "clip_ratio": 0.0, "completion_length": 689.546875, "epoch": 0.6612, "grad_norm": 0.4627090572724657, "kl": 5.3671875, "learning_rate": 6.2151292570545215e-06, "loss": 0.2141, "reward": 1.91796875, "reward_std": 0.2673792466521263, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.97265625, "step": 1653 }, { "clip_ratio": 0.0, "completion_length": 733.125, "epoch": 0.6616, "grad_norm": 0.5999704393212304, "kl": 5.265625, "learning_rate": 6.202209044781991e-06, "loss": 0.2104, "reward": 2.064453125, "reward_std": 0.3228185996413231, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.978515625, "step": 1654 }, { "clip_ratio": 0.0, "completion_length": 889.3984375, "epoch": 0.662, "grad_norm": 0.3390883549111842, "kl": 5.421875, "learning_rate": 6.18929623649726e-06, "loss": 0.2041, "reward": 1.892578125, "reward_std": 0.3424427658319473, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.962890625, "step": 1655 }, { "clip_ratio": 0.0, "completion_length": 788.375, "epoch": 0.6624, "grad_norm": 0.6057160737672499, "kl": 5.7578125, "learning_rate": 6.176390857374508e-06, "loss": 0.2305, "reward": 1.89453125, "reward_std": 0.3299889490008354, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.96484375, "step": 1656 }, { "clip_ratio": 0.0, "completion_length": 681.25, "epoch": 0.6628, "grad_norm": 0.4998133833533413, "kl": 4.921875, "learning_rate": 6.1634929325734385e-06, "loss": 0.1973, "reward": 2.076171875, "reward_std": 0.3159308657050133, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 1657 }, { "clip_ratio": 0.0, "completion_length": 559.25, "epoch": 0.6632, "grad_norm": 0.609831006559274, "kl": 5.171875, "learning_rate": 6.150602487239207e-06, "loss": 0.2068, "reward": 2.013671875, "reward_std": 0.17061364650726318, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.990234375, "step": 1658 }, { "clip_ratio": 0.0, "completion_length": 873.75, "epoch": 0.6636, "grad_norm": 0.28716076808642343, "kl": 6.1875, "learning_rate": 6.137719546502401e-06, "loss": 0.2475, "reward": 2.029296875, "reward_std": 0.2938409671187401, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.966796875, "step": 1659 }, { "clip_ratio": 0.0, "completion_length": 860.5, "epoch": 0.664, "grad_norm": 0.6277292723901505, "kl": 6.15625, "learning_rate": 6.124844135478971e-06, "loss": 0.2464, "reward": 1.904296875, "reward_std": 0.32532021403312683, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.966796875, "step": 1660 }, { "clip_ratio": 0.0, "completion_length": 748.75, "epoch": 0.6644, "grad_norm": 0.2913887600418958, "kl": 5.46875, "learning_rate": 6.1119762792701935e-06, "loss": 0.219, "reward": 2.0625, "reward_std": 0.23582034558057785, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.96875, "step": 1661 }, { "clip_ratio": 0.0, "completion_length": 481.0, "epoch": 0.6648, "grad_norm": 0.3597648188427227, "kl": 5.3203125, "learning_rate": 6.099116002962604e-06, "loss": 0.2126, "reward": 2.0390625, "reward_std": 0.12654343992471695, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.9921875, "step": 1662 }, { "clip_ratio": 0.0, "completion_length": 603.125, "epoch": 0.6652, "grad_norm": 0.6631165463889489, "kl": 5.03125, "learning_rate": 6.086263331627976e-06, "loss": 0.2012, "reward": 1.94921875, "reward_std": 0.1661948561668396, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1663 }, { "clip_ratio": 0.0, "completion_length": 937.875, "epoch": 0.6656, "grad_norm": 0.2286567832973995, "kl": 5.9375, "learning_rate": 6.073418290323251e-06, "loss": 0.2375, "reward": 1.888671875, "reward_std": 0.3454107195138931, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.958984375, "step": 1664 }, { "clip_ratio": 0.0, "completion_length": 745.5, "epoch": 0.666, "grad_norm": 3.062161333786037, "kl": 5.1875, "learning_rate": 6.06058090409049e-06, "loss": 0.2078, "reward": 1.923828125, "reward_std": 0.27273958921432495, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.970703125, "step": 1665 }, { "clip_ratio": 0.0, "completion_length": 774.25, "epoch": 0.6664, "grad_norm": 0.4942946081818986, "kl": 6.109375, "learning_rate": 6.047751197956838e-06, "loss": 0.2445, "reward": 2.15234375, "reward_std": 0.31635860353708267, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.95703125, "step": 1666 }, { "clip_ratio": 0.0, "completion_length": 948.75, "epoch": 0.6668, "grad_norm": 0.35094384249014693, "kl": 6.234375, "learning_rate": 6.0349291969344595e-06, "loss": 0.2493, "reward": 1.92578125, "reward_std": 0.3918303847312927, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.96484375, "step": 1667 }, { "clip_ratio": 0.0, "completion_length": 556.25, "epoch": 0.6672, "grad_norm": 0.47607893694332826, "kl": 4.703125, "learning_rate": 6.022114926020504e-06, "loss": 0.1879, "reward": 1.9296875, "reward_std": 0.1603279784321785, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.96875, "step": 1668 }, { "clip_ratio": 0.0, "completion_length": 680.25, "epoch": 0.6676, "grad_norm": 0.21422411016011939, "kl": 5.0625, "learning_rate": 6.009308410197048e-06, "loss": 0.2023, "reward": 2.103515625, "reward_std": 0.19113312661647797, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 1669 }, { "clip_ratio": 0.0, "completion_length": 759.125, "epoch": 0.668, "grad_norm": 0.4015930646916795, "kl": 5.5390625, "learning_rate": 5.996509674431053e-06, "loss": 0.2216, "reward": 2.017578125, "reward_std": 0.3045787587761879, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 1670 }, { "clip_ratio": 0.0, "completion_length": 510.5, "epoch": 0.6684, "grad_norm": 0.15491282729819192, "kl": 4.4765625, "learning_rate": 5.983718743674302e-06, "loss": 0.1794, "reward": 1.990234375, "reward_std": 0.0390625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.998046875, "step": 1671 }, { "clip_ratio": 0.0, "completion_length": 633.375, "epoch": 0.6688, "grad_norm": 0.24166771725973854, "kl": 5.1953125, "learning_rate": 5.970935642863375e-06, "loss": 0.2078, "reward": 2.005859375, "reward_std": 0.23715253919363022, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1672 }, { "clip_ratio": 0.0, "completion_length": 832.375, "epoch": 0.6692, "grad_norm": 0.536163459116259, "kl": 5.90625, "learning_rate": 5.958160396919577e-06, "loss": 0.2362, "reward": 1.904296875, "reward_std": 0.3129289820790291, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.958984375, "step": 1673 }, { "clip_ratio": 0.0, "completion_length": 808.875, "epoch": 0.6696, "grad_norm": 0.18436122712037276, "kl": 5.2578125, "learning_rate": 5.94539303074891e-06, "loss": 0.2101, "reward": 2.0078125, "reward_std": 0.4640495404601097, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.9453125, "step": 1674 }, { "clip_ratio": 0.0, "completion_length": 778.25, "epoch": 0.67, "grad_norm": 0.5208391196071741, "kl": 5.41796875, "learning_rate": 5.932633569242e-06, "loss": 0.2164, "reward": 1.888671875, "reward_std": 0.3208845257759094, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.958984375, "step": 1675 }, { "clip_ratio": 0.0, "completion_length": 669.75, "epoch": 0.6704, "grad_norm": 0.26885306799252945, "kl": 4.8984375, "learning_rate": 5.9198820372740726e-06, "loss": 0.1958, "reward": 2.025390625, "reward_std": 0.28534914553165436, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.970703125, "step": 1676 }, { "clip_ratio": 0.0, "completion_length": 803.5, "epoch": 0.6708, "grad_norm": 0.42083882624192825, "kl": 5.5703125, "learning_rate": 5.907138459704895e-06, "loss": 0.2223, "reward": 2.0703125, "reward_std": 0.21875, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1677 }, { "clip_ratio": 0.0, "completion_length": 748.0, "epoch": 0.6712, "grad_norm": 0.32882288211059224, "kl": 5.2109375, "learning_rate": 5.894402861378721e-06, "loss": 0.2084, "reward": 1.90625, "reward_std": 0.2693582996726036, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.9609375, "step": 1678 }, { "clip_ratio": 0.0, "completion_length": 683.125, "epoch": 0.6716, "grad_norm": 0.2003774801994809, "kl": 5.109375, "learning_rate": 5.881675267124254e-06, "loss": 0.2042, "reward": 2.0546875, "reward_std": 0.21215169876813889, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 1679 }, { "clip_ratio": 0.0, "completion_length": 799.125, "epoch": 0.672, "grad_norm": 0.19313698973157387, "kl": 4.921875, "learning_rate": 5.868955701754584e-06, "loss": 0.1971, "reward": 1.9140625, "reward_std": 0.27216219902038574, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.9609375, "step": 1680 }, { "clip_ratio": 0.0, "completion_length": 753.5, "epoch": 0.6724, "grad_norm": 0.20024607543867123, "kl": 4.828125, "learning_rate": 5.85624419006716e-06, "loss": 0.1932, "reward": 1.919921875, "reward_std": 0.2498341202735901, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.966796875, "step": 1681 }, { "clip_ratio": 0.0, "completion_length": 720.75, "epoch": 0.6728, "grad_norm": 0.3257920781053084, "kl": 5.2734375, "learning_rate": 5.843540756843722e-06, "loss": 0.2114, "reward": 1.962890625, "reward_std": 0.1484375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 1682 }, { "clip_ratio": 0.0, "completion_length": 583.0, "epoch": 0.6732, "grad_norm": 0.29398907046644895, "kl": 4.84375, "learning_rate": 5.830845426850268e-06, "loss": 0.1938, "reward": 1.955078125, "reward_std": 0.12298412621021271, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.986328125, "step": 1683 }, { "clip_ratio": 0.0, "completion_length": 530.25, "epoch": 0.6736, "grad_norm": 0.16848763637878644, "kl": 4.4375, "learning_rate": 5.818158224836987e-06, "loss": 0.1777, "reward": 2.005859375, "reward_std": 0.12551629543304443, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.990234375, "step": 1684 }, { "clip_ratio": 0.0, "completion_length": 689.0, "epoch": 0.674, "grad_norm": 1.366512382454245, "kl": 5.671875, "learning_rate": 5.8054791755382286e-06, "loss": 0.227, "reward": 1.958984375, "reward_std": 0.1640625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1685 }, { "clip_ratio": 0.0, "completion_length": 499.1328125, "epoch": 0.6744, "grad_norm": 0.2420906466499369, "kl": 4.65234375, "learning_rate": 5.792808303672454e-06, "loss": 0.1745, "reward": 1.98828125, "reward_std": 0.109375, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.99609375, "step": 1686 }, { "clip_ratio": 0.0, "completion_length": 569.125, "epoch": 0.6748, "grad_norm": 0.1654305439044788, "kl": 4.90625, "learning_rate": 5.780145633942173e-06, "loss": 0.1963, "reward": 2.09375, "reward_std": 0.09804558008909225, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.9921875, "step": 1687 }, { "clip_ratio": 0.0, "completion_length": 660.25, "epoch": 0.6752, "grad_norm": 0.08695150314664676, "kl": 5.140625, "learning_rate": 5.7674911910339094e-06, "loss": 0.2055, "reward": 1.970703125, "reward_std": 0.1171875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.986328125, "step": 1688 }, { "clip_ratio": 0.0, "completion_length": 676.5, "epoch": 0.6756, "grad_norm": 0.14744637091372342, "kl": 5.1875, "learning_rate": 5.754844999618144e-06, "loss": 0.2075, "reward": 1.974609375, "reward_std": 0.1015625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.990234375, "step": 1689 }, { "clip_ratio": 0.0, "completion_length": 671.5, "epoch": 0.676, "grad_norm": 0.2998858559133107, "kl": 5.40625, "learning_rate": 5.742207084349274e-06, "loss": 0.2168, "reward": 2.09375, "reward_std": 0.16633247584104538, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 1690 }, { "clip_ratio": 0.0, "completion_length": 677.75, "epoch": 0.6764, "grad_norm": 0.20352034720628415, "kl": 4.7734375, "learning_rate": 5.729577469865566e-06, "loss": 0.1907, "reward": 2.1875, "reward_std": 0.13434084504842758, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 1691 }, { "clip_ratio": 0.0, "completion_length": 738.625, "epoch": 0.6768, "grad_norm": 0.2827275922321398, "kl": 4.890625, "learning_rate": 5.716956180789098e-06, "loss": 0.1958, "reward": 2.056640625, "reward_std": 0.22882488369941711, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 1692 }, { "clip_ratio": 0.0, "completion_length": 714.0, "epoch": 0.6772, "grad_norm": 0.08641963398804814, "kl": 5.1875, "learning_rate": 5.704343241725719e-06, "loss": 0.2072, "reward": 2.08984375, "reward_std": 0.140625, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98828125, "step": 1693 }, { "clip_ratio": 0.0, "completion_length": 699.375, "epoch": 0.6776, "grad_norm": 0.15591031627161944, "kl": 5.0859375, "learning_rate": 5.691738677265e-06, "loss": 0.2035, "reward": 2.02734375, "reward_std": 0.17322681099176407, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98828125, "step": 1694 }, { "clip_ratio": 0.0, "completion_length": 770.875, "epoch": 0.678, "grad_norm": 0.1331516123849137, "kl": 5.0625, "learning_rate": 5.679142511980176e-06, "loss": 0.2024, "reward": 1.9296875, "reward_std": 0.20966220647096634, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.96875, "step": 1695 }, { "clip_ratio": 0.0, "completion_length": 819.125, "epoch": 0.6784, "grad_norm": 0.18972002196039026, "kl": 5.8515625, "learning_rate": 5.666554770428129e-06, "loss": 0.2336, "reward": 1.91796875, "reward_std": 0.2980230450630188, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 1696 }, { "clip_ratio": 0.0, "completion_length": 682.875, "epoch": 0.6788, "grad_norm": 0.08963836302648977, "kl": 5.2734375, "learning_rate": 5.653975477149298e-06, "loss": 0.2108, "reward": 1.9765625, "reward_std": 0.09375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 1697 }, { "clip_ratio": 0.0, "completion_length": 714.625, "epoch": 0.6792, "grad_norm": 0.19053305205074028, "kl": 5.421875, "learning_rate": 5.641404656667661e-06, "loss": 0.217, "reward": 2.1015625, "reward_std": 0.20109658688306808, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.984375, "step": 1698 }, { "clip_ratio": 0.0, "completion_length": 911.75, "epoch": 0.6796, "grad_norm": 0.36823754935940733, "kl": 6.0390625, "learning_rate": 5.628842333490674e-06, "loss": 0.2417, "reward": 1.91015625, "reward_std": 0.32742708921432495, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.96484375, "step": 1699 }, { "clip_ratio": 0.0, "completion_length": 810.125, "epoch": 0.68, "grad_norm": 0.3321855870277993, "kl": 5.8984375, "learning_rate": 5.616288532109225e-06, "loss": 0.2354, "reward": 2.171875, "reward_std": 0.28774453699588776, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 1700 }, { "clip_ratio": 0.0, "completion_length": 697.0, "epoch": 0.6804, "grad_norm": 2.7057362347263036, "kl": 4.98046875, "learning_rate": 5.603743276997607e-06, "loss": 0.1992, "reward": 1.94921875, "reward_std": 0.20141888409852982, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.98046875, "step": 1701 }, { "clip_ratio": 0.0, "completion_length": 649.375, "epoch": 0.6808, "grad_norm": 0.17684975436722078, "kl": 5.3203125, "learning_rate": 5.591206592613416e-06, "loss": 0.2129, "reward": 1.970703125, "reward_std": 0.1171875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.986328125, "step": 1702 }, { "clip_ratio": 0.0, "completion_length": 900.5, "epoch": 0.6812, "grad_norm": 0.5292847899921511, "kl": 5.75, "learning_rate": 5.5786785033975745e-06, "loss": 0.2303, "reward": 1.921875, "reward_std": 0.28554558008909225, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 1703 }, { "clip_ratio": 0.0, "completion_length": 703.375, "epoch": 0.6816, "grad_norm": 0.3024504349459709, "kl": 5.2265625, "learning_rate": 5.5661590337742255e-06, "loss": 0.2089, "reward": 2.203125, "reward_std": 0.15555208921432495, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.984375, "step": 1704 }, { "clip_ratio": 0.0, "completion_length": 716.125, "epoch": 0.682, "grad_norm": 1.581623953303713, "kl": 5.1484375, "learning_rate": 5.553648208150728e-06, "loss": 0.2058, "reward": 1.95703125, "reward_std": 0.1336466670036316, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98046875, "step": 1705 }, { "clip_ratio": 0.0, "completion_length": 603.0, "epoch": 0.6824, "grad_norm": 0.08343967782146314, "kl": 4.67578125, "learning_rate": 5.5411460509175605e-06, "loss": 0.1873, "reward": 2.2265625, "reward_std": 0.09375, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 1706 }, { "clip_ratio": 0.0, "completion_length": 659.0, "epoch": 0.6828, "grad_norm": 0.17541523166047446, "kl": 4.796875, "learning_rate": 5.5286525864483285e-06, "loss": 0.1919, "reward": 2.2109375, "reward_std": 0.1261480376124382, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 1707 }, { "clip_ratio": 0.0, "completion_length": 681.375, "epoch": 0.6832, "grad_norm": 0.1098151511375782, "kl": 4.171875, "learning_rate": 5.516167839099679e-06, "loss": 0.1668, "reward": 1.970703125, "reward_std": 0.1171875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.986328125, "step": 1708 }, { "clip_ratio": 0.0, "completion_length": 658.375, "epoch": 0.6836, "grad_norm": 0.3194171928621385, "kl": 4.7265625, "learning_rate": 5.50369183321126e-06, "loss": 0.189, "reward": 2.05078125, "reward_std": 0.171875, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1709 }, { "clip_ratio": 0.0, "completion_length": 891.75, "epoch": 0.684, "grad_norm": 0.496035135676488, "kl": 5.35546875, "learning_rate": 5.491224593105695e-06, "loss": 0.2143, "reward": 2.1875, "reward_std": 0.33489149808883667, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.9609375, "step": 1710 }, { "clip_ratio": 0.0, "completion_length": 868.875, "epoch": 0.6844, "grad_norm": 0.2206620133000677, "kl": 5.6796875, "learning_rate": 5.478766143088492e-06, "loss": 0.2274, "reward": 1.9453125, "reward_std": 0.21875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1711 }, { "clip_ratio": 0.0, "completion_length": 799.125, "epoch": 0.6848, "grad_norm": 0.3104612730295353, "kl": 5.6015625, "learning_rate": 5.466316507448049e-06, "loss": 0.2244, "reward": 1.94921875, "reward_std": 0.203125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1712 }, { "clip_ratio": 0.0, "completion_length": 838.75, "epoch": 0.6852, "grad_norm": 0.40566245283027713, "kl": 5.0625, "learning_rate": 5.453875710455562e-06, "loss": 0.2022, "reward": 2.138671875, "reward_std": 0.3198489621281624, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.958984375, "step": 1713 }, { "clip_ratio": 0.0, "completion_length": 729.375, "epoch": 0.6856, "grad_norm": 0.21995602180502083, "kl": 5.328125, "learning_rate": 5.441443776365003e-06, "loss": 0.2133, "reward": 1.96875, "reward_std": 0.125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.9921875, "step": 1714 }, { "clip_ratio": 0.0, "completion_length": 724.4921875, "epoch": 0.686, "grad_norm": 0.6305211609625866, "kl": 5.1953125, "learning_rate": 5.429020729413062e-06, "loss": 0.1965, "reward": 2.07421875, "reward_std": 0.16665175184607506, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1715 }, { "clip_ratio": 0.0, "completion_length": 904.25, "epoch": 0.6864, "grad_norm": 1.0012782957871578, "kl": 5.3828125, "learning_rate": 5.416606593819102e-06, "loss": 0.2151, "reward": 1.912109375, "reward_std": 0.24979425966739655, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 1716 }, { "clip_ratio": 0.0, "completion_length": 787.5, "epoch": 0.6868, "grad_norm": 0.19662929730472203, "kl": 5.59375, "learning_rate": 5.404201393785123e-06, "loss": 0.2239, "reward": 2.033203125, "reward_std": 0.2254539132118225, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.986328125, "step": 1717 }, { "clip_ratio": 0.0, "completion_length": 877.625, "epoch": 0.6872, "grad_norm": 0.33140821052312924, "kl": 5.578125, "learning_rate": 5.391805153495693e-06, "loss": 0.2235, "reward": 2.162109375, "reward_std": 0.31690485030412674, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 1718 }, { "clip_ratio": 0.0, "completion_length": 685.75, "epoch": 0.6876, "grad_norm": 0.12331741301477082, "kl": 4.90625, "learning_rate": 5.379417897117917e-06, "loss": 0.1964, "reward": 2.1015625, "reward_std": 0.09375, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 1719 }, { "clip_ratio": 0.0, "completion_length": 736.125, "epoch": 0.688, "grad_norm": 0.22972777960653962, "kl": 5.2421875, "learning_rate": 5.367039648801386e-06, "loss": 0.2097, "reward": 1.970703125, "reward_std": 0.1171875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.986328125, "step": 1720 }, { "clip_ratio": 0.0, "completion_length": 728.75, "epoch": 0.6884, "grad_norm": 0.12485493051917851, "kl": 4.4375, "learning_rate": 5.354670432678124e-06, "loss": 0.1778, "reward": 1.9609375, "reward_std": 0.12159235030412674, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 1721 }, { "clip_ratio": 0.0, "completion_length": 765.5, "epoch": 0.6888, "grad_norm": 0.1562353978926727, "kl": 5.203125, "learning_rate": 5.342310272862558e-06, "loss": 0.208, "reward": 1.943359375, "reward_std": 0.23571795225143433, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1722 }, { "clip_ratio": 0.0, "completion_length": 806.875, "epoch": 0.6892, "grad_norm": 0.29978259116539696, "kl": 4.8984375, "learning_rate": 5.3299591934514485e-06, "loss": 0.1963, "reward": 1.9375, "reward_std": 0.21620866656303406, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.9765625, "step": 1723 }, { "clip_ratio": 0.0, "completion_length": 745.75, "epoch": 0.6896, "grad_norm": 0.18832651238692413, "kl": 4.8515625, "learning_rate": 5.317617218523856e-06, "loss": 0.1937, "reward": 2.1015625, "reward_std": 0.21732433140277863, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.9765625, "step": 1724 }, { "clip_ratio": 0.0, "completion_length": 618.625, "epoch": 0.69, "grad_norm": 0.16267047050881212, "kl": 5.34375, "learning_rate": 5.305284372141095e-06, "loss": 0.214, "reward": 2.111328125, "reward_std": 0.0546875, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1725 }, { "clip_ratio": 0.0, "completion_length": 799.5, "epoch": 0.6904, "grad_norm": 0.38866834787001525, "kl": 5.0859375, "learning_rate": 5.292960678346674e-06, "loss": 0.203, "reward": 1.931640625, "reward_std": 0.2690112367272377, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.970703125, "step": 1726 }, { "clip_ratio": 0.0, "completion_length": 654.125, "epoch": 0.6908, "grad_norm": 0.24998993046395743, "kl": 4.8046875, "learning_rate": 5.280646161166274e-06, "loss": 0.1917, "reward": 2.208984375, "reward_std": 0.21214888989925385, "rewards/accuracy_reward": 0.2421875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.990234375, "step": 1727 }, { "clip_ratio": 0.0, "completion_length": 879.625, "epoch": 0.6912, "grad_norm": 0.3511481866888458, "kl": 5.546875, "learning_rate": 5.26834084460767e-06, "loss": 0.2213, "reward": 1.92578125, "reward_std": 0.3923690617084503, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.94140625, "step": 1728 }, { "clip_ratio": 0.0, "completion_length": 728.0, "epoch": 0.6916, "grad_norm": 0.19669305540056034, "kl": 4.51171875, "learning_rate": 5.256044752660709e-06, "loss": 0.1802, "reward": 2.427734375, "reward_std": 0.1640625, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.990234375, "step": 1729 }, { "clip_ratio": 0.0, "completion_length": 665.5, "epoch": 0.692, "grad_norm": 0.2291454519287811, "kl": 4.8984375, "learning_rate": 5.243757909297247e-06, "loss": 0.1962, "reward": 2.0234375, "reward_std": 0.22079972177743912, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 1730 }, { "clip_ratio": 0.0, "completion_length": 620.75, "epoch": 0.6924, "grad_norm": 0.11863197952171742, "kl": 5.296875, "learning_rate": 5.23148033847112e-06, "loss": 0.2117, "reward": 1.9765625, "reward_std": 0.06505206227302551, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 1731 }, { "clip_ratio": 0.0, "completion_length": 644.625, "epoch": 0.6928, "grad_norm": 0.1833739416330123, "kl": 4.3515625, "learning_rate": 5.219212064118079e-06, "loss": 0.1743, "reward": 2.01953125, "reward_std": 0.16409968957304955, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1732 }, { "clip_ratio": 0.0, "completion_length": 744.0, "epoch": 0.6932, "grad_norm": 0.14457385265246248, "kl": 5.046875, "learning_rate": 5.2069531101557505e-06, "loss": 0.2016, "reward": 1.9453125, "reward_std": 0.13586606830358505, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1733 }, { "clip_ratio": 0.0, "completion_length": 654.25, "epoch": 0.6936, "grad_norm": 0.16840716436007586, "kl": 4.19921875, "learning_rate": 5.194703500483593e-06, "loss": 0.1679, "reward": 1.98828125, "reward_std": 0.046875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.99609375, "step": 1734 }, { "clip_ratio": 0.0, "completion_length": 724.375, "epoch": 0.694, "grad_norm": 0.10155173613591788, "kl": 4.5625, "learning_rate": 5.1824632589828465e-06, "loss": 0.1822, "reward": 1.9609375, "reward_std": 0.1261480376124382, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 1735 }, { "clip_ratio": 0.0, "completion_length": 743.25, "epoch": 0.6944, "grad_norm": 0.09039481376135297, "kl": 5.1640625, "learning_rate": 5.1702324095164955e-06, "loss": 0.2065, "reward": 2.087890625, "reward_std": 0.1484375, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 1736 }, { "clip_ratio": 0.0, "completion_length": 649.875, "epoch": 0.6948, "grad_norm": 0.2758358114751699, "kl": 4.9296875, "learning_rate": 5.158010975929193e-06, "loss": 0.197, "reward": 1.974609375, "reward_std": 0.1015625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.990234375, "step": 1737 }, { "clip_ratio": 0.0, "completion_length": 793.75, "epoch": 0.6952, "grad_norm": 0.09418524093393975, "kl": 4.91015625, "learning_rate": 5.145798982047261e-06, "loss": 0.1966, "reward": 2.00390625, "reward_std": 0.21069541573524475, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1738 }, { "clip_ratio": 0.0, "completion_length": 757.875, "epoch": 0.6956, "grad_norm": 0.3147512799315921, "kl": 5.0625, "learning_rate": 5.133596451678603e-06, "loss": 0.202, "reward": 2.0546875, "reward_std": 0.33737194538116455, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.96875, "step": 1739 }, { "clip_ratio": 0.0, "completion_length": 812.375, "epoch": 0.696, "grad_norm": 0.08146851479943404, "kl": 4.59375, "learning_rate": 5.121403408612672e-06, "loss": 0.1833, "reward": 2.091796875, "reward_std": 0.17327880859375, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1740 }, { "clip_ratio": 0.0, "completion_length": 748.375, "epoch": 0.6964, "grad_norm": 0.14337058410007, "kl": 4.859375, "learning_rate": 5.109219876620441e-06, "loss": 0.194, "reward": 2.109375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.9921875, "step": 1741 }, { "clip_ratio": 0.0, "completion_length": 867.015625, "epoch": 0.6968, "grad_norm": 0.9526340642919678, "kl": 4.7734375, "learning_rate": 5.0970458794543135e-06, "loss": 0.1825, "reward": 1.9609375, "reward_std": 0.3745502158999443, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.953125, "step": 1742 }, { "clip_ratio": 0.0, "completion_length": 706.25, "epoch": 0.6972, "grad_norm": 0.3466840290250106, "kl": 4.2890625, "learning_rate": 5.0848814408481305e-06, "loss": 0.1714, "reward": 2.025390625, "reward_std": 0.1484375, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.994140625, "step": 1743 }, { "clip_ratio": 0.0, "completion_length": 706.25, "epoch": 0.6976, "grad_norm": 0.20993120728867967, "kl": 4.54296875, "learning_rate": 5.072726584517086e-06, "loss": 0.1815, "reward": 1.96484375, "reward_std": 0.203125, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98046875, "step": 1744 }, { "clip_ratio": 0.0, "completion_length": 701.5, "epoch": 0.698, "grad_norm": 0.17660670890191627, "kl": 4.32421875, "learning_rate": 5.060581334157693e-06, "loss": 0.173, "reward": 1.9765625, "reward_std": 0.09375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 1745 }, { "clip_ratio": 0.0, "completion_length": 777.5, "epoch": 0.6984, "grad_norm": 68.64596361250324, "kl": 3.8046875, "learning_rate": 5.048445713447738e-06, "loss": 0.1525, "reward": 1.796875, "reward_std": 0.3143020272254944, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.90625, "step": 1746 }, { "clip_ratio": 0.0, "completion_length": 786.140625, "epoch": 0.6988, "grad_norm": 0.2726092611178039, "kl": 5.2421875, "learning_rate": 5.036319746046232e-06, "loss": 0.1999, "reward": 2.060546875, "reward_std": 0.2291145622730255, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1747 }, { "clip_ratio": 0.0, "completion_length": 825.1796875, "epoch": 0.6992, "grad_norm": 0.20492885913434686, "kl": 5.109375, "learning_rate": 5.024203455593375e-06, "loss": 0.1947, "reward": 2.0234375, "reward_std": 0.2186223417520523, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.9765625, "step": 1748 }, { "clip_ratio": 0.0, "completion_length": 632.375, "epoch": 0.6996, "grad_norm": 0.10047874632281308, "kl": 4.421875, "learning_rate": 5.012096865710494e-06, "loss": 0.1768, "reward": 2.095703125, "reward_std": 0.1171875, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.986328125, "step": 1749 }, { "clip_ratio": 0.0, "completion_length": 767.0, "epoch": 0.7, "grad_norm": 0.3043289640109741, "kl": 3.69921875, "learning_rate": 5.000000000000003e-06, "loss": 0.1481, "reward": 2.07421875, "reward_std": 0.16846735030412674, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1750 }, { "clip_ratio": 0.0, "completion_length": 821.875, "epoch": 0.7004, "grad_norm": 0.41667215555998255, "kl": 4.6015625, "learning_rate": 4.98791288204536e-06, "loss": 0.1844, "reward": 1.943359375, "reward_std": 0.2265625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.974609375, "step": 1751 }, { "clip_ratio": 0.0, "completion_length": 867.4453125, "epoch": 0.7008, "grad_norm": 0.20550078397129215, "kl": 5.1796875, "learning_rate": 4.97583553541102e-06, "loss": 0.1835, "reward": 1.931640625, "reward_std": 0.32379651069641113, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.970703125, "step": 1752 }, { "clip_ratio": 0.0, "completion_length": 810.0, "epoch": 0.7012, "grad_norm": 0.24986128630255003, "kl": 4.87109375, "learning_rate": 4.9637679836423926e-06, "loss": 0.1949, "reward": 2.029296875, "reward_std": 0.22810593992471695, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1753 }, { "clip_ratio": 0.0, "completion_length": 780.125, "epoch": 0.7016, "grad_norm": 0.15261147345843293, "kl": 4.1796875, "learning_rate": 4.951710250265785e-06, "loss": 0.1671, "reward": 1.955078125, "reward_std": 0.1509895622730255, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.986328125, "step": 1754 }, { "clip_ratio": 0.0, "completion_length": 683.75, "epoch": 0.702, "grad_norm": 0.4352252649391098, "kl": 4.5546875, "learning_rate": 4.939662358788364e-06, "loss": 0.1819, "reward": 2.173828125, "reward_std": 0.14998093992471695, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.994140625, "step": 1755 }, { "clip_ratio": 0.0, "completion_length": 861.125, "epoch": 0.7024, "grad_norm": 0.5826942401249476, "kl": 4.6953125, "learning_rate": 4.927624332698109e-06, "loss": 0.1878, "reward": 1.927734375, "reward_std": 0.24945375323295593, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.966796875, "step": 1756 }, { "clip_ratio": 0.0, "completion_length": 587.28125, "epoch": 0.7028, "grad_norm": 0.14022496319008357, "kl": 4.46875, "learning_rate": 4.915596195463773e-06, "loss": 0.1756, "reward": 1.96484375, "reward_std": 0.11586953699588776, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98828125, "step": 1757 }, { "clip_ratio": 0.0, "completion_length": 846.125, "epoch": 0.7032, "grad_norm": 0.19992812471087523, "kl": 5.1953125, "learning_rate": 4.903577970534823e-06, "loss": 0.208, "reward": 1.9609375, "reward_std": 0.15625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 1758 }, { "clip_ratio": 0.0, "completion_length": 618.625, "epoch": 0.7036, "grad_norm": 0.4011535019324346, "kl": 4.8984375, "learning_rate": 4.891569681341403e-06, "loss": 0.1957, "reward": 2.06640625, "reward_std": 0.16527669876813889, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1759 }, { "clip_ratio": 0.0, "completion_length": 711.375, "epoch": 0.704, "grad_norm": 0.14134920618142208, "kl": 4.6796875, "learning_rate": 4.879571351294287e-06, "loss": 0.1873, "reward": 2.046875, "reward_std": 0.13468188047409058, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 1760 }, { "clip_ratio": 0.0, "completion_length": 808.9296875, "epoch": 0.7044, "grad_norm": 0.25105243735316424, "kl": 4.921875, "learning_rate": 4.8675830037848295e-06, "loss": 0.1853, "reward": 1.96484375, "reward_std": 0.140625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98828125, "step": 1761 }, { "clip_ratio": 0.0, "completion_length": 630.75, "epoch": 0.7048, "grad_norm": 0.0530041210935677, "kl": 4.52734375, "learning_rate": 4.855604662184935e-06, "loss": 0.181, "reward": 2.111328125, "reward_std": 0.0546875, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1762 }, { "clip_ratio": 0.0, "completion_length": 775.25, "epoch": 0.7052, "grad_norm": 0.20414968824818347, "kl": 4.7578125, "learning_rate": 4.843636349846991e-06, "loss": 0.1904, "reward": 2.171875, "reward_std": 0.2562146782875061, "rewards/accuracy_reward": 0.2265625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1763 }, { "clip_ratio": 0.0, "completion_length": 612.625, "epoch": 0.7056, "grad_norm": 0.4173389976943057, "kl": 4.171875, "learning_rate": 4.831678090103832e-06, "loss": 0.1665, "reward": 2.013671875, "reward_std": 0.12621419876813889, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.990234375, "step": 1764 }, { "clip_ratio": 0.0, "completion_length": 729.0, "epoch": 0.706, "grad_norm": 0.4949221195501856, "kl": 4.3515625, "learning_rate": 4.8197299062687e-06, "loss": 0.1743, "reward": 1.94140625, "reward_std": 0.16010860353708267, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.97265625, "step": 1765 }, { "clip_ratio": 0.0, "completion_length": 686.75, "epoch": 0.7064, "grad_norm": 0.3139313621256145, "kl": 4.5234375, "learning_rate": 4.807791821635186e-06, "loss": 0.1807, "reward": 2.09765625, "reward_std": 0.109375, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1766 }, { "clip_ratio": 0.0, "completion_length": 654.0, "epoch": 0.7068, "grad_norm": 0.08250796809104066, "kl": 4.6640625, "learning_rate": 4.795863859477207e-06, "loss": 0.1868, "reward": 1.974609375, "reward_std": 0.1015625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.990234375, "step": 1767 }, { "clip_ratio": 0.0, "completion_length": 878.75, "epoch": 0.7072, "grad_norm": 0.23259022629364248, "kl": 5.546875, "learning_rate": 4.783946043048922e-06, "loss": 0.2219, "reward": 2.044921875, "reward_std": 0.2916145622730255, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.966796875, "step": 1768 }, { "clip_ratio": 0.0, "completion_length": 743.875, "epoch": 0.7076, "grad_norm": 0.3652929637567772, "kl": 4.609375, "learning_rate": 4.772038395584735e-06, "loss": 0.1847, "reward": 1.95703125, "reward_std": 0.13721734285354614, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98046875, "step": 1769 }, { "clip_ratio": 0.0, "completion_length": 516.75, "epoch": 0.708, "grad_norm": 0.050670542857151696, "kl": 3.8828125, "learning_rate": 4.76014094029921e-06, "loss": 0.1555, "reward": 2.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1770 }, { "clip_ratio": 0.0, "completion_length": 705.375, "epoch": 0.7084, "grad_norm": 0.220262762697073, "kl": 4.8984375, "learning_rate": 4.7482537003870425e-06, "loss": 0.1957, "reward": 1.9609375, "reward_std": 0.15625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 1771 }, { "clip_ratio": 0.0, "completion_length": 709.25, "epoch": 0.7088, "grad_norm": 0.66191830026328, "kl": 4.93359375, "learning_rate": 4.736376699023023e-06, "loss": 0.1974, "reward": 2.072265625, "reward_std": 0.17898958921432495, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 1772 }, { "clip_ratio": 0.0, "completion_length": 512.75, "epoch": 0.7092, "grad_norm": 0.13463841684463965, "kl": 3.84375, "learning_rate": 4.724509959361961e-06, "loss": 0.1533, "reward": 2.0078125, "reward_std": 0.03125, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1773 }, { "clip_ratio": 0.0, "completion_length": 628.25, "epoch": 0.7096, "grad_norm": 0.3925758298300971, "kl": 4.6640625, "learning_rate": 4.712653504538684e-06, "loss": 0.1868, "reward": 2.099609375, "reward_std": 0.1015625, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.990234375, "step": 1774 }, { "clip_ratio": 0.0, "completion_length": 671.5, "epoch": 0.71, "grad_norm": 0.14450270484616848, "kl": 4.5390625, "learning_rate": 4.700807357667953e-06, "loss": 0.1821, "reward": 1.970703125, "reward_std": 0.1171875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.986328125, "step": 1775 }, { "clip_ratio": 0.0, "completion_length": 805.25, "epoch": 0.7104, "grad_norm": 0.5549645959236602, "kl": 4.703125, "learning_rate": 4.688971541844436e-06, "loss": 0.1884, "reward": 1.953125, "reward_std": 0.3049037978053093, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 1776 }, { "clip_ratio": 0.0, "completion_length": 716.375, "epoch": 0.7108, "grad_norm": 0.14787787509674588, "kl": 4.6171875, "learning_rate": 4.677146080142664e-06, "loss": 0.1847, "reward": 1.93359375, "reward_std": 0.20144016295671463, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.97265625, "step": 1777 }, { "clip_ratio": 0.0, "completion_length": 653.375, "epoch": 0.7112, "grad_norm": 0.10621414772619435, "kl": 4.4921875, "learning_rate": 4.6653309956169745e-06, "loss": 0.1802, "reward": 1.96875, "reward_std": 0.125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.984375, "step": 1778 }, { "clip_ratio": 0.0, "completion_length": 569.5, "epoch": 0.7116, "grad_norm": 0.10877125782307326, "kl": 3.984375, "learning_rate": 4.6535263113014885e-06, "loss": 0.1594, "reward": 2.0859375, "reward_std": 0.059839196503162384, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1779 }, { "clip_ratio": 0.0, "completion_length": 763.0, "epoch": 0.712, "grad_norm": 0.15773241974864388, "kl": 4.9140625, "learning_rate": 4.641732050210032e-06, "loss": 0.1966, "reward": 1.916015625, "reward_std": 0.22657519578933716, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.962890625, "step": 1780 }, { "clip_ratio": 0.0, "completion_length": 742.125, "epoch": 0.7124, "grad_norm": 0.6436354038049266, "kl": 4.890625, "learning_rate": 4.629948235336133e-06, "loss": 0.1954, "reward": 1.92578125, "reward_std": 0.25726625323295593, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.96484375, "step": 1781 }, { "clip_ratio": 0.0, "completion_length": 712.875, "epoch": 0.7128, "grad_norm": 0.08266596087025697, "kl": 4.16015625, "learning_rate": 4.618174889652928e-06, "loss": 0.1663, "reward": 2.21875, "reward_std": 0.125, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.984375, "step": 1782 }, { "clip_ratio": 0.0, "completion_length": 804.125, "epoch": 0.7132, "grad_norm": 0.14905148658863937, "kl": 5.015625, "learning_rate": 4.606412036113166e-06, "loss": 0.2008, "reward": 1.9375, "reward_std": 0.22521330043673515, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.9765625, "step": 1783 }, { "clip_ratio": 0.0, "completion_length": 754.5, "epoch": 0.7136, "grad_norm": 0.4213416879868331, "kl": 4.703125, "learning_rate": 4.59465969764913e-06, "loss": 0.1882, "reward": 2.015625, "reward_std": 0.15358919650316238, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 1784 }, { "clip_ratio": 0.0, "completion_length": 694.0, "epoch": 0.714, "grad_norm": 0.18654221195843523, "kl": 4.9296875, "learning_rate": 4.582917897172603e-06, "loss": 0.1968, "reward": 1.966796875, "reward_std": 0.1953125, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1785 }, { "clip_ratio": 0.0, "completion_length": 557.875, "epoch": 0.7144, "grad_norm": 0.44353908608626547, "kl": 4.6484375, "learning_rate": 4.571186657574828e-06, "loss": 0.1865, "reward": 2.115234375, "reward_std": 0.0390625, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.998046875, "step": 1786 }, { "clip_ratio": 0.0, "completion_length": 759.5, "epoch": 0.7148, "grad_norm": 0.1523731881450633, "kl": 4.04296875, "learning_rate": 4.559466001726451e-06, "loss": 0.1617, "reward": 2.072265625, "reward_std": 0.17627985030412674, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 1787 }, { "clip_ratio": 0.0, "completion_length": 823.25, "epoch": 0.7152, "grad_norm": 0.3431663862038828, "kl": 5.0234375, "learning_rate": 4.5477559524775e-06, "loss": 0.2009, "reward": 2.078125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.984375, "step": 1788 }, { "clip_ratio": 0.0, "completion_length": 903.5, "epoch": 0.7156, "grad_norm": 0.2705350800751941, "kl": 4.6796875, "learning_rate": 4.53605653265731e-06, "loss": 0.1871, "reward": 1.904296875, "reward_std": 0.31122470647096634, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 1789 }, { "clip_ratio": 0.0, "completion_length": 687.75, "epoch": 0.716, "grad_norm": 0.21479322681678775, "kl": 4.796875, "learning_rate": 4.524367765074499e-06, "loss": 0.1915, "reward": 1.986328125, "reward_std": 0.0546875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1790 }, { "clip_ratio": 0.0, "completion_length": 789.5, "epoch": 0.7164, "grad_norm": 0.4782116610120686, "kl": 4.3046875, "learning_rate": 4.512689672516918e-06, "loss": 0.1724, "reward": 2.046875, "reward_std": 0.23271455615758896, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 1791 }, { "clip_ratio": 0.0, "completion_length": 814.25, "epoch": 0.7168, "grad_norm": 0.1503524447119106, "kl": 4.640625, "learning_rate": 4.501022277751602e-06, "loss": 0.1859, "reward": 1.953125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.9765625, "step": 1792 }, { "clip_ratio": 0.0, "completion_length": 851.0, "epoch": 0.7172, "grad_norm": 0.25433960431027486, "kl": 5.6328125, "learning_rate": 4.48936560352474e-06, "loss": 0.225, "reward": 2.064453125, "reward_std": 0.2421875, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.970703125, "step": 1793 }, { "clip_ratio": 0.0, "completion_length": 836.375, "epoch": 0.7176, "grad_norm": 0.3300195335662055, "kl": 5.0703125, "learning_rate": 4.477719672561615e-06, "loss": 0.2029, "reward": 2.068359375, "reward_std": 0.31952521204948425, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.974609375, "step": 1794 }, { "clip_ratio": 0.0, "completion_length": 776.125, "epoch": 0.718, "grad_norm": 0.19715494320865187, "kl": 4.859375, "learning_rate": 4.46608450756656e-06, "loss": 0.1949, "reward": 2.111328125, "reward_std": 0.0546875, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1795 }, { "clip_ratio": 0.0, "completion_length": 881.125, "epoch": 0.7184, "grad_norm": 0.4284980016338194, "kl": 5.3671875, "learning_rate": 4.4544601312229295e-06, "loss": 0.2146, "reward": 1.95703125, "reward_std": 0.171875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98828125, "step": 1796 }, { "clip_ratio": 0.0, "completion_length": 751.0, "epoch": 0.7188, "grad_norm": 0.21322307062836246, "kl": 4.703125, "learning_rate": 4.442846566193034e-06, "loss": 0.1882, "reward": 1.9609375, "reward_std": 0.15625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 1797 }, { "clip_ratio": 0.0, "completion_length": 803.1328125, "epoch": 0.7192, "grad_norm": 0.3106643428727401, "kl": 4.5078125, "learning_rate": 4.4312438351181246e-06, "loss": 0.1772, "reward": 1.923828125, "reward_std": 0.20758646726608276, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.970703125, "step": 1798 }, { "clip_ratio": 0.0, "completion_length": 813.5, "epoch": 0.7196, "grad_norm": 0.6173692320438593, "kl": 4.765625, "learning_rate": 4.419651960618302e-06, "loss": 0.1905, "reward": 1.908203125, "reward_std": 0.26215536147356033, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 1799 }, { "clip_ratio": 0.0, "completion_length": 809.09375, "epoch": 0.72, "grad_norm": 0.2424601792398816, "kl": 4.671875, "learning_rate": 4.408070965292534e-06, "loss": 0.1768, "reward": 2.068359375, "reward_std": 0.20461758971214294, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1800 }, { "clip_ratio": 0.0, "completion_length": 827.375, "epoch": 0.7204, "grad_norm": 0.22701279688702378, "kl": 4.734375, "learning_rate": 4.3965008717185555e-06, "loss": 0.1892, "reward": 1.95703125, "reward_std": 0.171875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98046875, "step": 1801 }, { "clip_ratio": 0.0, "completion_length": 891.75, "epoch": 0.7208, "grad_norm": 0.1208142368176061, "kl": 4.4296875, "learning_rate": 4.384941702452856e-06, "loss": 0.1764, "reward": 1.904296875, "reward_std": 0.31617579609155655, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 1802 }, { "clip_ratio": 0.0, "completion_length": 826.5, "epoch": 0.7212, "grad_norm": 0.18673242062379158, "kl": 4.828125, "learning_rate": 4.373393480030637e-06, "loss": 0.1928, "reward": 2.046875, "reward_std": 0.27289125323295593, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.9609375, "step": 1803 }, { "clip_ratio": 0.0, "completion_length": 831.0546875, "epoch": 0.7216, "grad_norm": 0.18832439757103386, "kl": 4.4609375, "learning_rate": 4.361856226965733e-06, "loss": 0.1773, "reward": 2.072265625, "reward_std": 0.2109375, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 1804 }, { "clip_ratio": 0.0, "completion_length": 879.625, "epoch": 0.722, "grad_norm": 0.5278951665776279, "kl": 5.26953125, "learning_rate": 4.350329965750622e-06, "loss": 0.2104, "reward": 2.017578125, "reward_std": 0.3721490651369095, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.955078125, "step": 1805 }, { "clip_ratio": 0.0, "completion_length": 714.0, "epoch": 0.7224, "grad_norm": 0.5720916442106548, "kl": 4.15625, "learning_rate": 4.338814718856333e-06, "loss": 0.1661, "reward": 2.0, "reward_std": 0.21608919650316238, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 1806 }, { "clip_ratio": 0.0, "completion_length": 657.140625, "epoch": 0.7228, "grad_norm": 0.3369596664159158, "kl": 4.09375, "learning_rate": 4.3273105087324375e-06, "loss": 0.1585, "reward": 1.978515625, "reward_std": 0.058983080089092255, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.994140625, "step": 1807 }, { "clip_ratio": 0.0, "completion_length": 734.75, "epoch": 0.7232, "grad_norm": 0.11564833702068233, "kl": 4.5546875, "learning_rate": 4.315817357806974e-06, "loss": 0.1821, "reward": 1.95703125, "reward_std": 0.13226625323295593, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98046875, "step": 1808 }, { "clip_ratio": 0.0, "completion_length": 714.875, "epoch": 0.7236, "grad_norm": 0.3735381207238554, "kl": 4.49609375, "learning_rate": 4.304335288486426e-06, "loss": 0.1797, "reward": 1.970703125, "reward_std": 0.1171875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.986328125, "step": 1809 }, { "clip_ratio": 0.0, "completion_length": 615.0, "epoch": 0.724, "grad_norm": 0.08460331750856659, "kl": 4.4140625, "learning_rate": 4.292864323155684e-06, "loss": 0.1764, "reward": 1.986328125, "reward_std": 0.0546875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1810 }, { "clip_ratio": 0.0, "completion_length": 686.25, "epoch": 0.7244, "grad_norm": 0.07483930240521694, "kl": 3.92578125, "learning_rate": 4.281404484177974e-06, "loss": 0.1571, "reward": 1.98828125, "reward_std": 0.046875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.99609375, "step": 1811 }, { "clip_ratio": 0.0, "completion_length": 591.625, "epoch": 0.7248, "grad_norm": 0.21706572147280792, "kl": 4.171875, "learning_rate": 4.26995579389485e-06, "loss": 0.1673, "reward": 2.265625, "reward_std": 0.042695626616477966, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1812 }, { "clip_ratio": 0.0, "completion_length": 748.0, "epoch": 0.7252, "grad_norm": 0.34626725664017594, "kl": 4.2734375, "learning_rate": 4.258518274626103e-06, "loss": 0.1708, "reward": 1.970703125, "reward_std": 0.1171875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.986328125, "step": 1813 }, { "clip_ratio": 0.0, "completion_length": 637.375, "epoch": 0.7256, "grad_norm": 0.20119070509156345, "kl": 4.50390625, "learning_rate": 4.247091948669775e-06, "loss": 0.1805, "reward": 2.07421875, "reward_std": 0.10671419650316238, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.99609375, "step": 1814 }, { "clip_ratio": 0.0, "completion_length": 696.875, "epoch": 0.726, "grad_norm": 0.2595494028381985, "kl": 4.53125, "learning_rate": 4.235676838302069e-06, "loss": 0.1814, "reward": 2.02734375, "reward_std": 0.10671419650316238, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.99609375, "step": 1815 }, { "clip_ratio": 0.0, "completion_length": 784.375, "epoch": 0.7264, "grad_norm": 0.366811307746614, "kl": 4.34375, "learning_rate": 4.224272965777326e-06, "loss": 0.1735, "reward": 2.08984375, "reward_std": 0.140625, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98828125, "step": 1816 }, { "clip_ratio": 0.0, "completion_length": 740.375, "epoch": 0.7268, "grad_norm": 0.16182087946919654, "kl": 4.4140625, "learning_rate": 4.21288035332798e-06, "loss": 0.1766, "reward": 2.068359375, "reward_std": 0.13435593992471695, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.998046875, "step": 1817 }, { "clip_ratio": 0.0, "completion_length": 779.375, "epoch": 0.7272, "grad_norm": 0.6036740747424374, "kl": 4.6171875, "learning_rate": 4.201499023164508e-06, "loss": 0.1848, "reward": 2.072265625, "reward_std": 0.2109375, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 1818 }, { "clip_ratio": 0.0, "completion_length": 827.25, "epoch": 0.7276, "grad_norm": 0.13955610693419165, "kl": 4.3671875, "learning_rate": 4.190128997475402e-06, "loss": 0.1747, "reward": 1.99609375, "reward_std": 0.078125, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.99609375, "step": 1819 }, { "clip_ratio": 0.0, "completion_length": 806.25, "epoch": 0.728, "grad_norm": 0.16329223769422826, "kl": 3.890625, "learning_rate": 4.178770298427107e-06, "loss": 0.1557, "reward": 1.970703125, "reward_std": 0.1171875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.994140625, "step": 1820 }, { "clip_ratio": 0.0, "completion_length": 841.875, "epoch": 0.7284, "grad_norm": 0.30797496593090545, "kl": 4.171875, "learning_rate": 4.167422948163986e-06, "loss": 0.167, "reward": 2.021484375, "reward_std": 0.1640625, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.990234375, "step": 1821 }, { "clip_ratio": 0.0, "completion_length": 812.5, "epoch": 0.7288, "grad_norm": 0.1662411081199423, "kl": 4.8046875, "learning_rate": 4.15608696880828e-06, "loss": 0.1923, "reward": 1.935546875, "reward_std": 0.18193094432353973, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1822 }, { "clip_ratio": 0.0, "completion_length": 792.25, "epoch": 0.7292, "grad_norm": 0.20100023108599258, "kl": 4.04296875, "learning_rate": 4.144762382460059e-06, "loss": 0.1614, "reward": 1.95703125, "reward_std": 0.171875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98046875, "step": 1823 }, { "clip_ratio": 0.0, "completion_length": 651.25, "epoch": 0.7296, "grad_norm": 0.1863813995819913, "kl": 3.390625, "learning_rate": 4.133449211197188e-06, "loss": 0.1355, "reward": 1.986328125, "reward_std": 0.0546875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1824 }, { "clip_ratio": 0.0, "completion_length": 718.25, "epoch": 0.73, "grad_norm": 2.4703769755981138, "kl": 4.37109375, "learning_rate": 4.12214747707527e-06, "loss": 0.1749, "reward": 2.212890625, "reward_std": 0.11377985030412674, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 1825 }, { "clip_ratio": 0.0, "completion_length": 725.1875, "epoch": 0.7304, "grad_norm": 0.7025537058816537, "kl": 4.6640625, "learning_rate": 4.110857202127615e-06, "loss": 0.1741, "reward": 2.076171875, "reward_std": 0.1953125, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.982421875, "step": 1826 }, { "clip_ratio": 0.0, "completion_length": 904.75, "epoch": 0.7308, "grad_norm": 0.2737896003242183, "kl": 5.33984375, "learning_rate": 4.099578408365192e-06, "loss": 0.2137, "reward": 1.904296875, "reward_std": 0.3624904975295067, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.958984375, "step": 1827 }, { "clip_ratio": 0.0, "completion_length": 741.1171875, "epoch": 0.7312, "grad_norm": 0.4435263435305774, "kl": 4.71875, "learning_rate": 4.08831111777658e-06, "loss": 0.1843, "reward": 2.0078125, "reward_std": 0.1461828052997589, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.984375, "step": 1828 }, { "clip_ratio": 0.0, "completion_length": 708.25, "epoch": 0.7316, "grad_norm": 0.34492668610025823, "kl": 4.21875, "learning_rate": 4.0770553523279535e-06, "loss": 0.1691, "reward": 2.224609375, "reward_std": 0.1015625, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.990234375, "step": 1829 }, { "clip_ratio": 0.0, "completion_length": 838.1796875, "epoch": 0.732, "grad_norm": 0.38034685774748, "kl": 5.0390625, "learning_rate": 4.065811133962987e-06, "loss": 0.1901, "reward": 1.943359375, "reward_std": 0.16950611770153046, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.982421875, "step": 1830 }, { "clip_ratio": 0.0, "completion_length": 637.0, "epoch": 0.7324, "grad_norm": 0.5469267359454458, "kl": 5.078125, "learning_rate": 4.05457848460287e-06, "loss": 0.2027, "reward": 1.98828125, "reward_std": 0.109375, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.99609375, "step": 1831 }, { "clip_ratio": 0.0, "completion_length": 843.375, "epoch": 0.7328, "grad_norm": 0.4734108060627742, "kl": 5.0546875, "learning_rate": 4.04335742614622e-06, "loss": 0.202, "reward": 2.1015625, "reward_std": 0.2778356857597828, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.9765625, "step": 1832 }, { "clip_ratio": 0.0, "completion_length": 862.0, "epoch": 0.7332, "grad_norm": 0.29723051462068634, "kl": 4.875, "learning_rate": 4.032147980469072e-06, "loss": 0.1953, "reward": 1.92578125, "reward_std": 0.26492708921432495, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 1833 }, { "clip_ratio": 0.0, "completion_length": 698.375, "epoch": 0.7336, "grad_norm": 0.13624450642293573, "kl": 5.015625, "learning_rate": 4.020950169424815e-06, "loss": 0.2008, "reward": 1.9609375, "reward_std": 0.15625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 1834 }, { "clip_ratio": 0.0, "completion_length": 795.5, "epoch": 0.734, "grad_norm": 0.1503286168230817, "kl": 4.6796875, "learning_rate": 4.009764014844143e-06, "loss": 0.1873, "reward": 2.05078125, "reward_std": 0.203125, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 1835 }, { "clip_ratio": 0.0, "completion_length": 816.5, "epoch": 0.7344, "grad_norm": 0.46850238399242833, "kl": 4.703125, "learning_rate": 3.998589538535046e-06, "loss": 0.188, "reward": 2.193359375, "reward_std": 0.23508089780807495, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1836 }, { "clip_ratio": 0.0, "completion_length": 876.5, "epoch": 0.7348, "grad_norm": 0.1933613143176049, "kl": 5.81640625, "learning_rate": 3.987426762282733e-06, "loss": 0.2323, "reward": 1.947265625, "reward_std": 0.34301088005304337, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.962890625, "step": 1837 }, { "clip_ratio": 0.0, "completion_length": 850.1484375, "epoch": 0.7352, "grad_norm": 0.1700879339120217, "kl": 4.9921875, "learning_rate": 3.976275707849616e-06, "loss": 0.1911, "reward": 2.037109375, "reward_std": 0.28680288046598434, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 1838 }, { "clip_ratio": 0.0, "completion_length": 843.5, "epoch": 0.7356, "grad_norm": 1.1739168742318724, "kl": 5.0546875, "learning_rate": 3.965136396975235e-06, "loss": 0.202, "reward": 1.90234375, "reward_std": 0.2996155843138695, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.96484375, "step": 1839 }, { "clip_ratio": 0.0, "completion_length": 902.875, "epoch": 0.736, "grad_norm": 0.45772177338459413, "kl": 5.0234375, "learning_rate": 3.954008851376252e-06, "loss": 0.2011, "reward": 1.916015625, "reward_std": 0.275733582675457, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.970703125, "step": 1840 }, { "clip_ratio": 0.0, "completion_length": 832.5, "epoch": 0.7364, "grad_norm": 0.2321112691911793, "kl": 5.6796875, "learning_rate": 3.942893092746387e-06, "loss": 0.2275, "reward": 1.92578125, "reward_std": 0.27208830416202545, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 1841 }, { "clip_ratio": 0.0, "completion_length": 675.125, "epoch": 0.7368, "grad_norm": 0.5058148372090417, "kl": 4.8671875, "learning_rate": 3.931789142756377e-06, "loss": 0.1956, "reward": 2.078125, "reward_std": 0.2530868798494339, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.9765625, "step": 1842 }, { "clip_ratio": 0.0, "completion_length": 761.25, "epoch": 0.7372, "grad_norm": 0.21671953453871554, "kl": 5.6640625, "learning_rate": 3.920697023053949e-06, "loss": 0.2266, "reward": 1.919921875, "reward_std": 0.2546844184398651, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.966796875, "step": 1843 }, { "clip_ratio": 0.0, "completion_length": 671.25, "epoch": 0.7376, "grad_norm": 0.3319431083887212, "kl": 5.1171875, "learning_rate": 3.9096167552637454e-06, "loss": 0.2049, "reward": 2.0, "reward_std": 0.0625, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 1.0, "step": 1844 }, { "clip_ratio": 0.0, "completion_length": 878.0, "epoch": 0.738, "grad_norm": 0.2877079290668115, "kl": 5.1171875, "learning_rate": 3.898548360987325e-06, "loss": 0.2045, "reward": 1.927734375, "reward_std": 0.26430703699588776, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.974609375, "step": 1845 }, { "clip_ratio": 0.0, "completion_length": 837.625, "epoch": 0.7384, "grad_norm": 0.22841497529912885, "kl": 5.1953125, "learning_rate": 3.887491861803085e-06, "loss": 0.2074, "reward": 1.953125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.984375, "step": 1846 }, { "clip_ratio": 0.0, "completion_length": 750.0078125, "epoch": 0.7388, "grad_norm": 0.3744350589342204, "kl": 4.2578125, "learning_rate": 3.876447279266238e-06, "loss": 0.1659, "reward": 2.095703125, "reward_std": 0.3105265200138092, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.978515625, "step": 1847 }, { "clip_ratio": 0.0, "completion_length": 755.625, "epoch": 0.7392, "grad_norm": 0.4958459989630482, "kl": 5.3515625, "learning_rate": 3.86541463490876e-06, "loss": 0.2139, "reward": 1.947265625, "reward_std": 0.18398308008909225, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.986328125, "step": 1848 }, { "clip_ratio": 0.0, "completion_length": 717.875, "epoch": 0.7396, "grad_norm": 0.45138812598477945, "kl": 5.0859375, "learning_rate": 3.854393950239356e-06, "loss": 0.2033, "reward": 2.091796875, "reward_std": 0.16152116656303406, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.982421875, "step": 1849 }, { "clip_ratio": 0.0, "completion_length": 630.0, "epoch": 0.74, "grad_norm": 0.09629792148584344, "kl": 5.078125, "learning_rate": 3.8433852467434175e-06, "loss": 0.2034, "reward": 1.994140625, "reward_std": 0.10220151394605637, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.986328125, "step": 1850 }, { "clip_ratio": 0.0, "completion_length": 880.53125, "epoch": 0.7404, "grad_norm": 0.27751540522397955, "kl": 4.95703125, "learning_rate": 3.832388545882975e-06, "loss": 0.1913, "reward": 1.970703125, "reward_std": 0.3752254769206047, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.970703125, "step": 1851 }, { "clip_ratio": 0.0, "completion_length": 668.375, "epoch": 0.7408, "grad_norm": 1.3890215214509696, "kl": 4.74609375, "learning_rate": 3.821403869096658e-06, "loss": 0.1901, "reward": 1.96875, "reward_std": 0.125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.984375, "step": 1852 }, { "clip_ratio": 0.0, "completion_length": 840.625, "epoch": 0.7412, "grad_norm": 0.27786152161953986, "kl": 5.375, "learning_rate": 3.810431237799657e-06, "loss": 0.2153, "reward": 1.92578125, "reward_std": 0.2667730376124382, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 1853 }, { "clip_ratio": 0.0, "completion_length": 881.75, "epoch": 0.7416, "grad_norm": 0.5260126086642193, "kl": 5.25, "learning_rate": 3.7994706733836738e-06, "loss": 0.2098, "reward": 2.197265625, "reward_std": 0.2109375, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.986328125, "step": 1854 }, { "clip_ratio": 0.0, "completion_length": 675.28125, "epoch": 0.742, "grad_norm": 0.24639807532454291, "kl": 4.71484375, "learning_rate": 3.7885221972168974e-06, "loss": 0.1813, "reward": 1.9453125, "reward_std": 0.19399453699588776, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.984375, "step": 1855 }, { "clip_ratio": 0.0, "completion_length": 662.875, "epoch": 0.7424, "grad_norm": 0.12766278992097654, "kl": 4.61328125, "learning_rate": 3.7775858306439374e-06, "loss": 0.1844, "reward": 2.103515625, "reward_std": 0.17949789762496948, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.986328125, "step": 1856 }, { "clip_ratio": 0.0, "completion_length": 584.0, "epoch": 0.7428, "grad_norm": 0.31734020088784637, "kl": 4.5546875, "learning_rate": 3.766661594985801e-06, "loss": 0.1825, "reward": 2.224609375, "reward_std": 0.1015625, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.990234375, "step": 1857 }, { "clip_ratio": 0.0, "completion_length": 669.6875, "epoch": 0.7432, "grad_norm": 0.2099497950973643, "kl": 4.8828125, "learning_rate": 3.7557495115398446e-06, "loss": 0.1868, "reward": 2.01953125, "reward_std": 0.20466843992471695, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98828125, "step": 1858 }, { "clip_ratio": 0.0, "completion_length": 607.25, "epoch": 0.7436, "grad_norm": 0.12113042963231997, "kl": 4.296875, "learning_rate": 3.7448496015797296e-06, "loss": 0.172, "reward": 2.111328125, "reward_std": 0.0546875, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1859 }, { "clip_ratio": 0.0, "completion_length": 846.375, "epoch": 0.744, "grad_norm": 0.7224845464611345, "kl": 5.40625, "learning_rate": 3.7339618863553983e-06, "loss": 0.2167, "reward": 1.9296875, "reward_std": 0.22974662482738495, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.9765625, "step": 1860 }, { "clip_ratio": 0.0, "completion_length": 846.5, "epoch": 0.7444, "grad_norm": 0.34520115237476184, "kl": 5.1875, "learning_rate": 3.723086387092997e-06, "loss": 0.2076, "reward": 1.919921875, "reward_std": 0.23412227630615234, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.974609375, "step": 1861 }, { "clip_ratio": 0.0, "completion_length": 821.0, "epoch": 0.7448, "grad_norm": 0.13097219475929708, "kl": 4.9453125, "learning_rate": 3.7122231249948747e-06, "loss": 0.198, "reward": 1.9140625, "reward_std": 0.27544331550598145, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.9609375, "step": 1862 }, { "clip_ratio": 0.0, "completion_length": 716.125, "epoch": 0.7452, "grad_norm": 0.2798679444399969, "kl": 5.6171875, "learning_rate": 3.7013721212395128e-06, "loss": 0.2244, "reward": 1.951171875, "reward_std": 0.1652105376124382, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.982421875, "step": 1863 }, { "clip_ratio": 0.0, "completion_length": 727.125, "epoch": 0.7456, "grad_norm": 0.16625321974283297, "kl": 5.078125, "learning_rate": 3.6905333969815038e-06, "loss": 0.2036, "reward": 1.947265625, "reward_std": 0.17627985030412674, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 1864 }, { "clip_ratio": 0.0, "completion_length": 644.125, "epoch": 0.746, "grad_norm": 0.987380174192636, "kl": 5.2109375, "learning_rate": 3.679706973351491e-06, "loss": 0.2086, "reward": 2.00390625, "reward_std": 0.2748293802142143, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1865 }, { "clip_ratio": 0.0, "completion_length": 653.4375, "epoch": 0.7464, "grad_norm": 7.881458910503114, "kl": 5.078125, "learning_rate": 3.6688928714561444e-06, "loss": 0.1996, "reward": 2.10546875, "reward_std": 0.18332062661647797, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98828125, "step": 1866 }, { "clip_ratio": 0.0, "completion_length": 777.875, "epoch": 0.7468, "grad_norm": 0.23612400861437327, "kl": 4.9453125, "learning_rate": 3.658091112378106e-06, "loss": 0.1979, "reward": 2.044921875, "reward_std": 0.2400214672088623, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.974609375, "step": 1867 }, { "clip_ratio": 0.0, "completion_length": 768.625, "epoch": 0.7472, "grad_norm": 0.28015689529323845, "kl": 5.4296875, "learning_rate": 3.6473017171759563e-06, "loss": 0.217, "reward": 1.970703125, "reward_std": 0.2743890583515167, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 1868 }, { "clip_ratio": 0.0, "completion_length": 910.875, "epoch": 0.7476, "grad_norm": 0.3158941203301619, "kl": 5.6796875, "learning_rate": 3.636524706884181e-06, "loss": 0.227, "reward": 2.17578125, "reward_std": 0.26992058008909225, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 1869 }, { "clip_ratio": 0.0, "completion_length": 814.0, "epoch": 0.748, "grad_norm": 0.3338271621282442, "kl": 5.15625, "learning_rate": 3.625760102513103e-06, "loss": 0.2061, "reward": 2.04296875, "reward_std": 0.25556667894124985, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 1870 }, { "clip_ratio": 0.0, "completion_length": 761.375, "epoch": 0.7484, "grad_norm": 0.37683457077176347, "kl": 4.7578125, "learning_rate": 3.615007925048878e-06, "loss": 0.19, "reward": 2.107421875, "reward_std": 0.24336913973093033, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.982421875, "step": 1871 }, { "clip_ratio": 0.0, "completion_length": 599.875, "epoch": 0.7488, "grad_norm": 0.23211305005375005, "kl": 4.3125, "learning_rate": 3.604268195453421e-06, "loss": 0.1722, "reward": 1.99609375, "reward_std": 0.015625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 0.99609375, "step": 1872 }, { "clip_ratio": 0.0, "completion_length": 703.0, "epoch": 0.7492, "grad_norm": 0.23518275781891185, "kl": 5.25, "learning_rate": 3.5935409346643835e-06, "loss": 0.2098, "reward": 2.09765625, "reward_std": 0.109375, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1873 }, { "clip_ratio": 0.0, "completion_length": 686.5, "epoch": 0.7496, "grad_norm": 0.41640688091507816, "kl": 4.484375, "learning_rate": 3.582826163595119e-06, "loss": 0.1793, "reward": 1.98046875, "reward_std": 0.078125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.99609375, "step": 1874 }, { "clip_ratio": 0.0, "completion_length": 783.75, "epoch": 0.75, "grad_norm": 0.5864898532200217, "kl": 4.71875, "learning_rate": 3.5721239031346067e-06, "loss": 0.1888, "reward": 2.046875, "reward_std": 0.24190182238817215, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 1875 }, { "clip_ratio": 0.0, "completion_length": 854.0, "epoch": 0.7504, "grad_norm": 0.36656669607802866, "kl": 5.0703125, "learning_rate": 3.5614341741474633e-06, "loss": 0.2025, "reward": 1.94921875, "reward_std": 0.203125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1876 }, { "clip_ratio": 0.0, "completion_length": 873.5, "epoch": 0.7508, "grad_norm": 0.8075213014674045, "kl": 5.3515625, "learning_rate": 3.5507569974738575e-06, "loss": 0.214, "reward": 2.166015625, "reward_std": 0.34003299474716187, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 1877 }, { "clip_ratio": 0.0, "completion_length": 833.125, "epoch": 0.7512, "grad_norm": 1.8940271227453154, "kl": 5.15625, "learning_rate": 3.540092393929494e-06, "loss": 0.206, "reward": 1.908203125, "reward_std": 0.2609727308154106, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 1878 }, { "clip_ratio": 0.0, "completion_length": 634.5625, "epoch": 0.7516, "grad_norm": 0.36763144591908814, "kl": 4.4765625, "learning_rate": 3.5294403843055604e-06, "loss": 0.1778, "reward": 2.021484375, "reward_std": 0.25711458921432495, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.982421875, "step": 1879 }, { "clip_ratio": 0.0, "completion_length": 727.25, "epoch": 0.752, "grad_norm": 0.24267254802395943, "kl": 4.90625, "learning_rate": 3.5188009893686916e-06, "loss": 0.1966, "reward": 1.966796875, "reward_std": 0.1328125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.990234375, "step": 1880 }, { "clip_ratio": 0.0, "completion_length": 758.875, "epoch": 0.7524, "grad_norm": 0.38229346586517693, "kl": 5.0625, "learning_rate": 3.50817422986094e-06, "loss": 0.2031, "reward": 1.95703125, "reward_std": 0.171875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98046875, "step": 1881 }, { "clip_ratio": 0.0, "completion_length": 900.375, "epoch": 0.7528, "grad_norm": 0.32034424244103954, "kl": 5.3671875, "learning_rate": 3.4975601264997094e-06, "loss": 0.2148, "reward": 1.923828125, "reward_std": 0.3046875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.970703125, "step": 1882 }, { "clip_ratio": 0.0, "completion_length": 791.125, "epoch": 0.7532, "grad_norm": 0.10603605018908083, "kl": 4.7265625, "learning_rate": 3.4869586999777492e-06, "loss": 0.1887, "reward": 1.943359375, "reward_std": 0.19277116656303406, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.974609375, "step": 1883 }, { "clip_ratio": 0.0, "completion_length": 698.375, "epoch": 0.7536, "grad_norm": 0.222180370954364, "kl": 4.6484375, "learning_rate": 3.476369970963072e-06, "loss": 0.1862, "reward": 2.083984375, "reward_std": 0.13710808008909225, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.990234375, "step": 1884 }, { "clip_ratio": 0.0, "completion_length": 783.875, "epoch": 0.754, "grad_norm": 1.2079470679598006, "kl": 5.4296875, "learning_rate": 3.4657939600989453e-06, "loss": 0.2169, "reward": 1.921875, "reward_std": 0.25185415148735046, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 1885 }, { "clip_ratio": 0.0, "completion_length": 812.0, "epoch": 0.7544, "grad_norm": 0.422460495901576, "kl": 4.6015625, "learning_rate": 3.455230688003852e-06, "loss": 0.184, "reward": 2.03515625, "reward_std": 0.30097050219774246, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.96484375, "step": 1886 }, { "clip_ratio": 0.0, "completion_length": 677.75, "epoch": 0.7548, "grad_norm": 0.0990030238844976, "kl": 4.11328125, "learning_rate": 3.4446801752714287e-06, "loss": 0.1645, "reward": 1.9765625, "reward_std": 0.09375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 1887 }, { "clip_ratio": 0.0, "completion_length": 713.125, "epoch": 0.7552, "grad_norm": 0.24205807668986862, "kl": 4.7578125, "learning_rate": 3.4341424424704373e-06, "loss": 0.1902, "reward": 2.052734375, "reward_std": 0.1640625, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.990234375, "step": 1888 }, { "clip_ratio": 0.0, "completion_length": 679.5, "epoch": 0.7556, "grad_norm": 0.22470911800327992, "kl": 4.1171875, "learning_rate": 3.4236175101447265e-06, "loss": 0.165, "reward": 2.171875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.984375, "step": 1889 }, { "clip_ratio": 0.0, "completion_length": 720.25, "epoch": 0.756, "grad_norm": 0.2363082569809509, "kl": 5.015625, "learning_rate": 3.4131053988131947e-06, "loss": 0.2001, "reward": 2.099609375, "reward_std": 0.1015625, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.990234375, "step": 1890 }, { "clip_ratio": 0.0, "completion_length": 656.0, "epoch": 0.7564, "grad_norm": 0.3588120925623334, "kl": 4.8515625, "learning_rate": 3.4026061289697397e-06, "loss": 0.1941, "reward": 2.13671875, "reward_std": 0.2590266987681389, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98828125, "step": 1891 }, { "clip_ratio": 0.0, "completion_length": 696.25, "epoch": 0.7568, "grad_norm": 0.11821380493730473, "kl": 4.21875, "learning_rate": 3.3921197210832235e-06, "loss": 0.169, "reward": 1.92578125, "reward_std": 0.2230147123336792, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.96484375, "step": 1892 }, { "clip_ratio": 0.0, "completion_length": 732.171875, "epoch": 0.7572, "grad_norm": 0.12676279091877984, "kl": 4.09375, "learning_rate": 3.381646195597437e-06, "loss": 0.1599, "reward": 2.18359375, "reward_std": 0.20173170417547226, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.97265625, "step": 1893 }, { "clip_ratio": 0.0, "completion_length": 821.625, "epoch": 0.7576, "grad_norm": 0.22330984737646972, "kl": 5.109375, "learning_rate": 3.3711855729310482e-06, "loss": 0.2045, "reward": 1.9921875, "reward_std": 0.28758031129837036, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1894 }, { "clip_ratio": 0.0, "completion_length": 723.125, "epoch": 0.758, "grad_norm": 0.18033067429607802, "kl": 4.25390625, "learning_rate": 3.360737873477584e-06, "loss": 0.1704, "reward": 2.095703125, "reward_std": 0.0802573561668396, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.986328125, "step": 1895 }, { "clip_ratio": 0.0, "completion_length": 723.625, "epoch": 0.7584, "grad_norm": 0.8255764273172531, "kl": 4.3046875, "learning_rate": 3.3503031176053657e-06, "loss": 0.1718, "reward": 1.955078125, "reward_std": 0.14007875323295593, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.978515625, "step": 1896 }, { "clip_ratio": 0.0, "completion_length": 751.75, "epoch": 0.7588, "grad_norm": 0.15860515107366122, "kl": 4.4296875, "learning_rate": 3.3398813256574847e-06, "loss": 0.1775, "reward": 1.953125, "reward_std": 0.19313044100999832, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1897 }, { "clip_ratio": 0.0, "completion_length": 763.625, "epoch": 0.7592, "grad_norm": 0.173823628217027, "kl": 4.05859375, "learning_rate": 3.3294725179517573e-06, "loss": 0.1627, "reward": 1.96484375, "reward_std": 0.1105230376124382, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98828125, "step": 1898 }, { "clip_ratio": 0.0, "completion_length": 823.25, "epoch": 0.7596, "grad_norm": 0.09642978999538618, "kl": 4.703125, "learning_rate": 3.3190767147806825e-06, "loss": 0.1881, "reward": 2.001953125, "reward_std": 0.11435256898403168, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.994140625, "step": 1899 }, { "clip_ratio": 0.0, "completion_length": 801.625, "epoch": 0.76, "grad_norm": 0.5803373014195239, "kl": 4.30078125, "learning_rate": 3.308693936411421e-06, "loss": 0.1718, "reward": 1.94921875, "reward_std": 0.16846735030412674, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1900 }, { "clip_ratio": 0.0, "completion_length": 777.125, "epoch": 0.7604, "grad_norm": 0.32979104782878205, "kl": 4.671875, "learning_rate": 3.2983242030857177e-06, "loss": 0.1872, "reward": 1.958984375, "reward_std": 0.13211458921432495, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1901 }, { "clip_ratio": 0.0, "completion_length": 731.4296875, "epoch": 0.7608, "grad_norm": 0.28710823936914126, "kl": 4.21484375, "learning_rate": 3.287967535019908e-06, "loss": 0.1607, "reward": 1.9609375, "reward_std": 0.1261480376124382, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 1902 }, { "clip_ratio": 0.0, "completion_length": 877.875, "epoch": 0.7612, "grad_norm": 0.22188700751738297, "kl": 5.1171875, "learning_rate": 3.2776239524048426e-06, "loss": 0.2045, "reward": 1.9765625, "reward_std": 0.2905155047774315, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.9765625, "step": 1903 }, { "clip_ratio": 0.0, "completion_length": 694.3125, "epoch": 0.7616, "grad_norm": 0.15148831267189394, "kl": 3.9765625, "learning_rate": 3.2672934754058615e-06, "loss": 0.1487, "reward": 1.98828125, "reward_std": 0.046875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.99609375, "step": 1904 }, { "clip_ratio": 0.0, "completion_length": 876.125, "epoch": 0.762, "grad_norm": 0.5100151601643448, "kl": 5.078125, "learning_rate": 3.2569761241627694e-06, "loss": 0.203, "reward": 2.046875, "reward_std": 0.3125, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 1905 }, { "clip_ratio": 0.0, "completion_length": 662.25, "epoch": 0.7624, "grad_norm": 0.13225112372602038, "kl": 4.42578125, "learning_rate": 3.2466719187897555e-06, "loss": 0.1771, "reward": 1.9609375, "reward_std": 0.1275520622730255, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 1906 }, { "clip_ratio": 0.0, "completion_length": 692.75, "epoch": 0.7628, "grad_norm": 0.04839738727727729, "kl": 4.5625, "learning_rate": 3.2363808793754082e-06, "loss": 0.1822, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1907 }, { "clip_ratio": 0.0, "completion_length": 789.375, "epoch": 0.7632, "grad_norm": 0.1897092183279878, "kl": 4.328125, "learning_rate": 3.2261030259826287e-06, "loss": 0.173, "reward": 1.974609375, "reward_std": 0.1138886883854866, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.990234375, "step": 1908 }, { "clip_ratio": 0.0, "completion_length": 863.625, "epoch": 0.7636, "grad_norm": 0.6228949567504728, "kl": 4.6328125, "learning_rate": 3.2158383786486204e-06, "loss": 0.1851, "reward": 1.9453125, "reward_std": 0.21875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1909 }, { "clip_ratio": 0.0, "completion_length": 819.125, "epoch": 0.764, "grad_norm": 0.12968578462741634, "kl": 5.140625, "learning_rate": 3.2055869573848374e-06, "loss": 0.2059, "reward": 1.93359375, "reward_std": 0.2579425275325775, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.97265625, "step": 1910 }, { "clip_ratio": 0.0, "completion_length": 719.875, "epoch": 0.7644, "grad_norm": 0.11489317599525285, "kl": 4.13671875, "learning_rate": 3.195348782176948e-06, "loss": 0.1657, "reward": 1.986328125, "reward_std": 0.0546875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1911 }, { "clip_ratio": 0.0, "completion_length": 714.25, "epoch": 0.7648, "grad_norm": 0.2969725735326738, "kl": 4.734375, "learning_rate": 3.1851238729848033e-06, "loss": 0.1896, "reward": 2.193359375, "reward_std": 0.15746419876813889, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.990234375, "step": 1912 }, { "clip_ratio": 0.0, "completion_length": 708.125, "epoch": 0.7652, "grad_norm": 0.16006480321969588, "kl": 4.90625, "learning_rate": 3.174912249742382e-06, "loss": 0.1964, "reward": 1.9453125, "reward_std": 0.18495866656303406, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1913 }, { "clip_ratio": 0.0, "completion_length": 813.375, "epoch": 0.7656, "grad_norm": 0.1854781220446725, "kl": 4.6796875, "learning_rate": 3.164713932357776e-06, "loss": 0.1866, "reward": 2.005859375, "reward_std": 0.2640867307782173, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1914 }, { "clip_ratio": 0.0, "completion_length": 879.75, "epoch": 0.766, "grad_norm": 1.0065262451966153, "kl": 4.8828125, "learning_rate": 3.1545289407131128e-06, "loss": 0.1952, "reward": 1.974609375, "reward_std": 0.2890625, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1915 }, { "clip_ratio": 0.0, "completion_length": 706.75, "epoch": 0.7664, "grad_norm": 0.1758160714332407, "kl": 4.09765625, "learning_rate": 3.144357294664565e-06, "loss": 0.1639, "reward": 1.98046875, "reward_std": 0.078125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.99609375, "step": 1916 }, { "clip_ratio": 0.0, "completion_length": 893.625, "epoch": 0.7668, "grad_norm": 0.3014687518989877, "kl": 5.1875, "learning_rate": 3.134199014042274e-06, "loss": 0.2072, "reward": 1.892578125, "reward_std": 0.33070748299360275, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.955078125, "step": 1917 }, { "clip_ratio": 0.0, "completion_length": 828.875, "epoch": 0.7672, "grad_norm": 0.13503731573241967, "kl": 4.6484375, "learning_rate": 3.124054118650327e-06, "loss": 0.1861, "reward": 1.953125, "reward_std": 0.19313043355941772, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1918 }, { "clip_ratio": 0.0, "completion_length": 705.625, "epoch": 0.7676, "grad_norm": 0.11169091695816342, "kl": 4.125, "learning_rate": 3.113922628266718e-06, "loss": 0.1652, "reward": 2.205078125, "reward_std": 0.14696048200130463, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 1919 }, { "clip_ratio": 0.0, "completion_length": 677.0, "epoch": 0.768, "grad_norm": 0.15867167010099886, "kl": 4.6328125, "learning_rate": 3.103804562643302e-06, "loss": 0.1849, "reward": 2.056640625, "reward_std": 0.14009655267000198, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.986328125, "step": 1920 }, { "clip_ratio": 0.0, "completion_length": 667.5, "epoch": 0.7684, "grad_norm": 0.15709602683134435, "kl": 3.734375, "learning_rate": 3.0936999415057712e-06, "loss": 0.1494, "reward": 1.978515625, "reward_std": 0.0859375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.994140625, "step": 1921 }, { "clip_ratio": 0.0, "completion_length": 564.5, "epoch": 0.7688, "grad_norm": 0.0460388979677555, "kl": 4.19921875, "learning_rate": 3.0836087845536e-06, "loss": 0.1683, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1922 }, { "clip_ratio": 0.0, "completion_length": 726.625, "epoch": 0.7692, "grad_norm": 0.29722594491373777, "kl": 3.515625, "learning_rate": 3.073531111460013e-06, "loss": 0.1405, "reward": 2.0859375, "reward_std": 0.1193198561668396, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 1923 }, { "clip_ratio": 0.0, "completion_length": 762.375, "epoch": 0.7696, "grad_norm": 0.5058478901322465, "kl": 4.69140625, "learning_rate": 3.063466941871952e-06, "loss": 0.1875, "reward": 1.94140625, "reward_std": 0.18320196866989136, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.98046875, "step": 1924 }, { "clip_ratio": 0.0, "completion_length": 955.125, "epoch": 0.77, "grad_norm": 0.6270882904653112, "kl": 4.90625, "learning_rate": 3.0534162954100264e-06, "loss": 0.1962, "reward": 1.904296875, "reward_std": 0.3458823561668396, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 1925 }, { "clip_ratio": 0.0, "completion_length": 821.5, "epoch": 0.7704, "grad_norm": 0.3041375003137973, "kl": 4.234375, "learning_rate": 3.043379191668492e-06, "loss": 0.1694, "reward": 2.0859375, "reward_std": 0.15625, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 1926 }, { "clip_ratio": 0.0, "completion_length": 824.875, "epoch": 0.7708, "grad_norm": 0.2216012872769585, "kl": 4.5546875, "learning_rate": 3.033355650215193e-06, "loss": 0.1817, "reward": 1.978515625, "reward_std": 0.34157584607601166, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 1927 }, { "clip_ratio": 0.0, "completion_length": 575.625, "epoch": 0.7712, "grad_norm": 0.032388191136781264, "kl": 3.94921875, "learning_rate": 3.023345690591537e-06, "loss": 0.1576, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1928 }, { "clip_ratio": 0.0, "completion_length": 733.25, "epoch": 0.7716, "grad_norm": 2.1791627107367275, "kl": 3.9140625, "learning_rate": 3.013349332312451e-06, "loss": 0.1566, "reward": 2.05078125, "reward_std": 0.13706252723932266, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.99609375, "step": 1929 }, { "clip_ratio": 0.0, "completion_length": 699.75, "epoch": 0.772, "grad_norm": 9.264843078276192, "kl": 3.859375, "learning_rate": 3.003366594866345e-06, "loss": 0.1543, "reward": 1.984375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.9921875, "step": 1930 }, { "clip_ratio": 0.0, "completion_length": 707.375, "epoch": 0.7724, "grad_norm": 0.1214045064490477, "kl": 4.1484375, "learning_rate": 2.993397497715086e-06, "loss": 0.1661, "reward": 2.044921875, "reward_std": 0.10310593992471695, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.998046875, "step": 1931 }, { "clip_ratio": 0.0, "completion_length": 809.75, "epoch": 0.7728, "grad_norm": 0.4240191856891181, "kl": 4.9296875, "learning_rate": 2.983442060293926e-06, "loss": 0.1971, "reward": 2.099609375, "reward_std": 0.2457016110420227, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.982421875, "step": 1932 }, { "clip_ratio": 0.0, "completion_length": 617.625, "epoch": 0.7732, "grad_norm": 0.23830639704136664, "kl": 4.38671875, "learning_rate": 2.9735003020115095e-06, "loss": 0.1755, "reward": 2.076171875, "reward_std": 0.1666145622730255, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.982421875, "step": 1933 }, { "clip_ratio": 0.0, "completion_length": 876.875, "epoch": 0.7736, "grad_norm": 0.1910831325885423, "kl": 4.9609375, "learning_rate": 2.963572242249799e-06, "loss": 0.1981, "reward": 1.931640625, "reward_std": 0.26559244096279144, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 1934 }, { "clip_ratio": 0.0, "completion_length": 790.2734375, "epoch": 0.774, "grad_norm": 0.14970397475111147, "kl": 3.85546875, "learning_rate": 2.953657900364053e-06, "loss": 0.142, "reward": 2.07421875, "reward_std": 0.17117708921432495, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1935 }, { "clip_ratio": 0.0, "completion_length": 734.796875, "epoch": 0.7744, "grad_norm": 0.3053665534226952, "kl": 5.15625, "learning_rate": 2.9437572956827965e-06, "loss": 0.1982, "reward": 1.935546875, "reward_std": 0.22315485030412674, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1936 }, { "clip_ratio": 0.0, "completion_length": 762.875, "epoch": 0.7748, "grad_norm": 0.19031642353237993, "kl": 3.8203125, "learning_rate": 2.9338704475077527e-06, "loss": 0.1526, "reward": 1.94921875, "reward_std": 0.2753315046429634, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.95703125, "step": 1937 }, { "clip_ratio": 0.0, "completion_length": 880.796875, "epoch": 0.7752, "grad_norm": 0.4480674392435946, "kl": 4.765625, "learning_rate": 2.9239973751138495e-06, "loss": 0.186, "reward": 1.912109375, "reward_std": 0.34069089591503143, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.958984375, "step": 1938 }, { "clip_ratio": 0.0, "completion_length": 802.75, "epoch": 0.7756, "grad_norm": 0.3494661211575079, "kl": 4.546875, "learning_rate": 2.9141380977491373e-06, "loss": 0.1817, "reward": 1.966796875, "reward_std": 0.40248046815395355, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.951171875, "step": 1939 }, { "clip_ratio": 0.0, "completion_length": 911.25, "epoch": 0.776, "grad_norm": 1.6012889420488814, "kl": 4.6171875, "learning_rate": 2.9042926346347932e-06, "loss": 0.1853, "reward": 2.017578125, "reward_std": 0.36130431294441223, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.955078125, "step": 1940 }, { "clip_ratio": 0.0, "completion_length": 747.5, "epoch": 0.7764, "grad_norm": 0.1120821233615595, "kl": 3.609375, "learning_rate": 2.8944610049650377e-06, "loss": 0.1443, "reward": 1.94921875, "reward_std": 0.12724344432353973, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1941 }, { "clip_ratio": 0.0, "completion_length": 929.75, "epoch": 0.7768, "grad_norm": 0.217326090856752, "kl": 5.328125, "learning_rate": 2.884643227907147e-06, "loss": 0.2133, "reward": 2.15625, "reward_std": 0.3448980450630188, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.9609375, "step": 1942 }, { "clip_ratio": 0.0, "completion_length": 785.625, "epoch": 0.7772, "grad_norm": 0.32658435709198647, "kl": 5.1796875, "learning_rate": 2.874839322601375e-06, "loss": 0.2072, "reward": 2.02734375, "reward_std": 0.27263741195201874, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.97265625, "step": 1943 }, { "clip_ratio": 0.0, "completion_length": 801.375, "epoch": 0.7776, "grad_norm": 0.21269734653495986, "kl": 5.4375, "learning_rate": 2.8650493081609344e-06, "loss": 0.2174, "reward": 1.970703125, "reward_std": 0.20149878412485123, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 1944 }, { "clip_ratio": 0.0, "completion_length": 744.0, "epoch": 0.778, "grad_norm": 0.1897775883453442, "kl": 4.9140625, "learning_rate": 2.855273203671969e-06, "loss": 0.1964, "reward": 2.072265625, "reward_std": 0.1822395622730255, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 1945 }, { "clip_ratio": 0.0, "completion_length": 804.875, "epoch": 0.7784, "grad_norm": 0.21501069975873457, "kl": 4.8359375, "learning_rate": 2.8455110281934804e-06, "loss": 0.1932, "reward": 2.099609375, "reward_std": 0.2948591262102127, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.974609375, "step": 1946 }, { "clip_ratio": 0.0, "completion_length": 845.625, "epoch": 0.7788, "grad_norm": 0.1328590738045276, "kl": 4.80078125, "learning_rate": 2.8357628007573412e-06, "loss": 0.1918, "reward": 2.064453125, "reward_std": 0.2120855450630188, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 1947 }, { "clip_ratio": 0.0, "completion_length": 913.375, "epoch": 0.7792, "grad_norm": 0.1661074869420633, "kl": 4.7578125, "learning_rate": 2.8260285403682153e-06, "loss": 0.1903, "reward": 1.904296875, "reward_std": 0.3184482902288437, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 1948 }, { "clip_ratio": 0.0, "completion_length": 677.75, "epoch": 0.7796, "grad_norm": 0.21008013973194148, "kl": 4.7890625, "learning_rate": 2.816308266003541e-06, "loss": 0.1914, "reward": 2.0703125, "reward_std": 0.22754128277301788, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.984375, "step": 1949 }, { "clip_ratio": 0.0, "completion_length": 853.25, "epoch": 0.78, "grad_norm": 0.15079536639779628, "kl": 4.765625, "learning_rate": 2.8066019966134907e-06, "loss": 0.1904, "reward": 2.041015625, "reward_std": 0.26434970647096634, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.962890625, "step": 1950 }, { "clip_ratio": 0.0, "completion_length": 746.125, "epoch": 0.7804, "grad_norm": 0.20357078873287707, "kl": 4.296875, "learning_rate": 2.796909751120931e-06, "loss": 0.1718, "reward": 1.982421875, "reward_std": 0.11273179948329926, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.990234375, "step": 1951 }, { "clip_ratio": 0.0, "completion_length": 893.25, "epoch": 0.7808, "grad_norm": 0.5933580824738417, "kl": 4.6484375, "learning_rate": 2.7872315484213954e-06, "loss": 0.186, "reward": 2.138671875, "reward_std": 0.39373013377189636, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 1952 }, { "clip_ratio": 0.0, "completion_length": 727.75, "epoch": 0.7812, "grad_norm": 0.4993826679772362, "kl": 4.90625, "learning_rate": 2.7775674073830337e-06, "loss": 0.1964, "reward": 1.939453125, "reward_std": 0.20257875323295593, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.970703125, "step": 1953 }, { "clip_ratio": 0.0, "completion_length": 657.5, "epoch": 0.7816, "grad_norm": 0.10204716233998912, "kl": 3.734375, "learning_rate": 2.7679173468465813e-06, "loss": 0.1496, "reward": 1.98828125, "reward_std": 0.046875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.99609375, "step": 1954 }, { "clip_ratio": 0.0, "completion_length": 811.5, "epoch": 0.782, "grad_norm": 0.1750016866545685, "kl": 4.5234375, "learning_rate": 2.7582813856253276e-06, "loss": 0.1808, "reward": 1.947265625, "reward_std": 0.2109375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 1955 }, { "clip_ratio": 0.0, "completion_length": 662.453125, "epoch": 0.7824, "grad_norm": 0.3461617731292701, "kl": 4.6640625, "learning_rate": 2.7486595425050667e-06, "loss": 0.1781, "reward": 2.07421875, "reward_std": 0.28559304773807526, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 1956 }, { "clip_ratio": 0.0, "completion_length": 759.0, "epoch": 0.7828, "grad_norm": 0.2604465295207056, "kl": 4.765625, "learning_rate": 2.739051836244081e-06, "loss": 0.1905, "reward": 1.958984375, "reward_std": 0.13930703699588776, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.990234375, "step": 1957 }, { "clip_ratio": 0.0, "completion_length": 717.125, "epoch": 0.7832, "grad_norm": 0.5557622362256787, "kl": 4.2734375, "learning_rate": 2.7294582855730835e-06, "loss": 0.1708, "reward": 2.064453125, "reward_std": 0.15314127504825592, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.986328125, "step": 1958 }, { "clip_ratio": 0.0, "completion_length": 854.375, "epoch": 0.7836, "grad_norm": 0.16894889648775716, "kl": 5.015625, "learning_rate": 2.7198789091951903e-06, "loss": 0.2006, "reward": 1.92578125, "reward_std": 0.23575043678283691, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 1959 }, { "clip_ratio": 0.0, "completion_length": 786.0, "epoch": 0.784, "grad_norm": 0.2901850855985791, "kl": 4.41796875, "learning_rate": 2.7103137257858867e-06, "loss": 0.1769, "reward": 1.9609375, "reward_std": 0.15625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 1960 }, { "clip_ratio": 0.0, "completion_length": 755.25, "epoch": 0.7844, "grad_norm": 0.1391566061414827, "kl": 4.82421875, "learning_rate": 2.7007627539929847e-06, "loss": 0.193, "reward": 2.03515625, "reward_std": 0.23101628571748734, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1961 }, { "clip_ratio": 0.0, "completion_length": 799.75, "epoch": 0.7848, "grad_norm": 0.18651870870078785, "kl": 3.8125, "learning_rate": 2.6912260124366007e-06, "loss": 0.1524, "reward": 2.009765625, "reward_std": 0.2591850757598877, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 1962 }, { "clip_ratio": 0.0, "completion_length": 780.0, "epoch": 0.7852, "grad_norm": 0.24989747118067593, "kl": 4.421875, "learning_rate": 2.6817035197090892e-06, "loss": 0.177, "reward": 2.033203125, "reward_std": 0.2270153984427452, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 1963 }, { "clip_ratio": 0.0, "completion_length": 841.375, "epoch": 0.7856, "grad_norm": 0.2521358291104299, "kl": 4.265625, "learning_rate": 2.672195294375045e-06, "loss": 0.1707, "reward": 2.0234375, "reward_std": 0.2725568860769272, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.9609375, "step": 1964 }, { "clip_ratio": 0.0, "completion_length": 805.875, "epoch": 0.786, "grad_norm": 0.6796836955726092, "kl": 4.1953125, "learning_rate": 2.6627013549712355e-06, "loss": 0.168, "reward": 2.029296875, "reward_std": 0.28795240074396133, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1965 }, { "clip_ratio": 0.0, "completion_length": 782.5, "epoch": 0.7864, "grad_norm": 0.35203098279782585, "kl": 5.125, "learning_rate": 2.6532217200065856e-06, "loss": 0.2054, "reward": 2.087890625, "reward_std": 0.1484375, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 1966 }, { "clip_ratio": 0.0, "completion_length": 711.0, "epoch": 0.7868, "grad_norm": 0.20327468879828658, "kl": 4.234375, "learning_rate": 2.643756407962127e-06, "loss": 0.1694, "reward": 2.21875, "reward_std": 0.17033424973487854, "rewards/accuracy_reward": 0.2421875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 1967 }, { "clip_ratio": 0.0, "completion_length": 692.25, "epoch": 0.7872, "grad_norm": 0.26974651064350536, "kl": 4.3828125, "learning_rate": 2.634305437290968e-06, "loss": 0.1753, "reward": 2.078125, "reward_std": 0.11911680549383163, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.984375, "step": 1968 }, { "clip_ratio": 0.0, "completion_length": 794.796875, "epoch": 0.7876, "grad_norm": 0.29286455884554297, "kl": 4.5390625, "learning_rate": 2.624868826418262e-06, "loss": 0.1738, "reward": 1.974609375, "reward_std": 0.22047948837280273, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.982421875, "step": 1969 }, { "clip_ratio": 0.0, "completion_length": 864.578125, "epoch": 0.788, "grad_norm": 0.3433997179090418, "kl": 5.0703125, "learning_rate": 2.615446593741161e-06, "loss": 0.2008, "reward": 2.05859375, "reward_std": 0.2782595455646515, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 1970 }, { "clip_ratio": 0.0, "completion_length": 892.0, "epoch": 0.7884, "grad_norm": 0.18280294222797577, "kl": 5.1875, "learning_rate": 2.6060387576287983e-06, "loss": 0.2075, "reward": 1.931640625, "reward_std": 0.2109375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.978515625, "step": 1971 }, { "clip_ratio": 0.0, "completion_length": 779.125, "epoch": 0.7888, "grad_norm": 0.2708029501109588, "kl": 4.5625, "learning_rate": 2.596645336422219e-06, "loss": 0.1825, "reward": 2.08203125, "reward_std": 0.171875, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98046875, "step": 1972 }, { "clip_ratio": 0.0, "completion_length": 779.875, "epoch": 0.7892, "grad_norm": 0.37548089500460724, "kl": 4.93359375, "learning_rate": 2.5872663484343887e-06, "loss": 0.1975, "reward": 1.935546875, "reward_std": 0.1989101767539978, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.982421875, "step": 1973 }, { "clip_ratio": 0.0, "completion_length": 727.625, "epoch": 0.7896, "grad_norm": 0.24183422466902663, "kl": 4.9453125, "learning_rate": 2.577901811950121e-06, "loss": 0.1979, "reward": 1.974609375, "reward_std": 0.2457016110420227, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.982421875, "step": 1974 }, { "clip_ratio": 0.0, "completion_length": 630.875, "epoch": 0.79, "grad_norm": 0.1240151402751845, "kl": 4.40625, "learning_rate": 2.5685517452260566e-06, "loss": 0.1763, "reward": 2.1953125, "reward_std": 0.12233919650316238, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.9921875, "step": 1975 }, { "clip_ratio": 0.0, "completion_length": 772.5, "epoch": 0.7904, "grad_norm": 0.2798580879706794, "kl": 4.09375, "learning_rate": 2.5592161664906366e-06, "loss": 0.1636, "reward": 1.958984375, "reward_std": 0.14212024956941605, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.990234375, "step": 1976 }, { "clip_ratio": 0.0, "completion_length": 776.75, "epoch": 0.7908, "grad_norm": 0.11102500759589913, "kl": 4.640625, "learning_rate": 2.549895093944039e-06, "loss": 0.1861, "reward": 1.955078125, "reward_std": 0.1796875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.978515625, "step": 1977 }, { "clip_ratio": 0.0, "completion_length": 833.25, "epoch": 0.7912, "grad_norm": 0.12283972482555096, "kl": 4.71875, "learning_rate": 2.5405885457581793e-06, "loss": 0.1885, "reward": 2.07421875, "reward_std": 0.203125, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1978 }, { "clip_ratio": 0.0, "completion_length": 826.5, "epoch": 0.7916, "grad_norm": 0.3930472761221155, "kl": 4.828125, "learning_rate": 2.5312965400766475e-06, "loss": 0.1929, "reward": 1.947265625, "reward_std": 0.2178204506635666, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 1979 }, { "clip_ratio": 0.0, "completion_length": 650.375, "epoch": 0.792, "grad_norm": 0.4312484680509116, "kl": 4.765625, "learning_rate": 2.522019095014683e-06, "loss": 0.1909, "reward": 2.19921875, "reward_std": 0.15137271583080292, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98828125, "step": 1980 }, { "clip_ratio": 0.0, "completion_length": 908.75, "epoch": 0.7924, "grad_norm": 0.4006595119558388, "kl": 4.6640625, "learning_rate": 2.512756228659141e-06, "loss": 0.1864, "reward": 1.8984375, "reward_std": 0.35507817566394806, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.9609375, "step": 1981 }, { "clip_ratio": 0.0, "completion_length": 753.0, "epoch": 0.7928, "grad_norm": 0.28689765926822697, "kl": 4.5078125, "learning_rate": 2.5035079590684496e-06, "loss": 0.1805, "reward": 1.947265625, "reward_std": 0.17627985030412674, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 1982 }, { "clip_ratio": 0.0, "completion_length": 769.0, "epoch": 0.7932, "grad_norm": 0.17248669603611688, "kl": 4.7734375, "learning_rate": 2.494274304272589e-06, "loss": 0.1907, "reward": 1.9609375, "reward_std": 0.1900520622730255, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.984375, "step": 1983 }, { "clip_ratio": 0.0, "completion_length": 884.75, "epoch": 0.7936, "grad_norm": 0.22398351595342403, "kl": 5.203125, "learning_rate": 2.48505528227304e-06, "loss": 0.2079, "reward": 1.927734375, "reward_std": 0.2890625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.966796875, "step": 1984 }, { "clip_ratio": 0.0, "completion_length": 708.375, "epoch": 0.794, "grad_norm": 2.6624909574732833, "kl": 4.4609375, "learning_rate": 2.4758509110427576e-06, "loss": 0.1785, "reward": 2.0703125, "reward_std": 0.14943470060825348, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1985 }, { "clip_ratio": 0.0, "completion_length": 921.625, "epoch": 0.7944, "grad_norm": 0.4059279298933321, "kl": 5.3046875, "learning_rate": 2.4666612085261344e-06, "loss": 0.212, "reward": 1.91015625, "reward_std": 0.3224448561668396, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.96484375, "step": 1986 }, { "clip_ratio": 0.0, "completion_length": 797.25, "epoch": 0.7948, "grad_norm": 0.44256969078155733, "kl": 4.3359375, "learning_rate": 2.4574861926389615e-06, "loss": 0.1735, "reward": 2.013671875, "reward_std": 0.22791431099176407, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.982421875, "step": 1987 }, { "clip_ratio": 0.0, "completion_length": 564.625, "epoch": 0.7952, "grad_norm": 0.16781722008940966, "kl": 3.50390625, "learning_rate": 2.4483258812684096e-06, "loss": 0.1401, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1988 }, { "clip_ratio": 0.0, "completion_length": 832.75, "epoch": 0.7956, "grad_norm": 0.19029374367996796, "kl": 5.0546875, "learning_rate": 2.4391802922729703e-06, "loss": 0.2024, "reward": 1.97265625, "reward_std": 0.3160141110420227, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.97265625, "step": 1989 }, { "clip_ratio": 0.0, "completion_length": 767.875, "epoch": 0.796, "grad_norm": 0.7674990722776439, "kl": 4.78125, "learning_rate": 2.4300494434824373e-06, "loss": 0.1908, "reward": 2.193359375, "reward_std": 0.19190484285354614, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.974609375, "step": 1990 }, { "clip_ratio": 0.0, "completion_length": 736.625, "epoch": 0.7964, "grad_norm": 4.07615704558016, "kl": 4.4296875, "learning_rate": 2.420933352697865e-06, "loss": 0.1771, "reward": 1.970703125, "reward_std": 0.1171875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.986328125, "step": 1991 }, { "clip_ratio": 0.0, "completion_length": 713.5, "epoch": 0.7968, "grad_norm": 0.29775363351890277, "kl": 4.09375, "learning_rate": 2.411832037691545e-06, "loss": 0.164, "reward": 2.07421875, "reward_std": 0.1917647123336792, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.97265625, "step": 1992 }, { "clip_ratio": 0.0, "completion_length": 905.0, "epoch": 0.7972, "grad_norm": 0.20835173127662532, "kl": 4.6796875, "learning_rate": 2.4027455162069567e-06, "loss": 0.187, "reward": 2.080078125, "reward_std": 0.3760072663426399, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.962890625, "step": 1993 }, { "clip_ratio": 0.0, "completion_length": 856.1796875, "epoch": 0.7976, "grad_norm": 0.3270717667182192, "kl": 4.90625, "learning_rate": 2.3936738059587284e-06, "loss": 0.1858, "reward": 1.970703125, "reward_std": 0.33366432785987854, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.962890625, "step": 1994 }, { "clip_ratio": 0.0, "completion_length": 907.625, "epoch": 0.798, "grad_norm": 0.15822559880335735, "kl": 5.0, "learning_rate": 2.3846169246326345e-06, "loss": 0.2004, "reward": 1.9375, "reward_std": 0.25, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.9765625, "step": 1995 }, { "clip_ratio": 0.0, "completion_length": 655.625, "epoch": 0.7984, "grad_norm": 0.27914084115048265, "kl": 4.25390625, "learning_rate": 2.37557488988552e-06, "loss": 0.17, "reward": 2.15234375, "reward_std": 0.13402669876813889, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.99609375, "step": 1996 }, { "clip_ratio": 0.0, "completion_length": 739.625, "epoch": 0.7988, "grad_norm": 0.10617750382665714, "kl": 3.90234375, "learning_rate": 2.3665477193453037e-06, "loss": 0.1562, "reward": 2.126953125, "reward_std": 0.1761064976453781, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 1997 }, { "clip_ratio": 0.0, "completion_length": 773.75, "epoch": 0.7992, "grad_norm": 0.1288838051923278, "kl": 4.5078125, "learning_rate": 2.35753543061091e-06, "loss": 0.1805, "reward": 1.95703125, "reward_std": 0.13226625323295593, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98046875, "step": 1998 }, { "clip_ratio": 0.0, "completion_length": 824.375, "epoch": 0.7996, "grad_norm": 0.8678811603702102, "kl": 4.3125, "learning_rate": 2.3485380412522586e-06, "loss": 0.1728, "reward": 2.072265625, "reward_std": 0.2109375, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 1999 }, { "clip_ratio": 0.0, "completion_length": 599.75, "epoch": 0.8, "grad_norm": 0.131525797388203, "kl": 3.921875, "learning_rate": 2.339555568810221e-06, "loss": 0.1571, "reward": 2.0078125, "reward_std": 0.03125, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 2000 }, { "clip_ratio": 0.0, "completion_length": 859.25, "epoch": 0.8004, "grad_norm": 0.19822470233414913, "kl": 4.70703125, "learning_rate": 2.3305880307965834e-06, "loss": 0.1885, "reward": 2.095703125, "reward_std": 0.2803569808602333, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 2001 }, { "clip_ratio": 0.0, "completion_length": 771.625, "epoch": 0.8008, "grad_norm": 0.22660982316346684, "kl": 4.6328125, "learning_rate": 2.321635444694028e-06, "loss": 0.1852, "reward": 2.11328125, "reward_std": 0.23101628571748734, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 2002 }, { "clip_ratio": 0.0, "completion_length": 678.125, "epoch": 0.8012, "grad_norm": 0.31017888365827867, "kl": 3.84375, "learning_rate": 2.3126978279560687e-06, "loss": 0.1539, "reward": 1.984375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.9921875, "step": 2003 }, { "clip_ratio": 0.0, "completion_length": 565.125, "epoch": 0.8016, "grad_norm": 0.23165315656736957, "kl": 3.6796875, "learning_rate": 2.3037751980070557e-06, "loss": 0.1473, "reward": 1.978515625, "reward_std": 0.058983080089092255, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.994140625, "step": 2004 }, { "clip_ratio": 0.0, "completion_length": 707.875, "epoch": 0.802, "grad_norm": 0.22191108912634655, "kl": 4.6484375, "learning_rate": 2.2948675722421086e-06, "loss": 0.1859, "reward": 2.115234375, "reward_std": 0.0390625, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.998046875, "step": 2005 }, { "clip_ratio": 0.0, "completion_length": 831.0, "epoch": 0.8024, "grad_norm": 0.30697215122417326, "kl": 4.1953125, "learning_rate": 2.2859749680270983e-06, "loss": 0.1677, "reward": 2.02734375, "reward_std": 0.28957879543304443, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.97265625, "step": 2006 }, { "clip_ratio": 0.0, "completion_length": 716.875, "epoch": 0.8028, "grad_norm": 0.5180012832871069, "kl": 4.3203125, "learning_rate": 2.277097402698619e-06, "loss": 0.1729, "reward": 1.982421875, "reward_std": 0.0703125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.998046875, "step": 2007 }, { "clip_ratio": 0.0, "completion_length": 660.625, "epoch": 0.8032, "grad_norm": 0.11468716523185128, "kl": 3.8984375, "learning_rate": 2.2682348935639274e-06, "loss": 0.1561, "reward": 1.98828125, "reward_std": 0.046875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.99609375, "step": 2008 }, { "clip_ratio": 0.0, "completion_length": 675.125, "epoch": 0.8036, "grad_norm": 0.14097893668867215, "kl": 3.88671875, "learning_rate": 2.259387457900948e-06, "loss": 0.1555, "reward": 1.974609375, "reward_std": 0.1015625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.990234375, "step": 2009 }, { "clip_ratio": 0.0, "completion_length": 661.25, "epoch": 0.804, "grad_norm": 0.3436271897928858, "kl": 4.00390625, "learning_rate": 2.2505551129582047e-06, "loss": 0.16, "reward": 2.16015625, "reward_std": 0.109375, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.99609375, "step": 2010 }, { "clip_ratio": 0.0, "completion_length": 735.875, "epoch": 0.8044, "grad_norm": 0.1707910634617492, "kl": 4.203125, "learning_rate": 2.241737875954808e-06, "loss": 0.1682, "reward": 1.9765625, "reward_std": 0.09375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 2011 }, { "clip_ratio": 0.0, "completion_length": 749.625, "epoch": 0.8048, "grad_norm": 0.2914207251045679, "kl": 4.0078125, "learning_rate": 2.2329357640804118e-06, "loss": 0.16, "reward": 1.9609375, "reward_std": 0.15625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 2012 }, { "clip_ratio": 0.0, "completion_length": 724.375, "epoch": 0.8052, "grad_norm": 0.12337897591020607, "kl": 4.6015625, "learning_rate": 2.22414879449518e-06, "loss": 0.184, "reward": 1.97265625, "reward_std": 0.109375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 2013 }, { "clip_ratio": 0.0, "completion_length": 731.75, "epoch": 0.8056, "grad_norm": 0.17255251728958482, "kl": 4.421875, "learning_rate": 2.215376984329767e-06, "loss": 0.1769, "reward": 2.0234375, "reward_std": 0.15625, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 2014 }, { "clip_ratio": 0.0, "completion_length": 737.0, "epoch": 0.806, "grad_norm": 0.11791437690806429, "kl": 4.5625, "learning_rate": 2.206620350685257e-06, "loss": 0.1822, "reward": 1.958984375, "reward_std": 0.12445375323295593, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2015 }, { "clip_ratio": 0.0, "completion_length": 842.875, "epoch": 0.8064, "grad_norm": 0.15339954271723857, "kl": 4.90625, "learning_rate": 2.1978789106331666e-06, "loss": 0.1962, "reward": 2.05078125, "reward_std": 0.2374618873000145, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 2016 }, { "clip_ratio": 0.0, "completion_length": 893.25, "epoch": 0.8068, "grad_norm": 0.2978122346925805, "kl": 4.36328125, "learning_rate": 2.1891526812153674e-06, "loss": 0.175, "reward": 2.142578125, "reward_std": 0.23805078864097595, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 2017 }, { "clip_ratio": 0.0, "completion_length": 718.875, "epoch": 0.8072, "grad_norm": 0.06492343429767629, "kl": 4.625, "learning_rate": 2.1804416794441e-06, "loss": 0.1847, "reward": 2.111328125, "reward_std": 0.0546875, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 2018 }, { "clip_ratio": 0.0, "completion_length": 819.125, "epoch": 0.8076, "grad_norm": 0.24531212655685472, "kl": 4.7890625, "learning_rate": 2.171745922301903e-06, "loss": 0.1914, "reward": 2.044921875, "reward_std": 0.2532804012298584, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.966796875, "step": 2019 }, { "clip_ratio": 0.0, "completion_length": 643.7734375, "epoch": 0.808, "grad_norm": 0.18982050824112698, "kl": 3.58984375, "learning_rate": 2.163065426741603e-06, "loss": 0.1344, "reward": 2.021484375, "reward_std": 0.09496419876813889, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.998046875, "step": 2020 }, { "clip_ratio": 0.0, "completion_length": 748.875, "epoch": 0.8084, "grad_norm": 0.29470078577253345, "kl": 4.140625, "learning_rate": 2.154400209686268e-06, "loss": 0.1657, "reward": 2.01953125, "reward_std": 0.26716843992471695, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98046875, "step": 2021 }, { "clip_ratio": 0.0, "completion_length": 796.375, "epoch": 0.8088, "grad_norm": 0.16435236036338396, "kl": 4.6328125, "learning_rate": 2.1457502880291815e-06, "loss": 0.1849, "reward": 1.958984375, "reward_std": 0.1640625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2022 }, { "clip_ratio": 0.0, "completion_length": 821.125, "epoch": 0.8092, "grad_norm": 0.19457653124565816, "kl": 3.83984375, "learning_rate": 2.1371156786338108e-06, "loss": 0.1537, "reward": 2.025390625, "reward_std": 0.22061938047409058, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.978515625, "step": 2023 }, { "clip_ratio": 0.0, "completion_length": 810.375, "epoch": 0.8096, "grad_norm": 0.2517997636364008, "kl": 4.05078125, "learning_rate": 2.128496398333768e-06, "loss": 0.162, "reward": 2.068359375, "reward_std": 0.14376502484083176, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.990234375, "step": 2024 }, { "clip_ratio": 0.0, "completion_length": 635.125, "epoch": 0.81, "grad_norm": 0.09359691534389657, "kl": 3.59375, "learning_rate": 2.119892463932781e-06, "loss": 0.144, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 2025 }, { "clip_ratio": 0.0, "completion_length": 848.75, "epoch": 0.8104, "grad_norm": 0.20861431766597213, "kl": 4.26953125, "learning_rate": 2.1113038922046603e-06, "loss": 0.1708, "reward": 1.9921875, "reward_std": 0.13644562661647797, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 2026 }, { "clip_ratio": 0.0, "completion_length": 832.125, "epoch": 0.8108, "grad_norm": 0.3726138773621635, "kl": 4.5234375, "learning_rate": 2.102730699893263e-06, "loss": 0.1811, "reward": 2.0390625, "reward_std": 0.2587667405605316, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.9765625, "step": 2027 }, { "clip_ratio": 0.0, "completion_length": 839.0, "epoch": 0.8112, "grad_norm": 0.49911562335446596, "kl": 4.66015625, "learning_rate": 2.09417290371247e-06, "loss": 0.1866, "reward": 2.0703125, "reward_std": 0.22877176105976105, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.9765625, "step": 2028 }, { "clip_ratio": 0.0, "completion_length": 793.875, "epoch": 0.8116, "grad_norm": 0.3171428145081578, "kl": 4.109375, "learning_rate": 2.0856305203461436e-06, "loss": 0.1642, "reward": 2.212890625, "reward_std": 0.1484375, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 2029 }, { "clip_ratio": 0.0, "completion_length": 761.875, "epoch": 0.812, "grad_norm": 0.3252553721054124, "kl": 4.8125, "learning_rate": 2.0771035664480944e-06, "loss": 0.1925, "reward": 1.9609375, "reward_std": 0.15625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 2030 }, { "clip_ratio": 0.0, "completion_length": 743.5, "epoch": 0.8124, "grad_norm": 4.246064774917766, "kl": 4.1640625, "learning_rate": 2.0685920586420562e-06, "loss": 0.1667, "reward": 2.23828125, "reward_std": 0.2289872094988823, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.99609375, "step": 2031 }, { "clip_ratio": 0.0, "completion_length": 758.5, "epoch": 0.8128, "grad_norm": 0.07929129946203921, "kl": 4.26953125, "learning_rate": 2.0600960135216463e-06, "loss": 0.1706, "reward": 1.9453125, "reward_std": 0.14987194538116455, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2032 }, { "clip_ratio": 0.0, "completion_length": 882.0390625, "epoch": 0.8132, "grad_norm": 0.5338428546092782, "kl": 4.82421875, "learning_rate": 2.051615447650347e-06, "loss": 0.1843, "reward": 1.904296875, "reward_std": 0.322608582675457, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.966796875, "step": 2033 }, { "clip_ratio": 0.0, "completion_length": 679.125, "epoch": 0.8136, "grad_norm": 0.20392365287756153, "kl": 4.265625, "learning_rate": 2.0431503775614457e-06, "loss": 0.1706, "reward": 2.2890625, "reward_std": 0.059839196503162384, "rewards/accuracy_reward": 0.2890625, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 2034 }, { "clip_ratio": 0.0, "completion_length": 785.578125, "epoch": 0.814, "grad_norm": 0.17519793661950658, "kl": 3.19921875, "learning_rate": 2.0347008197580376e-06, "loss": 0.123, "reward": 1.908203125, "reward_std": 0.2310810387134552, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 2035 }, { "clip_ratio": 0.0, "completion_length": 832.5, "epoch": 0.8144, "grad_norm": 0.24769513818256997, "kl": 4.44921875, "learning_rate": 2.026266790712965e-06, "loss": 0.1782, "reward": 2.109375, "reward_std": 0.24664128571748734, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2036 }, { "clip_ratio": 0.0, "completion_length": 895.0, "epoch": 0.8148, "grad_norm": 0.22499292817129063, "kl": 4.08203125, "learning_rate": 2.017848306868797e-06, "loss": 0.1636, "reward": 1.935546875, "reward_std": 0.30492153018713, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.966796875, "step": 2037 }, { "clip_ratio": 0.0, "completion_length": 591.25, "epoch": 0.8152, "grad_norm": 0.3513561759776618, "kl": 3.79296875, "learning_rate": 2.009445384637805e-06, "loss": 0.1518, "reward": 2.203125, "reward_std": 0.0625, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 2038 }, { "clip_ratio": 0.0, "completion_length": 819.125, "epoch": 0.8156, "grad_norm": 0.1885134168239755, "kl": 3.96875, "learning_rate": 2.0010580404019066e-06, "loss": 0.1589, "reward": 1.9609375, "reward_std": 0.1275520622730255, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 2039 }, { "clip_ratio": 0.0, "completion_length": 724.75, "epoch": 0.816, "grad_norm": 0.1412904875276072, "kl": 4.375, "learning_rate": 1.9926862905126663e-06, "loss": 0.1749, "reward": 1.98046875, "reward_std": 0.11945747584104538, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 2040 }, { "clip_ratio": 0.0, "completion_length": 828.125, "epoch": 0.8164, "grad_norm": 0.25368223342029, "kl": 4.75, "learning_rate": 1.984330151291233e-06, "loss": 0.1897, "reward": 2.025390625, "reward_std": 0.21916363388299942, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 2041 }, { "clip_ratio": 0.0, "completion_length": 822.875, "epoch": 0.8168, "grad_norm": 0.5160214377899434, "kl": 4.33984375, "learning_rate": 1.9759896390283362e-06, "loss": 0.1735, "reward": 1.9296875, "reward_std": 0.2443198561668396, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.96875, "step": 2042 }, { "clip_ratio": 0.0, "completion_length": 772.375, "epoch": 0.8172, "grad_norm": 0.17874119638826738, "kl": 4.62890625, "learning_rate": 1.9676647699842246e-06, "loss": 0.1852, "reward": 2.087890625, "reward_std": 0.1484375, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 2043 }, { "clip_ratio": 0.0, "completion_length": 676.625, "epoch": 0.8176, "grad_norm": 0.9987465069258681, "kl": 4.0234375, "learning_rate": 1.959355560388654e-06, "loss": 0.1611, "reward": 2.201171875, "reward_std": 0.20409732311964035, "rewards/accuracy_reward": 0.2265625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.990234375, "step": 2044 }, { "clip_ratio": 0.0, "completion_length": 696.375, "epoch": 0.818, "grad_norm": 0.7210518164793026, "kl": 3.859375, "learning_rate": 1.95106202644086e-06, "loss": 0.1545, "reward": 2.244140625, "reward_std": 0.0859375, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 2045 }, { "clip_ratio": 0.0, "completion_length": 778.71875, "epoch": 0.8184, "grad_norm": 4.339730747579831, "kl": 4.1171875, "learning_rate": 1.9427841843095063e-06, "loss": 0.1558, "reward": 1.94140625, "reward_std": 0.20742058008909225, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.98046875, "step": 2046 }, { "clip_ratio": 0.0, "completion_length": 743.875, "epoch": 0.8188, "grad_norm": 0.16700390938468043, "kl": 3.96484375, "learning_rate": 1.934522050132678e-06, "loss": 0.1586, "reward": 2.01171875, "reward_std": 0.21039338409900665, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98046875, "step": 2047 }, { "clip_ratio": 0.0, "completion_length": 707.625, "epoch": 0.8192, "grad_norm": 0.34316566879050137, "kl": 3.890625, "learning_rate": 1.9262756400178163e-06, "loss": 0.1555, "reward": 2.23828125, "reward_std": 0.046875, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.99609375, "step": 2048 }, { "clip_ratio": 0.0, "completion_length": 745.0, "epoch": 0.8196, "grad_norm": 0.2987022741735225, "kl": 3.9296875, "learning_rate": 1.918044970041729e-06, "loss": 0.1572, "reward": 1.9609375, "reward_std": 0.12430208921432495, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 2049 }, { "clip_ratio": 0.0, "completion_length": 750.125, "epoch": 0.82, "grad_norm": 0.192853705339046, "kl": 4.265625, "learning_rate": 1.9098300562505266e-06, "loss": 0.1708, "reward": 1.96484375, "reward_std": 0.14643365889787674, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98046875, "step": 2050 }, { "clip_ratio": 0.0, "completion_length": 725.875, "epoch": 0.8204, "grad_norm": 0.08981885714874566, "kl": 4.171875, "learning_rate": 1.9016309146596024e-06, "loss": 0.1667, "reward": 1.984375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.9921875, "step": 2051 }, { "clip_ratio": 0.0, "completion_length": 769.375, "epoch": 0.8208, "grad_norm": 0.3878749266532329, "kl": 5.0234375, "learning_rate": 1.8934475612536019e-06, "loss": 0.2007, "reward": 1.955078125, "reward_std": 0.20839616656303406, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 2052 }, { "clip_ratio": 0.0, "completion_length": 729.0, "epoch": 0.8212, "grad_norm": 3.988748708280009, "kl": 4.1484375, "learning_rate": 1.8852800119863912e-06, "loss": 0.1659, "reward": 1.93359375, "reward_std": 0.2052055522799492, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.97265625, "step": 2053 }, { "clip_ratio": 0.0, "completion_length": 782.625, "epoch": 0.8216, "grad_norm": 0.22731298948736572, "kl": 4.54296875, "learning_rate": 1.8771282827810278e-06, "loss": 0.1816, "reward": 2.21484375, "reward_std": 0.140625, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98828125, "step": 2054 }, { "clip_ratio": 0.0, "completion_length": 955.625, "epoch": 0.822, "grad_norm": 0.3517489216557336, "kl": 5.1640625, "learning_rate": 1.8689923895297247e-06, "loss": 0.2062, "reward": 1.951171875, "reward_std": 0.34945328533649445, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.966796875, "step": 2055 }, { "clip_ratio": 0.0, "completion_length": 804.625, "epoch": 0.8224, "grad_norm": 0.13671325411651847, "kl": 4.53125, "learning_rate": 1.8608723480938207e-06, "loss": 0.1811, "reward": 1.947265625, "reward_std": 0.1808355376124382, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 2056 }, { "clip_ratio": 0.0, "completion_length": 857.75, "epoch": 0.8228, "grad_norm": 0.08883142304827836, "kl": 4.71875, "learning_rate": 1.8527681743037518e-06, "loss": 0.1888, "reward": 2.083984375, "reward_std": 0.17551349848508835, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.982421875, "step": 2057 }, { "clip_ratio": 0.0, "completion_length": 794.75, "epoch": 0.8232, "grad_norm": 0.9404407655518411, "kl": 4.9765625, "learning_rate": 1.8446798839590186e-06, "loss": 0.1994, "reward": 2.0, "reward_std": 0.24586329609155655, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2058 }, { "clip_ratio": 0.0, "completion_length": 818.875, "epoch": 0.8236, "grad_norm": 0.4822917089058684, "kl": 4.65625, "learning_rate": 1.8366074928281608e-06, "loss": 0.1863, "reward": 1.955078125, "reward_std": 0.1796875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.978515625, "step": 2059 }, { "clip_ratio": 0.0, "completion_length": 886.875, "epoch": 0.824, "grad_norm": 0.6331837103917073, "kl": 4.84375, "learning_rate": 1.8285510166487154e-06, "loss": 0.1938, "reward": 1.943359375, "reward_std": 0.16237766295671463, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.982421875, "step": 2060 }, { "clip_ratio": 0.0, "completion_length": 909.625, "epoch": 0.8244, "grad_norm": 0.3568299018292939, "kl": 4.515625, "learning_rate": 1.820510471127196e-06, "loss": 0.1806, "reward": 2.05078125, "reward_std": 0.296875, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.96484375, "step": 2061 }, { "clip_ratio": 0.0, "completion_length": 664.0, "epoch": 0.8248, "grad_norm": 1.3354751529074786, "kl": 4.140625, "learning_rate": 1.812485871939056e-06, "loss": 0.1656, "reward": 1.95703125, "reward_std": 0.09599344432353973, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98046875, "step": 2062 }, { "clip_ratio": 0.0, "completion_length": 795.75, "epoch": 0.8252, "grad_norm": 1.0559109033780956, "kl": 4.5234375, "learning_rate": 1.804477234728661e-06, "loss": 0.181, "reward": 2.208984375, "reward_std": 0.1271323561668396, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2063 }, { "clip_ratio": 0.0, "completion_length": 866.375, "epoch": 0.8256, "grad_norm": 0.3014080410476618, "kl": 5.3828125, "learning_rate": 1.7964845751092663e-06, "loss": 0.2152, "reward": 2.05859375, "reward_std": 0.265625, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.97265625, "step": 2064 }, { "clip_ratio": 0.0, "completion_length": 668.375, "epoch": 0.826, "grad_norm": 0.23926488679729407, "kl": 4.328125, "learning_rate": 1.7885079086629598e-06, "loss": 0.1735, "reward": 2.064453125, "reward_std": 0.19542870670557022, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.986328125, "step": 2065 }, { "clip_ratio": 0.0, "completion_length": 778.0, "epoch": 0.8264, "grad_norm": 0.16473327525309203, "kl": 3.98046875, "learning_rate": 1.7805472509406695e-06, "loss": 0.1593, "reward": 1.94140625, "reward_std": 0.1974448561668396, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.97265625, "step": 2066 }, { "clip_ratio": 0.0, "completion_length": 760.5, "epoch": 0.8268, "grad_norm": 0.47862673213360274, "kl": 4.046875, "learning_rate": 1.7726026174621004e-06, "loss": 0.1623, "reward": 2.1015625, "reward_std": 0.09375, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 2067 }, { "clip_ratio": 0.0, "completion_length": 718.25, "epoch": 0.8272, "grad_norm": 0.2434773596150226, "kl": 3.7421875, "learning_rate": 1.7646740237157256e-06, "loss": 0.1494, "reward": 2.13671875, "reward_std": 0.20379015058279037, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 2068 }, { "clip_ratio": 0.0, "completion_length": 670.0, "epoch": 0.8276, "grad_norm": 0.22613092112175343, "kl": 3.63671875, "learning_rate": 1.7567614851587444e-06, "loss": 0.1455, "reward": 2.072265625, "reward_std": 0.14336346089839935, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 2069 }, { "clip_ratio": 0.0, "completion_length": 649.625, "epoch": 0.828, "grad_norm": 0.37357643353257164, "kl": 4.0234375, "learning_rate": 1.7488650172170496e-06, "loss": 0.1611, "reward": 2.01171875, "reward_std": 0.07295385748147964, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.99609375, "step": 2070 }, { "clip_ratio": 0.0, "completion_length": 800.0, "epoch": 0.8284, "grad_norm": 0.16993932177505455, "kl": 3.60546875, "learning_rate": 1.7409846352852144e-06, "loss": 0.1443, "reward": 2.119140625, "reward_std": 0.20259655267000198, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.978515625, "step": 2071 }, { "clip_ratio": 0.0, "completion_length": 911.125, "epoch": 0.8288, "grad_norm": 0.8637423982669769, "kl": 4.3828125, "learning_rate": 1.7331203547264452e-06, "loss": 0.1752, "reward": 2.05859375, "reward_std": 0.39042720943689346, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.95703125, "step": 2072 }, { "clip_ratio": 0.0, "completion_length": 859.375, "epoch": 0.8292, "grad_norm": 0.5436740942368313, "kl": 4.484375, "learning_rate": 1.7252721908725633e-06, "loss": 0.1792, "reward": 2.1640625, "reward_std": 0.23167353868484497, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.9609375, "step": 2073 }, { "clip_ratio": 0.0, "completion_length": 735.375, "epoch": 0.8296, "grad_norm": 0.21503853338717813, "kl": 4.31640625, "learning_rate": 1.7174401590239587e-06, "loss": 0.1728, "reward": 2.138671875, "reward_std": 0.13687361031770706, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.990234375, "step": 2074 }, { "clip_ratio": 0.0, "completion_length": 702.875, "epoch": 0.83, "grad_norm": 0.2328151523853601, "kl": 3.98828125, "learning_rate": 1.709624274449584e-06, "loss": 0.1591, "reward": 2.2265625, "reward_std": 0.09375, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 2075 }, { "clip_ratio": 0.0, "completion_length": 815.625, "epoch": 0.8304, "grad_norm": 0.5392322704536573, "kl": 3.69921875, "learning_rate": 1.7018245523869038e-06, "loss": 0.1481, "reward": 1.962890625, "reward_std": 0.1484375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 2076 }, { "clip_ratio": 0.0, "completion_length": 770.625, "epoch": 0.8308, "grad_norm": 0.18916728579209352, "kl": 4.4296875, "learning_rate": 1.6940410080418723e-06, "loss": 0.1773, "reward": 2.0546875, "reward_std": 0.2443198561668396, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.96875, "step": 2077 }, { "clip_ratio": 0.0, "completion_length": 741.625, "epoch": 0.8312, "grad_norm": 0.269569255262724, "kl": 3.59765625, "learning_rate": 1.686273656588917e-06, "loss": 0.1439, "reward": 1.962890625, "reward_std": 0.12148308008909225, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 2078 }, { "clip_ratio": 0.0, "completion_length": 742.5, "epoch": 0.8316, "grad_norm": 0.22388612435748823, "kl": 4.0390625, "learning_rate": 1.6785225131708749e-06, "loss": 0.1615, "reward": 1.9765625, "reward_std": 0.09375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 2079 }, { "clip_ratio": 0.0, "completion_length": 696.8125, "epoch": 0.832, "grad_norm": 0.20885474286809527, "kl": 3.5625, "learning_rate": 1.6707875928990059e-06, "loss": 0.1404, "reward": 1.9765625, "reward_std": 0.09375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 2080 }, { "clip_ratio": 0.0, "completion_length": 677.375, "epoch": 0.8324, "grad_norm": 0.3124859234268774, "kl": 4.265625, "learning_rate": 1.6630689108529286e-06, "loss": 0.1704, "reward": 2.390625, "reward_std": 0.11840169876813889, "rewards/accuracy_reward": 0.390625, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 2081 }, { "clip_ratio": 0.0, "completion_length": 708.25, "epoch": 0.8328, "grad_norm": 0.05839235819424436, "kl": 3.8671875, "learning_rate": 1.6553664820806102e-06, "loss": 0.1548, "reward": 1.984375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.9921875, "step": 2082 }, { "clip_ratio": 0.0, "completion_length": 725.7734375, "epoch": 0.8332, "grad_norm": 0.331813820838429, "kl": 4.0859375, "learning_rate": 1.6476803215983295e-06, "loss": 0.1554, "reward": 2.095703125, "reward_std": 0.15273308008909225, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 2083 }, { "clip_ratio": 0.0, "completion_length": 902.5, "epoch": 0.8336, "grad_norm": 0.37481555042362474, "kl": 4.7890625, "learning_rate": 1.6400104443906463e-06, "loss": 0.1916, "reward": 2.095703125, "reward_std": 0.2734375, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 2084 }, { "clip_ratio": 0.0, "completion_length": 850.125, "epoch": 0.834, "grad_norm": 0.24075559679204536, "kl": 4.16796875, "learning_rate": 1.6323568654103838e-06, "loss": 0.1667, "reward": 1.96875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 2085 }, { "clip_ratio": 0.0, "completion_length": 699.125, "epoch": 0.8344, "grad_norm": 0.16854854866939842, "kl": 4.2734375, "learning_rate": 1.6247195995785836e-06, "loss": 0.1711, "reward": 1.99609375, "reward_std": 0.078125, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.99609375, "step": 2086 }, { "clip_ratio": 0.0, "completion_length": 890.875, "epoch": 0.8348, "grad_norm": 0.35190108434303974, "kl": 4.390625, "learning_rate": 1.6170986617844864e-06, "loss": 0.1756, "reward": 2.08203125, "reward_std": 0.171875, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98046875, "step": 2087 }, { "clip_ratio": 0.0, "completion_length": 777.25, "epoch": 0.8352, "grad_norm": 0.10538475666473593, "kl": 3.8828125, "learning_rate": 1.6094940668855008e-06, "loss": 0.1557, "reward": 2.142578125, "reward_std": 0.08382828533649445, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 2088 }, { "clip_ratio": 0.0, "completion_length": 713.25, "epoch": 0.8356, "grad_norm": 0.2753322440793642, "kl": 3.390625, "learning_rate": 1.601905829707171e-06, "loss": 0.1357, "reward": 2.111328125, "reward_std": 0.1171875, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.994140625, "step": 2089 }, { "clip_ratio": 0.0, "completion_length": 791.375, "epoch": 0.836, "grad_norm": 2.8640203359018686, "kl": 3.8515625, "learning_rate": 1.5943339650431578e-06, "loss": 0.1535, "reward": 2.056640625, "reward_std": 0.1484375, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.994140625, "step": 2090 }, { "clip_ratio": 0.0, "completion_length": 718.875, "epoch": 0.8364, "grad_norm": 1.759504744083046, "kl": 3.96484375, "learning_rate": 1.5867784876551973e-06, "loss": 0.1588, "reward": 2.0859375, "reward_std": 0.16364938765764236, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 2091 }, { "clip_ratio": 0.0, "completion_length": 846.25, "epoch": 0.8368, "grad_norm": 0.5594033328917418, "kl": 4.34765625, "learning_rate": 1.579239412273078e-06, "loss": 0.1739, "reward": 2.05859375, "reward_std": 0.3519627973437309, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.95703125, "step": 2092 }, { "clip_ratio": 0.0, "completion_length": 776.7578125, "epoch": 0.8372, "grad_norm": 0.3698401705086603, "kl": 4.0078125, "learning_rate": 1.5717167535946142e-06, "loss": 0.1454, "reward": 1.94921875, "reward_std": 0.16933366656303406, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 2093 }, { "clip_ratio": 0.0, "completion_length": 867.125, "epoch": 0.8376, "grad_norm": 0.12776379758608972, "kl": 4.21484375, "learning_rate": 1.5642105262856122e-06, "loss": 0.1684, "reward": 1.9609375, "reward_std": 0.15625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 2094 }, { "clip_ratio": 0.0, "completion_length": 804.25, "epoch": 0.838, "grad_norm": 0.16333858879845578, "kl": 3.65625, "learning_rate": 1.5567207449798517e-06, "loss": 0.1466, "reward": 1.958984375, "reward_std": 0.12445375323295593, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2095 }, { "clip_ratio": 0.0, "completion_length": 732.125, "epoch": 0.8384, "grad_norm": 0.3399480386683632, "kl": 3.421875, "learning_rate": 1.5492474242790368e-06, "loss": 0.1367, "reward": 2.126953125, "reward_std": 0.11435256898403168, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.994140625, "step": 2096 }, { "clip_ratio": 0.0, "completion_length": 880.625, "epoch": 0.8388, "grad_norm": 0.3097886805432668, "kl": 4.0, "learning_rate": 1.5417905787527943e-06, "loss": 0.1602, "reward": 1.9375, "reward_std": 0.2198980376124382, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.9765625, "step": 2097 }, { "clip_ratio": 0.0, "completion_length": 971.0, "epoch": 0.8392, "grad_norm": 0.2262970100724011, "kl": 3.953125, "learning_rate": 1.5343502229386209e-06, "loss": 0.1581, "reward": 1.919921875, "reward_std": 0.3123341202735901, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 2098 }, { "clip_ratio": 0.0, "completion_length": 850.125, "epoch": 0.8396, "grad_norm": 0.1998832633649392, "kl": 4.078125, "learning_rate": 1.526926371341878e-06, "loss": 0.163, "reward": 2.0234375, "reward_std": 0.26448965817689896, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.9609375, "step": 2099 }, { "clip_ratio": 0.0, "completion_length": 756.78125, "epoch": 0.84, "grad_norm": 0.2162692158536283, "kl": 4.12890625, "learning_rate": 1.5195190384357405e-06, "loss": 0.161, "reward": 1.947265625, "reward_std": 0.17714616656303406, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 2100 }, { "clip_ratio": 0.0, "completion_length": 759.0, "epoch": 0.8404, "grad_norm": 0.20232959432077152, "kl": 3.3828125, "learning_rate": 1.5121282386611823e-06, "loss": 0.1352, "reward": 1.974609375, "reward_std": 0.1015625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.990234375, "step": 2101 }, { "clip_ratio": 0.0, "completion_length": 819.125, "epoch": 0.8408, "grad_norm": 0.1464572921192024, "kl": 3.5859375, "learning_rate": 1.5047539864269477e-06, "loss": 0.143, "reward": 1.958984375, "reward_std": 0.1640625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2102 }, { "clip_ratio": 0.0, "completion_length": 776.125, "epoch": 0.8412, "grad_norm": 0.37142653243906026, "kl": 3.96875, "learning_rate": 1.4973962961095135e-06, "loss": 0.159, "reward": 2.064453125, "reward_std": 0.1171875, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 2103 }, { "clip_ratio": 0.0, "completion_length": 689.125, "epoch": 0.8416, "grad_norm": 0.12971849529091317, "kl": 3.8984375, "learning_rate": 1.490055182053083e-06, "loss": 0.156, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 2104 }, { "clip_ratio": 0.0, "completion_length": 867.375, "epoch": 0.842, "grad_norm": 0.28966613128655905, "kl": 4.08203125, "learning_rate": 1.4827306585695234e-06, "loss": 0.1632, "reward": 1.951171875, "reward_std": 0.1953125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.982421875, "step": 2105 }, { "clip_ratio": 0.0, "completion_length": 792.5, "epoch": 0.8424, "grad_norm": 0.22008290666517089, "kl": 3.9453125, "learning_rate": 1.4754227399383758e-06, "loss": 0.1579, "reward": 2.2109375, "reward_std": 0.15625, "rewards/accuracy_reward": 0.2421875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.984375, "step": 2106 }, { "clip_ratio": 0.0, "completion_length": 718.25, "epoch": 0.8428, "grad_norm": 0.17069763110208902, "kl": 3.94921875, "learning_rate": 1.468131440406798e-06, "loss": 0.158, "reward": 2.0546875, "reward_std": 0.15788477659225464, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.9921875, "step": 2107 }, { "clip_ratio": 0.0, "completion_length": 798.25, "epoch": 0.8432, "grad_norm": 0.22346678916969337, "kl": 3.9609375, "learning_rate": 1.4608567741895496e-06, "loss": 0.1587, "reward": 1.9921875, "reward_std": 0.1834537610411644, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 2108 }, { "clip_ratio": 0.0, "completion_length": 685.75, "epoch": 0.8436, "grad_norm": 0.06601475162705205, "kl": 3.2890625, "learning_rate": 1.4535987554689712e-06, "loss": 0.1319, "reward": 1.984375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.9921875, "step": 2109 }, { "clip_ratio": 0.0, "completion_length": 804.375, "epoch": 0.844, "grad_norm": 0.20557795241692936, "kl": 4.203125, "learning_rate": 1.446357398394934e-06, "loss": 0.1681, "reward": 2.080078125, "reward_std": 0.1796875, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 2110 }, { "clip_ratio": 0.0, "completion_length": 689.375, "epoch": 0.8444, "grad_norm": 0.23416047498369888, "kl": 3.9453125, "learning_rate": 1.439132717084839e-06, "loss": 0.1579, "reward": 2.080078125, "reward_std": 0.2020099237561226, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.978515625, "step": 2111 }, { "clip_ratio": 0.0, "completion_length": 840.046875, "epoch": 0.8448, "grad_norm": 0.2954530979188524, "kl": 3.5546875, "learning_rate": 1.4319247256235713e-06, "loss": 0.1388, "reward": 1.9765625, "reward_std": 0.25668321549892426, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.9765625, "step": 2112 }, { "clip_ratio": 0.0, "completion_length": 793.875, "epoch": 0.8452, "grad_norm": 0.22413250161809617, "kl": 4.2890625, "learning_rate": 1.4247334380634792e-06, "loss": 0.1717, "reward": 2.080078125, "reward_std": 0.2421875, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 2113 }, { "clip_ratio": 0.0, "completion_length": 821.0, "epoch": 0.8456, "grad_norm": 0.3177712428403385, "kl": 4.15234375, "learning_rate": 1.4175588684243447e-06, "loss": 0.1664, "reward": 1.955078125, "reward_std": 0.2052573561668396, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 2114 }, { "clip_ratio": 0.0, "completion_length": 703.1171875, "epoch": 0.846, "grad_norm": 0.12233633727972149, "kl": 3.47265625, "learning_rate": 1.4104010306933558e-06, "loss": 0.1314, "reward": 2.0703125, "reward_std": 0.1209537610411644, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 2115 }, { "clip_ratio": 0.0, "completion_length": 885.25, "epoch": 0.8464, "grad_norm": 0.09672069565174587, "kl": 4.51953125, "learning_rate": 1.40325993882509e-06, "loss": 0.1808, "reward": 1.9609375, "reward_std": 0.1261480376124382, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 2116 }, { "clip_ratio": 0.0, "completion_length": 907.5, "epoch": 0.8468, "grad_norm": 1.4106384110377086, "kl": 4.390625, "learning_rate": 1.3961356067414667e-06, "loss": 0.1754, "reward": 1.876953125, "reward_std": 0.3213491216301918, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.962890625, "step": 2117 }, { "clip_ratio": 0.0, "completion_length": 750.375, "epoch": 0.8472, "grad_norm": 1.088948878142157, "kl": 3.70703125, "learning_rate": 1.3890280483317375e-06, "loss": 0.1481, "reward": 2.099609375, "reward_std": 0.1015625, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.990234375, "step": 2118 }, { "clip_ratio": 0.0, "completion_length": 866.5, "epoch": 0.8476, "grad_norm": 0.41714187739810044, "kl": 4.046875, "learning_rate": 1.381937277452451e-06, "loss": 0.162, "reward": 2.1328125, "reward_std": 0.3068819046020508, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.9609375, "step": 2119 }, { "clip_ratio": 0.0, "completion_length": 815.5, "epoch": 0.848, "grad_norm": 0.1766344087863047, "kl": 3.98828125, "learning_rate": 1.3748633079274254e-06, "loss": 0.1598, "reward": 1.970703125, "reward_std": 0.1171875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.986328125, "step": 2120 }, { "clip_ratio": 0.0, "completion_length": 781.75, "epoch": 0.8484, "grad_norm": 0.33091523442299714, "kl": 3.6484375, "learning_rate": 1.3678061535477305e-06, "loss": 0.1459, "reward": 2.080078125, "reward_std": 0.12548638880252838, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.986328125, "step": 2121 }, { "clip_ratio": 0.0, "completion_length": 698.25, "epoch": 0.8488, "grad_norm": 0.09267456658016374, "kl": 3.9375, "learning_rate": 1.3607658280716474e-06, "loss": 0.1571, "reward": 2.109375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.9921875, "step": 2122 }, { "clip_ratio": 0.0, "completion_length": 858.375, "epoch": 0.8492, "grad_norm": 0.19595803247252142, "kl": 3.96875, "learning_rate": 1.3537423452246522e-06, "loss": 0.1586, "reward": 1.951171875, "reward_std": 0.1953125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.982421875, "step": 2123 }, { "clip_ratio": 0.0, "completion_length": 629.75, "epoch": 0.8496, "grad_norm": 0.4449096239713227, "kl": 4.0234375, "learning_rate": 1.3467357186993802e-06, "loss": 0.161, "reward": 2.099609375, "reward_std": 0.06961458921432495, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.990234375, "step": 2124 }, { "clip_ratio": 0.0, "completion_length": 871.125, "epoch": 0.85, "grad_norm": 0.30858239339933824, "kl": 4.3359375, "learning_rate": 1.339745962155613e-06, "loss": 0.1734, "reward": 2.041015625, "reward_std": 0.25585293769836426, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.986328125, "step": 2125 }, { "clip_ratio": 0.0, "completion_length": 659.375, "epoch": 0.8504, "grad_norm": 0.11961927959643538, "kl": 3.44921875, "learning_rate": 1.3327730892202384e-06, "loss": 0.1379, "reward": 2.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 2126 }, { "clip_ratio": 0.0, "completion_length": 708.0, "epoch": 0.8508, "grad_norm": 0.04815653436136281, "kl": 3.890625, "learning_rate": 1.3258171134872267e-06, "loss": 0.1557, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 2127 }, { "clip_ratio": 0.0, "completion_length": 783.8125, "epoch": 0.8512, "grad_norm": 2.6030466916383386, "kl": 3.95703125, "learning_rate": 1.3188780485176089e-06, "loss": 0.1558, "reward": 2.107421875, "reward_std": 0.18472521007061005, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.990234375, "step": 2128 }, { "clip_ratio": 0.0, "completion_length": 896.875, "epoch": 0.8516, "grad_norm": 0.16163211948056108, "kl": 4.01171875, "learning_rate": 1.3119559078394462e-06, "loss": 0.1605, "reward": 1.935546875, "reward_std": 0.2578125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 2129 }, { "clip_ratio": 0.0, "completion_length": 755.875, "epoch": 0.852, "grad_norm": 0.2480449003589986, "kl": 3.859375, "learning_rate": 1.30505070494781e-06, "loss": 0.1545, "reward": 1.970703125, "reward_std": 0.1171875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.986328125, "step": 2130 }, { "clip_ratio": 0.0, "completion_length": 821.75, "epoch": 0.8524, "grad_norm": 0.2295162007587851, "kl": 4.5625, "learning_rate": 1.2981624533047432e-06, "loss": 0.1832, "reward": 2.06640625, "reward_std": 0.234375, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.97265625, "step": 2131 }, { "clip_ratio": 0.0, "completion_length": 932.375, "epoch": 0.8528, "grad_norm": 0.5237609510784036, "kl": 5.046875, "learning_rate": 1.2912911663392468e-06, "loss": 0.2018, "reward": 1.91796875, "reward_std": 0.328125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 2132 }, { "clip_ratio": 0.0, "completion_length": 849.25, "epoch": 0.8532, "grad_norm": 0.11250103216708067, "kl": 4.05859375, "learning_rate": 1.2844368574472454e-06, "loss": 0.1624, "reward": 2.01953125, "reward_std": 0.2101532220840454, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98046875, "step": 2133 }, { "clip_ratio": 0.0, "completion_length": 704.0, "epoch": 0.8536, "grad_norm": 0.4500146445257021, "kl": 3.79296875, "learning_rate": 1.277599539991563e-06, "loss": 0.1517, "reward": 1.92578125, "reward_std": 0.22853436321020126, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 2134 }, { "clip_ratio": 0.0, "completion_length": 744.25, "epoch": 0.854, "grad_norm": 0.22756860891511235, "kl": 4.1171875, "learning_rate": 1.2707792273019049e-06, "loss": 0.165, "reward": 2.1015625, "reward_std": 0.09375, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 2135 }, { "clip_ratio": 0.0, "completion_length": 809.75, "epoch": 0.8544, "grad_norm": 0.3379894842798751, "kl": 3.76171875, "learning_rate": 1.2639759326748136e-06, "loss": 0.1505, "reward": 1.962890625, "reward_std": 0.1484375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 2136 }, { "clip_ratio": 0.0, "completion_length": 811.0, "epoch": 0.8548, "grad_norm": 0.14203731537866382, "kl": 3.6484375, "learning_rate": 1.257189669373664e-06, "loss": 0.1457, "reward": 1.974609375, "reward_std": 0.1015625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.990234375, "step": 2137 }, { "clip_ratio": 0.0, "completion_length": 755.125, "epoch": 0.8552, "grad_norm": 0.17747369648063338, "kl": 4.4921875, "learning_rate": 1.2504204506286244e-06, "loss": 0.1798, "reward": 2.064453125, "reward_std": 0.20257875323295593, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.970703125, "step": 2138 }, { "clip_ratio": 0.0, "completion_length": 803.375, "epoch": 0.8556, "grad_norm": 1.030400835264879, "kl": 4.234375, "learning_rate": 1.2436682896366282e-06, "loss": 0.1696, "reward": 2.068359375, "reward_std": 0.2265625, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.974609375, "step": 2139 }, { "clip_ratio": 0.0, "completion_length": 748.875, "epoch": 0.856, "grad_norm": 0.7382064380327441, "kl": 3.89453125, "learning_rate": 1.2369331995613664e-06, "loss": 0.1561, "reward": 1.9453125, "reward_std": 0.1818198561668396, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2140 }, { "clip_ratio": 0.0, "completion_length": 838.625, "epoch": 0.8564, "grad_norm": 0.16444080287138924, "kl": 4.5859375, "learning_rate": 1.230215193533233e-06, "loss": 0.1836, "reward": 2.20703125, "reward_std": 0.171875, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98046875, "step": 2141 }, { "clip_ratio": 0.0, "completion_length": 822.625, "epoch": 0.8568, "grad_norm": 0.16713931668632329, "kl": 4.19921875, "learning_rate": 1.223514284649331e-06, "loss": 0.1678, "reward": 1.927734375, "reward_std": 0.19691256433725357, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.974609375, "step": 2142 }, { "clip_ratio": 0.0, "completion_length": 890.125, "epoch": 0.8572, "grad_norm": 0.32270231656188153, "kl": 4.3125, "learning_rate": 1.2168304859734226e-06, "loss": 0.1725, "reward": 1.970703125, "reward_std": 0.2584804594516754, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.970703125, "step": 2143 }, { "clip_ratio": 0.0, "completion_length": 756.0, "epoch": 0.8576, "grad_norm": 0.21463262466014682, "kl": 3.70703125, "learning_rate": 1.210163810535917e-06, "loss": 0.1486, "reward": 2.119140625, "reward_std": 0.2590331584215164, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.986328125, "step": 2144 }, { "clip_ratio": 0.0, "completion_length": 761.375, "epoch": 0.858, "grad_norm": 0.2703349832284943, "kl": 4.05078125, "learning_rate": 1.2035142713338366e-06, "loss": 0.1621, "reward": 1.943359375, "reward_std": 0.15584102272987366, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.974609375, "step": 2145 }, { "clip_ratio": 0.0, "completion_length": 683.625, "epoch": 0.8584, "grad_norm": 0.16182753999692867, "kl": 3.8359375, "learning_rate": 1.196881881330798e-06, "loss": 0.1537, "reward": 1.984375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.9921875, "step": 2146 }, { "clip_ratio": 0.0, "completion_length": 783.875, "epoch": 0.8588, "grad_norm": 0.20697461066186262, "kl": 5.2109375, "learning_rate": 1.1902666534569884e-06, "loss": 0.2081, "reward": 2.0703125, "reward_std": 0.1900520622730255, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.984375, "step": 2147 }, { "clip_ratio": 0.0, "completion_length": 784.125, "epoch": 0.8592, "grad_norm": 0.24577455272556914, "kl": 4.01953125, "learning_rate": 1.1836686006091313e-06, "loss": 0.1605, "reward": 2.029296875, "reward_std": 0.2076982855796814, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.982421875, "step": 2148 }, { "clip_ratio": 0.0, "completion_length": 709.75, "epoch": 0.8596, "grad_norm": 0.1400652593959231, "kl": 4.09375, "learning_rate": 1.1770877356504684e-06, "loss": 0.164, "reward": 1.96875, "reward_std": 0.125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.984375, "step": 2149 }, { "clip_ratio": 0.0, "completion_length": 833.5, "epoch": 0.86, "grad_norm": 0.2684490902253054, "kl": 4.828125, "learning_rate": 1.1705240714107301e-06, "loss": 0.1933, "reward": 1.927734375, "reward_std": 0.2521323561668396, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.966796875, "step": 2150 }, { "clip_ratio": 0.0, "completion_length": 649.5, "epoch": 0.8604, "grad_norm": 0.15728692638297878, "kl": 4.09375, "learning_rate": 1.1639776206861197e-06, "loss": 0.1637, "reward": 2.048828125, "reward_std": 0.1623838096857071, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 2151 }, { "clip_ratio": 0.0, "completion_length": 798.0, "epoch": 0.8608, "grad_norm": 0.40342531553850275, "kl": 4.1484375, "learning_rate": 1.1574483962392768e-06, "loss": 0.1661, "reward": 2.013671875, "reward_std": 0.23015566170215607, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.982421875, "step": 2152 }, { "clip_ratio": 0.0, "completion_length": 821.25, "epoch": 0.8612, "grad_norm": 0.2522897681898484, "kl": 4.22265625, "learning_rate": 1.1509364107992582e-06, "loss": 0.169, "reward": 1.94140625, "reward_std": 0.19476625323295593, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.97265625, "step": 2153 }, { "clip_ratio": 0.0, "completion_length": 824.125, "epoch": 0.8616, "grad_norm": 0.20258214840206493, "kl": 4.2890625, "learning_rate": 1.1444416770615118e-06, "loss": 0.1714, "reward": 1.96875, "reward_std": 0.23595499992370605, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2154 }, { "clip_ratio": 0.0, "completion_length": 738.75, "epoch": 0.862, "grad_norm": 0.24426904506107885, "kl": 4.515625, "learning_rate": 1.1379642076878528e-06, "loss": 0.1805, "reward": 2.09375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 2155 }, { "clip_ratio": 0.0, "completion_length": 765.375, "epoch": 0.8624, "grad_norm": 0.23991742622764647, "kl": 4.0078125, "learning_rate": 1.1315040153064416e-06, "loss": 0.1604, "reward": 2.078125, "reward_std": 0.1505698561668396, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.9765625, "step": 2156 }, { "clip_ratio": 0.0, "completion_length": 588.5, "epoch": 0.8628, "grad_norm": 0.3005805825598906, "kl": 3.16796875, "learning_rate": 1.1250611125117527e-06, "loss": 0.1267, "reward": 2.140625, "reward_std": 0.042695626616477966, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 2157 }, { "clip_ratio": 0.0, "completion_length": 889.0, "epoch": 0.8632, "grad_norm": 0.09501409863700264, "kl": 4.65625, "learning_rate": 1.1186355118645552e-06, "loss": 0.1867, "reward": 2.0390625, "reward_std": 0.2753010094165802, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.9609375, "step": 2158 }, { "clip_ratio": 0.0, "completion_length": 698.375, "epoch": 0.8636, "grad_norm": 0.19487013562746516, "kl": 3.46875, "learning_rate": 1.1122272258918864e-06, "loss": 0.1386, "reward": 1.958984375, "reward_std": 0.13211458921432495, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2159 }, { "clip_ratio": 0.0, "completion_length": 704.5, "epoch": 0.864, "grad_norm": 0.08063837986073627, "kl": 3.72265625, "learning_rate": 1.1058362670870248e-06, "loss": 0.1489, "reward": 1.98828125, "reward_std": 0.046875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.99609375, "step": 2160 }, { "clip_ratio": 0.0, "completion_length": 601.75, "epoch": 0.8644, "grad_norm": 0.13320282948230278, "kl": 3.8125, "learning_rate": 1.0994626479094749e-06, "loss": 0.1525, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 2161 }, { "clip_ratio": 0.0, "completion_length": 614.75, "epoch": 0.8648, "grad_norm": 0.2653073672987709, "kl": 3.53515625, "learning_rate": 1.093106380784934e-06, "loss": 0.1412, "reward": 1.990234375, "reward_std": 0.0390625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.998046875, "step": 2162 }, { "clip_ratio": 0.0, "completion_length": 698.875, "epoch": 0.8652, "grad_norm": 0.2915093947785712, "kl": 3.9453125, "learning_rate": 1.0867674781052683e-06, "loss": 0.158, "reward": 1.9609375, "reward_std": 0.12430208921432495, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 2163 }, { "clip_ratio": 0.0, "completion_length": 697.953125, "epoch": 0.8656, "grad_norm": 0.183572975670322, "kl": 3.73828125, "learning_rate": 1.0804459522284927e-06, "loss": 0.1432, "reward": 2.138671875, "reward_std": 0.23015566170215607, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.982421875, "step": 2164 }, { "clip_ratio": 0.0, "completion_length": 765.25, "epoch": 0.866, "grad_norm": 0.23517027745019325, "kl": 4.0859375, "learning_rate": 1.0741418154787443e-06, "loss": 0.1635, "reward": 1.98046875, "reward_std": 0.1529511883854866, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98828125, "step": 2165 }, { "clip_ratio": 0.0, "completion_length": 960.625, "epoch": 0.8664, "grad_norm": 0.8153141222274998, "kl": 5.05859375, "learning_rate": 1.0678550801462662e-06, "loss": 0.2021, "reward": 2.052734375, "reward_std": 0.3441469967365265, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 2166 }, { "clip_ratio": 0.0, "completion_length": 685.75, "epoch": 0.8668, "grad_norm": 0.21018903168792005, "kl": 3.78125, "learning_rate": 1.0615857584873624e-06, "loss": 0.1509, "reward": 2.189453125, "reward_std": 0.1171875, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 2167 }, { "clip_ratio": 0.0, "completion_length": 806.5, "epoch": 0.8672, "grad_norm": 0.2561680888256301, "kl": 4.0078125, "learning_rate": 1.0553338627244026e-06, "loss": 0.1603, "reward": 2.08984375, "reward_std": 0.2478339672088623, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.97265625, "step": 2168 }, { "clip_ratio": 0.0, "completion_length": 730.0, "epoch": 0.8676, "grad_norm": 2.255384188425045, "kl": 4.953125, "learning_rate": 1.0490994050457748e-06, "loss": 0.198, "reward": 1.94921875, "reward_std": 0.16933366656303406, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 2169 }, { "clip_ratio": 0.0, "completion_length": 749.5, "epoch": 0.868, "grad_norm": 0.2779595730083843, "kl": 3.97265625, "learning_rate": 1.042882397605871e-06, "loss": 0.1587, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 2170 }, { "clip_ratio": 0.0, "completion_length": 736.25, "epoch": 0.8684, "grad_norm": 0.24212449234779274, "kl": 3.69140625, "learning_rate": 1.0366828525250728e-06, "loss": 0.1477, "reward": 2.05078125, "reward_std": 0.1700107902288437, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 2171 }, { "clip_ratio": 0.0, "completion_length": 874.75, "epoch": 0.8688, "grad_norm": 0.44208993419825315, "kl": 4.2421875, "learning_rate": 1.0305007818897006e-06, "loss": 0.1696, "reward": 1.9609375, "reward_std": 0.15625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 2172 }, { "clip_ratio": 0.0, "completion_length": 768.25, "epoch": 0.8692, "grad_norm": 0.3877304798589207, "kl": 4.6328125, "learning_rate": 1.024336197752025e-06, "loss": 0.1852, "reward": 1.943359375, "reward_std": 0.19277116656303406, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.974609375, "step": 2173 }, { "clip_ratio": 0.0, "completion_length": 707.625, "epoch": 0.8696, "grad_norm": 0.5303593376449643, "kl": 4.23046875, "learning_rate": 1.0181891121302145e-06, "loss": 0.1693, "reward": 2.05078125, "reward_std": 0.08903051167726517, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.99609375, "step": 2174 }, { "clip_ratio": 0.0, "completion_length": 554.8671875, "epoch": 0.87, "grad_norm": 0.3311327777004188, "kl": 3.68359375, "learning_rate": 1.012059537008332e-06, "loss": 0.1379, "reward": 1.990234375, "reward_std": 0.0390625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.998046875, "step": 2175 }, { "clip_ratio": 0.0, "completion_length": 706.25, "epoch": 0.8704, "grad_norm": 0.10754272257610029, "kl": 3.86328125, "learning_rate": 1.0059474843362893e-06, "loss": 0.1546, "reward": 2.22265625, "reward_std": 0.109375, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 2176 }, { "clip_ratio": 0.0, "completion_length": 734.75, "epoch": 0.8708, "grad_norm": 0.13946554863398536, "kl": 4.2734375, "learning_rate": 9.99852966029854e-07, "loss": 0.1707, "reward": 2.0234375, "reward_std": 0.12233919650316238, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.9921875, "step": 2177 }, { "clip_ratio": 0.0, "completion_length": 842.25, "epoch": 0.8712, "grad_norm": 0.2664409203618155, "kl": 4.453125, "learning_rate": 9.93775993970597e-07, "loss": 0.1781, "reward": 1.951171875, "reward_std": 0.1953125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.982421875, "step": 2178 }, { "clip_ratio": 0.0, "completion_length": 653.75, "epoch": 0.8716, "grad_norm": 0.1548720872913265, "kl": 3.703125, "learning_rate": 9.877165800058874e-07, "loss": 0.1483, "reward": 1.984375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.9921875, "step": 2179 }, { "clip_ratio": 0.0, "completion_length": 784.75, "epoch": 0.872, "grad_norm": 0.08425152580737741, "kl": 4.2734375, "learning_rate": 9.816747359488632e-07, "loss": 0.1708, "reward": 1.970703125, "reward_std": 0.1171875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.986328125, "step": 2180 }, { "clip_ratio": 0.0, "completion_length": 681.75, "epoch": 0.8724, "grad_norm": 0.5954853870341453, "kl": 3.78515625, "learning_rate": 9.756504735784067e-07, "loss": 0.1513, "reward": 1.98828125, "reward_std": 0.046875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.99609375, "step": 2181 }, { "clip_ratio": 0.0, "completion_length": 634.375, "epoch": 0.8728, "grad_norm": 0.07876061456752466, "kl": 4.2265625, "learning_rate": 9.696438046391288e-07, "loss": 0.169, "reward": 1.974609375, "reward_std": 0.06961458921432495, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.990234375, "step": 2182 }, { "clip_ratio": 0.0, "completion_length": 853.96875, "epoch": 0.8732, "grad_norm": 0.4388361588480049, "kl": 4.51171875, "learning_rate": 9.636547408413355e-07, "loss": 0.1667, "reward": 1.923828125, "reward_std": 0.2759895622730255, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.970703125, "step": 2183 }, { "clip_ratio": 0.0, "completion_length": 669.875, "epoch": 0.8736, "grad_norm": 0.554998115028257, "kl": 3.671875, "learning_rate": 9.576832938610137e-07, "loss": 0.1469, "reward": 2.091796875, "reward_std": 0.09320375323295593, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.982421875, "step": 2184 }, { "clip_ratio": 0.0, "completion_length": 768.75, "epoch": 0.874, "grad_norm": 0.21617334901228455, "kl": 4.5703125, "learning_rate": 9.517294753398066e-07, "loss": 0.1828, "reward": 2.216796875, "reward_std": 0.2965635135769844, "rewards/accuracy_reward": 0.2734375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.974609375, "step": 2185 }, { "clip_ratio": 0.0, "completion_length": 813.25, "epoch": 0.8744, "grad_norm": 0.40064959639795317, "kl": 4.8828125, "learning_rate": 9.457932968849826e-07, "loss": 0.1946, "reward": 1.962890625, "reward_std": 0.2638786733150482, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 2186 }, { "clip_ratio": 0.0, "completion_length": 809.875, "epoch": 0.8748, "grad_norm": 0.9743535422432226, "kl": 4.59375, "learning_rate": 9.398747700694322e-07, "loss": 0.1837, "reward": 2.09375, "reward_std": 0.125, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.984375, "step": 2187 }, { "clip_ratio": 0.0, "completion_length": 733.625, "epoch": 0.8752, "grad_norm": 0.2762705863724687, "kl": 4.484375, "learning_rate": 9.339739064316233e-07, "loss": 0.1793, "reward": 1.9765625, "reward_std": 0.13508247584104538, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.984375, "step": 2188 }, { "clip_ratio": 0.0, "completion_length": 727.875, "epoch": 0.8756, "grad_norm": 0.6383349311031566, "kl": 4.640625, "learning_rate": 9.280907174755916e-07, "loss": 0.1859, "reward": 2.125, "reward_std": 0.18090169876813889, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.984375, "step": 2189 }, { "clip_ratio": 0.0, "completion_length": 792.875, "epoch": 0.876, "grad_norm": 0.2320810138079594, "kl": 4.0859375, "learning_rate": 9.222252146709143e-07, "loss": 0.1637, "reward": 1.9453125, "reward_std": 0.14984101057052612, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2190 }, { "clip_ratio": 0.0, "completion_length": 566.625, "epoch": 0.8764, "grad_norm": 2.0839767406894425, "kl": 3.5390625, "learning_rate": 9.16377409452689e-07, "loss": 0.1415, "reward": 2.154296875, "reward_std": 0.06371419876813889, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 0.998046875, "step": 2191 }, { "clip_ratio": 0.0, "completion_length": 790.0, "epoch": 0.8768, "grad_norm": 0.28354493069353637, "kl": 4.0234375, "learning_rate": 9.105473132215126e-07, "loss": 0.1612, "reward": 2.001953125, "reward_std": 0.2106628566980362, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 2192 }, { "clip_ratio": 0.0, "completion_length": 601.875, "epoch": 0.8772, "grad_norm": 1.1399928202592802, "kl": 3.47265625, "learning_rate": 9.047349373434566e-07, "loss": 0.1388, "reward": 1.998046875, "reward_std": 0.0078125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 0.998046875, "step": 2193 }, { "clip_ratio": 0.0, "completion_length": 739.0, "epoch": 0.8776, "grad_norm": 1.021695742003637, "kl": 4.21875, "learning_rate": 8.989402931500434e-07, "loss": 0.1688, "reward": 1.974609375, "reward_std": 0.1591114066541195, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.982421875, "step": 2194 }, { "clip_ratio": 0.0, "completion_length": 724.625, "epoch": 0.878, "grad_norm": 0.5767916264380362, "kl": 3.7734375, "learning_rate": 8.931633919382299e-07, "loss": 0.1508, "reward": 1.94921875, "reward_std": 0.15529775619506836, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.97265625, "step": 2195 }, { "clip_ratio": 0.0, "completion_length": 835.125, "epoch": 0.8784, "grad_norm": 0.6977472749329887, "kl": 3.74609375, "learning_rate": 8.874042449703779e-07, "loss": 0.1499, "reward": 2.23046875, "reward_std": 0.171875, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98046875, "step": 2196 }, { "clip_ratio": 0.0, "completion_length": 842.25, "epoch": 0.8788, "grad_norm": 3.9337759913135537, "kl": 3.33203125, "learning_rate": 8.816628634742441e-07, "loss": 0.1331, "reward": 2.17578125, "reward_std": 0.2140132561326027, "rewards/accuracy_reward": 0.2265625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 2197 }, { "clip_ratio": 0.0, "completion_length": 927.5, "epoch": 0.8792, "grad_norm": 7.216785325169615, "kl": 3.73046875, "learning_rate": 8.759392586429394e-07, "loss": 0.1493, "reward": 1.98828125, "reward_std": 0.19346429780125618, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.97265625, "step": 2198 }, { "clip_ratio": 0.0, "completion_length": 989.375, "epoch": 0.8796, "grad_norm": 3.975225146008592, "kl": 3.671875, "learning_rate": 8.702334416349279e-07, "loss": 0.1468, "reward": 1.91796875, "reward_std": 0.26608753576874733, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.94921875, "step": 2199 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.88, "grad_norm": 6.336570795673538, "kl": 3.26953125, "learning_rate": 8.645454235739903e-07, "loss": 0.1309, "reward": 2.154296875, "reward_std": 0.133885832503438, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 0.966796875, "step": 2200 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.8804, "grad_norm": 3.5139124736340106, "kl": 2.8515625, "learning_rate": 8.58875215549212e-07, "loss": 0.1141, "reward": 2.078125, "reward_std": 0.18703142181038857, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.953125, "step": 2201 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.8808, "grad_norm": 6.991196041704374, "kl": 2.72265625, "learning_rate": 8.532228286149502e-07, "loss": 0.1089, "reward": 2.0390625, "reward_std": 0.2142604999244213, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9453125, "step": 2202 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.8812, "grad_norm": 2.037901719666874, "kl": 2.33984375, "learning_rate": 8.475882737908248e-07, "loss": 0.0935, "reward": 2.064453125, "reward_std": 0.17969895713031292, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.947265625, "step": 2203 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.8816, "grad_norm": 7.344265787222103, "kl": 2.04296875, "learning_rate": 8.419715620616875e-07, "loss": 0.0816, "reward": 1.90234375, "reward_std": 0.2202620431780815, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.92578125, "step": 2204 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.882, "grad_norm": 4.432024973298271, "kl": 2.34375, "learning_rate": 8.363727043776037e-07, "loss": 0.0938, "reward": 1.96875, "reward_std": 0.1386595331132412, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 0.953125, "step": 2205 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.8824, "grad_norm": 2.471161832585205, "kl": 2.19921875, "learning_rate": 8.307917116538378e-07, "loss": 0.0881, "reward": 1.9453125, "reward_std": 0.2708318643271923, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.8984375, "step": 2206 }, { "clip_ratio": 0.0, "completion_length": 996.125, "epoch": 0.8828, "grad_norm": 2.374521799748917, "kl": 2.20703125, "learning_rate": 8.252285947708139e-07, "loss": 0.0884, "reward": 1.931640625, "reward_std": 0.3021981194615364, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.916015625, "step": 2207 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.8832, "grad_norm": 7.224477318323515, "kl": 2.275390625, "learning_rate": 8.196833645741187e-07, "loss": 0.0911, "reward": 1.984375, "reward_std": 0.37516891211271286, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.90625, "step": 2208 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.8836, "grad_norm": 4.298791638478525, "kl": 2.45703125, "learning_rate": 8.141560318744601e-07, "loss": 0.0983, "reward": 1.90234375, "reward_std": 0.31733763962984085, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.90234375, "step": 2209 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.884, "grad_norm": 6.933959354115516, "kl": 2.62109375, "learning_rate": 8.086466074476562e-07, "loss": 0.1047, "reward": 1.98046875, "reward_std": 0.22803045809268951, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.91015625, "step": 2210 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.8844, "grad_norm": 4.2674180611759205, "kl": 3.08203125, "learning_rate": 8.031551020346129e-07, "loss": 0.1232, "reward": 2.2734375, "reward_std": 0.207103431224823, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.921875, "step": 2211 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.8848, "grad_norm": 4.62455633956363, "kl": 2.5390625, "learning_rate": 7.976815263412963e-07, "loss": 0.1012, "reward": 1.84375, "reward_std": 0.4154632091522217, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.9140625, "step": 2212 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.8852, "grad_norm": 2.9731062425658967, "kl": 2.75, "learning_rate": 7.922258910387282e-07, "loss": 0.1099, "reward": 1.955078125, "reward_std": 0.33771007508039474, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.939453125, "step": 2213 }, { "clip_ratio": 0.0, "completion_length": 1017.7578125, "epoch": 0.8856, "grad_norm": 2.098924658993973, "kl": 3.6484375, "learning_rate": 7.867882067629473e-07, "loss": 0.1305, "reward": 2.0859375, "reward_std": 0.36132121086120605, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.9296875, "step": 2214 }, { "clip_ratio": 0.0, "completion_length": 1018.875, "epoch": 0.886, "grad_norm": 6.455455876563587, "kl": 3.1171875, "learning_rate": 7.81368484114996e-07, "loss": 0.1264, "reward": 1.98046875, "reward_std": 0.4158381149172783, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.92578125, "step": 2215 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.8864, "grad_norm": 1.7471662215113621, "kl": 3.21875, "learning_rate": 7.759667336609011e-07, "loss": 0.1288, "reward": 1.97265625, "reward_std": 0.26412031427025795, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.96484375, "step": 2216 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.8868, "grad_norm": 1.8455688446335912, "kl": 3.28125, "learning_rate": 7.7058296593165e-07, "loss": 0.1314, "reward": 2.087890625, "reward_std": 0.3372665047645569, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.931640625, "step": 2217 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.8872, "grad_norm": 7.578855901321196, "kl": 3.58984375, "learning_rate": 7.652171914231777e-07, "loss": 0.1434, "reward": 1.94140625, "reward_std": 0.2558402828872204, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.95703125, "step": 2218 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.8876, "grad_norm": 6.600686014314426, "kl": 3.51171875, "learning_rate": 7.598694205963331e-07, "loss": 0.1403, "reward": 2.048828125, "reward_std": 0.3139517083764076, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.923828125, "step": 2219 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.888, "grad_norm": 2.5696650662168254, "kl": 3.60546875, "learning_rate": 7.545396638768698e-07, "loss": 0.1444, "reward": 1.880859375, "reward_std": 0.34148215875029564, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.927734375, "step": 2220 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.8884, "grad_norm": 5.997415805455163, "kl": 3.6640625, "learning_rate": 7.492279316554207e-07, "loss": 0.1464, "reward": 1.9140625, "reward_std": 0.2765871249139309, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.9453125, "step": 2221 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.8888, "grad_norm": 8.823479146536537, "kl": 3.43359375, "learning_rate": 7.439342342874789e-07, "loss": 0.1375, "reward": 1.92578125, "reward_std": 0.2244468592107296, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.94921875, "step": 2222 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.8892, "grad_norm": 5.059943315711173, "kl": 3.671875, "learning_rate": 7.386585820933812e-07, "loss": 0.1471, "reward": 2.140625, "reward_std": 0.24549544230103493, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.953125, "step": 2223 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.8896, "grad_norm": 1.5977694163375722, "kl": 2.9609375, "learning_rate": 7.334009853582791e-07, "loss": 0.1185, "reward": 1.935546875, "reward_std": 0.20697950199246407, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.958984375, "step": 2224 }, { "clip_ratio": 0.0, "completion_length": 1019.3203125, "epoch": 0.89, "grad_norm": 2.8908075725036557, "kl": 3.75, "learning_rate": 7.281614543321269e-07, "loss": 0.1332, "reward": 1.9921875, "reward_std": 0.3652817904949188, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.9453125, "step": 2225 }, { "clip_ratio": 0.0, "completion_length": 1017.9609375, "epoch": 0.8904, "grad_norm": 0.7514181743486459, "kl": 2.9453125, "learning_rate": 7.22939999229657e-07, "loss": 0.0963, "reward": 1.9140625, "reward_std": 0.2844337522983551, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.953125, "step": 2226 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.8908, "grad_norm": 1.281710967701381, "kl": 3.5234375, "learning_rate": 7.177366302303667e-07, "loss": 0.1411, "reward": 2.12109375, "reward_std": 0.26758297719061375, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.94921875, "step": 2227 }, { "clip_ratio": 0.0, "completion_length": 1021.484375, "epoch": 0.8912, "grad_norm": 12.63833695622528, "kl": 3.37109375, "learning_rate": 7.125513574784904e-07, "loss": 0.1261, "reward": 1.912109375, "reward_std": 0.3261212036013603, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.951171875, "step": 2228 }, { "clip_ratio": 0.0, "completion_length": 957.265625, "epoch": 0.8916, "grad_norm": 14.257302035909763, "kl": 3.6640625, "learning_rate": 7.073841910829771e-07, "loss": 0.1308, "reward": 2.017578125, "reward_std": 0.3478933088481426, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.962890625, "step": 2229 }, { "clip_ratio": 0.0, "completion_length": 1007.8046875, "epoch": 0.892, "grad_norm": 1.2441951219017076, "kl": 3.59765625, "learning_rate": 7.022351411174866e-07, "loss": 0.1133, "reward": 1.884765625, "reward_std": 0.31470103189349174, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.947265625, "step": 2230 }, { "clip_ratio": 0.0, "completion_length": 1018.953125, "epoch": 0.8924, "grad_norm": 1.3033984301743333, "kl": 4.00390625, "learning_rate": 6.971042176203535e-07, "loss": 0.1425, "reward": 1.958984375, "reward_std": 0.23584988340735435, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.958984375, "step": 2231 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.8928, "grad_norm": 1.163163191487747, "kl": 3.2265625, "learning_rate": 6.919914305945774e-07, "loss": 0.1289, "reward": 1.931640625, "reward_std": 0.22430991753935814, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.955078125, "step": 2232 }, { "clip_ratio": 0.0, "completion_length": 1018.9296875, "epoch": 0.8932, "grad_norm": 0.5411577805024327, "kl": 4.3671875, "learning_rate": 6.868967900077972e-07, "loss": 0.164, "reward": 2.0390625, "reward_std": 0.2692459896206856, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.9609375, "step": 2233 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.8936, "grad_norm": 0.5666160199953963, "kl": 3.54296875, "learning_rate": 6.818203057922756e-07, "loss": 0.1418, "reward": 1.93359375, "reward_std": 0.24010245129466057, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.96484375, "step": 2234 }, { "clip_ratio": 0.0, "completion_length": 1019.6015625, "epoch": 0.894, "grad_norm": 0.6811356607830032, "kl": 3.578125, "learning_rate": 6.767619878448783e-07, "loss": 0.1373, "reward": 1.923828125, "reward_std": 0.20309529453516006, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.962890625, "step": 2235 }, { "clip_ratio": 0.0, "completion_length": 994.9921875, "epoch": 0.8944, "grad_norm": 32.12366276912937, "kl": 10.2578125, "learning_rate": 6.717218460270536e-07, "loss": 0.3937, "reward": 1.96875, "reward_std": 0.20082489028573036, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.9765625, "step": 2236 }, { "clip_ratio": 0.0, "completion_length": 949.9375, "epoch": 0.8948, "grad_norm": 0.49877201361411005, "kl": 4.0859375, "learning_rate": 6.666998901648203e-07, "loss": 0.1553, "reward": 1.935546875, "reward_std": 0.15151171013712883, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 2237 }, { "clip_ratio": 0.0, "completion_length": 961.5, "epoch": 0.8952, "grad_norm": 0.33588123405363224, "kl": 4.19140625, "learning_rate": 6.616961300487323e-07, "loss": 0.1673, "reward": 2.033203125, "reward_std": 0.18813154846429825, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.978515625, "step": 2238 }, { "clip_ratio": 0.0, "completion_length": 879.5, "epoch": 0.8956, "grad_norm": 0.5014789143151379, "kl": 3.28515625, "learning_rate": 6.567105754338798e-07, "loss": 0.1315, "reward": 1.9453125, "reward_std": 0.1751183159649372, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.96875, "step": 2239 }, { "clip_ratio": 0.0, "completion_length": 1020.1953125, "epoch": 0.896, "grad_norm": 217416.13668354755, "kl": 6243.3828125, "learning_rate": 6.517432360398556e-07, "loss": 135.1741, "reward": 1.994140625, "reward_std": 0.0234375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 0.994140625, "step": 2240 }, { "clip_ratio": 0.0, "completion_length": 1013.6015625, "epoch": 0.8964, "grad_norm": 1.8924861167194138, "kl": 4.28515625, "learning_rate": 6.467941215507434e-07, "loss": 0.1483, "reward": 2.03125, "reward_std": 0.2854636311531067, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.9609375, "step": 2241 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.8968, "grad_norm": 0.4173044563180689, "kl": 3.98828125, "learning_rate": 6.418632416150927e-07, "loss": 0.1597, "reward": 1.951171875, "reward_std": 0.30908436328172684, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.958984375, "step": 2242 }, { "clip_ratio": 0.0, "completion_length": 850.6953125, "epoch": 0.8972, "grad_norm": 0.21326644109539866, "kl": 3.296875, "learning_rate": 6.369506058459063e-07, "loss": 0.1236, "reward": 2.1015625, "reward_std": 0.2822566404938698, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 2243 }, { "clip_ratio": 0.0, "completion_length": 740.0, "epoch": 0.8976, "grad_norm": 13.98019260629389, "kl": 5.0078125, "learning_rate": 6.320562238206218e-07, "loss": 0.1303, "reward": 1.9140625, "reward_std": 0.22373561561107635, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.9765625, "step": 2244 }, { "clip_ratio": 0.0, "completion_length": 836.1171875, "epoch": 0.898, "grad_norm": 0.2658287365710063, "kl": 4.0234375, "learning_rate": 6.271801050810856e-07, "loss": 0.1455, "reward": 1.955078125, "reward_std": 0.3335120677947998, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.970703125, "step": 2245 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.8984, "grad_norm": 0.2260987939932687, "kl": 3.9375, "learning_rate": 6.223222591335409e-07, "loss": 0.1578, "reward": 1.974609375, "reward_std": 0.1421625204384327, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.982421875, "step": 2246 }, { "clip_ratio": 0.0, "completion_length": 1017.921875, "epoch": 0.8988, "grad_norm": 0.5742700515752817, "kl": 4.9375, "learning_rate": 6.174826954486069e-07, "loss": 0.2002, "reward": 1.97265625, "reward_std": 0.13226625323295593, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98046875, "step": 2247 }, { "clip_ratio": 0.0, "completion_length": 1010.40625, "epoch": 0.8992, "grad_norm": 0.5241222967776276, "kl": 3.42578125, "learning_rate": 6.126614234612593e-07, "loss": 0.1155, "reward": 1.876953125, "reward_std": 0.3698892444372177, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.955078125, "step": 2248 }, { "clip_ratio": 0.0, "completion_length": 983.9921875, "epoch": 0.8996, "grad_norm": 0.37529431527927454, "kl": 3.98828125, "learning_rate": 6.078584525708175e-07, "loss": 0.1605, "reward": 2.146484375, "reward_std": 0.2605506554245949, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.974609375, "step": 2249 }, { "clip_ratio": 0.0, "completion_length": 979.5390625, "epoch": 0.9, "grad_norm": 0.31683147806245615, "kl": 4.0703125, "learning_rate": 6.030737921409169e-07, "loss": 0.1559, "reward": 2.0703125, "reward_std": 0.1886480376124382, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2250 }, { "clip_ratio": 0.0, "completion_length": 1018.8984375, "epoch": 0.9004, "grad_norm": 0.34105401113777445, "kl": 4.0703125, "learning_rate": 5.98307451499498e-07, "loss": 0.1443, "reward": 2.14453125, "reward_std": 0.39559492468833923, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.95703125, "step": 2251 }, { "clip_ratio": 0.0, "completion_length": 957.0859375, "epoch": 0.9008, "grad_norm": 0.2780126787036105, "kl": 3.3515625, "learning_rate": 5.935594399387856e-07, "loss": 0.1206, "reward": 2.013671875, "reward_std": 0.35446175932884216, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.951171875, "step": 2252 }, { "clip_ratio": 0.0, "completion_length": 903.53125, "epoch": 0.9012, "grad_norm": 0.5600523598037549, "kl": 3.953125, "learning_rate": 5.888297667152731e-07, "loss": 0.1419, "reward": 1.9609375, "reward_std": 0.15625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 2253 }, { "clip_ratio": 0.0, "completion_length": 761.875, "epoch": 0.9016, "grad_norm": 0.21788609015504184, "kl": 3.51171875, "learning_rate": 5.841184410496992e-07, "loss": 0.1406, "reward": 2.03125, "reward_std": 0.17181096598505974, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.984375, "step": 2254 }, { "clip_ratio": 0.0, "completion_length": 933.03125, "epoch": 0.902, "grad_norm": 0.40182738548952684, "kl": 3.7734375, "learning_rate": 5.794254721270331e-07, "loss": 0.1339, "reward": 1.9453125, "reward_std": 0.21381107345223427, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.9765625, "step": 2255 }, { "clip_ratio": 0.0, "completion_length": 959.28125, "epoch": 0.9024, "grad_norm": 0.24204109815650385, "kl": 3.890625, "learning_rate": 5.747508690964599e-07, "loss": 0.155, "reward": 1.939453125, "reward_std": 0.2745855376124382, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 2256 }, { "clip_ratio": 0.0, "completion_length": 980.171875, "epoch": 0.9028, "grad_norm": 0.5334228417964885, "kl": 3.875, "learning_rate": 5.700946410713548e-07, "loss": 0.1464, "reward": 1.955078125, "reward_std": 0.17984195053577423, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 2257 }, { "clip_ratio": 0.0, "completion_length": 821.640625, "epoch": 0.9032, "grad_norm": 0.9658683754333569, "kl": 3.32421875, "learning_rate": 5.654567971292757e-07, "loss": 0.1341, "reward": 2.0, "reward_std": 0.1441391110420227, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 2258 }, { "clip_ratio": 0.0, "completion_length": 939.8046875, "epoch": 0.9036, "grad_norm": 0.336432726188985, "kl": 3.8984375, "learning_rate": 5.608373463119354e-07, "loss": 0.1557, "reward": 1.9765625, "reward_std": 0.24611737579107285, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.9765625, "step": 2259 }, { "clip_ratio": 0.0, "completion_length": 871.578125, "epoch": 0.904, "grad_norm": 0.30168581341721873, "kl": 4.03125, "learning_rate": 5.562362976251901e-07, "loss": 0.1675, "reward": 2.19921875, "reward_std": 0.25222480297088623, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.96484375, "step": 2260 }, { "clip_ratio": 0.0, "completion_length": 937.7734375, "epoch": 0.9044, "grad_norm": 0.455072084915952, "kl": 3.71484375, "learning_rate": 5.516536600390188e-07, "loss": 0.1494, "reward": 2.068359375, "reward_std": 0.3103600740432739, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2261 }, { "clip_ratio": 0.0, "completion_length": 954.0390625, "epoch": 0.9048, "grad_norm": 0.44033566900030774, "kl": 4.08984375, "learning_rate": 5.470894424875062e-07, "loss": 0.138, "reward": 1.908203125, "reward_std": 0.25560563057661057, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 2262 }, { "clip_ratio": 0.0, "completion_length": 803.171875, "epoch": 0.9052, "grad_norm": 0.33634317300624184, "kl": 3.85546875, "learning_rate": 5.425436538688322e-07, "loss": 0.1473, "reward": 1.955078125, "reward_std": 0.1923220381140709, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 2263 }, { "clip_ratio": 0.0, "completion_length": 858.265625, "epoch": 0.9056, "grad_norm": 3.435478249132522, "kl": 3.65625, "learning_rate": 5.380163030452412e-07, "loss": 0.1491, "reward": 2.091796875, "reward_std": 0.1328125, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.982421875, "step": 2264 }, { "clip_ratio": 0.0, "completion_length": 905.5234375, "epoch": 0.906, "grad_norm": 0.1547743822633455, "kl": 3.31640625, "learning_rate": 5.335073988430373e-07, "loss": 0.1241, "reward": 2.0859375, "reward_std": 0.08518769592046738, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 2265 }, { "clip_ratio": 0.0, "completion_length": 881.0390625, "epoch": 0.9064, "grad_norm": 0.5861881658521817, "kl": 3.24609375, "learning_rate": 5.290169500525577e-07, "loss": 0.1081, "reward": 2.005859375, "reward_std": 0.3383176773786545, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.974609375, "step": 2266 }, { "clip_ratio": 0.0, "completion_length": 807.484375, "epoch": 0.9068, "grad_norm": 0.37629759242742833, "kl": 3.453125, "learning_rate": 5.245449654281632e-07, "loss": 0.1453, "reward": 2.1640625, "reward_std": 0.2437850758433342, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.984375, "step": 2267 }, { "clip_ratio": 0.0, "completion_length": 834.7734375, "epoch": 0.9072, "grad_norm": 3.0244737747772543, "kl": 3.6484375, "learning_rate": 5.200914536882184e-07, "loss": 0.1478, "reward": 2.111328125, "reward_std": 0.10934244096279144, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.986328125, "step": 2268 }, { "clip_ratio": 0.0, "completion_length": 882.3515625, "epoch": 0.9076, "grad_norm": 0.3819919664442608, "kl": 3.75390625, "learning_rate": 5.156564235150686e-07, "loss": 0.1507, "reward": 1.953125, "reward_std": 0.2198980376124382, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2269 }, { "clip_ratio": 0.0, "completion_length": 718.734375, "epoch": 0.908, "grad_norm": 0.7678252230017186, "kl": 3.6015625, "learning_rate": 5.112398835550348e-07, "loss": 0.1175, "reward": 2.05078125, "reward_std": 0.22777669876813889, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98828125, "step": 2270 }, { "clip_ratio": 0.0, "completion_length": 819.4296875, "epoch": 0.9084, "grad_norm": 0.22550386151732787, "kl": 3.16015625, "learning_rate": 5.068418424183874e-07, "loss": 0.1266, "reward": 2.091796875, "reward_std": 0.17430410161614418, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.974609375, "step": 2271 }, { "clip_ratio": 0.0, "completion_length": 930.890625, "epoch": 0.9088, "grad_norm": 0.6009996195487285, "kl": 3.4453125, "learning_rate": 5.024623086793323e-07, "loss": 0.1122, "reward": 1.9296875, "reward_std": 0.2663356475532055, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 2272 }, { "clip_ratio": 0.0, "completion_length": 909.046875, "epoch": 0.9092, "grad_norm": 0.2696325952057717, "kl": 3.44140625, "learning_rate": 4.981012908759941e-07, "loss": 0.1402, "reward": 2.060546875, "reward_std": 0.18254609405994415, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2273 }, { "clip_ratio": 0.0, "completion_length": 911.3046875, "epoch": 0.9096, "grad_norm": 0.16845529930684255, "kl": 3.5625, "learning_rate": 4.937587975103997e-07, "loss": 0.1442, "reward": 1.9921875, "reward_std": 0.07011200115084648, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.9921875, "step": 2274 }, { "clip_ratio": 0.0, "completion_length": 905.828125, "epoch": 0.91, "grad_norm": 0.6428399923370236, "kl": 4.0859375, "learning_rate": 4.894348370484648e-07, "loss": 0.1732, "reward": 2.1796875, "reward_std": 0.24659234285354614, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.96875, "step": 2275 }, { "clip_ratio": 0.0, "completion_length": 959.9609375, "epoch": 0.9104, "grad_norm": 0.31830672418599293, "kl": 3.91796875, "learning_rate": 4.851294179199673e-07, "loss": 0.1409, "reward": 2.03515625, "reward_std": 0.20466843992471695, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 2276 }, { "clip_ratio": 0.0, "completion_length": 858.7890625, "epoch": 0.9108, "grad_norm": 0.25935029998688336, "kl": 3.5546875, "learning_rate": 4.808425485185486e-07, "loss": 0.1101, "reward": 2.052734375, "reward_std": 0.38403064757585526, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.974609375, "step": 2277 }, { "clip_ratio": 0.0, "completion_length": 920.0546875, "epoch": 0.9112, "grad_norm": 0.27877663317105084, "kl": 3.51953125, "learning_rate": 4.765742372016735e-07, "loss": 0.1354, "reward": 1.982421875, "reward_std": 0.10657265037298203, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.990234375, "step": 2278 }, { "clip_ratio": 0.0, "completion_length": 955.3828125, "epoch": 0.9116, "grad_norm": 0.5102222573412104, "kl": 3.375, "learning_rate": 4.723244922906356e-07, "loss": 0.1219, "reward": 2.0546875, "reward_std": 0.2651250585913658, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.96875, "step": 2279 }, { "clip_ratio": 0.0, "completion_length": 831.5390625, "epoch": 0.912, "grad_norm": 0.26329840644176056, "kl": 2.9921875, "learning_rate": 4.6809332207053083e-07, "loss": 0.1084, "reward": 1.982421875, "reward_std": 0.29792726039886475, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.974609375, "step": 2280 }, { "clip_ratio": 0.0, "completion_length": 867.4453125, "epoch": 0.9124, "grad_norm": 0.37572615554130384, "kl": 3.71875, "learning_rate": 4.638807347902408e-07, "loss": 0.156, "reward": 2.083984375, "reward_std": 0.1640625, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2281 }, { "clip_ratio": 0.0, "completion_length": 845.15625, "epoch": 0.9128, "grad_norm": 0.940337709382221, "kl": 3.6953125, "learning_rate": 4.596867386624215e-07, "loss": 0.1322, "reward": 1.939453125, "reward_std": 0.21028198674321175, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 2282 }, { "clip_ratio": 0.0, "completion_length": 793.625, "epoch": 0.9132, "grad_norm": 0.288831365641326, "kl": 4.234375, "learning_rate": 4.5551134186348045e-07, "loss": 0.1552, "reward": 2.0703125, "reward_std": 0.265147902071476, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 2283 }, { "clip_ratio": 0.0, "completion_length": 946.4765625, "epoch": 0.9136, "grad_norm": 0.23493760807015968, "kl": 3.515625, "learning_rate": 4.5135455253357053e-07, "loss": 0.1211, "reward": 2.193359375, "reward_std": 0.24618664383888245, "rewards/accuracy_reward": 0.2265625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.990234375, "step": 2284 }, { "clip_ratio": 0.0, "completion_length": 998.234375, "epoch": 0.914, "grad_norm": 0.24754928799794015, "kl": 3.80859375, "learning_rate": 4.4721637877656377e-07, "loss": 0.1633, "reward": 2.068359375, "reward_std": 0.22742006927728653, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.966796875, "step": 2285 }, { "clip_ratio": 0.0, "completion_length": 964.9609375, "epoch": 0.9144, "grad_norm": 0.1970168133896549, "kl": 3.71875, "learning_rate": 4.4309682866004124e-07, "loss": 0.1529, "reward": 1.96875, "reward_std": 0.125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.984375, "step": 2286 }, { "clip_ratio": 0.0, "completion_length": 907.171875, "epoch": 0.9148, "grad_norm": 0.265527029697065, "kl": 3.5859375, "learning_rate": 4.3899591021527743e-07, "loss": 0.1431, "reward": 2.27734375, "reward_std": 0.34835124760866165, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.97265625, "step": 2287 }, { "clip_ratio": 0.0, "completion_length": 970.9296875, "epoch": 0.9152, "grad_norm": 0.43210645131967496, "kl": 3.8984375, "learning_rate": 4.349136314372204e-07, "loss": 0.1563, "reward": 1.982421875, "reward_std": 0.17930787056684494, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.974609375, "step": 2288 }, { "clip_ratio": 0.0, "completion_length": 1012.234375, "epoch": 0.9156, "grad_norm": 0.14493649172604378, "kl": 3.7421875, "learning_rate": 4.308500002844862e-07, "loss": 0.1536, "reward": 2.0703125, "reward_std": 0.21875, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2289 }, { "clip_ratio": 0.0, "completion_length": 967.5390625, "epoch": 0.916, "grad_norm": 0.23266357514903746, "kl": 3.7890625, "learning_rate": 4.268050246793276e-07, "loss": 0.1563, "reward": 1.9296875, "reward_std": 0.23104173317551613, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.96875, "step": 2290 }, { "clip_ratio": 0.0, "completion_length": 826.1015625, "epoch": 0.9164, "grad_norm": 0.31700083449473804, "kl": 3.8515625, "learning_rate": 4.2277871250763327e-07, "loss": 0.1209, "reward": 1.95703125, "reward_std": 0.171875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98828125, "step": 2291 }, { "clip_ratio": 0.0, "completion_length": 837.359375, "epoch": 0.9168, "grad_norm": 0.1976034468886886, "kl": 3.15234375, "learning_rate": 4.1877107161890416e-07, "loss": 0.121, "reward": 2.103515625, "reward_std": 0.23021908104419708, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 2292 }, { "clip_ratio": 0.0, "completion_length": 882.8046875, "epoch": 0.9172, "grad_norm": 0.2825161255071284, "kl": 3.625, "learning_rate": 4.1478210982624055e-07, "loss": 0.1352, "reward": 2.146484375, "reward_std": 0.2515806332230568, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2293 }, { "clip_ratio": 0.0, "completion_length": 905.921875, "epoch": 0.9176, "grad_norm": 0.4975085555687797, "kl": 3.9140625, "learning_rate": 4.108118349063306e-07, "loss": 0.1459, "reward": 2.115234375, "reward_std": 0.2644001357257366, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.966796875, "step": 2294 }, { "clip_ratio": 0.0, "completion_length": 906.9765625, "epoch": 0.918, "grad_norm": 0.6601072630148933, "kl": 3.82421875, "learning_rate": 4.068602545994249e-07, "loss": 0.1602, "reward": 1.96875, "reward_std": 0.36715494096279144, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 2295 }, { "clip_ratio": 0.0, "completion_length": 890.3125, "epoch": 0.9184, "grad_norm": 0.3820879403407482, "kl": 4.59375, "learning_rate": 4.0292737660933335e-07, "loss": 0.1869, "reward": 1.927734375, "reward_std": 0.24672237783670425, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.966796875, "step": 2296 }, { "clip_ratio": 0.0, "completion_length": 883.8984375, "epoch": 0.9188, "grad_norm": 0.2979746156237691, "kl": 3.6953125, "learning_rate": 3.990132086034026e-07, "loss": 0.1576, "reward": 1.9375, "reward_std": 0.25580865889787674, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.96875, "step": 2297 }, { "clip_ratio": 0.0, "completion_length": 896.34375, "epoch": 0.9192, "grad_norm": 0.39813107946421805, "kl": 3.6953125, "learning_rate": 3.9511775821250206e-07, "loss": 0.1087, "reward": 1.935546875, "reward_std": 0.23219703137874603, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.974609375, "step": 2298 }, { "clip_ratio": 0.0, "completion_length": 806.3203125, "epoch": 0.9196, "grad_norm": 0.2790903708640095, "kl": 3.98828125, "learning_rate": 3.912410330310157e-07, "loss": 0.1381, "reward": 2.12109375, "reward_std": 0.21958077698946, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98828125, "step": 2299 }, { "clip_ratio": 0.0, "completion_length": 886.109375, "epoch": 0.92, "grad_norm": 0.14854944742921117, "kl": 3.625, "learning_rate": 3.8738304061681107e-07, "loss": 0.1542, "reward": 1.951171875, "reward_std": 0.1747364103794098, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.974609375, "step": 2300 }, { "clip_ratio": 0.0, "completion_length": 909.96875, "epoch": 0.9204, "grad_norm": 3.544625001157822, "kl": 4.75, "learning_rate": 3.835437884912474e-07, "loss": 0.1854, "reward": 2.24609375, "reward_std": 0.140625, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.98828125, "step": 2301 }, { "clip_ratio": 0.0, "completion_length": 869.6015625, "epoch": 0.9208, "grad_norm": 0.30889471944687963, "kl": 3.6953125, "learning_rate": 3.7972328413914074e-07, "loss": 0.1503, "reward": 1.974609375, "reward_std": 0.21871744096279144, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2302 }, { "clip_ratio": 0.0, "completion_length": 988.5859375, "epoch": 0.9212, "grad_norm": 0.2841672425479913, "kl": 3.69140625, "learning_rate": 3.759215350087619e-07, "loss": 0.1127, "reward": 2.064453125, "reward_std": 0.3946208208799362, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.978515625, "step": 2303 }, { "clip_ratio": 0.0, "completion_length": 934.671875, "epoch": 0.9216, "grad_norm": 0.11520257631405541, "kl": 3.671875, "learning_rate": 3.721385485118123e-07, "loss": 0.1474, "reward": 1.9296875, "reward_std": 0.21107376366853714, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.96875, "step": 2304 }, { "clip_ratio": 0.0, "completion_length": 876.65625, "epoch": 0.922, "grad_norm": 0.19640380409914834, "kl": 3.671875, "learning_rate": 3.68374332023419e-07, "loss": 0.1476, "reward": 2.08984375, "reward_std": 0.18050824850797653, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.98828125, "step": 2305 }, { "clip_ratio": 0.0, "completion_length": 888.25, "epoch": 0.9224, "grad_norm": 0.8563370162160697, "kl": 4.0859375, "learning_rate": 3.646288928821151e-07, "loss": 0.1687, "reward": 1.98046875, "reward_std": 0.07317390665411949, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.98828125, "step": 2306 }, { "clip_ratio": 0.0, "completion_length": 768.9609375, "epoch": 0.9228, "grad_norm": 0.41248348293961545, "kl": 3.546875, "learning_rate": 3.609022383898242e-07, "loss": 0.1456, "reward": 2.244140625, "reward_std": 0.129750594496727, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.994140625, "step": 2307 }, { "clip_ratio": 0.0, "completion_length": 898.9921875, "epoch": 0.9232, "grad_norm": 0.2611166247877979, "kl": 4.24609375, "learning_rate": 3.571943758118546e-07, "loss": 0.1743, "reward": 1.984375, "reward_std": 0.23317430913448334, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.9765625, "step": 2308 }, { "clip_ratio": 0.0, "completion_length": 938.2109375, "epoch": 0.9236, "grad_norm": 0.2059265574107932, "kl": 4.1171875, "learning_rate": 3.5350531237686723e-07, "loss": 0.1601, "reward": 1.955078125, "reward_std": 0.1796875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.986328125, "step": 2309 }, { "clip_ratio": 0.0, "completion_length": 846.71875, "epoch": 0.924, "grad_norm": 0.23267018106772921, "kl": 3.9609375, "learning_rate": 3.498350552768859e-07, "loss": 0.1537, "reward": 1.9609375, "reward_std": 0.21875, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.984375, "step": 2310 }, { "clip_ratio": 0.0, "completion_length": 739.953125, "epoch": 0.9244, "grad_norm": 0.2546316502133943, "kl": 3.49609375, "learning_rate": 3.4618361166726123e-07, "loss": 0.1294, "reward": 2.078125, "reward_std": 0.18440164253115654, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2311 }, { "clip_ratio": 0.0, "completion_length": 909.15625, "epoch": 0.9248, "grad_norm": 0.13971114676719865, "kl": 3.64453125, "learning_rate": 3.4255098866667114e-07, "loss": 0.1398, "reward": 2.087890625, "reward_std": 0.1484375, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 2312 }, { "clip_ratio": 0.0, "completion_length": 858.734375, "epoch": 0.9252, "grad_norm": 0.2875045846408489, "kl": 2.95703125, "learning_rate": 3.3893719335709953e-07, "loss": 0.1128, "reward": 1.970703125, "reward_std": 0.25355130061507225, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 2313 }, { "clip_ratio": 0.0, "completion_length": 959.6015625, "epoch": 0.9256, "grad_norm": 0.23419726591536574, "kl": 4.38671875, "learning_rate": 3.3534223278382405e-07, "loss": 0.1805, "reward": 1.984375, "reward_std": 0.125, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 2314 }, { "clip_ratio": 0.0, "completion_length": 975.5546875, "epoch": 0.926, "grad_norm": 0.3150181713691792, "kl": 4.13671875, "learning_rate": 3.3176611395540625e-07, "loss": 0.1692, "reward": 1.943359375, "reward_std": 0.3589552864432335, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.966796875, "step": 2315 }, { "clip_ratio": 0.0, "completion_length": 942.53125, "epoch": 0.9264, "grad_norm": 0.5185506205092055, "kl": 4.08984375, "learning_rate": 3.282088438436715e-07, "loss": 0.1533, "reward": 2.0078125, "reward_std": 0.335409052670002, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 2316 }, { "clip_ratio": 0.0, "completion_length": 827.7265625, "epoch": 0.9268, "grad_norm": 0.3090746987684536, "kl": 3.94921875, "learning_rate": 3.246704293837011e-07, "loss": 0.1701, "reward": 2.162109375, "reward_std": 0.22324342280626297, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2317 }, { "clip_ratio": 0.0, "completion_length": 684.734375, "epoch": 0.9272, "grad_norm": 0.3273977886609215, "kl": 3.671875, "learning_rate": 3.211508774738137e-07, "loss": 0.1498, "reward": 1.99609375, "reward_std": 0.1231590062379837, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 2318 }, { "clip_ratio": 0.0, "completion_length": 891.5234375, "epoch": 0.9276, "grad_norm": 0.28449666100693854, "kl": 4.5, "learning_rate": 3.1765019497555617e-07, "loss": 0.1553, "reward": 1.916015625, "reward_std": 0.33895429223775864, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.962890625, "step": 2319 }, { "clip_ratio": 0.0, "completion_length": 823.3359375, "epoch": 0.928, "grad_norm": 0.45045412596054457, "kl": 3.5234375, "learning_rate": 3.1416838871368925e-07, "loss": 0.1405, "reward": 2.09765625, "reward_std": 0.171875, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 2320 }, { "clip_ratio": 0.0, "completion_length": 878.59375, "epoch": 0.9284, "grad_norm": 0.2673416581920884, "kl": 3.59375, "learning_rate": 3.10705465476171e-07, "loss": 0.1585, "reward": 2.0234375, "reward_std": 0.2890164405107498, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.96875, "step": 2321 }, { "clip_ratio": 0.0, "completion_length": 899.65625, "epoch": 0.9288, "grad_norm": 0.20311309543971567, "kl": 4.0, "learning_rate": 3.072614320141487e-07, "loss": 0.166, "reward": 1.990234375, "reward_std": 0.22435061633586884, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 2322 }, { "clip_ratio": 0.0, "completion_length": 816.9765625, "epoch": 0.9292, "grad_norm": 0.21850691537589878, "kl": 3.68359375, "learning_rate": 3.0383629504194047e-07, "loss": 0.1517, "reward": 2.078125, "reward_std": 0.14789125323295593, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.9765625, "step": 2323 }, { "clip_ratio": 0.0, "completion_length": 732.15625, "epoch": 0.9296, "grad_norm": 0.17327825590372026, "kl": 3.67578125, "learning_rate": 3.00430061237027e-07, "loss": 0.1494, "reward": 1.970703125, "reward_std": 0.0802573561668396, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.986328125, "step": 2324 }, { "clip_ratio": 0.0, "completion_length": 868.9609375, "epoch": 0.93, "grad_norm": 0.17626577367468482, "kl": 4.1484375, "learning_rate": 2.970427372400353e-07, "loss": 0.1675, "reward": 2.103515625, "reward_std": 0.2288404181599617, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.978515625, "step": 2325 }, { "clip_ratio": 0.0, "completion_length": 884.3359375, "epoch": 0.9304, "grad_norm": 0.4410269243247377, "kl": 3.5390625, "learning_rate": 2.936743296547273e-07, "loss": 0.138, "reward": 2.044921875, "reward_std": 0.2512141987681389, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.990234375, "step": 2326 }, { "clip_ratio": 0.0, "completion_length": 830.90625, "epoch": 0.9308, "grad_norm": 0.2187502324115811, "kl": 3.2421875, "learning_rate": 2.9032484504798454e-07, "loss": 0.1261, "reward": 2.03515625, "reward_std": 0.2663841247558594, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 2327 }, { "clip_ratio": 0.0, "completion_length": 976.5703125, "epoch": 0.9312, "grad_norm": 0.25295832497769616, "kl": 3.72265625, "learning_rate": 2.8699428994980017e-07, "loss": 0.1507, "reward": 1.916015625, "reward_std": 0.30898308008909225, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.970703125, "step": 2328 }, { "clip_ratio": 0.0, "completion_length": 919.1484375, "epoch": 0.9316, "grad_norm": 0.19472012076776996, "kl": 3.375, "learning_rate": 2.836826708532603e-07, "loss": 0.1546, "reward": 1.984375, "reward_std": 0.16633247584104538, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.984375, "step": 2329 }, { "clip_ratio": 0.0, "completion_length": 861.109375, "epoch": 0.932, "grad_norm": 3.199080503111455, "kl": 4.27734375, "learning_rate": 2.8038999421453827e-07, "loss": 0.1776, "reward": 2.072265625, "reward_std": 0.13579118251800537, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 2330 }, { "clip_ratio": 0.0, "completion_length": 869.59375, "epoch": 0.9324, "grad_norm": 0.3593545489219739, "kl": 3.6171875, "learning_rate": 2.771162664528726e-07, "loss": 0.1389, "reward": 2.02734375, "reward_std": 0.1324067935347557, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.99609375, "step": 2331 }, { "clip_ratio": 0.0, "completion_length": 857.2890625, "epoch": 0.9328, "grad_norm": 0.49028657577144935, "kl": 3.4375, "learning_rate": 2.7386149395056463e-07, "loss": 0.1303, "reward": 2.109375, "reward_std": 0.37416934967041016, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.9609375, "step": 2332 }, { "clip_ratio": 0.0, "completion_length": 826.953125, "epoch": 0.9332, "grad_norm": 0.682107221013048, "kl": 4.00390625, "learning_rate": 2.7062568305295967e-07, "loss": 0.1547, "reward": 1.978515625, "reward_std": 0.17592863366007805, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 2333 }, { "clip_ratio": 0.0, "completion_length": 772.5625, "epoch": 0.9336, "grad_norm": 0.31882205914734185, "kl": 3.3671875, "learning_rate": 2.6740884006843826e-07, "loss": 0.1425, "reward": 2.00390625, "reward_std": 0.31086357682943344, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.97265625, "step": 2334 }, { "clip_ratio": 0.0, "completion_length": 817.7890625, "epoch": 0.934, "grad_norm": 0.6111715111208388, "kl": 3.57421875, "learning_rate": 2.6421097126839714e-07, "loss": 0.1461, "reward": 2.23046875, "reward_std": 0.12054429948329926, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 2335 }, { "clip_ratio": 0.0, "completion_length": 809.640625, "epoch": 0.9344, "grad_norm": 0.1970139292689625, "kl": 3.65234375, "learning_rate": 2.6103208288724815e-07, "loss": 0.1495, "reward": 2.13671875, "reward_std": 0.2009018287062645, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 2336 }, { "clip_ratio": 0.0, "completion_length": 886.453125, "epoch": 0.9348, "grad_norm": 0.264774089692905, "kl": 3.64453125, "learning_rate": 2.57872181122395e-07, "loss": 0.1455, "reward": 2.05859375, "reward_std": 0.2555532157421112, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98046875, "step": 2337 }, { "clip_ratio": 0.0, "completion_length": 838.2109375, "epoch": 0.9352, "grad_norm": 0.41290880835004057, "kl": 4.0703125, "learning_rate": 2.547312721342277e-07, "loss": 0.1694, "reward": 1.947265625, "reward_std": 0.2145351693034172, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 2338 }, { "clip_ratio": 0.0, "completion_length": 852.390625, "epoch": 0.9356, "grad_norm": 0.3116935766229399, "kl": 3.453125, "learning_rate": 2.516093620461124e-07, "loss": 0.1439, "reward": 1.9453125, "reward_std": 0.17592502385377884, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2339 }, { "clip_ratio": 0.0, "completion_length": 908.703125, "epoch": 0.936, "grad_norm": 0.20198022310093178, "kl": 4.4765625, "learning_rate": 2.4850645694436736e-07, "loss": 0.1834, "reward": 2.0, "reward_std": 0.28920209407806396, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.9765625, "step": 2340 }, { "clip_ratio": 0.0, "completion_length": 742.203125, "epoch": 0.9364, "grad_norm": 0.1170536748454832, "kl": 3.71484375, "learning_rate": 2.4542256287826915e-07, "loss": 0.1542, "reward": 2.119140625, "reward_std": 0.0859375, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 2341 }, { "clip_ratio": 0.0, "completion_length": 933.65625, "epoch": 0.9368, "grad_norm": 0.22854600765556052, "kl": 4.3046875, "learning_rate": 2.423576858600252e-07, "loss": 0.1684, "reward": 1.9453125, "reward_std": 0.22767701745033264, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.96875, "step": 2342 }, { "clip_ratio": 0.0, "completion_length": 868.4765625, "epoch": 0.9372, "grad_norm": 0.15525225530276565, "kl": 3.13671875, "learning_rate": 2.3931183186477026e-07, "loss": 0.1289, "reward": 2.08203125, "reward_std": 0.1417730376124382, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98046875, "step": 2343 }, { "clip_ratio": 0.0, "completion_length": 791.109375, "epoch": 0.9376, "grad_norm": 0.3712538635115468, "kl": 3.78515625, "learning_rate": 2.3628500683055222e-07, "loss": 0.1442, "reward": 1.958984375, "reward_std": 0.1640625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2344 }, { "clip_ratio": 0.0, "completion_length": 866.2109375, "epoch": 0.938, "grad_norm": 1.552532223520603, "kl": 3.65625, "learning_rate": 2.332772166583208e-07, "loss": 0.1483, "reward": 2.001953125, "reward_std": 0.041806600987911224, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 0.994140625, "step": 2345 }, { "clip_ratio": 0.0, "completion_length": 819.3046875, "epoch": 0.9384, "grad_norm": 0.14753948542470985, "kl": 3.41015625, "learning_rate": 2.3028846721191878e-07, "loss": 0.1329, "reward": 1.958984375, "reward_std": 0.15621744096279144, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2346 }, { "clip_ratio": 0.0, "completion_length": 815.734375, "epoch": 0.9388, "grad_norm": 0.3041449357570263, "kl": 3.91015625, "learning_rate": 2.273187643180652e-07, "loss": 0.1503, "reward": 1.943359375, "reward_std": 0.21900955587625504, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.974609375, "step": 2347 }, { "clip_ratio": 0.0, "completion_length": 857.4921875, "epoch": 0.9392, "grad_norm": 0.38520372046407014, "kl": 4.03515625, "learning_rate": 2.2436811376634893e-07, "loss": 0.1693, "reward": 2.064453125, "reward_std": 0.26863158121705055, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 2348 }, { "clip_ratio": 0.0, "completion_length": 913.4609375, "epoch": 0.9396, "grad_norm": 0.22640498650901486, "kl": 3.97265625, "learning_rate": 2.214365213092118e-07, "loss": 0.1583, "reward": 1.94921875, "reward_std": 0.1744270622730255, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 2349 }, { "clip_ratio": 0.0, "completion_length": 811.6328125, "epoch": 0.94, "grad_norm": 0.10881438237363558, "kl": 3.7734375, "learning_rate": 2.1852399266194312e-07, "loss": 0.1513, "reward": 2.0, "reward_std": 0.10519562661647797, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.9921875, "step": 2350 }, { "clip_ratio": 0.0, "completion_length": 734.53125, "epoch": 0.9404, "grad_norm": 0.13727667037491773, "kl": 3.296875, "learning_rate": 2.1563053350266983e-07, "loss": 0.1355, "reward": 2.251953125, "reward_std": 0.09738312661647797, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 2351 }, { "clip_ratio": 0.0, "completion_length": 811.09375, "epoch": 0.9408, "grad_norm": 0.19009553593615544, "kl": 3.61328125, "learning_rate": 2.1275614947233624e-07, "loss": 0.1465, "reward": 1.974609375, "reward_std": 0.1640625, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.990234375, "step": 2352 }, { "clip_ratio": 0.0, "completion_length": 726.6640625, "epoch": 0.9412, "grad_norm": 0.1750023678036076, "kl": 3.34375, "learning_rate": 2.0990084617470207e-07, "loss": 0.1362, "reward": 2.009765625, "reward_std": 0.12863312661647797, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 2353 }, { "clip_ratio": 0.0, "completion_length": 817.921875, "epoch": 0.9416, "grad_norm": 0.4757589579715379, "kl": 3.578125, "learning_rate": 2.0706462917632676e-07, "loss": 0.1477, "reward": 1.98828125, "reward_std": 0.109375, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.98828125, "step": 2354 }, { "clip_ratio": 0.0, "completion_length": 740.40625, "epoch": 0.942, "grad_norm": 0.24110823501079254, "kl": 3.6640625, "learning_rate": 2.0424750400655947e-07, "loss": 0.143, "reward": 2.103515625, "reward_std": 0.15734179317951202, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 2355 }, { "clip_ratio": 0.0, "completion_length": 733.7734375, "epoch": 0.9424, "grad_norm": 0.7386218610650209, "kl": 3.65234375, "learning_rate": 2.014494761575314e-07, "loss": 0.1519, "reward": 2.0625, "reward_std": 0.2008708119392395, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.984375, "step": 2356 }, { "clip_ratio": 0.0, "completion_length": 862.0859375, "epoch": 0.9428, "grad_norm": 0.5001997804609812, "kl": 3.94921875, "learning_rate": 1.9867055108414023e-07, "loss": 0.1649, "reward": 2.115234375, "reward_std": 0.314302995800972, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 2357 }, { "clip_ratio": 0.0, "completion_length": 792.6953125, "epoch": 0.9432, "grad_norm": 0.15687553581380698, "kl": 3.6640625, "learning_rate": 1.9591073420404338e-07, "loss": 0.1452, "reward": 1.96484375, "reward_std": 0.1105230376124382, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98828125, "step": 2358 }, { "clip_ratio": 0.0, "completion_length": 822.875, "epoch": 0.9436, "grad_norm": 0.07705311142615538, "kl": 3.90625, "learning_rate": 1.9317003089764365e-07, "loss": 0.1602, "reward": 2.103515625, "reward_std": 0.0859375, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.994140625, "step": 2359 }, { "clip_ratio": 0.0, "completion_length": 857.6953125, "epoch": 0.944, "grad_norm": 0.3910072213525739, "kl": 3.4375, "learning_rate": 1.9044844650808468e-07, "loss": 0.1465, "reward": 2.15234375, "reward_std": 0.25118163973093033, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 2360 }, { "clip_ratio": 0.0, "completion_length": 836.90625, "epoch": 0.9444, "grad_norm": 0.25174434670026713, "kl": 3.765625, "learning_rate": 1.877459863412323e-07, "loss": 0.1359, "reward": 2.04296875, "reward_std": 0.2721805050969124, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 2361 }, { "clip_ratio": 0.0, "completion_length": 792.5, "epoch": 0.9448, "grad_norm": 0.4067720339835842, "kl": 3.3203125, "learning_rate": 1.8506265566567095e-07, "loss": 0.1226, "reward": 2.119140625, "reward_std": 0.2913777679204941, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.986328125, "step": 2362 }, { "clip_ratio": 0.0, "completion_length": 907.40625, "epoch": 0.9452, "grad_norm": 0.3054182791510881, "kl": 3.7734375, "learning_rate": 1.8239845971269266e-07, "loss": 0.1508, "reward": 2.01171875, "reward_std": 0.3382126912474632, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.97265625, "step": 2363 }, { "clip_ratio": 0.0, "completion_length": 857.5859375, "epoch": 0.9456, "grad_norm": 0.3908610794306416, "kl": 4.04296875, "learning_rate": 1.7975340367628269e-07, "loss": 0.151, "reward": 2.033203125, "reward_std": 0.12661828845739365, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.994140625, "step": 2364 }, { "clip_ratio": 0.0, "completion_length": 971.5546875, "epoch": 0.946, "grad_norm": 0.45481132738412855, "kl": 4.0078125, "learning_rate": 1.7712749271311392e-07, "loss": 0.1658, "reward": 1.96875, "reward_std": 0.21593307703733444, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2365 }, { "clip_ratio": 0.0, "completion_length": 731.5859375, "epoch": 0.9464, "grad_norm": 0.17073277131250436, "kl": 3.34765625, "learning_rate": 1.7452073194253237e-07, "loss": 0.1384, "reward": 1.966796875, "reward_std": 0.1552298665046692, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2366 }, { "clip_ratio": 0.0, "completion_length": 838.859375, "epoch": 0.9468, "grad_norm": 0.23307081029041188, "kl": 3.3515625, "learning_rate": 1.719331264465529e-07, "loss": 0.1322, "reward": 1.994140625, "reward_std": 0.23494622111320496, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 2367 }, { "clip_ratio": 0.0, "completion_length": 766.484375, "epoch": 0.9472, "grad_norm": 0.14835411050668687, "kl": 3.7578125, "learning_rate": 1.6936468126984573e-07, "loss": 0.1538, "reward": 1.9921875, "reward_std": 0.0874006412923336, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.9921875, "step": 2368 }, { "clip_ratio": 0.0, "completion_length": 851.5625, "epoch": 0.9476, "grad_norm": 0.2116466281288878, "kl": 3.70703125, "learning_rate": 1.668154014197243e-07, "loss": 0.1308, "reward": 2.068359375, "reward_std": 0.20180703699588776, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.982421875, "step": 2369 }, { "clip_ratio": 0.0, "completion_length": 701.1328125, "epoch": 0.948, "grad_norm": 0.31736447494184467, "kl": 3.4921875, "learning_rate": 1.6428529186614195e-07, "loss": 0.1444, "reward": 2.109375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.9921875, "step": 2370 }, { "clip_ratio": 0.0, "completion_length": 702.59375, "epoch": 0.9484, "grad_norm": 0.23732101270373884, "kl": 3.2421875, "learning_rate": 1.6177435754167413e-07, "loss": 0.1194, "reward": 1.951171875, "reward_std": 0.1365126073360443, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.982421875, "step": 2371 }, { "clip_ratio": 0.0, "completion_length": 836.8203125, "epoch": 0.9488, "grad_norm": 0.5182162815951978, "kl": 4.00390625, "learning_rate": 1.5928260334151847e-07, "loss": 0.154, "reward": 2.052734375, "reward_std": 0.3543730527162552, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.966796875, "step": 2372 }, { "clip_ratio": 0.0, "completion_length": 860.640625, "epoch": 0.9492, "grad_norm": 0.2831782499640372, "kl": 3.08203125, "learning_rate": 1.5681003412347573e-07, "loss": 0.1268, "reward": 1.984375, "reward_std": 0.2066391110420227, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 2373 }, { "clip_ratio": 0.0, "completion_length": 762.1796875, "epoch": 0.9496, "grad_norm": 0.3163538922535952, "kl": 3.59765625, "learning_rate": 1.543566547079467e-07, "loss": 0.1476, "reward": 2.01171875, "reward_std": 0.19468443095684052, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 2374 }, { "clip_ratio": 0.0, "completion_length": 841.2265625, "epoch": 0.95, "grad_norm": 0.26572450669854286, "kl": 3.5078125, "learning_rate": 1.519224698779198e-07, "loss": 0.142, "reward": 1.943359375, "reward_std": 0.25529052317142487, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.974609375, "step": 2375 }, { "clip_ratio": 0.0, "completion_length": 857.46875, "epoch": 0.9504, "grad_norm": 0.35275867041088765, "kl": 4.21875, "learning_rate": 1.4950748437896235e-07, "loss": 0.1782, "reward": 2.033203125, "reward_std": 0.25485505163669586, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 2376 }, { "clip_ratio": 0.0, "completion_length": 769.6875, "epoch": 0.9508, "grad_norm": 0.2609616699063701, "kl": 3.6171875, "learning_rate": 1.4711170291921485e-07, "loss": 0.1417, "reward": 1.939453125, "reward_std": 0.1765594184398651, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 2377 }, { "clip_ratio": 0.0, "completion_length": 763.046875, "epoch": 0.9512, "grad_norm": 0.2493724038667667, "kl": 3.3046875, "learning_rate": 1.4473513016937223e-07, "loss": 0.143, "reward": 2.0, "reward_std": 0.18414128571748734, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 2378 }, { "clip_ratio": 0.0, "completion_length": 819.2890625, "epoch": 0.9516, "grad_norm": 0.3867366895803273, "kl": 3.6796875, "learning_rate": 1.4237777076268723e-07, "loss": 0.1478, "reward": 2.01953125, "reward_std": 0.08957062661647797, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 0.99609375, "step": 2379 }, { "clip_ratio": 0.0, "completion_length": 990.796875, "epoch": 0.952, "grad_norm": 0.5376820469412574, "kl": 4.3671875, "learning_rate": 1.400396292949513e-07, "loss": 0.1722, "reward": 1.9453125, "reward_std": 0.47923339903354645, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.9296875, "step": 2380 }, { "clip_ratio": 0.0, "completion_length": 837.7265625, "epoch": 0.9524, "grad_norm": 0.1296347827683048, "kl": 3.56640625, "learning_rate": 1.377207103244904e-07, "loss": 0.1492, "reward": 1.982421875, "reward_std": 0.06246744096279144, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.990234375, "step": 2381 }, { "clip_ratio": 0.0, "completion_length": 831.5390625, "epoch": 0.9528, "grad_norm": 0.3422846651869927, "kl": 4.3828125, "learning_rate": 1.3542101837215826e-07, "loss": 0.1805, "reward": 1.966796875, "reward_std": 0.1328125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.982421875, "step": 2382 }, { "clip_ratio": 0.0, "completion_length": 800.96875, "epoch": 0.9532, "grad_norm": 0.8397905860121281, "kl": 3.4453125, "learning_rate": 1.3314055792131964e-07, "loss": 0.1311, "reward": 2.0625, "reward_std": 0.21608919650316238, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.9921875, "step": 2383 }, { "clip_ratio": 0.0, "completion_length": 856.25, "epoch": 0.9536, "grad_norm": 0.38092256211465, "kl": 3.3046875, "learning_rate": 1.308793334178493e-07, "loss": 0.136, "reward": 2.078125, "reward_std": 0.2884472645819187, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.96875, "step": 2384 }, { "clip_ratio": 0.0, "completion_length": 826.7421875, "epoch": 0.954, "grad_norm": 0.5561021049157311, "kl": 3.2578125, "learning_rate": 1.2863734927012094e-07, "loss": 0.1382, "reward": 1.958984375, "reward_std": 0.1258341670036316, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2385 }, { "clip_ratio": 0.0, "completion_length": 883.078125, "epoch": 0.9544, "grad_norm": 0.13624190971330122, "kl": 3.56640625, "learning_rate": 1.26414609848996e-07, "loss": 0.1475, "reward": 2.1015625, "reward_std": 0.09375, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 2386 }, { "clip_ratio": 0.0, "completion_length": 898.421875, "epoch": 0.9548, "grad_norm": 0.3974345273454261, "kl": 3.8828125, "learning_rate": 1.242111194878215e-07, "loss": 0.1619, "reward": 1.998046875, "reward_std": 0.332603394985199, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.966796875, "step": 2387 }, { "clip_ratio": 0.0, "completion_length": 763.1640625, "epoch": 0.9552, "grad_norm": 0.3883508599766611, "kl": 3.5703125, "learning_rate": 1.2202688248241113e-07, "loss": 0.087, "reward": 1.908203125, "reward_std": 0.3083875998854637, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.970703125, "step": 2388 }, { "clip_ratio": 0.0, "completion_length": 905.46875, "epoch": 0.9556, "grad_norm": 0.6813511923909908, "kl": 3.8359375, "learning_rate": 1.1986190309104861e-07, "loss": 0.1551, "reward": 2.28125, "reward_std": 0.3350592888891697, "rewards/accuracy_reward": 0.3515625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.96875, "step": 2389 }, { "clip_ratio": 0.0, "completion_length": 969.3515625, "epoch": 0.956, "grad_norm": 0.22078292817851702, "kl": 4.37109375, "learning_rate": 1.1771618553447217e-07, "loss": 0.1781, "reward": 2.001953125, "reward_std": 0.31545013934373856, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.970703125, "step": 2390 }, { "clip_ratio": 0.0, "completion_length": 760.703125, "epoch": 0.9564, "grad_norm": 0.2566377302527935, "kl": 3.27734375, "learning_rate": 1.1558973399586671e-07, "loss": 0.1343, "reward": 2.11328125, "reward_std": 0.046875, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.99609375, "step": 2391 }, { "clip_ratio": 0.0, "completion_length": 770.0625, "epoch": 0.9568, "grad_norm": 0.4737363239817313, "kl": 3.69921875, "learning_rate": 1.134825526208605e-07, "loss": 0.1573, "reward": 2.150390625, "reward_std": 0.15786828845739365, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.986328125, "step": 2392 }, { "clip_ratio": 0.0, "completion_length": 810.7734375, "epoch": 0.9572, "grad_norm": 0.14550909739876355, "kl": 3.9921875, "learning_rate": 1.1139464551750857e-07, "loss": 0.1692, "reward": 2.2109375, "reward_std": 0.14265555515885353, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 2393 }, { "clip_ratio": 0.0, "completion_length": 784.8359375, "epoch": 0.9576, "grad_norm": 0.2758226740136849, "kl": 3.51953125, "learning_rate": 1.0932601675629595e-07, "loss": 0.1475, "reward": 2.091796875, "reward_std": 0.1547744981944561, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.990234375, "step": 2394 }, { "clip_ratio": 0.0, "completion_length": 755.8828125, "epoch": 0.958, "grad_norm": 0.23779632061659398, "kl": 3.671875, "learning_rate": 1.0727667037011668e-07, "loss": 0.1593, "reward": 2.0, "reward_std": 0.16769562661647797, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 2395 }, { "clip_ratio": 0.0, "completion_length": 799.6796875, "epoch": 0.9584, "grad_norm": 0.11456934189349542, "kl": 3.453125, "learning_rate": 1.052466103542793e-07, "loss": 0.136, "reward": 2.041015625, "reward_std": 0.11873093992471695, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 2396 }, { "clip_ratio": 0.0, "completion_length": 833.3515625, "epoch": 0.9588, "grad_norm": 0.2063086679817483, "kl": 3.23828125, "learning_rate": 1.0323584066648795e-07, "loss": 0.1302, "reward": 2.0625, "reward_std": 0.23224534839391708, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 2397 }, { "clip_ratio": 0.0, "completion_length": 900.5859375, "epoch": 0.9592, "grad_norm": 0.4201423503089152, "kl": 4.078125, "learning_rate": 1.0124436522684244e-07, "loss": 0.1723, "reward": 2.125, "reward_std": 0.429037481546402, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.9609375, "step": 2398 }, { "clip_ratio": 0.0, "completion_length": 866.0078125, "epoch": 0.9596, "grad_norm": 0.35885149532337335, "kl": 3.73828125, "learning_rate": 9.9272187917826e-08, "loss": 0.1464, "reward": 2.06640625, "reward_std": 0.234375, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.98046875, "step": 2399 }, { "clip_ratio": 0.0, "completion_length": 830.078125, "epoch": 0.96, "grad_norm": 0.29414683679721265, "kl": 3.9765625, "learning_rate": 9.731931258429638e-08, "loss": 0.1609, "reward": 1.974609375, "reward_std": 0.20675812661647797, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2400 }, { "clip_ratio": 0.0, "completion_length": 832.3203125, "epoch": 0.9604, "grad_norm": 0.3624867989780257, "kl": 3.515625, "learning_rate": 9.538574303348813e-08, "loss": 0.1362, "reward": 1.921875, "reward_std": 0.28006206452846527, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 2401 }, { "clip_ratio": 0.0, "completion_length": 870.40625, "epoch": 0.9608, "grad_norm": 0.6150978909080456, "kl": 3.88671875, "learning_rate": 9.347148303499143e-08, "loss": 0.1665, "reward": 1.962890625, "reward_std": 0.1484375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 2402 }, { "clip_ratio": 0.0, "completion_length": 903.3515625, "epoch": 0.9612, "grad_norm": 0.694310431732144, "kl": 3.97265625, "learning_rate": 9.157653632075435e-08, "loss": 0.1363, "reward": 1.935546875, "reward_std": 0.257522389292717, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.974609375, "step": 2403 }, { "clip_ratio": 0.0, "completion_length": 820.2109375, "epoch": 0.9616, "grad_norm": 0.3564267649380876, "kl": 3.91796875, "learning_rate": 8.970090658507291e-08, "loss": 0.1577, "reward": 2.09375, "reward_std": 0.20949089527130127, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 2404 }, { "clip_ratio": 0.0, "completion_length": 800.375, "epoch": 0.962, "grad_norm": 0.4021421876069585, "kl": 3.71484375, "learning_rate": 8.784459748458318e-08, "loss": 0.1361, "reward": 1.912109375, "reward_std": 0.25632865726947784, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 2405 }, { "clip_ratio": 0.0, "completion_length": 905.5546875, "epoch": 0.9624, "grad_norm": 0.980346709825462, "kl": 3.8671875, "learning_rate": 8.600761263825475e-08, "loss": 0.1578, "reward": 1.97265625, "reward_std": 0.1669239066541195, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98046875, "step": 2406 }, { "clip_ratio": 0.0, "completion_length": 794.7109375, "epoch": 0.9628, "grad_norm": 0.17405108574754566, "kl": 3.44921875, "learning_rate": 8.418995562738286e-08, "loss": 0.1421, "reward": 2.072265625, "reward_std": 0.13663385808467865, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 2407 }, { "clip_ratio": 0.0, "completion_length": 851.609375, "epoch": 0.9632, "grad_norm": 0.5376754912398211, "kl": 4.0546875, "learning_rate": 8.239162999558403e-08, "loss": 0.164, "reward": 2.19921875, "reward_std": 0.203125, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 2408 }, { "clip_ratio": 0.0, "completion_length": 821.421875, "epoch": 0.9636, "grad_norm": 0.24348950072644396, "kl": 3.5859375, "learning_rate": 8.061263924878604e-08, "loss": 0.1454, "reward": 2.0078125, "reward_std": 0.03125, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 2409 }, { "clip_ratio": 0.0, "completion_length": 766.625, "epoch": 0.964, "grad_norm": 0.3853425642561558, "kl": 3.37890625, "learning_rate": 7.885298685522235e-08, "loss": 0.1444, "reward": 2.197265625, "reward_std": 0.2109375, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 2410 }, { "clip_ratio": 0.0, "completion_length": 889.4375, "epoch": 0.9644, "grad_norm": 0.1771250914413782, "kl": 3.625, "learning_rate": 7.71126762454233e-08, "loss": 0.1494, "reward": 1.9921875, "reward_std": 0.2688393145799637, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.96875, "step": 2411 }, { "clip_ratio": 0.0, "completion_length": 763.625, "epoch": 0.9648, "grad_norm": 0.24967355411682807, "kl": 3.390625, "learning_rate": 7.539171081221597e-08, "loss": 0.1399, "reward": 2.125, "reward_std": 0.10519562661647797, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.9921875, "step": 2412 }, { "clip_ratio": 0.0, "completion_length": 774.640625, "epoch": 0.9652, "grad_norm": 0.5791216718425062, "kl": 3.3828125, "learning_rate": 7.369009391070992e-08, "loss": 0.1393, "reward": 2.23046875, "reward_std": 0.13995973765850067, "rewards/accuracy_reward": 0.2421875, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.99609375, "step": 2413 }, { "clip_ratio": 0.0, "completion_length": 882.453125, "epoch": 0.9656, "grad_norm": 0.19523145733632516, "kl": 3.45703125, "learning_rate": 7.200782885829482e-08, "loss": 0.1448, "reward": 1.9609375, "reward_std": 0.18910211324691772, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.9765625, "step": 2414 }, { "clip_ratio": 0.0, "completion_length": 717.5390625, "epoch": 0.966, "grad_norm": 0.19626982419780825, "kl": 3.5, "learning_rate": 7.034491893463059e-08, "loss": 0.1418, "reward": 2.115234375, "reward_std": 0.0390625, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.998046875, "step": 2415 }, { "clip_ratio": 0.0, "completion_length": 839.9296875, "epoch": 0.9664, "grad_norm": 0.13158513358090268, "kl": 3.6953125, "learning_rate": 6.870136738164612e-08, "loss": 0.1625, "reward": 2.005859375, "reward_std": 0.12551628798246384, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.990234375, "step": 2416 }, { "clip_ratio": 0.0, "completion_length": 926.6328125, "epoch": 0.9668, "grad_norm": 0.6374737458890841, "kl": 4.13671875, "learning_rate": 6.707717740353059e-08, "loss": 0.1636, "reward": 1.869140625, "reward_std": 0.43246573954820633, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.939453125, "step": 2417 }, { "clip_ratio": 0.0, "completion_length": 832.9375, "epoch": 0.9672, "grad_norm": 0.38594330402774857, "kl": 3.93359375, "learning_rate": 6.547235216672443e-08, "loss": 0.1455, "reward": 2.0625, "reward_std": 0.19579888880252838, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.984375, "step": 2418 }, { "clip_ratio": 0.0, "completion_length": 892.125, "epoch": 0.9676, "grad_norm": 0.24699674589636866, "kl": 3.921875, "learning_rate": 6.388689479991606e-08, "loss": 0.1627, "reward": 2.140625, "reward_std": 0.16928968206048012, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.984375, "step": 2419 }, { "clip_ratio": 0.0, "completion_length": 847.2734375, "epoch": 0.968, "grad_norm": 0.22584088059304078, "kl": 3.796875, "learning_rate": 6.232080839403631e-08, "loss": 0.1535, "reward": 2.080078125, "reward_std": 0.1509895622730255, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.986328125, "step": 2420 }, { "clip_ratio": 0.0, "completion_length": 720.0078125, "epoch": 0.9684, "grad_norm": 0.44830426603066936, "kl": 3.47265625, "learning_rate": 6.07740960022507e-08, "loss": 0.1415, "reward": 1.96875, "reward_std": 0.19326548278331757, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.9765625, "step": 2421 }, { "clip_ratio": 0.0, "completion_length": 758.1171875, "epoch": 0.9688, "grad_norm": 0.2023230063817668, "kl": 3.48828125, "learning_rate": 5.9246760639953824e-08, "loss": 0.1403, "reward": 2.109375, "reward_std": 0.18683473765850067, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 2422 }, { "clip_ratio": 0.0, "completion_length": 774.8203125, "epoch": 0.9692, "grad_norm": 0.3987089715546846, "kl": 3.91015625, "learning_rate": 5.7738805284764945e-08, "loss": 0.1582, "reward": 2.048828125, "reward_std": 0.18173722177743912, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.994140625, "step": 2423 }, { "clip_ratio": 0.0, "completion_length": 834.6484375, "epoch": 0.9696, "grad_norm": 0.5681252044107009, "kl": 3.76953125, "learning_rate": 5.625023287652021e-08, "loss": 0.1292, "reward": 1.93359375, "reward_std": 0.2712097689509392, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 2424 }, { "clip_ratio": 0.0, "completion_length": 837.875, "epoch": 0.97, "grad_norm": 0.31893002371371526, "kl": 3.5703125, "learning_rate": 5.4781046317267103e-08, "loss": 0.1746, "reward": 1.9296875, "reward_std": 0.34375, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 2425 }, { "clip_ratio": 0.0, "completion_length": 746.9375, "epoch": 0.9704, "grad_norm": 0.24491498741093473, "kl": 3.53125, "learning_rate": 5.3331248471258926e-08, "loss": 0.1457, "reward": 1.95703125, "reward_std": 0.1974448561668396, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 2426 }, { "clip_ratio": 0.0, "completion_length": 818.7421875, "epoch": 0.9708, "grad_norm": 0.10807360850692564, "kl": 3.41796875, "learning_rate": 5.190084216495361e-08, "loss": 0.1464, "reward": 1.982421875, "reward_std": 0.0703125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.990234375, "step": 2427 }, { "clip_ratio": 0.0, "completion_length": 880.3515625, "epoch": 0.9712, "grad_norm": 0.2638303192457103, "kl": 3.71875, "learning_rate": 5.048983018699827e-08, "loss": 0.1535, "reward": 2.07421875, "reward_std": 0.20822912454605103, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.98046875, "step": 2428 }, { "clip_ratio": 0.0, "completion_length": 830.46875, "epoch": 0.9716, "grad_norm": 0.5072577769086344, "kl": 3.421875, "learning_rate": 4.9098215288235776e-08, "loss": 0.1383, "reward": 1.98828125, "reward_std": 0.131154403090477, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 2429 }, { "clip_ratio": 0.0, "completion_length": 830.890625, "epoch": 0.972, "grad_norm": 0.16787895289152385, "kl": 3.28125, "learning_rate": 4.772600018168816e-08, "loss": 0.1256, "reward": 2.09765625, "reward_std": 0.12483987957239151, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98828125, "step": 2430 }, { "clip_ratio": 0.0, "completion_length": 819.640625, "epoch": 0.9724, "grad_norm": 0.38587628206414826, "kl": 3.6875, "learning_rate": 4.6373187542561036e-08, "loss": 0.1527, "reward": 2.111328125, "reward_std": 0.2800048664212227, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.970703125, "step": 2431 }, { "clip_ratio": 0.0, "completion_length": 741.7109375, "epoch": 0.9728, "grad_norm": 0.5919040603000876, "kl": 3.4140625, "learning_rate": 4.503978000823028e-08, "loss": 0.1261, "reward": 1.986328125, "reward_std": 0.1752607226371765, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.986328125, "step": 2432 }, { "clip_ratio": 0.0, "completion_length": 696.09375, "epoch": 0.9732, "grad_norm": 0.28912908361580547, "kl": 3.3203125, "learning_rate": 4.3725780178243135e-08, "loss": 0.1307, "reward": 1.98828125, "reward_std": 0.046875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.99609375, "step": 2433 }, { "clip_ratio": 0.0, "completion_length": 879.9609375, "epoch": 0.9736, "grad_norm": 0.3561032884939646, "kl": 3.8203125, "learning_rate": 4.2431190614309334e-08, "loss": 0.163, "reward": 2.033203125, "reward_std": 0.2421875, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.986328125, "step": 2434 }, { "clip_ratio": 0.0, "completion_length": 883.4765625, "epoch": 0.974, "grad_norm": 0.3458337891135655, "kl": 3.890625, "learning_rate": 4.115601384029666e-08, "loss": 0.1533, "reward": 1.96875, "reward_std": 0.3598533719778061, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 2435 }, { "clip_ratio": 0.0, "completion_length": 861.640625, "epoch": 0.9744, "grad_norm": 1.7928435789852954, "kl": 3.38671875, "learning_rate": 3.990025234222872e-08, "loss": 0.146, "reward": 2.1328125, "reward_std": 0.14965169876813889, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 2436 }, { "clip_ratio": 0.0, "completion_length": 922.75, "epoch": 0.9748, "grad_norm": 0.49296697592525307, "kl": 3.6328125, "learning_rate": 3.866390856827495e-08, "loss": 0.1535, "reward": 2.07421875, "reward_std": 0.22332343086600304, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98046875, "step": 2437 }, { "clip_ratio": 0.0, "completion_length": 901.9375, "epoch": 0.9752, "grad_norm": 0.7095984740541652, "kl": 3.85546875, "learning_rate": 3.7446984928753984e-08, "loss": 0.1591, "reward": 2.0703125, "reward_std": 0.1818198561668396, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2438 }, { "clip_ratio": 0.0, "completion_length": 761.5859375, "epoch": 0.9756, "grad_norm": 0.753347521604869, "kl": 3.24609375, "learning_rate": 3.6249483796116924e-08, "loss": 0.1233, "reward": 2.14453125, "reward_std": 0.20642951875925064, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98828125, "step": 2439 }, { "clip_ratio": 0.0, "completion_length": 880.7265625, "epoch": 0.976, "grad_norm": 0.3451992872591287, "kl": 3.703125, "learning_rate": 3.50714075049563e-08, "loss": 0.155, "reward": 1.9453125, "reward_std": 0.1900520622730255, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2440 }, { "clip_ratio": 0.0, "completion_length": 845.8828125, "epoch": 0.9764, "grad_norm": 0.5168598783395286, "kl": 3.7421875, "learning_rate": 3.391275835199159e-08, "loss": 0.1574, "reward": 2.00390625, "reward_std": 0.1285141110420227, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.98828125, "step": 2441 }, { "clip_ratio": 0.0, "completion_length": 769.0390625, "epoch": 0.9768, "grad_norm": 0.36225706139589164, "kl": 3.48828125, "learning_rate": 3.2773538596068134e-08, "loss": 0.1332, "reward": 2.154296875, "reward_std": 0.19736222177743912, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.990234375, "step": 2442 }, { "clip_ratio": 0.0, "completion_length": 713.90625, "epoch": 0.9772, "grad_norm": 0.36585526001294144, "kl": 3.171875, "learning_rate": 3.165375045815266e-08, "loss": 0.1183, "reward": 2.19140625, "reward_std": 0.153188094496727, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.99609375, "step": 2443 }, { "clip_ratio": 0.0, "completion_length": 820.0703125, "epoch": 0.9776, "grad_norm": 0.1642449622435921, "kl": 3.33203125, "learning_rate": 3.0553396121330015e-08, "loss": 0.1367, "reward": 2.0078125, "reward_std": 0.1886480376124382, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 2444 }, { "clip_ratio": 0.0, "completion_length": 843.4375, "epoch": 0.978, "grad_norm": 1.4410501695584281, "kl": 3.47265625, "learning_rate": 2.947247773079753e-08, "loss": 0.141, "reward": 1.984375, "reward_std": 0.21508973836898804, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 2445 }, { "clip_ratio": 0.0, "completion_length": 650.046875, "epoch": 0.9784, "grad_norm": 0.3274996882588126, "kl": 3.2578125, "learning_rate": 2.8410997393860663e-08, "loss": 0.1389, "reward": 2.240234375, "reward_std": 0.11231021583080292, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.990234375, "step": 2446 }, { "clip_ratio": 0.0, "completion_length": 889.3671875, "epoch": 0.9788, "grad_norm": 0.3376601977199013, "kl": 4.4296875, "learning_rate": 2.7368957179929602e-08, "loss": 0.1993, "reward": 2.0859375, "reward_std": 0.22949771583080292, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2447 }, { "clip_ratio": 0.0, "completion_length": 711.65625, "epoch": 0.9792, "grad_norm": 0.28527370709913186, "kl": 3.39453125, "learning_rate": 2.6346359120514863e-08, "loss": 0.1401, "reward": 2.009765625, "reward_std": 0.15216783434152603, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 2448 }, { "clip_ratio": 0.0, "completion_length": 900.078125, "epoch": 0.9796, "grad_norm": 0.23431575172502933, "kl": 3.796875, "learning_rate": 2.5343205209225062e-08, "loss": 0.1599, "reward": 1.939453125, "reward_std": 0.2421875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.970703125, "step": 2449 }, { "clip_ratio": 0.0, "completion_length": 851.71875, "epoch": 0.98, "grad_norm": 0.31969578609636407, "kl": 3.26171875, "learning_rate": 2.4359497401758026e-08, "loss": 0.1229, "reward": 1.931640625, "reward_std": 0.22109829634428024, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.970703125, "step": 2450 }, { "clip_ratio": 0.0, "completion_length": 901.7421875, "epoch": 0.9804, "grad_norm": 0.21310912930173403, "kl": 4.1171875, "learning_rate": 2.339523761590301e-08, "loss": 0.1699, "reward": 2.31640625, "reward_std": 0.14876245707273483, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 2451 }, { "clip_ratio": 0.0, "completion_length": 851.4375, "epoch": 0.9808, "grad_norm": 0.1649107770504175, "kl": 3.80859375, "learning_rate": 2.2450427731534052e-08, "loss": 0.1474, "reward": 1.947265625, "reward_std": 0.18020814657211304, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 2452 }, { "clip_ratio": 0.0, "completion_length": 868.578125, "epoch": 0.9812, "grad_norm": 0.19120739024947633, "kl": 4.03125, "learning_rate": 2.152506959060774e-08, "loss": 0.163, "reward": 2.021484375, "reward_std": 0.3362576812505722, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.974609375, "step": 2453 }, { "clip_ratio": 0.0, "completion_length": 862.5, "epoch": 0.9816, "grad_norm": 0.48574868054583653, "kl": 3.9453125, "learning_rate": 2.061916499715544e-08, "loss": 0.17, "reward": 1.96875, "reward_std": 0.21444028615951538, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.9765625, "step": 2454 }, { "clip_ratio": 0.0, "completion_length": 876.375, "epoch": 0.982, "grad_norm": 0.39292988994026495, "kl": 4.03125, "learning_rate": 1.973271571728441e-08, "loss": 0.1677, "reward": 1.94140625, "reward_std": 0.22531834617257118, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.97265625, "step": 2455 }, { "clip_ratio": 0.0, "completion_length": 916.84375, "epoch": 0.9824, "grad_norm": 0.21134182983963426, "kl": 4.22265625, "learning_rate": 1.886572347917337e-08, "loss": 0.1783, "reward": 1.9765625, "reward_std": 0.21053429320454597, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.9765625, "step": 2456 }, { "clip_ratio": 0.0, "completion_length": 836.59375, "epoch": 0.9828, "grad_norm": 1.890672125709416, "kl": 3.12109375, "learning_rate": 1.8018189973069144e-08, "loss": 0.1118, "reward": 2.08203125, "reward_std": 0.10277669876813889, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.99609375, "step": 2457 }, { "clip_ratio": 0.0, "completion_length": 913.109375, "epoch": 0.9832, "grad_norm": 0.2143098537951093, "kl": 4.62109375, "learning_rate": 1.7190116851280024e-08, "loss": 0.1872, "reward": 1.919921875, "reward_std": 0.28565485030412674, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.966796875, "step": 2458 }, { "clip_ratio": 0.0, "completion_length": 863.421875, "epoch": 0.9836, "grad_norm": 0.363990400776322, "kl": 3.8671875, "learning_rate": 1.6381505728176872e-08, "loss": 0.1439, "reward": 2.041015625, "reward_std": 0.25939860939979553, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.962890625, "step": 2459 }, { "clip_ratio": 0.0, "completion_length": 861.3984375, "epoch": 0.984, "grad_norm": 0.3373935286108729, "kl": 4.12109375, "learning_rate": 1.5592358180189782e-08, "loss": 0.1622, "reward": 1.953125, "reward_std": 0.22796630859375, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2460 }, { "clip_ratio": 0.0, "completion_length": 893.3671875, "epoch": 0.9844, "grad_norm": 0.1609628920858326, "kl": 3.765625, "learning_rate": 1.482267574580143e-08, "loss": 0.1386, "reward": 2.0390625, "reward_std": 0.2835279181599617, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.96875, "step": 2461 }, { "clip_ratio": 0.0, "completion_length": 767.3203125, "epoch": 0.9848, "grad_norm": 0.2812954130416999, "kl": 3.171875, "learning_rate": 1.4072459925548176e-08, "loss": 0.1158, "reward": 1.974609375, "reward_std": 0.1015625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.990234375, "step": 2462 }, { "clip_ratio": 0.0, "completion_length": 717.8671875, "epoch": 0.9852, "grad_norm": 0.15035627226717005, "kl": 3.38671875, "learning_rate": 1.3341712182012301e-08, "loss": 0.1385, "reward": 1.982421875, "reward_std": 0.1328125, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.990234375, "step": 2463 }, { "clip_ratio": 0.0, "completion_length": 720.8828125, "epoch": 0.9856, "grad_norm": 0.2253770143841002, "kl": 3.265625, "learning_rate": 1.2630433939825326e-08, "loss": 0.1362, "reward": 2.1015625, "reward_std": 0.09375, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 2464 }, { "clip_ratio": 0.0, "completion_length": 731.2734375, "epoch": 0.986, "grad_norm": 0.3489059891792058, "kl": 2.84375, "learning_rate": 1.1938626585660252e-08, "loss": 0.1116, "reward": 1.970703125, "reward_std": 0.1975904181599617, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.986328125, "step": 2465 }, { "clip_ratio": 0.0, "completion_length": 838.3125, "epoch": 0.9864, "grad_norm": 0.3025021332258777, "kl": 3.3828125, "learning_rate": 1.126629146822933e-08, "loss": 0.1338, "reward": 1.947265625, "reward_std": 0.16581370681524277, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 2466 }, { "clip_ratio": 0.0, "completion_length": 876.625, "epoch": 0.9868, "grad_norm": 0.2502501081670409, "kl": 3.625, "learning_rate": 1.0613429898287397e-08, "loss": 0.1423, "reward": 2.060546875, "reward_std": 0.1953125, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 2467 }, { "clip_ratio": 0.0, "completion_length": 771.296875, "epoch": 0.9872, "grad_norm": 0.9092824524240551, "kl": 3.90625, "learning_rate": 9.980043148619668e-09, "loss": 0.1626, "reward": 1.994140625, "reward_std": 0.1780479997396469, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 2468 }, { "clip_ratio": 0.0, "completion_length": 828.8984375, "epoch": 0.9876, "grad_norm": 0.2814177530937229, "kl": 3.7890625, "learning_rate": 9.366132454046162e-09, "loss": 0.1588, "reward": 2.05859375, "reward_std": 0.30117058008909225, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 2469 }, { "clip_ratio": 0.0, "completion_length": 704.5546875, "epoch": 0.988, "grad_norm": 0.24434848177630014, "kl": 3.62109375, "learning_rate": 8.771699011416169e-09, "loss": 0.1296, "reward": 2.05078125, "reward_std": 0.22777669876813889, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98828125, "step": 2470 }, { "clip_ratio": 0.0, "completion_length": 931.078125, "epoch": 0.9884, "grad_norm": 0.4943556930221092, "kl": 4.5625, "learning_rate": 8.196743979610455e-09, "loss": 0.1983, "reward": 2.208984375, "reward_std": 0.24344199895858765, "rewards/accuracy_reward": 0.2734375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 2471 }, { "clip_ratio": 0.0, "completion_length": 819.9453125, "epoch": 0.9888, "grad_norm": 0.3735935399196857, "kl": 3.30859375, "learning_rate": 7.641268479531283e-09, "loss": 0.11, "reward": 2.056640625, "reward_std": 0.19917110353708267, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.994140625, "step": 2472 }, { "clip_ratio": 0.0, "completion_length": 802.1875, "epoch": 0.9892, "grad_norm": 0.17461405971933233, "kl": 3.19140625, "learning_rate": 7.105273594107953e-09, "loss": 0.1305, "reward": 1.947265625, "reward_std": 0.1443743333220482, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 2473 }, { "clip_ratio": 0.0, "completion_length": 778.4296875, "epoch": 0.9896, "grad_norm": 0.21546498041313103, "kl": 3.6328125, "learning_rate": 6.588760368287928e-09, "loss": 0.1494, "reward": 2.08984375, "reward_std": 0.19527994096279144, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98046875, "step": 2474 }, { "clip_ratio": 0.0, "completion_length": 836.8984375, "epoch": 0.99, "grad_norm": 0.2919931715568975, "kl": 3.68359375, "learning_rate": 6.091729809042379e-09, "loss": 0.1429, "reward": 1.95703125, "reward_std": 0.21029773354530334, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.98046875, "step": 2475 }, { "clip_ratio": 0.0, "completion_length": 736.015625, "epoch": 0.9904, "grad_norm": 0.2465825392817082, "kl": 3.6171875, "learning_rate": 5.614182885357311e-09, "loss": 0.1502, "reward": 2.09765625, "reward_std": 0.07471735030412674, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 2476 }, { "clip_ratio": 0.0, "completion_length": 808.9453125, "epoch": 0.9908, "grad_norm": 0.5884441795741666, "kl": 3.5, "learning_rate": 5.156120528233555e-09, "loss": 0.1416, "reward": 1.974609375, "reward_std": 0.16346796974539757, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2477 }, { "clip_ratio": 0.0, "completion_length": 789.90625, "epoch": 0.9912, "grad_norm": 0.43565775027069015, "kl": 3.9296875, "learning_rate": 4.717543630688992e-09, "loss": 0.1574, "reward": 1.966796875, "reward_std": 0.17414497584104538, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2478 }, { "clip_ratio": 0.0, "completion_length": 870.96875, "epoch": 0.9916, "grad_norm": 0.10297522120332009, "kl": 3.77734375, "learning_rate": 4.298453047749674e-09, "loss": 0.1726, "reward": 2.1328125, "reward_std": 0.14634962752461433, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.984375, "step": 2479 }, { "clip_ratio": 0.0, "completion_length": 909.8203125, "epoch": 0.992, "grad_norm": 0.1731520454605475, "kl": 4.421875, "learning_rate": 3.898849596456477e-09, "loss": 0.1813, "reward": 1.970703125, "reward_std": 0.19624444097280502, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.986328125, "step": 2480 }, { "clip_ratio": 0.0, "completion_length": 868.75, "epoch": 0.9924, "grad_norm": 0.6220229854793868, "kl": 3.65625, "learning_rate": 3.518734055855122e-09, "loss": 0.1288, "reward": 2.091796875, "reward_std": 0.3331584334373474, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.982421875, "step": 2481 }, { "clip_ratio": 0.0, "completion_length": 828.2421875, "epoch": 0.9928, "grad_norm": 0.30982267402158403, "kl": 3.8125, "learning_rate": 3.1581071670006013e-09, "loss": 0.1551, "reward": 1.984375, "reward_std": 0.3123912587761879, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.9765625, "step": 2482 }, { "clip_ratio": 0.0, "completion_length": 741.8359375, "epoch": 0.9932, "grad_norm": 0.30554294063890963, "kl": 3.9296875, "learning_rate": 2.8169696329527484e-09, "loss": 0.1609, "reward": 2.1171875, "reward_std": 0.18572436273097992, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.9921875, "step": 2483 }, { "clip_ratio": 0.0, "completion_length": 843.671875, "epoch": 0.9936, "grad_norm": 0.40493221667842144, "kl": 3.93359375, "learning_rate": 2.495322118778454e-09, "loss": 0.1374, "reward": 2.056640625, "reward_std": 0.2447395622730255, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.978515625, "step": 2484 }, { "clip_ratio": 0.0, "completion_length": 808.59375, "epoch": 0.994, "grad_norm": 18.75432179875533, "kl": 3.34765625, "learning_rate": 2.193165251545004e-09, "loss": 0.1349, "reward": 1.951171875, "reward_std": 0.1666145622730255, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.982421875, "step": 2485 }, { "clip_ratio": 0.0, "completion_length": 796.75, "epoch": 0.9944, "grad_norm": 0.20843181943477615, "kl": 3.36328125, "learning_rate": 1.910499620322304e-09, "loss": 0.1484, "reward": 1.955078125, "reward_std": 0.1796875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.978515625, "step": 2486 }, { "clip_ratio": 0.0, "completion_length": 803.546875, "epoch": 0.9948, "grad_norm": 0.15309450389605322, "kl": 3.609375, "learning_rate": 1.647325776182873e-09, "loss": 0.1493, "reward": 2.109375, "reward_std": 0.125, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 2487 }, { "clip_ratio": 0.0, "completion_length": 632.4765625, "epoch": 0.9952, "grad_norm": 0.32152131026765446, "kl": 3.39453125, "learning_rate": 1.4036442321962995e-09, "loss": 0.103, "reward": 2.00390625, "reward_std": 0.24890750646591187, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.98046875, "step": 2488 }, { "clip_ratio": 0.0, "completion_length": 906.5546875, "epoch": 0.9956, "grad_norm": 0.3335756791467518, "kl": 3.765625, "learning_rate": 1.1794554634314558e-09, "loss": 0.1542, "reward": 1.9453125, "reward_std": 0.3041067197918892, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 2489 }, { "clip_ratio": 0.0, "completion_length": 885.078125, "epoch": 0.996, "grad_norm": 0.5404521587893301, "kl": 3.671875, "learning_rate": 9.74759906957612e-10, "loss": 0.1456, "reward": 1.923828125, "reward_std": 0.243075430393219, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.970703125, "step": 2490 }, { "clip_ratio": 0.0, "completion_length": 825.6796875, "epoch": 0.9964, "grad_norm": 0.47601905537375183, "kl": 3.4453125, "learning_rate": 7.895579618388827e-10, "loss": 0.1438, "reward": 1.955078125, "reward_std": 0.2421875, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 2491 }, { "clip_ratio": 0.0, "completion_length": 875.1328125, "epoch": 0.9968, "grad_norm": 0.221284627515779, "kl": 3.7265625, "learning_rate": 6.238499891353389e-10, "loss": 0.1546, "reward": 2.05078125, "reward_std": 0.2289465367794037, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98828125, "step": 2492 }, { "clip_ratio": 0.0, "completion_length": 743.8984375, "epoch": 0.9972, "grad_norm": 0.292283557941262, "kl": 3.734375, "learning_rate": 4.77636311903007e-10, "loss": 0.1658, "reward": 2.046875, "reward_std": 0.24137328565120697, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 2493 }, { "clip_ratio": 0.0, "completion_length": 853.234375, "epoch": 0.9976, "grad_norm": 2.756172306544377, "kl": 3.58984375, "learning_rate": 3.509172151938689e-10, "loss": 0.1376, "reward": 2.197265625, "reward_std": 0.25246506929397583, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 2494 }, { "clip_ratio": 0.0, "completion_length": 864.6953125, "epoch": 0.998, "grad_norm": 0.18451970655257313, "kl": 3.671875, "learning_rate": 2.436929460525317e-10, "loss": 0.1476, "reward": 1.951171875, "reward_std": 0.18266618251800537, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.974609375, "step": 2495 }, { "clip_ratio": 0.0, "completion_length": 887.71875, "epoch": 0.9984, "grad_norm": 0.22274887887208566, "kl": 3.94140625, "learning_rate": 1.559637135173375e-10, "loss": 0.1615, "reward": 2.095703125, "reward_std": 0.2907291576266289, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 2496 }, { "clip_ratio": 0.0, "completion_length": 828.171875, "epoch": 0.9988, "grad_norm": 0.3331646546142689, "kl": 3.80859375, "learning_rate": 8.772968862369447e-11, "loss": 0.1417, "reward": 2.05859375, "reward_std": 0.2369270622730255, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.98046875, "step": 2497 }, { "clip_ratio": 0.0, "completion_length": 858.90625, "epoch": 0.9992, "grad_norm": 0.9729682356442256, "kl": 3.5703125, "learning_rate": 3.899100439408443e-11, "loss": 0.1473, "reward": 1.947265625, "reward_std": 0.14844822883605957, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 2498 }, { "clip_ratio": 0.0, "completion_length": 899.7578125, "epoch": 0.9996, "grad_norm": 0.2814091618478754, "kl": 4.17578125, "learning_rate": 9.74775584916543e-12, "loss": 0.1771, "reward": 2.03125, "reward_std": 0.34034235030412674, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.9609375, "step": 2499 }, { "clip_ratio": 0.0, "completion_length": 726.484375, "epoch": 1.0, "grad_norm": 7.83306314613933, "kl": 3.4765625, "learning_rate": 0.0, "loss": 0.1315, "reward": 2.048828125, "reward_std": 0.14998093992471695, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.994140625, "step": 2500 }, { "epoch": 1.0, "step": 2500, "total_flos": 0.0, "train_loss": 2591852.7904117624, "train_runtime": 66018.0639, "train_samples_per_second": 0.303, "train_steps_per_second": 0.038 } ], "logging_steps": 1, "max_steps": 2500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }